From 44e6a2069122dbec29d25b8ced3f59f5626a97d7 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Tue, 17 May 2011 15:36:46 +0200 Subject: ftrace/kbuild: Add recordmcount files to force full build Modifications to recordmcount must be performed on all object files to stay consistent with what the kernel code may expect. Add the recordmcount files to the main dependencies to make sure any change to them causes a full recompile. Signed-off-by: Michal Marek Link: http://lkml.kernel.org/r/20110517133646.GP13293@sepie.suse.cz Signed-off-by: Steven Rostedt diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d5f925a..7d3f903 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -252,6 +252,8 @@ sub_cmd_record_mcount = \ if [ $(@) != "scripts/mod/empty.o" ]; then \ $(objtree)/scripts/recordmcount "$(@)"; \ fi; +recordmcount_source := $(srctree)/scripts/recordmcount.c \ + $(srctree)/scripts/recordmcount.h else sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \ @@ -259,6 +261,7 @@ sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \ "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ "$(if $(part-of-module),1,0)" "$(@)"; +recordmcount_source := $(srctree)/scripts/recordmcount.pl endif cmd_record_mcount = \ if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then \ @@ -279,13 +282,13 @@ define rule_cc_o_c endef # Built-in and composite module parts -$(obj)/%.o: $(src)/%.c FORCE +$(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) # Single-part modules are special since we need to mark them in $(MODVERDIR) -$(single-used-m): $(obj)/%.o: $(src)/%.c FORCE +$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod) -- cgit v0.10.2 From c6fece27dfa46abc8c4b87d94a8c2f8cd6c5d8b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 13:21:17 -0400 Subject: ftrace/trivial: Clean up recordmcount.c to use Linux style comparisons The Linux ftrace subsystem style for comparing is: var == 1 var > 0 and not: 1 == var 0 < var It is considered that Linux developers are smart enough not to do the if (var = 1) mistake. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.290712238@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index f9f6f52..37afe0e 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -78,7 +78,7 @@ static off_t ulseek(int const fd, off_t const offset, int const whence) { off_t const w = lseek(fd, offset, whence); - if ((off_t)-1 == w) { + if (w == (off_t)-1) { perror("lseek"); fail_file(); } @@ -111,7 +111,7 @@ static void * umalloc(size_t size) { void *const addr = malloc(size); - if (0 == addr) { + if (addr == 0) { fprintf(stderr, "malloc failed: %zu bytes\n", size); fail_file(); } @@ -136,7 +136,7 @@ static void *mmap_file(char const *fname) void *addr; fd_map = open(fname, O_RDWR); - if (0 > fd_map || 0 > fstat(fd_map, &sb)) { + if (fd_map < 0 || fstat(fd_map, &sb) < 0) { perror(fname); fail_file(); } @@ -147,7 +147,7 @@ static void *mmap_file(char const *fname) addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd_map, 0); mmap_failed = 0; - if (MAP_FAILED == addr) { + if (addr == MAP_FAILED) { mmap_failed = 1; addr = umalloc(sb.st_size); uread(fd_map, addr, sb.st_size); @@ -206,12 +206,12 @@ static uint32_t (*w2)(uint16_t); static int is_mcounted_section_name(char const *const txtname) { - return 0 == strcmp(".text", txtname) || - 0 == strcmp(".ref.text", txtname) || - 0 == strcmp(".sched.text", txtname) || - 0 == strcmp(".spinlock.text", txtname) || - 0 == strcmp(".irqentry.text", txtname) || - 0 == strcmp(".text.unlikely", txtname); + return strcmp(".text", txtname) == 0 || + strcmp(".ref.text", txtname) == 0 || + strcmp(".sched.text", txtname) == 0 || + strcmp(".spinlock.text", txtname) == 0 || + strcmp(".irqentry.text", txtname) == 0 || + strcmp(".text.unlikely", txtname) == 0; } /* 32 bit and 64 bit are very similar */ @@ -270,7 +270,7 @@ do_file(char const *const fname) fail_file(); } break; case ELFDATA2LSB: { - if (1 != *(unsigned char const *)&endian) { + if (*(unsigned char const *)&endian != 1) { /* main() is big endian, file.o is little endian. */ w = w4rev; w2 = w2rev; @@ -278,7 +278,7 @@ do_file(char const *const fname) } } break; case ELFDATA2MSB: { - if (0 != *(unsigned char const *)&endian) { + if (*(unsigned char const *)&endian != 0) { /* main() is little endian, file.o is big endian. */ w = w4rev; w2 = w2rev; @@ -286,9 +286,9 @@ do_file(char const *const fname) } } break; } /* end switch */ - if (0 != memcmp(ELFMAG, ehdr->e_ident, SELFMAG) - || ET_REL != w2(ehdr->e_type) - || EV_CURRENT != ehdr->e_ident[EI_VERSION]) { + if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 + || w2(ehdr->e_type) != ET_REL + || ehdr->e_ident[EI_VERSION] != EV_CURRENT) { fprintf(stderr, "unrecognized ET_REL file %s\n", fname); fail_file(); } @@ -321,15 +321,15 @@ do_file(char const *const fname) fail_file(); } break; case ELFCLASS32: { - if (sizeof(Elf32_Ehdr) != w2(ehdr->e_ehsize) - || sizeof(Elf32_Shdr) != w2(ehdr->e_shentsize)) { + if (w2(ehdr->e_ehsize) != sizeof(Elf32_Ehdr) + || w2(ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (EM_S390 == w2(ehdr->e_machine)) + if (w2(ehdr->e_machine) == EM_S390) reltype = R_390_32; - if (EM_MIPS == w2(ehdr->e_machine)) { + if (w2(ehdr->e_machine) == EM_MIPS) { reltype = R_MIPS_32; is_fake_mcount32 = MIPS32_is_fake_mcount; } @@ -337,15 +337,15 @@ do_file(char const *const fname) } break; case ELFCLASS64: { Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; - if (sizeof(Elf64_Ehdr) != w2(ghdr->e_ehsize) - || sizeof(Elf64_Shdr) != w2(ghdr->e_shentsize)) { + if (w2(ghdr->e_ehsize) != sizeof(Elf64_Ehdr) + || w2(ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (EM_S390 == w2(ghdr->e_machine)) + if (w2(ghdr->e_machine) == EM_S390) reltype = R_390_64; - if (EM_MIPS == w2(ghdr->e_machine)) { + if (w2(ghdr->e_machine) == EM_MIPS) { reltype = R_MIPS_64; Elf64_r_sym = MIPS64_r_sym; Elf64_r_info = MIPS64_r_info; @@ -371,7 +371,7 @@ main(int argc, char const *argv[]) } /* Process each file in turn, allowing deep failure. */ - for (--argc, ++argv; 0 < argc; --argc, ++argv) { + for (--argc, ++argv; argc > 0; --argc, ++argv) { int const sjval = setjmp(jmpenv); int len; diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index baf187b..ac7b330 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -275,12 +275,12 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, Elf_Sym const *const symp = &sym0[Elf_r_sym(relp)]; char const *symname = &str0[w(symp->st_name)]; - char const *mcount = '_' == gpfx ? "_mcount" : "mcount"; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - if ('.' == symname[0]) + if (symname[0] == '.') ++symname; /* ppc64 hack */ - if (0 == strcmp(mcount, symname) || - (altmcount && 0 == strcmp(altmcount, symname))) + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) mcountsym = Elf_r_sym(relp); } @@ -290,7 +290,7 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, mrelp->r_offset = _w(offbase + ((void *)mlocp - (void *)mloc0)); Elf_r_info(mrelp, recsym, reltype); - if (sizeof(Elf_Rela) == rel_entsize) { + if (rel_entsize == sizeof(Elf_Rela)) { ((Elf_Rela *)mrelp)->r_addend = addend; *mlocp++ = 0; } else @@ -354,12 +354,12 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ Elf_Shdr const *const txthdr = &shdr0[w(relhdr->sh_info)]; char const *const txtname = &shstrtab[w(txthdr->sh_name)]; - if (0 == strcmp("__mcount_loc", txtname)) { + if (strcmp("__mcount_loc", txtname) == 0) { fprintf(stderr, "warning: __mcount_loc already exists: %s\n", fname); succeed_file(); } - if (SHT_PROGBITS != w(txthdr->sh_type) || + if (w(txthdr->sh_type) != SHT_PROGBITS || !is_mcounted_section_name(txtname)) return NULL; return txtname; @@ -370,7 +370,7 @@ static char const *has_rel_mcount(Elf_Shdr const *const relhdr, char const *const shstrtab, char const *const fname) { - if (SHT_REL != w(relhdr->sh_type) && SHT_RELA != w(relhdr->sh_type)) + if (w(relhdr->sh_type) != SHT_REL && w(relhdr->sh_type) != SHT_RELA) return NULL; return __has_rel_mcount(relhdr, shdr0, shstrtab, fname); } -- cgit v0.10.2 From d7e8623ae9cf93a9b517336085a86cabb014ea77 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 13:32:24 -0400 Subject: ftrace/trivial: Clean up record mcount to use Linux switch style The Linux style for switch statements is: switch (var) { case x: [...] break; } Not: switch (var) { case x: { [...] } break; Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.523968644@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 37afe0e..4ebd839 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -264,27 +264,27 @@ do_file(char const *const fname) w8 = w8nat; switch (ehdr->e_ident[EI_DATA]) { static unsigned int const endian = 1; - default: { + default: fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", ehdr->e_ident[EI_DATA], fname); fail_file(); - } break; - case ELFDATA2LSB: { + break; + case ELFDATA2LSB: if (*(unsigned char const *)&endian != 1) { /* main() is big endian, file.o is little endian. */ w = w4rev; w2 = w2rev; w8 = w8rev; } - } break; - case ELFDATA2MSB: { + break; + case ELFDATA2MSB: if (*(unsigned char const *)&endian != 0) { /* main() is little endian, file.o is big endian. */ w = w4rev; w2 = w2rev; w8 = w8rev; } - } break; + break; } /* end switch */ if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 || w2(ehdr->e_type) != ET_REL @@ -295,11 +295,11 @@ do_file(char const *const fname) gpfx = 0; switch (w2(ehdr->e_machine)) { - default: { + default: fprintf(stderr, "unrecognized e_machine %d %s\n", w2(ehdr->e_machine), fname); fail_file(); - } break; + break; case EM_386: reltype = R_386_32; break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; @@ -315,12 +315,12 @@ do_file(char const *const fname) } /* end switch */ switch (ehdr->e_ident[EI_CLASS]) { - default: { + default: fprintf(stderr, "unrecognized ELF class %d %s\n", ehdr->e_ident[EI_CLASS], fname); fail_file(); - } break; - case ELFCLASS32: { + break; + case ELFCLASS32: if (w2(ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || w2(ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, @@ -334,7 +334,7 @@ do_file(char const *const fname) is_fake_mcount32 = MIPS32_is_fake_mcount; } do32(ehdr, fname, reltype); - } break; + break; case ELFCLASS64: { Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; if (w2(ghdr->e_ehsize) != sizeof(Elf64_Ehdr) @@ -352,7 +352,8 @@ do_file(char const *const fname) is_fake_mcount64 = MIPS64_is_fake_mcount; } do64(ghdr, fname, reltype); - } break; + break; + } } /* end switch */ cleanup(); @@ -386,23 +387,23 @@ main(int argc, char const *argv[]) continue; switch (sjval) { - default: { + default: fprintf(stderr, "internal error: %s\n", argv[0]); exit(1); - } break; - case SJ_SETJMP: { /* normal sequence */ + break; + case SJ_SETJMP: /* normal sequence */ /* Avoid problems if early cleanup() */ fd_map = -1; ehdr_curr = NULL; mmap_failed = 1; do_file(argv[0]); - } break; - case SJ_FAIL: { /* error in do_file or below */ + break; + case SJ_FAIL: /* error in do_file or below */ ++n_error; - } break; - case SJ_SUCCEED: { /* premature success */ + break; + case SJ_SUCCEED: /* premature success */ /* do nothing */ - } break; + break; } /* end switch */ } return !!n_error; -- cgit v0.10.2 From bde66c3f45d777082a0df01e64870f47ba2a5055 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 14:10:22 -0400 Subject: ftrace: Add .kprobe.text section to whitelist for recordmcount.c The .kprobe.text section is safe to modify mcount to nop and tracing. Add it to the whitelist in recordmcount.c and recordmcount.pl. Cc: John Reiser Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20110421023737.743350547@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 4ebd839..37c5965 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -211,6 +211,7 @@ is_mcounted_section_name(char const *const txtname) strcmp(".sched.text", txtname) == 0 || strcmp(".spinlock.text", txtname) == 0 || strcmp(".irqentry.text", txtname) == 0 || + strcmp(".kprobes.text", txtname) == 0 || strcmp(".text.unlikely", txtname) == 0; } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 4be0dee..a871cd4 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -134,6 +134,7 @@ my %text_sections = ( ".sched.text" => 1, ".spinlock.text" => 1, ".irqentry.text" => 1, + ".kprobes.text" => 1, ".text.unlikely" => 1, ); -- cgit v0.10.2 From df2ccb69454d022ce99e3a3b7ee7f9fb4a4e9563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2011 13:31:08 -0400 Subject: ftrace/recordmcount: Modify only executable sections PROGBITS is not enough to determine if the section should be modified or not. Only process sections that are marked as executable. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.991485123@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index ac7b330..7f8d5c4 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -360,6 +360,7 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ succeed_file(); } if (w(txthdr->sh_type) != SHT_PROGBITS || + !(w(txthdr->sh_flags) & SHF_EXECINSTR) || !is_mcounted_section_name(txtname)) return NULL; return txtname; -- cgit v0.10.2 From 68eb1d2e5370c6da4c871c80fdccbe315dc2f3c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Apr 2011 03:58:48 -0400 Subject: ftrace/recordmcount: Make ignored mcount calls into nops at compile time There are sections that are ignored by ftrace for the function tracing because the text is in a section that can be removed without notice. The mcount calls in these sections are ignored and ftrace never sees them. The downside of this is that the functions in these sections still call mcount. Although the mcount function is defined in assembly simply as a return, this added overhead is unnecessary. The solution is to convert these callers into nops at compile time. A better solution is to add 'notrace' to the section markers, but as new sections come up all the time, it would be nice that they are delt with when they are created. Later patches will deal with finding these sections and doing the proper solution. Thanks to H. Peter Anvin for giving me the right nops to use for x86. Cc: "H. Peter Anvin" Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023738.237101176@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 37c5965..78054a4 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -118,6 +118,34 @@ umalloc(size_t size) return addr; } +static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +static unsigned char *ideal_nop; + +static char rel_type_nop; + +static int (*make_nop)(void *map, size_t const offset); + +static int make_nop_x86(void *map, size_t const offset) +{ + uint32_t *ptr; + unsigned char *op; + + /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ + ptr = map + offset; + if (*ptr != 0) + return -1; + + op = map + offset - 1; + if (*op != 0xe8) + return -1; + + /* convert to nop */ + ulseek(fd_map, offset - 1, SEEK_SET); + uwrite(fd_map, ideal_nop, 5); + return 0; +} + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap @@ -301,7 +329,11 @@ do_file(char const *const fname) w2(ehdr->e_machine), fname); fail_file(); break; - case EM_386: reltype = R_386_32; break; + case EM_386: + reltype = R_386_32; + make_nop = make_nop_x86; + ideal_nop = ideal_nop5_x86_32; + break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; break; @@ -312,7 +344,11 @@ do_file(char const *const fname) case EM_S390: /* reltype: e_class */ gpfx = '_'; break; case EM_SH: reltype = R_SH_DIR32; break; case EM_SPARCV9: reltype = R_SPARC_64; gpfx = '_'; break; - case EM_X86_64: reltype = R_X86_64_64; break; + case EM_X86_64: + make_nop = make_nop_x86; + ideal_nop = ideal_nop5_x86_64; + reltype = R_X86_64_64; + break; } /* end switch */ switch (ehdr->e_ident[EI_CLASS]) { diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 7f8d5c4..657dbed 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -23,6 +23,7 @@ #undef fn_is_fake_mcount #undef MIPS_is_fake_mcount #undef sift_rel_mcount +#undef nop_mcount #undef find_secsym_ndx #undef __has_rel_mcount #undef has_rel_mcount @@ -49,6 +50,7 @@ #ifdef RECORD_MCOUNT_64 # define append_func append64 # define sift_rel_mcount sift64_rel_mcount +# define nop_mcount nop_mcount_64 # define find_secsym_ndx find64_secsym_ndx # define __has_rel_mcount __has64_rel_mcount # define has_rel_mcount has64_rel_mcount @@ -77,6 +79,7 @@ #else # define append_func append32 # define sift_rel_mcount sift32_rel_mcount +# define nop_mcount nop_mcount_32 # define find_secsym_ndx find32_secsym_ndx # define __has_rel_mcount __has32_rel_mcount # define has_rel_mcount has32_rel_mcount @@ -304,6 +307,70 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, return mlocp; } +/* + * Read the relocation table again, but this time its called on sections + * that are not going to be traced. The mcount calls here will be converted + * into nops. + */ +static void nop_mcount(Elf_Shdr const *const relhdr, + Elf_Ehdr const *const ehdr) +{ + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const symsec_sh_link = w(relhdr->sh_link); + Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; + Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) + + (void *)ehdr); + + Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; + char const *const str0 = (char const *)(_w(strsec->sh_offset) + + (void *)ehdr); + + Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) + + (void *)ehdr); + unsigned rel_entsize = _w(relhdr->sh_entsize); + unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; + Elf_Rel const *relp = rel0; + + Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; + + unsigned mcountsym = 0; + unsigned t; + + for (t = nrel; t; --t) { + int ret = -1; + + if (!mcountsym) { + Elf_Sym const *const symp = + &sym0[Elf_r_sym(relp)]; + char const *symname = &str0[w(symp->st_name)]; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; + + if (symname[0] == '.') + ++symname; /* ppc64 hack */ + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) + mcountsym = Elf_r_sym(relp); + } + + if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) + ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + + /* + * If we successfully removed the mcount, mark the relocation + * as a nop (don't do anything with it). + */ + if (!ret) { + Elf_Rel rel; + rel = *(Elf_Rel *)relp; + Elf_r_info(&rel, Elf_r_sym(relp), rel_type_nop); + ulseek(fd_map, (void *)relp - (void *)ehdr, SEEK_SET); + uwrite(fd_map, &rel, sizeof(rel)); + } + relp = (Elf_Rel const *)(rel_entsize + (void *)relp); + } +} + /* * Find a symbol in the given section, to be used as the base for relocating @@ -360,8 +427,7 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ succeed_file(); } if (w(txthdr->sh_type) != SHT_PROGBITS || - !(w(txthdr->sh_flags) & SHF_EXECINSTR) || - !is_mcounted_section_name(txtname)) + !(w(txthdr->sh_flags) & SHF_EXECINSTR)) return NULL; return txtname; } @@ -384,9 +450,11 @@ static unsigned tot_relsize(Elf_Shdr const *const shdr0, { unsigned totrelsz = 0; Elf_Shdr const *shdrp = shdr0; + char const *txtname; for (; nhdr; --nhdr, ++shdrp) { - if (has_rel_mcount(shdrp, shdr0, shstrtab, fname)) + txtname = has_rel_mcount(shdrp, shdr0, shstrtab, fname); + if (txtname && is_mcounted_section_name(txtname)) totrelsz += _w(shdrp->sh_size); } return totrelsz; @@ -422,7 +490,7 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) for (relhdr = shdr0, k = nhdr; k; --k, ++relhdr) { char const *const txtname = has_rel_mcount(relhdr, shdr0, shstrtab, fname); - if (txtname) { + if (txtname && is_mcounted_section_name(txtname)) { uint_t recval = 0; unsigned const recsym = find_secsym_ndx( w(relhdr->sh_info), txtname, &recval, @@ -433,6 +501,12 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) mlocp = sift_rel_mcount(mlocp, (void *)mlocp - (void *)mloc0, &mrelp, relhdr, ehdr, recsym, recval, reltype); + } else if (make_nop && txtname) { + /* + * This section is ignored by ftrace, but still + * has mcount calls. Convert them to nops now. + */ + nop_mcount(relhdr, ehdr); } } if (mloc0 != mlocp) { -- cgit v0.10.2 From 06fd6f6301cce9af4a03dd9e40bfca167e931ff1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 18:53:25 -0400 Subject: ftrace/recordmcount: Add warning logic to warn on mcount not recorded There's some sections that should not have mcount recorded and should not have modifications to the that code. But currently they waste some time by calling mcount anyway (which simply returns). As the real answer should be to either whitelist the section or have gcc ignore it fully. This change adds a option to recordmcount to warn when it finds a section that is ignored by ftrace but still contains mcount callers. This is not on by default as developers may not know if the section should be completely ignored or added to the whitelist. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023738.476989377@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 78054a4..0e18975 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ static char gpfx; /* prefix for global symbol name (sometimes '_') */ static struct stat sb; /* Remember .st_size, etc. */ static jmp_buf jmpenv; /* setjmp/longjmp per-file error escape */ static const char *altmcount; /* alternate mcount symbol name */ +static int warn_on_notrace_sect; /* warn when section has mcount not being recorded */ /* setjmp() return values */ enum { @@ -397,19 +399,33 @@ do_file(char const *const fname) } int -main(int argc, char const *argv[]) +main(int argc, char *argv[]) { const char ftrace[] = "/ftrace.o"; int ftrace_size = sizeof(ftrace) - 1; int n_error = 0; /* gcc-4.3.0 false positive complaint */ + int c; + int i; - if (argc <= 1) { - fprintf(stderr, "usage: recordmcount file.o...\n"); + while ((c = getopt(argc, argv, "w")) >= 0) { + switch (c) { + case 'w': + warn_on_notrace_sect = 1; + break; + default: + fprintf(stderr, "usage: recordmcount [-w] file.o...\n"); + return 0; + } + } + + if ((argc - optind) < 1) { + fprintf(stderr, "usage: recordmcount [-w] file.o...\n"); return 0; } /* Process each file in turn, allowing deep failure. */ - for (--argc, ++argv; argc > 0; --argc, ++argv) { + for (i = optind; i < argc; i++) { + char *file = argv[i]; int const sjval = setjmp(jmpenv); int len; @@ -418,14 +434,14 @@ main(int argc, char const *argv[]) * function but does not call it. Since ftrace.o should * not be traced anyway, we just skip it. */ - len = strlen(argv[0]); + len = strlen(file); if (len >= ftrace_size && - strcmp(argv[0] + (len - ftrace_size), ftrace) == 0) + strcmp(file + (len - ftrace_size), ftrace) == 0) continue; switch (sjval) { default: - fprintf(stderr, "internal error: %s\n", argv[0]); + fprintf(stderr, "internal error: %s\n", file); exit(1); break; case SJ_SETJMP: /* normal sequence */ @@ -433,7 +449,7 @@ main(int argc, char const *argv[]) fd_map = -1; ehdr_curr = NULL; mmap_failed = 1; - do_file(argv[0]); + do_file(file); break; case SJ_FAIL: /* error in do_file or below */ ++n_error; diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 657dbed..22033d5 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -313,7 +313,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, * into nops. */ static void nop_mcount(Elf_Shdr const *const relhdr, - Elf_Ehdr const *const ehdr) + Elf_Ehdr const *const ehdr, + const char *const txtname) { Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + (void *)ehdr); @@ -336,6 +337,7 @@ static void nop_mcount(Elf_Shdr const *const relhdr, unsigned mcountsym = 0; unsigned t; + int once = 0; for (t = nrel; t; --t) { int ret = -1; @@ -353,8 +355,18 @@ static void nop_mcount(Elf_Shdr const *const relhdr, mcountsym = Elf_r_sym(relp); } - if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) - ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { + if (make_nop) + ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + if (warn_on_notrace_sect && !once) { + printf("Section %s has mcount callers being ignored\n", + txtname); + once = 1; + /* just warn? */ + if (!make_nop) + return; + } + } /* * If we successfully removed the mcount, mark the relocation @@ -501,12 +513,12 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) mlocp = sift_rel_mcount(mlocp, (void *)mlocp - (void *)mloc0, &mrelp, relhdr, ehdr, recsym, recval, reltype); - } else if (make_nop && txtname) { + } else if (txtname && (warn_on_notrace_sect || make_nop)) { /* * This section is ignored by ftrace, but still * has mcount calls. Convert them to nops now. */ - nop_mcount(relhdr, ehdr); + nop_mcount(relhdr, ehdr, txtname); } } if (mloc0 != mlocp) { -- cgit v0.10.2 From a3a8350ab2f588f3a7a08dc86658bf90773f9a52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 18:59:10 -0400 Subject: kbuild/recordmcount: Add RECORDMCOUNT_WARN to warn about mcount callers When mcount is called in a section that ftrace will not modify it into a nop, we want to warn about this. But not warn about this always. Now if the user builds the kernel with the option RECORDMCOUNT_WARN=1 then the build will warn about mcount callers that are ignored and will just waste execution time. Acked-by: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: http://lkml.kernel.org/r/20110421023738.714956282@goodmis.org Signed-off-by: Steven Rostedt diff --git a/Makefile b/Makefile index 41ea6fb..e7d01ada 100644 --- a/Makefile +++ b/Makefile @@ -1268,6 +1268,7 @@ help: @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)' @echo ' make C=2 [targets] Force check of all c source with $$CHECK' @echo ' make W=1 [targets] Enable extra gcc checks' + @echo ' make RECORDMCOUNT_WARN=1 [targets] Warn about ignored mcount sections' @echo '' @echo 'Execute "make" or "make all" to build all targets marked with [*] ' @echo 'For further info see the ./README file' diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 7d3f903..6165622 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -244,13 +244,16 @@ endif ifdef CONFIG_FTRACE_MCOUNT_RECORD ifdef BUILD_C_RECORDMCOUNT +ifeq ("$(origin RECORDMCOUNT_WARN)", "command line") + RECORDMCOUNT_FLAGS = -w +endif # Due to recursion, we must skip empty.o. # The empty.o file is created in the make process in order to determine # the target endianness and word size. It is made before all other C # files, including recordmcount. sub_cmd_record_mcount = \ if [ $(@) != "scripts/mod/empty.o" ]; then \ - $(objtree)/scripts/recordmcount "$(@)"; \ + $(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)"; \ fi; recordmcount_source := $(srctree)/scripts/recordmcount.c \ $(srctree)/scripts/recordmcount.h -- cgit v0.10.2 From 5f9b93c39239507d2af03f79c8b0df9ebb9be768 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 19:06:39 -0400 Subject: ftrace: Avoid recording mcount on .init sections directly The init and exit sections should not be traced and adding a call to mcount to them is a waste of text and instruction cache. Have the macro section attributes include notrace to ignore these functions for tracing from the build. Link: http://lkml.kernel.org/r/20110421023738.953028219@goodmis.org Signed-off-by: Steven Rostedt diff --git a/include/linux/init.h b/include/linux/init.h index 577671c..9146f39 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -79,29 +79,29 @@ #define __exitused __used #endif -#define __exit __section(.exit.text) __exitused __cold +#define __exit __section(.exit.text) __exitused __cold notrace /* Used for HOTPLUG */ -#define __devinit __section(.devinit.text) __cold +#define __devinit __section(.devinit.text) __cold notrace #define __devinitdata __section(.devinit.data) #define __devinitconst __section(.devinit.rodata) -#define __devexit __section(.devexit.text) __exitused __cold +#define __devexit __section(.devexit.text) __exitused __cold notrace #define __devexitdata __section(.devexit.data) #define __devexitconst __section(.devexit.rodata) /* Used for HOTPLUG_CPU */ -#define __cpuinit __section(.cpuinit.text) __cold +#define __cpuinit __section(.cpuinit.text) __cold notrace #define __cpuinitdata __section(.cpuinit.data) #define __cpuinitconst __section(.cpuinit.rodata) -#define __cpuexit __section(.cpuexit.text) __exitused __cold +#define __cpuexit __section(.cpuexit.text) __exitused __cold notrace #define __cpuexitdata __section(.cpuexit.data) #define __cpuexitconst __section(.cpuexit.rodata) /* Used for MEMORY_HOTPLUG */ -#define __meminit __section(.meminit.text) __cold +#define __meminit __section(.meminit.text) __cold notrace #define __meminitdata __section(.meminit.data) #define __meminitconst __section(.meminit.rodata) -#define __memexit __section(.memexit.text) __exitused __cold +#define __memexit __section(.memexit.text) __exitused __cold notrace #define __memexitdata __section(.memexit.data) #define __memexitconst __section(.memexit.rodata) -- cgit v0.10.2 From 406a733db0052fed66d17008430b6047ecea7b92 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2011 16:43:29 -0400 Subject: ftrace/x86: Do not trace .discard.text section The section called .discard.text has tracing attached to it and is currently ignored by ftrace. But it does include a call to the mcount stub. Adding a notrace to the code keeps gcc from adding the useless mcount caller to it. Link: http://lkml.kernel.org/r/20110421023739.243651696@goodmis.org Signed-off-by: Steven Rostedt diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19..647d8a0 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard.text) __used \ + static void __section(.discard.text) __used notrace \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ -- cgit v0.10.2 From f6d4f08194d77c86fe3af6e2e4b9184347014926 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 20:47:34 -0400 Subject: ftrace/recordmcount: Remove duplicate code to find mcount symbol The code in sift_rel_mcount() and nop_mcount() to get the mcount symbol number is identical. Replace the two locations with a call to a function that does the work. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023739.488093407@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 22033d5..deb6a51 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -28,6 +28,7 @@ #undef __has_rel_mcount #undef has_rel_mcount #undef tot_relsize +#undef get_mcountsym #undef do_func #undef Elf_Addr #undef Elf_Ehdr @@ -56,6 +57,7 @@ # define has_rel_mcount has64_rel_mcount # define tot_relsize tot64_relsize # define do_func do64 +# define get_mcountsym get_mcountsym_64 # define is_fake_mcount is_fake_mcount64 # define fn_is_fake_mcount fn_is_fake_mcount64 # define MIPS_is_fake_mcount MIPS64_is_fake_mcount @@ -85,6 +87,7 @@ # define has_rel_mcount has32_rel_mcount # define tot_relsize tot32_relsize # define do_func do32 +# define get_mcountsym get_mcountsym_32 # define is_fake_mcount is_fake_mcount32 # define fn_is_fake_mcount fn_is_fake_mcount32 # define MIPS_is_fake_mcount MIPS32_is_fake_mcount @@ -237,6 +240,26 @@ static void append_func(Elf_Ehdr *const ehdr, uwrite(fd_map, ehdr, sizeof(*ehdr)); } +static unsigned get_mcountsym(Elf_Sym const *const sym0, + Elf_Rel const *relp, + char const *const str0) +{ + unsigned mcountsym = 0; + + Elf_Sym const *const symp = + &sym0[Elf_r_sym(relp)]; + char const *symname = &str0[w(symp->st_name)]; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; + + if (symname[0] == '.') + ++symname; /* ppc64 hack */ + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) + mcountsym = Elf_r_sym(relp); + + return mcountsym; +} + /* * Look at the relocations in order to find the calls to mcount. * Accumulate the section offsets that are found, and their relocation info, @@ -274,18 +297,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, unsigned t; for (t = nrel; t; --t) { - if (!mcountsym) { - Elf_Sym const *const symp = - &sym0[Elf_r_sym(relp)]; - char const *symname = &str0[w(symp->st_name)]; - char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - - if (symname[0] == '.') - ++symname; /* ppc64 hack */ - if (strcmp(mcount, symname) == 0 || - (altmcount && strcmp(altmcount, symname) == 0)) - mcountsym = Elf_r_sym(relp); - } + if (!mcountsym) + mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { uint_t const addend = _w(_w(relp->r_offset) - recval); @@ -342,18 +355,8 @@ static void nop_mcount(Elf_Shdr const *const relhdr, for (t = nrel; t; --t) { int ret = -1; - if (!mcountsym) { - Elf_Sym const *const symp = - &sym0[Elf_r_sym(relp)]; - char const *symname = &str0[w(symp->st_name)]; - char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - - if (symname[0] == '.') - ++symname; /* ppc64 hack */ - if (strcmp(mcount, symname) == 0 || - (altmcount && strcmp(altmcount, symname) == 0)) - mcountsym = Elf_r_sym(relp); - } + if (!mcountsym) + mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { if (make_nop) -- cgit v0.10.2 From 0286d2d12d78ea8aaf0722473dd20cb609a99709 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:13:06 -0400 Subject: ftrace/recordmcount: Add helper function get_sym_str_and_relp() The code to get the symbol, string, and relp pointers in the two functions sift_rel_mcount() and nop_mcount() are identical and also non-trivial. Moving this duplicate code into a single helper function makes the code easier to read and more maintainable. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023739.723658553@goodmis.org Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index deb6a51..3c00fab 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -29,6 +29,7 @@ #undef has_rel_mcount #undef tot_relsize #undef get_mcountsym +#undef get_sym_str_and_relp #undef do_func #undef Elf_Addr #undef Elf_Ehdr @@ -56,6 +57,7 @@ # define __has_rel_mcount __has64_rel_mcount # define has_rel_mcount has64_rel_mcount # define tot_relsize tot64_relsize +# define get_sym_str_and_relp get_sym_str_and_relp_64 # define do_func do64 # define get_mcountsym get_mcountsym_64 # define is_fake_mcount is_fake_mcount64 @@ -86,6 +88,7 @@ # define __has_rel_mcount __has32_rel_mcount # define has_rel_mcount has32_rel_mcount # define tot_relsize tot32_relsize +# define get_sym_str_and_relp get_sym_str_and_relp_32 # define do_func do32 # define get_mcountsym get_mcountsym_32 # define is_fake_mcount is_fake_mcount32 @@ -260,6 +263,29 @@ static unsigned get_mcountsym(Elf_Sym const *const sym0, return mcountsym; } +static void get_sym_str_and_relp(Elf_Shdr const *const relhdr, + Elf_Ehdr const *const ehdr, + Elf_Sym const **sym0, + char const **str0, + Elf_Rel const **relp) +{ + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const symsec_sh_link = w(relhdr->sh_link); + Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; + Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; + Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) + + (void *)ehdr); + + *sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) + + (void *)ehdr); + + *str0 = (char const *)(_w(strsec->sh_offset) + + (void *)ehdr); + + *relp = rel0; +} + /* * Look at the relocations in order to find the calls to mcount. * Accumulate the section offsets that are found, and their relocation info, @@ -276,26 +302,16 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, { uint_t *const mloc0 = mlocp; Elf_Rel *mrelp = *mrelpp; - Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) - + (void *)ehdr); - unsigned const symsec_sh_link = w(relhdr->sh_link); - Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; - Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) - + (void *)ehdr); - - Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; - char const *const str0 = (char const *)(_w(strsec->sh_offset) - + (void *)ehdr); - - Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) - + (void *)ehdr); + Elf_Sym const *sym0; + char const *str0; + Elf_Rel const *relp; unsigned rel_entsize = _w(relhdr->sh_entsize); unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; - Elf_Rel const *relp = rel0; - unsigned mcountsym = 0; unsigned t; + get_sym_str_and_relp(relhdr, ehdr, &sym0, &str0, &relp); + for (t = nrel; t; --t) { if (!mcountsym) mcountsym = get_mcountsym(sym0, relp, str0); @@ -331,27 +347,18 @@ static void nop_mcount(Elf_Shdr const *const relhdr, { Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + (void *)ehdr); - unsigned const symsec_sh_link = w(relhdr->sh_link); - Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; - Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) - + (void *)ehdr); - - Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; - char const *const str0 = (char const *)(_w(strsec->sh_offset) - + (void *)ehdr); - - Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) - + (void *)ehdr); + Elf_Sym const *sym0; + char const *str0; + Elf_Rel const *relp; + Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; unsigned rel_entsize = _w(relhdr->sh_entsize); unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; - Elf_Rel const *relp = rel0; - - Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; - unsigned mcountsym = 0; unsigned t; int once = 0; + get_sym_str_and_relp(relhdr, ehdr, &sym0, &str0, &relp); + for (t = nrel; t; --t) { int ret = -1; -- cgit v0.10.2 From f215efc50a3c1430735e89915fae188f927d8773 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:40 +0200 Subject: ftrace/recordmcount: mcount address adjustment Introduce mcount_adjust{,_32,_64} to the C implementation of recordmcount analog to $mcount_adjust in the perl script. The adjustment is added to the address of the relocations against the mcount symbol. If this adjustment is done by recordmcount at compile time the ftrace_call_adjust function can be turned into a nop. Cc: John Reiser Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 3c00fab..4be6036 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -22,6 +22,7 @@ #undef is_fake_mcount #undef fn_is_fake_mcount #undef MIPS_is_fake_mcount +#undef mcount_adjust #undef sift_rel_mcount #undef nop_mcount #undef find_secsym_ndx @@ -63,6 +64,7 @@ # define is_fake_mcount is_fake_mcount64 # define fn_is_fake_mcount fn_is_fake_mcount64 # define MIPS_is_fake_mcount MIPS64_is_fake_mcount +# define mcount_adjust mcount_adjust_64 # define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr @@ -94,6 +96,7 @@ # define is_fake_mcount is_fake_mcount32 # define fn_is_fake_mcount fn_is_fake_mcount32 # define MIPS_is_fake_mcount MIPS32_is_fake_mcount +# define mcount_adjust mcount_adjust_32 # define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr @@ -132,6 +135,8 @@ static void fn_ELF_R_INFO(Elf_Rel *const rp, unsigned sym, unsigned type) } static void (*Elf_r_info)(Elf_Rel *const rp, unsigned sym, unsigned type) = fn_ELF_R_INFO; +static int mcount_adjust = 0; + /* * MIPS mcount long call has 2 _mcount symbols, only the position of the 1st * _mcount symbol is needed for dynamic function tracer, with it, to disable @@ -317,8 +322,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { - uint_t const addend = _w(_w(relp->r_offset) - recval); - + uint_t const addend = + _w(_w(relp->r_offset) - recval + mcount_adjust); mrelp->r_offset = _w(offbase + ((void *)mlocp - (void *)mloc0)); Elf_r_info(mrelp, recsym, reltype); -- cgit v0.10.2 From 414eb44dda6b9893b212b2306ccb423ab01eef8e Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:41 +0200 Subject: ftrace/x86: mcount offset calculation Do the mcount offset adjustment in the recordmcount.pl/recordmcount.[ch] at compile time and not in ftrace_call_adjust at run time. Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index db24c22..268c783 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -38,11 +38,10 @@ extern void mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* - * call mcount is "e8 <4 byte offset>" - * The addr points to the 4 byte offset and the caller of this - * function wants the pointer to e8. Simply subtract one. + * addr is the address of the mcount call instruction. + * recordmcount does the necessary offset calculation. */ - return addr - 1; + return addr; } #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 0e18975..7648a5d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -335,6 +335,7 @@ do_file(char const *const fname) reltype = R_386_32; make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_32; + mcount_adjust_32 = -1; break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; @@ -350,6 +351,7 @@ do_file(char const *const fname) make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_64; reltype = R_X86_64_64; + mcount_adjust_64 = -1; break; } /* end switch */ diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a871cd4..414e7f5 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -223,6 +223,7 @@ if ($arch eq "x86_64") { $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; $type = ".quad"; $alignment = 8; + $mcount_adjust = -1; # force flags for this arch $ld .= " -m elf_x86_64"; @@ -232,6 +233,7 @@ if ($arch eq "x86_64") { } elsif ($arch eq "i386") { $alignment = 4; + $mcount_adjust = -1; # force flags for this arch $ld .= " -m elf_i386"; -- cgit v0.10.2 From 0ef5bf1cd3bd2e3eacb11fca80efe3674927ed1a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:43 +0200 Subject: ftrace/s390: mcount offset calculation Do the mcount offset adjustment in the recordmcount.pl/recordmcount.[ch] at compile time and not in ftrace_call_adjust at run time. Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 3c29be4..b7931fa 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -11,15 +11,13 @@ struct dyn_arch_ftrace { }; #ifdef CONFIG_64BIT #define MCOUNT_INSN_SIZE 12 -#define MCOUNT_OFFSET 8 #else #define MCOUNT_INSN_SIZE 20 -#define MCOUNT_OFFSET 4 #endif static inline unsigned long ftrace_call_adjust(unsigned long addr) { - return addr - MCOUNT_OFFSET; + return addr; } #endif /* __ASSEMBLY__ */ diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 7648a5d..ee52cb8 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -368,8 +368,10 @@ do_file(char const *const fname) "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (w2(ehdr->e_machine) == EM_S390) + if (w2(ehdr->e_machine) == EM_S390) { reltype = R_390_32; + mcount_adjust_32 = -4; + } if (w2(ehdr->e_machine) == EM_MIPS) { reltype = R_MIPS_32; is_fake_mcount32 = MIPS32_is_fake_mcount; @@ -384,8 +386,10 @@ do_file(char const *const fname) "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (w2(ghdr->e_machine) == EM_S390) + if (w2(ghdr->e_machine) == EM_S390) { reltype = R_390_64; + mcount_adjust_64 = -8; + } if (w2(ghdr->e_machine) == EM_MIPS) { reltype = R_MIPS_64; Elf64_r_sym = MIPS64_r_sym; diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 414e7f5..858966a 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -243,12 +243,14 @@ if ($arch eq "x86_64") { } elsif ($arch eq "s390" && $bits == 32) { $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_390_32\\s+_mcount\$"; + $mcount_adjust = -4; $alignment = 4; $ld .= " -m elf_s390"; $cc .= " -m31"; } elsif ($arch eq "s390" && $bits == 64) { $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_390_(PC|PLT)32DBL\\s+_mcount\\+0x2\$"; + $mcount_adjust = -8; $alignment = 8; $type = ".quad"; $ld .= " -m elf64_s390"; -- cgit v0.10.2 From 075e0b00857e166dcc3e39037a1fc5a90acac709 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:40 +0200 Subject: perf: Optimize ctx_sched_out() Oleg noted that ctx_sched_out() disables the PMU even though it might not actually do something, avoid needless PMU-disabling. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.665385503@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index d863b3c..4d9a1f01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1760,7 +1760,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; @@ -1770,6 +1769,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active) goto out; + perf_pmu_disable(ctx->pmu); if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); @@ -1779,8 +1779,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } -out: perf_pmu_enable(ctx->pmu); +out: raw_spin_unlock(&ctx->lock); } -- cgit v0.10.2 From 9137fb28ac74d05eb66d1d8e6778eaa14e6fed43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:41 +0200 Subject: perf: Clean up 'ctx' reference counting Small cleanup to how we refcount in find_get_context(), this also allows us to use put_ctx() to free things instead of using kfree(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.719340481@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d9a1f01..d665ac4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2835,16 +2835,12 @@ retry: unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { + } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - get_ctx(ctx); - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -2856,14 +2852,14 @@ retry: else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; else { + get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { - put_task_struct(task); - kfree(ctx); + put_ctx(ctx); if (err == -EAGAIN) goto retry; -- cgit v0.10.2 From facc43071cc0d4821c176d7d34570714eb348df9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:42 +0200 Subject: perf: Optimize event scheduling locking Currently we only hold one ctx->lock at a time, which results in us flipping back and forth between cpuctx->ctx.lock and task_ctx->lock. Avoid this and gain large atomic regions by holding both locks. We nest the task lock inside the cpu lock, since with task scheduling we might have to change task ctx while holding the cpu ctx lock. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.769881865@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index d665ac4..d243af9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -200,6 +200,22 @@ __get_cpu_context(struct perf_event_context *ctx) return this_cpu_ptr(ctx->pmu->pmu_cpu_context); } +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + #ifdef CONFIG_CGROUP_PERF /* @@ -340,11 +356,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - perf_pmu_disable(cpuctx->ctx.pmu); - /* * perf_cgroup_events says at least one * context on this CPU has cgroup events. @@ -353,6 +366,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) * events for a context. */ if (cpuctx->ctx.nr_cgroups > 0) { + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); if (mode & PERF_CGROUP_SWOUT) { cpu_ctx_sched_out(cpuctx, EVENT_ALL); @@ -372,9 +387,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode) cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - perf_pmu_enable(cpuctx->ctx.pmu); } rcu_read_unlock(); @@ -1759,15 +1774,14 @@ static void ctx_sched_out(struct perf_event_context *ctx, { struct perf_event *event; - raw_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_events)) - goto out; + return; + update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) - goto out; + return; perf_pmu_disable(ctx->pmu); if (event_type & EVENT_PINNED) { @@ -1780,8 +1794,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); -out: - raw_spin_unlock(&ctx->lock); } /* @@ -1929,8 +1941,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, rcu_read_unlock(); if (do_switch) { + raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; + raw_spin_unlock(&ctx->lock); } } @@ -2056,10 +2070,9 @@ ctx_sched_in(struct perf_event_context *ctx, { u64 now; - raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) - goto out; + return; now = perf_clock(); ctx->timestamp = now; @@ -2074,9 +2087,6 @@ ctx_sched_in(struct perf_event_context *ctx, /* Then walk through the lower prio flexible groups */ if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - -out: - raw_spin_unlock(&ctx->lock); } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, @@ -2110,6 +2120,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; + perf_ctx_lock(cpuctx, ctx); perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -2124,12 +2135,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpuctx->task_ctx = ctx; + perf_pmu_enable(ctx->pmu); + perf_ctx_unlock(cpuctx, ctx); + /* * Since these rotations are per-cpu, we need to ensure the * cpu-context we got scheduled on is actually rotating. */ perf_pmu_rotate_start(ctx->pmu); - perf_pmu_enable(ctx->pmu); } /* @@ -2269,7 +2282,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; - raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2301,7 +2313,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (delta > 0) perf_adjust_period(event, period, delta); } - raw_spin_unlock(&ctx->lock); } /* @@ -2309,16 +2320,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) */ static void rotate_ctx(struct perf_event_context *ctx) { - raw_spin_lock(&ctx->lock); - /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (!ctx->rotate_disable) list_rotate_left(&ctx->flexible_groups); - - raw_spin_unlock(&ctx->lock); } /* @@ -2345,6 +2352,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) rotate = 1; } + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) @@ -2370,6 +2378,7 @@ done: list_del_init(&cpuctx->rotation_list); perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2424,9 +2433,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * in. */ perf_cgroup_sched_out(current); - task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); + task_ctx_sched_out(ctx, EVENT_ALL); list_for_each_entry(event, &ctx->pinned_groups, group_entry) { ret = event_enable_on_exec(event, ctx); @@ -5982,6 +5991,7 @@ free_dev: } static struct lock_class_key cpuctx_mutex; +static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, char *name, int type) { @@ -6032,6 +6042,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; @@ -6776,7 +6787,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * our context. */ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -6784,6 +6794,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); + task_ctx_sched_out(child_ctx, EVENT_ALL); child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get -- cgit v0.10.2 From 04dc2dbbfe1c6f81b996d4dab255da75f9efbb4a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:43 +0200 Subject: perf: Remove task_ctx_sched_in() Make task_ctx_sched_*() imply EVENT_ALL, since anything less will not actually have scheduled the task in/out at all. Since there's no site that schedules all of a task in (due to the interleave with flexible cpuctx) we can remove this function. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.817893268@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index d243af9..66b3dd8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1979,8 +1979,7 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task); } -static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); @@ -1990,7 +1989,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } @@ -2098,19 +2097,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type, task); } -static void task_ctx_sched_in(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - ctx_sched_in(ctx, cpuctx, event_type, NULL); - cpuctx->task_ctx = ctx; -} - static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { @@ -2363,7 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) @@ -2371,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) - task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, current); done: if (remove) @@ -2435,7 +2421,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) perf_cgroup_sched_out(current); raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx); list_for_each_entry(event, &ctx->pinned_groups, group_entry) { ret = event_enable_on_exec(event, ctx); @@ -6794,7 +6780,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get -- cgit v0.10.2 From 2c29ef0fef8aaff1f91263fc75c749d659da6972 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:44 +0200 Subject: perf: Simplify and fix __perf_install_in_context() Currently __perf_install_in_context() will try and schedule in the event irrespective of our event scheduling rules, that is, we try to schedule CPU-pinned, TASK-pinned, CPU-flexible, TASK-flexible, but when creating a new event we simply try and schedule it on top of whatever is already on the PMU, this can lead to errors for pinned events. Therefore, simplify things and simply schedule everything out, add the event to the corresponding context and schedule everything back in. This also nicely handles the case where with __ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI can come right in the middle of schedule, before we managed to call perf_event_task_sched_in(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.870894224@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 66b3dd8..60b333a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1469,8 +1469,12 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *tsk); +static void task_ctx_sched_out(struct perf_event_context *ctx); +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); /* * Cross CPU call to install and enable a performance event @@ -1481,20 +1485,31 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx = cpuctx->task_ctx; + struct task_struct *task = current; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); /* - * In case we're installing a new context to an already running task, - * could also happen before perf_event_task_sched_in() on architectures - * which do context switches with IRQs enabled. + * If there was an active task_ctx schedule it out. */ - if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx, ctx->task); + if (task_ctx) { + task_ctx_sched_out(task_ctx); + /* + * If the context we're installing events in is not the + * active task_ctx, flip them. + */ + if (ctx->task && task_ctx != ctx) { + raw_spin_unlock(&cpuctx->ctx.lock); + raw_spin_lock(&ctx->lock); + cpuctx->task_ctx = task_ctx = ctx; + } + task = task_ctx->task; + } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); /* * update cgrp time only if current cgrp @@ -1505,43 +1520,18 @@ static int __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (!event_filter_match(event)) - goto unlock; - - /* - * Don't put the event on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE || - (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) - goto unlock; - /* - * An exclusive event can't go on if there are already active - * hardware events, and no hardware event can go on if there - * is already an exclusive event on. + * Schedule everything back in */ - if (!group_can_go_on(event, cpuctx, 1)) - err = -EEXIST; - else - err = event_sched_in(event, cpuctx, ctx); - - if (err) { - /* - * This event couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the event group is pinned then put it in error state. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + if (task_ctx) + ctx_sched_in(task_ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + if (task_ctx) + ctx_sched_in(task_ctx, cpuctx, EVENT_FLEXIBLE, task); -unlock: - raw_spin_unlock(&ctx->lock); + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, task_ctx); return 0; } -- cgit v0.10.2 From db24d33e08b88e990991760a44d72006a5dc6102 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:45 +0200 Subject: perf: Change and simplify ctx::is_active semantics Instead of tracking if a context is active or not, track which events of the context are active. By making it a bitmask of EVENT_PINNED|EVENT_FLEXIBLE we can simplify some of the scheduling routines since it can avoid adding events that are already active. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.930282378@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 60b333a..71c2d44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1763,8 +1763,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_event *event; + int is_active = ctx->is_active; - ctx->is_active = 0; + ctx->is_active &= ~event_type; if (likely(!ctx->nr_events)) return; @@ -1774,12 +1775,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; perf_pmu_disable(ctx->pmu); - if (event_type & EVENT_PINNED) { + if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if (event_type & EVENT_FLEXIBLE) { + if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2058,8 +2059,9 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { u64 now; + int is_active = ctx->is_active; - ctx->is_active = 1; + ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; @@ -2070,11 +2072,11 @@ ctx_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (event_type & EVENT_PINNED) + if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (event_type & EVENT_FLEXIBLE) + if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) ctx_flexible_sched_in(ctx, cpuctx); } -- cgit v0.10.2 From dce5855bba5df9e87bb04584d505c1f1b103c652 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:46 +0200 Subject: perf: Collect the schedule-in rules in one function This was scattered out - refactor it into a single function. No change in functionality. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.979862055@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 71c2d44..802f3b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1476,6 +1476,18 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task); +static void perf_event_sched_in(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + struct task_struct *task) +{ + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); +} + /* * Cross CPU call to install and enable a performance event * @@ -1523,12 +1535,7 @@ static int __perf_install_in_context(void *info) /* * Schedule everything back in */ - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); - if (task_ctx) - ctx_sched_in(task_ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - if (task_ctx) - ctx_sched_in(task_ctx, cpuctx, EVENT_FLEXIBLE, task); + perf_event_sched_in(cpuctx, task_ctx, task); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, task_ctx); @@ -2107,9 +2114,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + perf_event_sched_in(cpuctx, ctx, task); cpuctx->task_ctx = ctx; @@ -2347,9 +2352,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); - if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, current); + perf_event_sched_in(cpuctx, ctx, current); done: if (remove) -- cgit v0.10.2 From e03a9a55b4e45377af9ca3d464135f9ea280b8f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:47 +0200 Subject: perf: Change close() semantics for group events In order to always call list_del_event() on the correct cpu if the event is part of an active context and avoid having to do two IPIs, change the close() semantics slightly. The current perf_event_disable() call would disable a whole group if the event that's being closed is the group leader, whereas the new code keeps the group siblings enabled. People should not rely on this behaviour and I don't think they do, but in case we find they do, the fix is easy and we have to take the double IPI cost. Signed-off-by: Peter Zijlstra Cc: Vince Weaver Link: http://lkml.kernel.org/r/20110409192142.038377551@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 802f3b2..c378062 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2920,12 +2920,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - /* - * Remove from the PMU, can't get re-enabled since we got - * here because the last ref went. - */ - perf_event_disable(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: @@ -2942,8 +2936,8 @@ int perf_event_release_kernel(struct perf_event *event) mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); - list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event); mutex_unlock(&ctx->mutex); free_event(event); -- cgit v0.10.2 From 64ce312618ef0e11d88def80effcefd1b59fdb1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:48 +0200 Subject: perf: De-schedule a task context when removing the last event Since perf_install_in_context() will now install a context when we add the first event, we can de-schedule the context when the last event is removed. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192142.090431763@chello.nl Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index c378062..cc5d57d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1120,6 +1120,10 @@ static int __perf_remove_from_context(void *info) raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); + if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + ctx->is_active = 0; + cpuctx->task_ctx = NULL; + } raw_spin_unlock(&ctx->lock); return 0; -- cgit v0.10.2 From bccdaba044a445eb2c391295587aa47977994777 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 10:39:43 -0300 Subject: perf evlist: Remove dependency on debug routines So far we avoided having to link debug.o in the python binding, keep it that way by not using ui__warning() in evlist.c. Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-4wtew8hd3g7ejnlehtspys2t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 50aa348..04c8a60 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -12,7 +12,6 @@ #include "evlist.h" #include "evsel.h" #include "util.h" -#include "debug.h" #include @@ -257,19 +256,15 @@ int perf_evlist__alloc_mmap(struct perf_evlist *evlist) return evlist->mmap != NULL ? 0 : -ENOMEM; } -static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *evsel, +static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, int prot, int mask, int fd) { evlist->mmap[idx].prev = 0; evlist->mmap[idx].mask = mask; evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, MAP_SHARED, fd, 0); - if (evlist->mmap[idx].base == MAP_FAILED) { - if (evlist->cpus->map[idx] == -1 && evsel->attr.inherit) - ui__warning("Inherit is not allowed on per-task " - "events using mmap.\n"); + if (evlist->mmap[idx].base == MAP_FAILED) return -1; - } perf_evlist__add_pollfd(evlist, fd); return 0; @@ -289,7 +284,7 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int m if (output == -1) { output = fd; - if (__perf_evlist__mmap(evlist, evsel, cpu, + if (__perf_evlist__mmap(evlist, cpu, prot, mask, output) < 0) goto out_unmap; } else { @@ -329,7 +324,7 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, in if (output == -1) { output = fd; - if (__perf_evlist__mmap(evlist, evsel, thread, + if (__perf_evlist__mmap(evlist, thread, prot, mask, output) < 0) goto out_unmap; } else { -- cgit v0.10.2 From 5c6970af2f4be4e04b06fe78214f6809777a8354 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 10:55:10 -0300 Subject: perf python: Use exception to propagate errors We were using pr_debug to tell the user about not being able to parse a sample where we should really use the python way of reporting errors: exceptions. Fixes this problem: [root@emilia ~]# python >>> import perf Traceback (most recent call last): File "", line 1, in ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: eprintf >>> [root@emilia ~] As we want to keep the objects linked in the python binding (and in the future in a shared library) minimal. Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-m9dba9kaluas0kq8r58z191c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 69436b3..2dd1698 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -694,14 +694,12 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, err = perf_event__parse_sample(event, first->attr.sample_type, perf_sample_size(first->attr.sample_type), sample_id_all, &pevent->sample); - if (err) { - pr_err("Can't parse sample, err = %d\n", err); - goto end; - } - + if (err) + return PyErr_Format(PyExc_OSError, + "perf: can't parse sample, err=%d", err); return pyevent; } -end: + Py_INCREF(Py_None); return Py_None; } -- cgit v0.10.2 From c2a70653af45c9cbb0cab900e8931b062e57b1ae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 11:04:54 -0300 Subject: perf evlist: Don't die if sample_{id_all|type} is invalid Fixes two more cases where the python binding would not load: . Not finding die(), which it shouldn't anyway, not good to just stop the world because some particular perf.data file is invalid, just propagate the error to the caller. . Not finding perf_sample_size: fix it by moving it from event.c to evsel, where it belongs, as most cases are moving to operate on an evsel object.o One of the fixed problems: [root@emilia ~]# python >>> import perf Traceback (most recent call last): File "", line 1, in ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: perf_sample_size >>> [root@emilia ~]# Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-1hkj7b2cvgbfnoizsekjb6c9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index b671862..2da9162 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -474,7 +474,7 @@ static int test__basic_mmap(void) unsigned int nr_events[nsyscalls], expected_nr_events[nsyscalls], i, j; struct perf_evsel *evsels[nsyscalls], *evsel; - int sample_size = perf_sample_size(attr.sample_type); + int sample_size = __perf_evsel__sample_size(attr.sample_type); for (i = 0; i < nsyscalls; ++i) { char name[64]; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 0fe9adf..3c1b8a6 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -35,22 +35,6 @@ const char *perf_event__name(unsigned int id) return perf_event__names[id]; } -int perf_sample_size(u64 sample_type) -{ - u64 mask = sample_type & PERF_SAMPLE_MASK; - int size = 0; - int i; - - for (i = 0; i < 64; i++) { - if (mask & (1ULL << i)) - size++; - } - - size *= sizeof(u64); - - return size; -} - static struct perf_sample synth_sample = { .pid = -1, .tid = -1, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index c083328..1d7f664 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -82,8 +82,6 @@ struct perf_sample { struct ip_callchain *callchain; }; -int perf_sample_size(u64 sample_type); - #define BUILD_ID_SIZE 20 struct build_id_event { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 04c8a60..b021ea9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -455,33 +455,46 @@ int perf_evlist__set_filters(struct perf_evlist *evlist) return 0; } -u64 perf_evlist__sample_type(struct perf_evlist *evlist) +bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist) { - struct perf_evsel *pos; - u64 type = 0; - - list_for_each_entry(pos, &evlist->entries, node) { - if (!type) - type = pos->attr.sample_type; - else if (type != pos->attr.sample_type) - die("non matching sample_type"); + struct perf_evsel *pos, *first; + + pos = first = list_entry(evlist->entries.next, struct perf_evsel, node); + + list_for_each_entry_continue(pos, &evlist->entries, node) { + if (first->attr.sample_type != pos->attr.sample_type) + return false; } - return type; + return true; } -bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) +u64 perf_evlist__sample_type(const struct perf_evlist *evlist) +{ + struct perf_evsel *first; + + first = list_entry(evlist->entries.next, struct perf_evsel, node); + return first->attr.sample_type; +} + +bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist) { - bool value = false, first = true; - struct perf_evsel *pos; - - list_for_each_entry(pos, &evlist->entries, node) { - if (first) { - value = pos->attr.sample_id_all; - first = false; - } else if (value != pos->attr.sample_id_all) - die("non matching sample_id_all"); + struct perf_evsel *pos, *first; + + pos = first = list_entry(evlist->entries.next, struct perf_evsel, node); + + list_for_each_entry_continue(pos, &evlist->entries, node) { + if (first->attr.sample_id_all != pos->attr.sample_id_all) + return false; } - return value; + return true; +} + +bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) +{ + struct perf_evsel *first; + + first = list_entry(evlist->entries.next, struct perf_evsel, node); + return first->attr.sample_id_all; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 0a1ef1f..b2b8623 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -66,7 +66,9 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, void perf_evlist__delete_maps(struct perf_evlist *evlist); int perf_evlist__set_filters(struct perf_evlist *evlist); -u64 perf_evlist__sample_type(struct perf_evlist *evlist); -bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); +u64 perf_evlist__sample_type(const struct perf_evlist *evlist); +bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist); +bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist); +bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cca29ed..0239eb8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -15,6 +15,22 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +int __perf_evsel__sample_size(u64 sample_type) +{ + u64 mask = sample_type & PERF_SAMPLE_MASK; + int size = 0; + int i; + + for (i = 0; i < 64; i++) { + if (mask & (1ULL << i)) + size++; + } + + size *= sizeof(u64); + + return size; +} + void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index f79bb2c..7e9366e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -149,4 +149,11 @@ static inline int perf_evsel__read_scaled(struct perf_evsel *evsel, return __perf_evsel__read(evsel, ncpus, nthreads, true); } +int __perf_evsel__sample_size(u64 sample_type); + +static inline int perf_evsel__sample_size(struct perf_evsel *evsel) +{ + return __perf_evsel__sample_size(evsel->attr.sample_type); +} + #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 2dd1698..24063b4 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -692,7 +692,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, first = list_entry(evlist->entries.next, struct perf_evsel, node); err = perf_event__parse_sample(event, first->attr.sample_type, - perf_sample_size(first->attr.sample_type), + perf_evsel__sample_size(first), sample_id_all, &pevent->sample); if (err) return PyErr_Format(PyExc_OSError, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 64500fc..f5a8fbd 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -58,6 +58,16 @@ static int perf_session__open(struct perf_session *self, bool force) goto out_close; } + if (!perf_evlist__valid_sample_type(self->evlist)) { + pr_err("non matching sample_type"); + goto out_close; + } + + if (!perf_evlist__valid_sample_id_all(self->evlist)) { + pr_err("non matching sample_id_all"); + goto out_close; + } + self->size = input_stat.st_size; return 0; @@ -97,7 +107,7 @@ out: void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_evlist__sample_type(self->evlist); - self->sample_size = perf_sample_size(self->sample_type); + self->sample_size = __perf_evsel__sample_size(self->sample_type); self->sample_id_all = perf_evlist__sample_id_all(self->evlist); perf_session__id_header_size(self); } -- cgit v0.10.2 From e95cc02880947e9f77540b03e166470e8ac14cbc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Mar 2011 18:27:42 +0200 Subject: perf python: Fix argument name list of read_on_cpu() Mandatory arguments need to be present in the argument name list, as well as optional arguments, otherwise python barfs: # ./python/twatch.py Traceback (most recent call last): File "./python/twatch.py", line 41, in main() File "./python/twatch.py", line 32, in main event = evlist.read_on_cpu(cpu) RuntimeError: more argument specifiers than keyword list entries Hence, add cpu to the name list. Cc: David Ahern Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Link: http://lkml.kernel.org/r/1301588863-20210-1-git-send-email-fweisbec@gmail.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 24063b4..a9ac050 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -674,7 +674,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, struct perf_evlist *evlist = &pevlist->evlist; union perf_event *event; int sample_id_all = 1, cpu; - static char *kwlist[] = {"sample_id_all", NULL, NULL}; + static char *kwlist[] = {"cpu", "sample_id_all", NULL, NULL}; int err; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, -- cgit v0.10.2 From 64348153c63b8c1f99f19f14a9c3cbd5df70c9d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Mar 2011 18:27:43 +0200 Subject: perf python: Cleanup useless double NULL termination in method arg names The list of methods argument names only needs to be NULL terminated once. Remove the second ones. Cc: David Ahern Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Link: http://lkml.kernel.org/r/1301588863-20210-2-git-send-email-fweisbec@gmail.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index a9ac050..8e0b5a3 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -247,7 +247,7 @@ struct pyrf_cpu_map { static int pyrf_cpu_map__init(struct pyrf_cpu_map *pcpus, PyObject *args, PyObject *kwargs) { - static char *kwlist[] = { "cpustr", NULL, NULL, }; + static char *kwlist[] = { "cpustr", NULL }; char *cpustr = NULL; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s", @@ -316,7 +316,7 @@ struct pyrf_thread_map { static int pyrf_thread_map__init(struct pyrf_thread_map *pthreads, PyObject *args, PyObject *kwargs) { - static char *kwlist[] = { "pid", "tid", NULL, NULL, }; + static char *kwlist[] = { "pid", "tid", NULL }; int pid = -1, tid = -1; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", @@ -418,7 +418,9 @@ static int pyrf_evsel__init(struct pyrf_evsel *pevsel, "wakeup_events", "bp_type", "bp_addr", - "bp_len", NULL, NULL, }; + "bp_len", + NULL + }; u64 sample_period = 0; u32 disabled = 0, inherit = 0, @@ -499,7 +501,7 @@ static PyObject *pyrf_evsel__open(struct pyrf_evsel *pevsel, struct thread_map *threads = NULL; PyObject *pcpus = NULL, *pthreads = NULL; int group = 0, inherit = 0; - static char *kwlist[] = {"cpus", "threads", "group", "inherit", NULL, NULL}; + static char *kwlist[] = { "cpus", "threads", "group", "inherit", NULL }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist, &pcpus, &pthreads, &group, &inherit)) @@ -582,8 +584,7 @@ static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist, PyObject *args, PyObject *kwargs) { struct perf_evlist *evlist = &pevlist->evlist; - static char *kwlist[] = {"pages", "overwrite", - NULL, NULL}; + static char *kwlist[] = { "pages", "overwrite", NULL }; int pages = 128, overwrite = false; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", kwlist, @@ -603,7 +604,7 @@ static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist, PyObject *args, PyObject *kwargs) { struct perf_evlist *evlist = &pevlist->evlist; - static char *kwlist[] = {"timeout", NULL, NULL}; + static char *kwlist[] = { "timeout", NULL }; int timeout = -1, n; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout)) @@ -674,7 +675,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, struct perf_evlist *evlist = &pevlist->evlist; union perf_event *event; int sample_id_all = 1, cpu; - static char *kwlist[] = {"cpu", "sample_id_all", NULL, NULL}; + static char *kwlist[] = { "cpu", "sample_id_all", NULL }; int err; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, -- cgit v0.10.2 From 2cee77c4505fc581f41b44e18ffc0953b67a414c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 May 2011 08:55:59 -0600 Subject: perf stat: clarify unsupported events from uncounted events perf stat continues running even if the event list contains counters that are not supported. The resulting output then contains for those events which gets confusing as to which events are supported, but not counted and which are not supported. Before: perf stat -ddd -- sleep 1 Performance counter stats for 'sleep 1': 0.571283 task-clock # 0.001 CPUs utilized 1 context-switches # 0.002 M/sec 0 CPU-migrations # 0.000 M/sec 157 page-faults # 0.275 M/sec 1,037,707 cycles # 1.816 GHz stalled-cycles-frontend stalled-cycles-backend 654,499 instructions # 0.63 insns per cycle 136,129 branches # 238.286 M/sec branch-misses L1-dcache-loads L1-dcache-load-misses LLC-loads LLC-load-misses L1-icache-loads L1-icache-load-misses dTLB-loads dTLB-load-misses iTLB-loads iTLB-load-misses L1-dcache-prefetches L1-dcache-prefetch-misses 1.001004836 seconds time elapsed After: perf stat -ddd -- sleep 1 Performance counter stats for 'sleep 1': 1.350326 task-clock # 0.001 CPUs utilized 2 context-switches # 0.001 M/sec 0 CPU-migrations # 0.000 M/sec 157 page-faults # 0.116 M/sec 11,986 cycles # 0.009 GHz stalled-cycles-frontend stalled-cycles-backend 496,986 instructions # 41.46 insns per cycle 138,065 branches # 102.246 M/sec 7,245 branch-misses # 5.25% of all branches L1-dcache-loads L1-dcache-load-misses LLC-loads LLC-load-misses L1-icache-loads L1-icache-load-misses dTLB-loads dTLB-load-misses iTLB-loads iTLB-load-misses L1-dcache-prefetches L1-dcache-prefetch-misses 1.002397333 seconds time elapsed v1->v2: changed supported type from int to bool v2->v3 fixed vertical alignment of new struct element Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1306767359-13221-1-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a9f0671..784ed6d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -61,6 +61,8 @@ #include #define DEFAULT_SEPARATOR " " +#define CNTR_NOT_SUPPORTED "" +#define CNTR_NOT_COUNTED "" static struct perf_event_attr default_attrs[] = { @@ -448,6 +450,7 @@ static int run_perf_stat(int argc __used, const char **argv) if (verbose) ui__warning("%s event is not supported by the kernel.\n", event_name(counter)); + counter->supported = false; continue; } @@ -466,6 +469,7 @@ static int run_perf_stat(int argc __used, const char **argv) die("Not all events could be opened.\n"); return -1; } + counter->supported = true; } if (perf_evlist__set_filters(evsel_list)) { @@ -861,7 +865,7 @@ static void print_counter_aggr(struct perf_evsel *counter) if (scaled == -1) { fprintf(stderr, "%*s%s%*s", csv_output ? 0 : 18, - "", + counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, csv_sep, csv_output ? 0 : -24, event_name(counter)); @@ -914,7 +918,8 @@ static void print_counter(struct perf_evsel *counter) csv_output ? 0 : -4, evsel_list->cpus->map[cpu], csv_sep, csv_output ? 0 : 18, - "", csv_sep, + counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, + csv_sep, csv_output ? 0 : -24, event_name(counter)); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 7e9366e..e9a3155 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -61,6 +61,7 @@ struct perf_evsel { off_t id_offset; }; struct cgroup_sel *cgrp; + bool supported; }; struct cpu_map; -- cgit v0.10.2 From 787bef174f055343c69a9639e6e05a564980ed4c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 May 2011 14:28:43 -0600 Subject: perf script: "sym" field really means show IP data Currently the "sym" output field is used to dump instruction pointers and callchain stack. Sample addresses can also be converted to symbols, so the meaning of "sym" needs to be fixed. This patch adds an "ip" option and if it is selected the user can also opt to dump symbols for them. If the user opts to dump IP without syms only the address is shown. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1306528124-25861-2-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 86c87e2..67a4e5c 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -115,10 +115,10 @@ OPTIONS -f:: --fields:: Comma separated list of fields to print. Options are: - comm, tid, pid, time, cpu, event, trace, sym. Field + comm, tid, pid, time, cpu, event, trace, ip, sym. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. - e.g., -f sw:comm,tid,time,sym and -f trace:time,cpu,trace + e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace perf script -f @@ -132,17 +132,17 @@ OPTIONS The arguments are processed in the order received. A later usage can reset a prior request. e.g.: - -f trace: -f comm,tid,time,sym + -f trace: -f comm,tid,time,ip,sym The first -f suppresses trace events (field list is ""), but then the - second invocation sets the fields to comm,tid,time,sym. In this case a + second invocation sets the fields to comm,tid,time,ip,sym. In this case a warning is given to the user: "Overriding previous field request for all events." Alternativey, consider the order: - -f comm,tid,time,sym -f trace: + -f comm,tid,time,ip,sym -f trace: The first -f sets the fields for all events and the second -f suppresses trace events. The user is given a warning message about diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 22747de..0852db2 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -30,7 +30,8 @@ enum perf_output_field { PERF_OUTPUT_CPU = 1U << 4, PERF_OUTPUT_EVNAME = 1U << 5, PERF_OUTPUT_TRACE = 1U << 6, - PERF_OUTPUT_SYM = 1U << 7, + PERF_OUTPUT_IP = 1U << 7, + PERF_OUTPUT_SYM = 1U << 8, }; struct output_option { @@ -44,6 +45,7 @@ struct output_option { {.str = "cpu", .field = PERF_OUTPUT_CPU}, {.str = "event", .field = PERF_OUTPUT_EVNAME}, {.str = "trace", .field = PERF_OUTPUT_TRACE}, + {.str = "ip", .field = PERF_OUTPUT_IP}, {.str = "sym", .field = PERF_OUTPUT_SYM}, }; @@ -60,7 +62,8 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | + PERF_OUTPUT_SYM, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -70,7 +73,8 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | + PERF_OUTPUT_SYM, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -88,7 +92,8 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | + PERF_OUTPUT_SYM, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -157,15 +162,20 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, !perf_session__has_traces(session, "record -R")) return -EINVAL; - if (PRINT_FIELD(SYM)) { + if (PRINT_FIELD(IP)) { if (perf_event_attr__check_stype(attr, PERF_SAMPLE_IP, "IP", - PERF_OUTPUT_SYM)) + PERF_OUTPUT_IP)) return -EINVAL; if (!no_callchain && !(attr->sample_type & PERF_SAMPLE_CALLCHAIN)) symbol_conf.use_callchain = false; } + if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP)) { + pr_err("Display of symbols requested but IP is not selected.\n" + "No addresses to convert to symbols.\n"); + return -EINVAL; + } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && perf_event_attr__check_stype(attr, PERF_SAMPLE_TID, "TID", @@ -230,7 +240,7 @@ static void print_sample_start(struct perf_sample *sample, if (PRINT_FIELD(COMM)) { if (latency_format) printf("%8.8s ", thread->comm); - else if (PRINT_FIELD(SYM) && symbol_conf.use_callchain) + else if (PRINT_FIELD(IP) && symbol_conf.use_callchain) printf("%s ", thread->comm); else printf("%16s ", thread->comm); @@ -288,12 +298,13 @@ static void process_event(union perf_event *event __unused, print_trace_event(sample->cpu, sample->raw_data, sample->raw_size); - if (PRINT_FIELD(SYM)) { + if (PRINT_FIELD(IP)) { if (!symbol_conf.use_callchain) printf(" "); else printf("\n"); - perf_session__print_symbols(event, sample, session); + perf_session__print_ip(event, sample, session, + PRINT_FIELD(SYM)); } printf("\n"); @@ -985,7 +996,7 @@ static const struct option options[] = { OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_CALLBACK('f', "fields", NULL, "str", - "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,sym", + "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym", parse_output_fields), OPT_END() diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f5a8fbd..ad33650 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1202,9 +1202,10 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -void perf_session__print_symbols(union perf_event *event, - struct perf_sample *sample, - struct perf_session *session) +void perf_session__print_ip(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session, + int print_sym) { struct addr_location al; const char *symname, *dsoname; @@ -1233,32 +1234,39 @@ void perf_session__print_symbols(union perf_event *event, if (!node) break; - if (node->sym && node->sym->name) - symname = node->sym->name; - else - symname = ""; + printf("\t%16" PRIx64, node->ip); + if (print_sym) { + if (node->sym && node->sym->name) + symname = node->sym->name; + else + symname = ""; - if (node->map && node->map->dso && node->map->dso->name) - dsoname = node->map->dso->name; - else - dsoname = ""; + if (node->map && node->map->dso && node->map->dso->name) + dsoname = node->map->dso->name; + else + dsoname = ""; - printf("\t%16" PRIx64 " %s (%s)\n", node->ip, symname, dsoname); + printf(" %s (%s)", symname, dsoname); + } + printf("\n"); callchain_cursor_advance(cursor); } } else { - if (al.sym && al.sym->name) - symname = al.sym->name; - else - symname = ""; + printf("%16" PRIx64, al.addr); + if (print_sym) { + if (al.sym && al.sym->name) + symname = al.sym->name; + else + symname = ""; - if (al.map && al.map->dso && al.map->dso->name) - dsoname = al.map->dso->name; - else - dsoname = ""; + if (al.map && al.map->dso && al.map->dso->name) + dsoname = al.map->dso->name; + else + dsoname = ""; - printf("%16" PRIx64 " %s (%s)", al.addr, symname, dsoname); + printf(" %s (%s)", symname, dsoname); + } } } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 66d4e14..d76af0f 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -167,8 +167,9 @@ static inline int perf_session__parse_sample(struct perf_session *session, struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -void perf_session__print_symbols(union perf_event *event, +void perf_session__print_ip(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); + struct perf_session *session, + int print_sym); #endif /* __PERF_SESSION_H */ -- cgit v0.10.2 From 610723f24eeb842025178a6722fa9108c4e157b6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 May 2011 14:28:44 -0600 Subject: perf script: Make printing of dso a separate field option The 'sym' option displays both the function name and the DSO it comes from. Split the display of the dso into a separate option. This allows display of the ip address and symbol without the dso, thus shortening line lengths - and decluttering the output a bit. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1306528124-25861-3-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 67a4e5c..1e744c2 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -115,7 +115,7 @@ OPTIONS -f:: --fields:: Comma separated list of fields to print. Options are: - comm, tid, pid, time, cpu, event, trace, ip, sym. Field + comm, tid, pid, time, cpu, event, trace, ip, sym, dso. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 0852db2..a8bd00f 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -32,6 +32,7 @@ enum perf_output_field { PERF_OUTPUT_TRACE = 1U << 6, PERF_OUTPUT_IP = 1U << 7, PERF_OUTPUT_SYM = 1U << 8, + PERF_OUTPUT_DSO = 1U << 9, }; struct output_option { @@ -47,6 +48,7 @@ struct output_option { {.str = "trace", .field = PERF_OUTPUT_TRACE}, {.str = "ip", .field = PERF_OUTPUT_IP}, {.str = "sym", .field = PERF_OUTPUT_SYM}, + {.str = "dso", .field = PERF_OUTPUT_DSO}, }; /* default set to maintain compatibility with current format */ @@ -63,7 +65,7 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | - PERF_OUTPUT_SYM, + PERF_OUTPUT_SYM | PERF_OUTPUT_DSO, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -74,7 +76,7 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | - PERF_OUTPUT_SYM, + PERF_OUTPUT_SYM | PERF_OUTPUT_DSO, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -93,7 +95,7 @@ static struct { .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | - PERF_OUTPUT_SYM, + PERF_OUTPUT_SYM | PERF_OUTPUT_DSO, .invalid_fields = PERF_OUTPUT_TRACE, }, @@ -176,6 +178,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, "No addresses to convert to symbols.\n"); return -EINVAL; } + if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP)) { + pr_err("Display of DSO requested but IP is not selected.\n" + "No addresses to convert to dso.\n"); + return -EINVAL; + } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && perf_event_attr__check_stype(attr, PERF_SAMPLE_TID, "TID", @@ -304,7 +311,7 @@ static void process_event(union perf_event *event __unused, else printf("\n"); perf_session__print_ip(event, sample, session, - PRINT_FIELD(SYM)); + PRINT_FIELD(SYM), PRINT_FIELD(DSO)); } printf("\n"); @@ -996,7 +1003,7 @@ static const struct option options[] = { OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_CALLBACK('f', "fields", NULL, "str", - "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym", + "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso", parse_output_fields), OPT_END() diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ad33650..0dd4182 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1205,7 +1205,7 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, void perf_session__print_ip(union perf_event *event, struct perf_sample *sample, struct perf_session *session, - int print_sym) + int print_sym, int print_dso) { struct addr_location al; const char *symname, *dsoname; @@ -1241,12 +1241,15 @@ void perf_session__print_ip(union perf_event *event, else symname = ""; + printf(" %s", symname); + } + if (print_dso) { if (node->map && node->map->dso && node->map->dso->name) dsoname = node->map->dso->name; else dsoname = ""; - printf(" %s (%s)", symname, dsoname); + printf(" (%s)", dsoname); } printf("\n"); @@ -1261,12 +1264,16 @@ void perf_session__print_ip(union perf_event *event, else symname = ""; + printf(" %s", symname); + } + + if (print_dso) { if (al.map && al.map->dso && al.map->dso->name) dsoname = al.map->dso->name; else dsoname = ""; - printf(" %s (%s)", symname, dsoname); + printf(" (%s)", dsoname); } } } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index d76af0f..de4178d 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -170,6 +170,6 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, void perf_session__print_ip(union perf_event *event, struct perf_sample *sample, struct perf_session *session, - int print_sym); + int print_sym, int print_dso); #endif /* __PERF_SESSION_H */ -- cgit v0.10.2 From 7cec0922389e080d11ec43dd23aa778e136bd1e1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 May 2011 13:08:23 -0600 Subject: perf script: Add printing of sample address Resolve to a function or variable if possible and if the sym option is enabled. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1306782503-22002-1-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 1e744c2..c6068cb 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -115,8 +115,8 @@ OPTIONS -f:: --fields:: Comma separated list of fields to print. Options are: - comm, tid, pid, time, cpu, event, trace, ip, sym, dso. Field - list can be prepended with the type, trace, sw or hw, + comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr. + Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a8bd00f..3056b45 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -33,6 +33,7 @@ enum perf_output_field { PERF_OUTPUT_IP = 1U << 7, PERF_OUTPUT_SYM = 1U << 8, PERF_OUTPUT_DSO = 1U << 9, + PERF_OUTPUT_ADDR = 1U << 10, }; struct output_option { @@ -49,6 +50,7 @@ struct output_option { {.str = "ip", .field = PERF_OUTPUT_IP}, {.str = "sym", .field = PERF_OUTPUT_SYM}, {.str = "dso", .field = PERF_OUTPUT_DSO}, + {.str = "addr", .field = PERF_OUTPUT_ADDR}, }; /* default set to maintain compatibility with current format */ @@ -173,14 +175,22 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, !(attr->sample_type & PERF_SAMPLE_CALLCHAIN)) symbol_conf.use_callchain = false; } - if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP)) { - pr_err("Display of symbols requested but IP is not selected.\n" - "No addresses to convert to symbols.\n"); + + if (PRINT_FIELD(ADDR) && + perf_event_attr__check_stype(attr, PERF_SAMPLE_ADDR, "ADDR", + PERF_OUTPUT_ADDR)) + return -EINVAL; + + if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) { + pr_err("Display of symbols requested but neither sample IP nor " + "sample address\nis selected. Hence, no addresses to convert " + "to symbols.\n"); return -EINVAL; } - if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP)) { - pr_err("Display of DSO requested but IP is not selected.\n" - "No addresses to convert to dso.\n"); + if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) { + pr_err("Display of DSO requested but neither sample IP nor " + "sample address\nis selected. Hence, no addresses to convert " + "to DSO.\n"); return -EINVAL; } @@ -288,6 +298,63 @@ static void print_sample_start(struct perf_sample *sample, } } +static bool sample_addr_correlates_sym(struct perf_event_attr *attr) +{ + if ((attr->type == PERF_TYPE_SOFTWARE) && + ((attr->config == PERF_COUNT_SW_PAGE_FAULTS) || + (attr->config == PERF_COUNT_SW_PAGE_FAULTS_MIN) || + (attr->config == PERF_COUNT_SW_PAGE_FAULTS_MAJ))) + return true; + + return false; +} + +static void print_sample_addr(union perf_event *event, + struct perf_sample *sample, + struct perf_session *session, + struct thread *thread, + struct perf_event_attr *attr) +{ + struct addr_location al; + u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + const char *symname, *dsoname; + + printf("%16" PRIx64, sample->addr); + + if (!sample_addr_correlates_sym(attr)) + return; + + thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, + event->ip.pid, sample->addr, &al); + if (!al.map) + thread__find_addr_map(thread, session, cpumode, MAP__VARIABLE, + event->ip.pid, sample->addr, &al); + + al.cpu = sample->cpu; + al.sym = NULL; + + if (al.map) + al.sym = map__find_symbol(al.map, al.addr, NULL); + + if (PRINT_FIELD(SYM)) { + if (al.sym && al.sym->name) + symname = al.sym->name; + else + symname = ""; + + printf(" %16s", symname); + } + + if (PRINT_FIELD(DSO)) { + if (al.map && al.map->dso && al.map->dso->name) + dsoname = al.map->dso->name; + else + dsoname = ""; + + printf(" (%s)", dsoname); + } +} + static void process_event(union perf_event *event __unused, struct perf_sample *sample, struct perf_evsel *evsel, @@ -305,6 +372,9 @@ static void process_event(union perf_event *event __unused, print_trace_event(sample->cpu, sample->raw_data, sample->raw_size); + if (PRINT_FIELD(ADDR)) + print_sample_addr(event, sample, session, thread, attr); + if (PRINT_FIELD(IP)) { if (!symbol_conf.use_callchain) printf(" "); @@ -1003,7 +1073,7 @@ static const struct option options[] = { OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_CALLBACK('f', "fields", NULL, "str", - "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso", + "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr", parse_output_fields), OPT_END() diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0239eb8..a03a36b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -377,6 +377,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, array++; } + data->addr = 0; if (type & PERF_SAMPLE_ADDR) { data->addr = *array; array++; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0dd4182..b723f21 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -708,9 +708,9 @@ static void dump_sample(struct perf_session *session, union perf_event *event, if (!dump_trace) return; - printf("(IP, %d): %d/%d: %#" PRIx64 " period: %" PRIu64 "\n", + printf("(IP, %d): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", event->header.misc, sample->pid, sample->tid, sample->ip, - sample->period); + sample->period, sample->addr); if (session->sample_type & PERF_SAMPLE_CALLCHAIN) callchain__printf(sample); -- cgit v0.10.2 From b58f6b0dd3d677338b9065388cc2cc942b86338e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Jun 2011 00:23:28 +0200 Subject: perf, core: Fix initial task_ctx/event installation A lost Quilt refresh of 2c29ef0fef8 (perf: Simplify and fix __perf_install_in_context()) is causing grief and lockups, reported by Jiri Olsa. When installing an event in a task context, there's a number of issues: - there might not be an existing task context, in which case we should install the now current context; - there might already be a context, not the current one, in which case we should de-schedule the old and install the new; these cases were dealt with in the lost refresh, however there is one further case that was found in testing: - there might already be a context, the current one, in which case we should still de-schedule, and should take care to re-install it (note that task_ctx_sched_out() clears cpuctx->task_ctx). Reported-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307399008.2497.971.camel@laptop Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index ba89f40..5e8c7b1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1505,25 +1505,31 @@ static int __perf_install_in_context(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; struct task_struct *task = current; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_lock(cpuctx, task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); /* * If there was an active task_ctx schedule it out. */ - if (task_ctx) { + if (task_ctx) task_ctx_sched_out(task_ctx); - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - raw_spin_unlock(&cpuctx->ctx.lock); - raw_spin_lock(&ctx->lock); - cpuctx->task_ctx = task_ctx = ctx; - } + + /* + * If the context we're installing events in is not the + * active task_ctx, flip them. + */ + if (ctx->task && task_ctx != ctx) { + if (task_ctx) + raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&ctx->lock); + task_ctx = ctx; + } + + if (task_ctx) { + cpuctx->task_ctx = task_ctx; task = task_ctx->task; } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); update_context_time(ctx); -- cgit v0.10.2 From 76369139ceb955deefc509e6e12ce9d6ce50ccab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 May 2011 19:55:04 +0200 Subject: perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Borislav Petkov Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3412684..779f6ed 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,33 +680,6 @@ enum perf_event_active_state { }; struct file; - -#define PERF_BUFFER_WRITABLE 0x01 - -struct perf_buffer { - atomic_t refcount; - struct rcu_head rcu_head; -#ifdef CONFIG_PERF_USE_VMALLOC - struct work_struct work; - int page_order; /* allocation order */ -#endif - int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ - - atomic_t poll; /* POLL_ for wakeups */ - - local_t head; /* write position */ - local_t nest; /* nested writers */ - local_t events; /* event limit */ - local_t wakeup; /* wakeup stamp */ - local_t lost; /* nr records lost */ - - long watermark; /* wakeup watermark */ - - struct perf_event_mmap_page *user_page; - void *data_pages[0]; -}; - struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, int, @@ -745,6 +718,8 @@ struct perf_cgroup { }; #endif +struct ring_buffer; + /** * struct perf_event - performance event kernel representation: */ @@ -834,7 +809,7 @@ struct perf_event { atomic_t mmap_count; int mmap_locked; struct user_struct *mmap_user; - struct perf_buffer *buffer; + struct ring_buffer *rb; /* poll related */ wait_queue_head_t waitq; @@ -945,7 +920,7 @@ struct perf_cpu_context { struct perf_output_handle { struct perf_event *event; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned long wakeup; unsigned long size; void *addr; diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 1ce23d3..89e5e8a 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o +obj-y := core.o ring_buffer.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e8c7b1..5e70f62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,8 @@ #include #include +#include "internal.h" + #include struct remote_function_call { @@ -2886,7 +2888,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_buffer_put(struct perf_buffer *buffer); +static void ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2909,9 +2911,9 @@ static void free_event(struct perf_event *event) } } - if (event->buffer) { - perf_buffer_put(event->buffer); - event->buffer = NULL; + if (event->rb) { + ring_buffer_put(event->rb); + event->rb = NULL; } if (is_cgroup_event(event)) @@ -3139,13 +3141,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned int events = POLL_HUP; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) - events = atomic_xchg(&buffer->poll, 0); + rb = rcu_dereference(event->rb); + if (rb) + events = atomic_xchg(&rb->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -3356,14 +3358,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_buffer *buffer; + struct ring_buffer *rb; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; - userpg = buffer->user_page; + userpg = rb->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -3390,220 +3392,10 @@ unlock: rcu_read_unlock(); } -static unsigned long perf_data_size(struct perf_buffer *buffer); - -static void -perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) -{ - long max_size = perf_data_size(buffer); - - if (watermark) - buffer->watermark = min(max_size, watermark); - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - if (flags & PERF_BUFFER_WRITABLE) - buffer->writable = 1; - - atomic_set(&buffer->refcount, 1); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > buffer->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(buffer->user_page); - - return virt_to_page(buffer->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - int i; - - size = sizeof(struct perf_buffer); - size += nr_pages * sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - buffer->user_page = perf_mmap_alloc_page(cpu); - if (!buffer->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!buffer->data_pages[i]) - goto fail_data_pages; - } - - buffer->nr_pages = nr_pages; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)buffer->data_pages[i]); - - free_page((unsigned long)buffer->user_page); - -fail_user_page: - kfree(buffer); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - int i; - - perf_mmap_free_page((unsigned long)buffer->user_page); - for (i = 0; i < buffer->nr_pages; i++) - perf_mmap_free_page((unsigned long)buffer->data_pages[i]); - kfree(buffer); -} - -static inline int page_order(struct perf_buffer *buffer) -{ - return 0; -} - -#else - -/* - * Back perf_mmap() with vmalloc memory. - * - * Required for architectures that have d-cache aliasing issues. - */ - -static inline int page_order(struct perf_buffer *buffer) -{ - return buffer->page_order; -} - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(buffer))) - return NULL; - - return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void perf_buffer_free_work(struct work_struct *work) -{ - struct perf_buffer *buffer; - void *base; - int i, nr; - - buffer = container_of(work, struct perf_buffer, work); - nr = 1 << page_order(buffer); - - base = buffer->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(buffer); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - schedule_work(&buffer->work); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - void *all_buf; - - size = sizeof(struct perf_buffer); - size += sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - INIT_WORK(&buffer->work, perf_buffer_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - buffer->user_page = all_buf; - buffer->data_pages[0] = all_buf + PAGE_SIZE; - buffer->page_order = ilog2(nr_pages); - buffer->nr_pages = 1; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_all_buf: - kfree(buffer); - -fail: - return NULL; -} - -#endif - -static unsigned long perf_data_size(struct perf_buffer *buffer) -{ - return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); -} - static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -3613,14 +3405,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; @@ -3635,35 +3427,35 @@ unlock: return ret; } -static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +static void rb_free_rcu(struct rcu_head *rcu_head) { - struct perf_buffer *buffer; + struct ring_buffer *rb; - buffer = container_of(rcu_head, struct perf_buffer, rcu_head); - perf_buffer_free(buffer); + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); } -static struct perf_buffer *perf_buffer_get(struct perf_event *event) +static struct ring_buffer *ring_buffer_get(struct perf_event *event) { - struct perf_buffer *buffer; + struct ring_buffer *rb; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) { - if (!atomic_inc_not_zero(&buffer->refcount)) - buffer = NULL; + rb = rcu_dereference(event->rb); + if (rb) { + if (!atomic_inc_not_zero(&rb->refcount)) + rb = NULL; } rcu_read_unlock(); - return buffer; + return rb; } -static void perf_buffer_put(struct perf_buffer *buffer) +static void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&buffer->refcount)) + if (!atomic_dec_and_test(&rb->refcount)) return; - call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); + call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3678,16 +3470,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->buffer); + unsigned long size = perf_data_size(event->rb); struct user_struct *user = event->mmap_user; - struct perf_buffer *buffer = event->buffer; + struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->buffer, NULL); + rcu_assign_pointer(event->rb, NULL); mutex_unlock(&event->mmap_mutex); - perf_buffer_put(buffer); + ring_buffer_put(rb); free_uid(user); } } @@ -3705,7 +3497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -3714,7 +3506,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the - * same buffer. + * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; @@ -3726,7 +3518,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have buffer pages ensure they're a power-of-two number, so we + * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -3740,9 +3532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->buffer) { - if (event->buffer->nr_pages == nr_pages) - atomic_inc(&event->buffer->refcount); + if (event->rb) { + if (event->rb->nr_pages == nr_pages) + atomic_inc(&event->rb->refcount); else ret = -EINVAL; goto unlock; @@ -3772,18 +3564,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->buffer); + WARN_ON(event->rb); if (vma->vm_flags & VM_WRITE) - flags |= PERF_BUFFER_WRITABLE; + flags |= RING_BUFFER_WRITABLE; - buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + rb = rb_alloc(nr_pages, event->attr.wakeup_watermark, event->cpu, flags); - if (!buffer) { + if (!rb) { ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->buffer, buffer); + rcu_assign_pointer(event->rb, rb); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -3882,117 +3674,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -/* - * Output - */ -static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!buffer->writable) - return true; - - mask = perf_data_size(buffer) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->buffer->poll, POLL_IN); - - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - - preempt_disable(); - local_inc(&buffer->nest); - handle->wakeup = local_read(&buffer->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - unsigned long head; - -again: - head = local_read(&buffer->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&buffer->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the buffer->head read and this - * write. - */ - buffer->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read buffer->head. - */ - if (unlikely(head != local_read(&buffer->head))) { - local_inc(&buffer->nest); - goto again; - } - - if (handle->wakeup != local_read(&buffer->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -__always_inline void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct perf_buffer *buffer = handle->buffer; - - handle->page++; - handle->page &= buffer->nr_pages - 1; - handle->addr = buffer->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(buffer); - } - } while (len); -} - static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4023,9 +3704,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } } -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) +void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) { if (event->attr.sample_id_all) __perf_event_header__init_id(header, data, event); @@ -4052,121 +3733,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, perf_output_put(handle, data->cpu_entry); } -static void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) +void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int nmi, int sample) -{ - struct perf_buffer *buffer; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto out; - - handle->buffer = buffer; - handle->event = event; - handle->nmi = nmi; - handle->sample = sample; - - if (!buffer->nr_pages) - goto out; - - have_lost = local_read(&buffer->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(buffer->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&buffer->head); - head += size; - if (unlikely(!perf_output_space(buffer, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&buffer->head, offset, head) != offset); - - if (head - local_read(&buffer->wakeup) > buffer->watermark) - local_add(buffer->watermark, &buffer->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); - handle->page &= buffer->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); - handle->addr = buffer->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&buffer->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&buffer->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_event *event = handle->event; - struct perf_buffer *buffer = handle->buffer; - - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&buffer->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &buffer->events); - local_inc(&buffer->wakeup); - } - } - - perf_output_put_handle(handle); - rcu_read_unlock(); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -4187,7 +3761,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } /* @@ -4217,7 +3791,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; @@ -4229,7 +3803,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } } @@ -4309,7 +3883,7 @@ void perf_output_sample(struct perf_output_handle *handle, size *= sizeof(u64); - perf_output_copy(handle, data->callchain, size); + __output_copy(handle, data->callchain, size); } else { u64 nr = 0; perf_output_put(handle, nr); @@ -4319,8 +3893,8 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); + __output_copy(handle, data->raw->data, + data->raw->size); } else { struct { u32 size; @@ -4617,7 +4191,7 @@ static void perf_event_comm_output(struct perf_event *event, comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); - perf_output_copy(&handle, comm_event->comm, + __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4763,7 +4337,7 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); - perf_output_copy(&handle, mmap_event->file_name, + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4819,7 +4393,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (file) { /* - * d_path works from the end of the buffer backwards, so we + * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ @@ -6346,7 +5920,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_buffer *buffer = NULL, *old_buffer = NULL; + struct ring_buffer *rb = NULL, *old_rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6363,7 +5937,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* - * If its not a per-cpu buffer, it must be the same task. + * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; @@ -6375,20 +5949,20 @@ set: goto unlock; if (output_event) { - /* get the buffer we want to redirect to */ - buffer = perf_buffer_get(output_event); - if (!buffer) + /* get the rb we want to redirect to */ + rb = ring_buffer_get(output_event); + if (!rb) goto unlock; } - old_buffer = event->buffer; - rcu_assign_pointer(event->buffer, buffer); + old_rb = event->rb; + rcu_assign_pointer(event->rb, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_buffer) - perf_buffer_put(old_buffer); + if (old_rb) + ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h new file mode 100644 index 0000000..114f27f --- /dev/null +++ b/kernel/events/internal.h @@ -0,0 +1,97 @@ +#ifndef _KERNEL_EVENTS_INTERNAL_H +#define _KERNEL_EVENTS_INTERNAL_H + +#define RING_BUFFER_WRITABLE 0x01 + +struct ring_buffer { + atomic_t refcount; + struct rcu_head rcu_head; +#ifdef CONFIG_PERF_USE_VMALLOC + struct work_struct work; + int page_order; /* allocation order */ +#endif + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + + atomic_t poll; /* POLL_ for wakeups */ + + local_t head; /* write position */ + local_t nest; /* nested writers */ + local_t events; /* event limit */ + local_t wakeup; /* wakeup stamp */ + local_t lost; /* nr records lost */ + + long watermark; /* wakeup watermark */ + + struct perf_event_mmap_page *user_page; + void *data_pages[0]; +}; + + +extern void rb_free(struct ring_buffer *rb); +extern struct ring_buffer * +rb_alloc(int nr_pages, long watermark, int cpu, int flags); +extern void perf_event_wakeup(struct perf_event *event); + +extern void +perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void +perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample); + +extern struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); + +#ifdef CONFIG_PERF_USE_VMALLOC +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static inline int page_order(struct ring_buffer *rb) +{ + return rb->page_order; +} + +#else + +static inline int page_order(struct ring_buffer *rb) +{ + return 0; +} +#endif + +static unsigned long perf_data_size(struct ring_buffer *rb) +{ + return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); +} + +static inline void +__output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + do { + unsigned long size = min_t(unsigned long, handle->size, len); + + memcpy(handle->addr, buf, size); + + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct ring_buffer *rb = handle->rb; + + handle->page++; + handle->page &= rb->nr_pages - 1; + handle->addr = rb->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(rb); + } + } while (len); +} + +#endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c new file mode 100644 index 0000000..fde5259 --- /dev/null +++ b/kernel/events/ring_buffer.c @@ -0,0 +1,399 @@ +/* + * Performance events ring-buffer code: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include + +#include "internal.h" + +static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!rb->writable) + return true; + + mask = perf_data_size(rb) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->rb->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); + } else + perf_event_wakeup(handle->event); +} + +/* + * We need to ensure a later event_id doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_get_handle(struct perf_output_handle *handle) +{ + struct ring_buffer *rb = handle->rb; + + preempt_disable(); + local_inc(&rb->nest); + handle->wakeup = local_read(&rb->wakeup); +} + +static void perf_output_put_handle(struct perf_output_handle *handle) +{ + struct ring_buffer *rb = handle->rb; + unsigned long head; + +again: + head = local_read(&rb->head); + + /* + * IRQ/NMI can happen here, which means we can miss a head update. + */ + + if (!local_dec_and_test(&rb->nest)) + goto out; + + /* + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the rb->head read and this + * write. + */ + rb->user_page->data_head = head; + + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read rb->head. + */ + if (unlikely(head != local_read(&rb->head))) { + local_inc(&rb->nest); + goto again; + } + + if (handle->wakeup != local_read(&rb->wakeup)) + perf_output_wakeup(handle); + +out: + preempt_enable(); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct ring_buffer *rb; + unsigned long tail, offset, head; + int have_lost; + struct perf_sample_data sample_data; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + rb = rcu_dereference(event->rb); + if (!rb) + goto out; + + handle->rb = rb; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!rb->nr_pages) + goto out; + + have_lost = local_read(&rb->lost); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } + + perf_output_get_handle(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(rb->user_page->data_tail); + smp_rmb(); + offset = head = local_read(&rb->head); + head += size; + if (unlikely(!perf_output_space(rb, tail, offset, head))) + goto fail; + } while (local_cmpxchg(&rb->head, offset, head) != offset); + + if (head - local_read(&rb->wakeup) > rb->watermark) + local_add(rb->watermark, &rb->wakeup); + + handle->page = offset >> (PAGE_SHIFT + page_order(rb)); + handle->page &= rb->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); + handle->addr = rb->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.id = event->id; + lost_event.lost = local_xchg(&rb->lost, 0); + + perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); + } + + return 0; + +fail: + local_inc(&rb->lost); + perf_output_put_handle(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + __output_copy(handle, buf, len); +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct ring_buffer *rb = handle->rb; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = local_inc_return(&rb->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + + perf_output_put_handle(handle); + rcu_read_unlock(); +} + +static void +ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) +{ + long max_size = perf_data_size(rb); + + if (watermark) + rb->watermark = min(max_size, watermark); + + if (!rb->watermark) + rb->watermark = max_size / 2; + + if (flags & RING_BUFFER_WRITABLE) + rb->writable = 1; + + atomic_set(&rb->refcount, 1); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (pgoff > rb->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(rb->user_page); + + return virt_to_page(rb->data_pages[pgoff - 1]); +} + +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + +struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct ring_buffer *rb; + unsigned long size; + int i; + + size = sizeof(struct ring_buffer); + size += nr_pages * sizeof(void *); + + rb = kzalloc(size, GFP_KERNEL); + if (!rb) + goto fail; + + rb->user_page = perf_mmap_alloc_page(cpu); + if (!rb->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + rb->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!rb->data_pages[i]) + goto fail_data_pages; + } + + rb->nr_pages = nr_pages; + + ring_buffer_init(rb, watermark, flags); + + return rb; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)rb->data_pages[i]); + + free_page((unsigned long)rb->user_page); + +fail_user_page: + kfree(rb); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +void rb_free(struct ring_buffer *rb) +{ + int i; + + perf_mmap_free_page((unsigned long)rb->user_page); + for (i = 0; i < rb->nr_pages; i++) + perf_mmap_free_page((unsigned long)rb->data_pages[i]); + kfree(rb); +} + +#else + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (pgoff > (1UL << page_order(rb))) + return NULL; + + return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void rb_free_work(struct work_struct *work) +{ + struct ring_buffer *rb; + void *base; + int i, nr; + + rb = container_of(work, struct ring_buffer, work); + nr = 1 << page_order(rb); + + base = rb->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(rb); +} + +void rb_free(struct ring_buffer *rb) +{ + schedule_work(&rb->work); +} + +struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct ring_buffer *rb; + unsigned long size; + void *all_buf; + + size = sizeof(struct ring_buffer); + size += sizeof(void *); + + rb = kzalloc(size, GFP_KERNEL); + if (!rb) + goto fail; + + INIT_WORK(&rb->work, rb_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + rb->user_page = all_buf; + rb->data_pages[0] = all_buf + PAGE_SIZE; + rb->page_order = ilog2(nr_pages); + rb->nr_pages = 1; + + ring_buffer_init(rb, watermark, flags); + + return rb; + +fail_all_buf: + kfree(rb); + +fail: + return NULL; +} + +#endif -- cgit v0.10.2 From e7e2ee89a9dbf48d70a922d5625cd7320a27cbff Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 10 May 2011 13:27:21 -0700 Subject: tracing: Schedule a delayed work to call wakeup() In using syscall tracing by concurrent processes, the wakeup() that is called in the event commit function causes contention on the spin lock of the waitqueue. I enabled sys_enter_getuid and sys_exit_getuid tracepoints, and by running getuid_microbench from autotest in parallel I found that the contention causes exponential latency increase in the tracing path. The autotest binary getuid_microbench calls getuid() in a tight loop for the given number of iterations and measures the average time required to complete a single invocation of syscall. The patch schedules a delayed work after 2 ms once an event commit calls to wake up the trace wait_queue. This removes the delay caused by contention on spin lock in wakeup() and amortizes the wakeup() calls scheduled over the 2 ms period. In the following example, the script enables the sys_enter_getuid and sys_exit_getuid tracepoints and runs the getuid_microbench in parallel with the given number of processes. The output clearly shows the latency increase caused by contentions. $ ~/getuid.sh 1 1000000 calls in 0.720974253 s (720.974253 ns/call) $ ~/getuid.sh 2 1000000 calls in 1.166457554 s (1166.457554 ns/call) 1000000 calls in 1.168933765 s (1168.933765 ns/call) $ ~/getuid.sh 3 1000000 calls in 1.783827516 s (1783.827516 ns/call) 1000000 calls in 1.795553270 s (1795.553270 ns/call) 1000000 calls in 1.796493376 s (1796.493376 ns/call) $ ~/getuid.sh 4 1000000 calls in 4.483041796 s (4483.041796 ns/call) 1000000 calls in 4.484165388 s (4484.165388 ns/call) 1000000 calls in 4.484850762 s (4484.850762 ns/call) 1000000 calls in 4.485643576 s (4485.643576 ns/call) $ ~/getuid.sh 5 1000000 calls in 6.497521653 s (6497.521653 ns/call) 1000000 calls in 6.502000236 s (6502.000236 ns/call) 1000000 calls in 6.501709115 s (6501.709115 ns/call) 1000000 calls in 6.502124100 s (6502.124100 ns/call) 1000000 calls in 6.502936358 s (6502.936358 ns/call) After the patch, the latencies scale better. 1000000 calls in 0.728720455 s (728.720455 ns/call) 1000000 calls in 0.842782857 s (842.782857 ns/call) 1000000 calls in 0.883803135 s (883.803135 ns/call) 1000000 calls in 0.902077764 s (902.077764 ns/call) 1000000 calls in 0.902838202 s (902.838202 ns/call) 1000000 calls in 0.908896885 s (908.896885 ns/call) 1000000 calls in 0.932523515 s (932.523515 ns/call) 1000000 calls in 0.958009672 s (958.009672 ns/call) 1000000 calls in 0.986188020 s (986.188020 ns/call) 1000000 calls in 0.989771102 s (989.771102 ns/call) 1000000 calls in 0.933518391 s (933.518391 ns/call) 1000000 calls in 0.958897947 s (958.897947 ns/call) 1000000 calls in 1.031038897 s (1031.038897 ns/call) 1000000 calls in 1.089516025 s (1089.516025 ns/call) 1000000 calls in 1.141998347 s (1141.998347 ns/call) Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1305059241-7629-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee9c921..71777c8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); +static void wakeup_work_handler(struct work_struct *work) +{ + wake_up(&trace_wait); +} + +static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); + /** * trace_wake_up - wake up tasks waiting for trace input * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. */ void trace_wake_up(void) { - int cpu; + const unsigned long delay = msecs_to_jiffies(2); if (trace_flags & TRACE_ITER_BLOCK) return; - /* - * The runqueue_is_locked() can fail, but this is the best we - * have for now: - */ - cpu = get_cpu(); - if (!runqueue_is_locked(cpu)) - wake_up(&trace_wait); - put_cpu(); + schedule_delayed_work(&wakeup_work, delay); } static int __init set_buf_size(char *str) -- cgit v0.10.2 From 7ea5906405a1f3fc1c0033dfd7e02f2cfd1de5e5 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 3 May 2011 17:56:42 -0700 Subject: tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ab38ac8..b891de9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -169,7 +169,7 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, size_t ring_buffer_page_len(void *page); -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0c7aa4..725153d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -997,13 +997,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct buffer_page *bpage, *tmp; - unsigned long addr; LIST_HEAD(pages); unsigned i; WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { + struct page *page; + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) @@ -1013,10 +1014,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } @@ -1045,7 +1047,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long addr; + struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -1067,10 +1069,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto fail_free_reader; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1314,7 +1316,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) unsigned nr_pages, rm_pages, new_pages; struct buffer_page *bpage, *tmp; unsigned long buffer_size; - unsigned long addr; LIST_HEAD(pages); int i, cpu; @@ -1375,16 +1376,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { + struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } } @@ -3730,16 +3732,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or NULL on error. */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { struct buffer_data_page *bpage; - unsigned long addr; + struct page *page; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) return NULL; - bpage = (void *)addr; + bpage = page_address(page); rb_init_page(bpage); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a6..a5457d5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) int inc; int i; - bpage = ring_buffer_alloc_read_page(buffer); + bpage = ring_buffer_alloc_read_page(buffer, cpu); if (!bpage) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71777c8..61fda6b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,7 +3697,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); if (!info->spare) return -ENOMEM; @@ -3854,7 +3854,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer); + ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); if (!ref->page) { kfree(ref); break; @@ -3863,8 +3863,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, r = ring_buffer_read_page(ref->buffer, &ref->page, len, info->cpu, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, - ref->page); + ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); break; } -- cgit v0.10.2 From 4f271a2a60c748599b30bb4dafff30d770439b96 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Mon, 13 Jun 2011 17:51:57 -0700 Subject: tracing: Add a proc file to stop tracing and free buffer The proc file entry buffer_size_kb is used to set the size of tracing buffer. The memory to expand the buffer size is kernel memory. Consider a use case where tracing is handled by a user space utility, which acts as a gate keeper for tracing requests. In an OOM condition, tracing is considered a low priority task and if the utility gets killed the ring buffer memory cannot be released back to the kernel. This patch adds a proc file called "free_buffer" whose purpose is to stop tracing and free up the ring buffer when it is closed. The user space process can then set the desired size in buffer_size_kb file and open the fd to the "free_buffer" file. Under OOM condition, if the process gets killed, the kernel closes the file descriptor. The release handler stops the tracing and releases the kernel memory automatically. Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/1308012717-11148-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61fda6b..9c557ae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2768,7 +2768,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int tracing_resize_ring_buffer(unsigned long size) +static int __tracing_resize_ring_buffer(unsigned long size) { int ret; @@ -2820,6 +2820,41 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +static ssize_t tracing_resize_ring_buffer(unsigned long size) +{ + int cpu, ret = size; + + mutex_lock(&trace_types_lock); + + tracing_stop(); + + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + + if (size != global_trace.entries) + ret = __tracing_resize_ring_buffer(size); + + if (ret < 0) + ret = -ENOMEM; + + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + + tracing_start(); + mutex_unlock(&trace_types_lock); + + return ret; +} + /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -2837,7 +2872,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); mutex_unlock(&trace_types_lock); return ret; @@ -2861,7 +2896,7 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) goto out; ret = 0; @@ -3436,7 +3471,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret, cpu; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -3454,48 +3489,43 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (!val) return -EINVAL; - mutex_lock(&trace_types_lock); - - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); - } - /* value is in KB */ val <<= 10; - if (val != global_trace.entries) { - ret = tracing_resize_ring_buffer(val); - if (ret < 0) { - cnt = ret; - goto out; - } - } + ret = tracing_resize_ring_buffer(val); + if (ret < 0) + return ret; *ppos += cnt; - /* If check pages failed, return ENOMEM */ - if (tracing_disabled) - cnt = -ENOMEM; - out: - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } + return cnt; +} - tracing_start(); - mutex_unlock(&trace_types_lock); +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; return cnt; } +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + /* disable tracing */ + tracing_off(); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(0); + + return 0; +} + static int mark_printk(const char *fmt, ...) { int ret; @@ -3641,6 +3671,11 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_free_buffer_fops = { + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, @@ -4365,6 +4400,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, + &global_trace, &tracing_free_buffer_fops); + trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); -- cgit v0.10.2 From cf30cf67d6c7592c670ec946d89fc15ee0deb0eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 22:44:07 -0400 Subject: tracing: Add disable_on_free option Add a trace option to disable tracing on free. When this option is set, a write into the free_buffer file will not only shrink the ring buffer down to zero, but it will also disable tracing. Cc: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9c557ae..42fdf3a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -425,6 +425,7 @@ static const char *trace_options[] = { "graph-time", "record-cmd", "overwrite", + "disable_on_free", NULL }; @@ -3518,8 +3519,9 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { - /* disable tracing */ - tracing_off(); + /* disable tracing ? */ + if (trace_flags & TRACE_ITER_STOP_ON_FREE) + tracing_off(); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(0); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f859..742f545 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -609,6 +609,7 @@ enum trace_iterator_flags { TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, + TRACE_ITER_STOP_ON_FREE = 0x400000, }; /* -- cgit v0.10.2 From bd38c0e6f98326132a691d73b2056b426423c638 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: ftrace: Fixed an include coding style issue Removed because was already declared. Braces of struct's coding style fixed. Cc: Frederic Weisbecker Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE59711.3090900@gmail.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417f..e153807 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,7 +32,6 @@ #include -#include #include #include "trace_output.h" @@ -82,8 +81,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = -{ +static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, }; @@ -785,8 +783,7 @@ static void unregister_ftrace_profiler(void) unregister_ftrace_graph(); } #else -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ +static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; -- cgit v0.10.2 From 84c15027a7f2fbd7f1180d7cbb60e41abbbaedd2 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: async: Fixed an include coding style issue Added , and Removed . Added KERN_DEBUG to printk() functions. Acked-by: Arjan van de Ven Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE596B4.7030904@gmail.com Signed-off-by: Steven Rostedt diff --git a/kernel/async.c b/kernel/async.c index cd9dbb9..d5fe7af 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel. */ #include +#include +#include #include #include #include #include #include -#include static async_cookie_t next_cookie = 1; @@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, + printk(KERN_DEBUG "calling %lli_%pF @ %i\n", + (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } @@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk("initcall %lli_%pF returned 0 after %lld usecs\n", + printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, ktime_t starttime, delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("async_waiting @ %i\n", task_pid_nr(current)); + printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } @@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, endtime = ktime_get(); delta = ktime_sub(endtime, starttime); - printk("async_continuing @ %i after %lli usec\n", + printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", task_pid_nr(current), (long long)ktime_to_ns(delta) >> 10); } -- cgit v0.10.2 From 321e68b095addc473ca52ced9acfa59be88f76e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:47 +0200 Subject: tracing, function_graph: Remove dependency of abstime and duration fields on latency The display of absolute time and duration fields is based on the latency field. This was added during the irqsoff/wakeup tracers graph support changes. It's causing confusion in what fields will be displayed for the function_graph tracer itself. So I'm removing this depency, and adding absolute time and duration fields to the preemptirqsoff preemptoff irqsoff wakeup tracers. With following commands: # echo function_graph > ./current_tracer # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 0) 0.068 us | } /* page_add_file_rmap */ 0) | _raw_spin_unlock() { ... This is what it looks like now: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) 0.068 us | } /* add_preempt_count */ 0) 0.993 us | } /* vfsmount_lock_local_lock */ ... For preemptirqsoff preemptoff irqsoff wakeup tracers, this is what it looked like before: SNIP # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU TASK/PID ||||| DURATION FUNCTION CALLS # | | | ||||| | | | | | | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... This is what it looks like now: SNIP # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 19.847735 | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-2-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb2..352652e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1207,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1270,18 +1270,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return __print_graph_function_flags(iter, tracer_flags.val); -} - -enum print_line_t print_graph_function_flags(struct trace_iterator *iter, - u32 flags) -{ - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - return __print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, tracer_flags.val); } static enum print_line_t @@ -1364,9 +1353,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) return; print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; + } __print_graph_headers_flags(s, flags); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c77424b..667aa8c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ - TRACE_GRAPH_PRINT_PROC) + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f029dd4..e4a70c0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) graph_trace_close(iter); } -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { -- cgit v0.10.2 From ffeb80fc30acbf6bd51cb47a1815f621a9d017dc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:48 +0200 Subject: tracing, function_graph: Merge overhead and duration display functions Functions print_graph_overhead() and print_graph_duration() displays data for one field - DURATION. I merged them into single function print_graph_duration(), and added a way to display the empty parts of the field. This way the print_graph_irq() function can use this column to display the IRQ signs if needed and the DURATION field details stays inside the print_graph_duration() function. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-3-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 352652e..44b955f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { static struct trace_array *graph_array; +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + DURATION_FILL_FULL = -1, + DURATION_FILL_START = -2, + DURATION_FILL_END = -3, +}; + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s, - u32 flags) -{ - /* If duration disappear, we don't need anything */ - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return 1; - - /* Non nested entry or return */ - if (duration == -1) - return trace_seq_printf(s, " "); - - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); - - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); - } - - return trace_seq_printf(s, " "); -} - static int print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; @@ -650,9 +638,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + ret = print_graph_duration(DURATION_FILL_START, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; if (type == TRACE_GRAPH_ENT) ret = trace_seq_printf(s, "==========>"); @@ -662,9 +650,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Don't close the duration column if haven't one */ - if (flags & TRACE_GRAPH_PRINT_DURATION) - trace_seq_printf(s, " |"); + ret = print_graph_duration(DURATION_FILL_END, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + ret = trace_seq_printf(s, "\n"); if (!ret) @@ -716,9 +705,48 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) } static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags) { - int ret; + int ret = -1; + + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) + return TRACE_TYPE_HANDLED; + + /* No real adata, just filling the column with spaces */ + switch (duration) { + case DURATION_FILL_FULL: + ret = trace_seq_printf(s, " | "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_START: + ret = trace_seq_printf(s, " "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_END: + ret = trace_seq_printf(s, " |"); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + ret = trace_seq_printf(s, "! "); + /* Duration exceeded 10 msecs */ + else if (duration > 10000ULL) + ret = trace_seq_printf(s, "+ "); + } + + /* + * The -1 means we either did not exceed the duration tresholds + * or we dont want to print out the overhead. Either way we need + * to fill out the space. + */ + if (ret == -1) + ret = trace_seq_printf(s, " "); + + /* Catching here any failure happenned above */ + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; ret = trace_print_graph_duration(duration, s); if (ret != TRACE_TYPE_HANDLED) @@ -767,18 +795,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = 0; } - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -815,17 +836,10 @@ print_graph_entry_nested(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = call->func; } - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -1078,18 +1092,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -1146,17 +1153,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Indentation */ if (depth > 0) -- cgit v0.10.2 From f56e7f8efb4ec200364f690a9902713410e24d47 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:49 +0200 Subject: tracing, function: Fix trace header to follow context-info option The header display of function tracer does not follow the context-info option, so field names are displayed even if this option is off. Added check for TRACE_ITER_CONTEXT_INFO trace_flags. With following commands: # echo function > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | add_preempt_count <-schedule rcu_note_context_switch <-schedule ... This is what it looks like now: # tracer: function # _raw_spin_unlock_irqrestore <-hrtimer_try_to_cancel ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-4-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42fdf3a..cf22b4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,6 +2053,9 @@ void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v0.10.2 From 199abfab40389963b397c2982222e68ea782b2cf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:50 +0200 Subject: tracing, function_graph: Remove lock-depth from latency trace The lock_depth was removed in commit e6e1e25 tracing: Remove lock_depth from event entry Removing the lock_depth info from function_graph latency header. With following commands: # echo function_graph > ./current_tracer # echo 1 > options/latency-format # cat trace This is what it looked like before: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59756/311298, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU||||| DURATION FUNCTION CALLS # | ||||| | | | | | | 0) .... 0.068 us | } /* __rcu_read_unlock */ ... This is what it looks like now: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59747/1744610, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # CPU|||| DURATION FUNCTION CALLS # | |||| | | | | | | 0) ..s. 1.641 us | } /* __rcu_process_callbacks */ ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-5-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 44b955f..1da5b97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1298,8 +1298,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); - seq_printf(s, "#%.*s|||| / \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); } static void __print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1318,7 +1317,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); @@ -1332,7 +1331,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); -- cgit v0.10.2 From 749230b06a753a22f6ed96e5dd60815d6ab12865 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:51 +0200 Subject: tracing, function_graph: Add context-info support for function_graph tracer The function_graph tracer does not follow global context-info option. Adding TRACE_ITER_CONTEXT_INFO trace_flags check to enable it. With following commands: # echo function_graph > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 1) 0.079 us | } /* __vma_link_rb */ 1) 0.056 us | copy_page_range(); 1) | security_vm_enough_memory() { ... This is what it looks like now: # tracer: function_graph # } /* update_ts_time_stats */ timekeeping_max_deferment(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-6-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 1da5b97..e8d6bb5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -613,28 +613,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } } /* No overhead */ @@ -710,8 +712,9 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, { int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return TRACE_TYPE_HANDLED; + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ switch (duration) { @@ -879,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return 0; + /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); @@ -1346,6 +1352,9 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v0.10.2 From 22fe9b54d859e53bfbbbdc1a0a77a82bc453927c Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Tue, 7 Jun 2011 21:58:27 +0200 Subject: tracing: Convert to kstrtoul_from_user This patch replaces the code for getting an unsigned long from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Link: http://lkml.kernel.org/r/1307476707-14762-1-git-send-email-peterhuewe@gmx.de Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e153807..458018a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -803,19 +803,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; /* big enough to hold a number */ int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 725153d..f00ede3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3980,20 +3980,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cf22b4b..c977018 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2706,20 +2706,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; @@ -3006,20 +2997,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *ptr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; *ptr = val * 1000; @@ -3474,19 +3456,10 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; /* must have at least 1 entry */ @@ -4139,19 +4112,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, { struct trace_option_dentry *topt = filp->private_data; unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) @@ -4199,20 +4163,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long index = (long)filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec39..4d7e149 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -486,20 +486,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); @@ -571,19 +562,10 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char buf[64]; ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b0b53b8..77575b3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, { long *ptr = filp->private_data; unsigned long val, flags; - char buf[64]; int ret; int cpu; - if (count >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) return ret; local_irq_save(flags); -- cgit v0.10.2 From d7ec4bfed6c97405c6417970ba06c439e08ab8e3 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 7 Jun 2011 17:01:42 -0700 Subject: ring-buffer: Set __GFP_NORETRY flag for ring buffer allocating process The tracing ring buffer is allocated from kernel memory. While allocating a large chunk of memory, OOM might happen which destabilizes the system. Thus random processes might get killed during the allocation. This patch adds __GFP_NORETRY flag to the ring buffer allocation calls to make it fail more gracefully if the system will not be able to complete the allocation request. Acked-by: David Rientjes Signed-off-by: Vaibhav Nagarnaik Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1307491302-9236-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f00ede3..731201b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1004,9 +1004,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { struct page *page; - + /* + * __GFP_NORETRY flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is + * not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; @@ -1015,7 +1020,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - GFP_KERNEL, 0); + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1377,13 +1382,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { struct page *page; + /* + * __GFP_NORETRY flag makes sure that the allocation + * fails gracefully without invoking oom-killer and + * the system is not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -3737,7 +3749,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) struct buffer_data_page *bpage; struct page *page; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) return NULL; -- cgit v0.10.2 From 395810627b6a43c8d0ec948884043946fa162308 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:21 +0900 Subject: x86: Swap save_stack_trace_regs parameters Swap the 1st and 2nd parameters of save_stack_trace_regs() as same as the parameters of save_stack_trace_tsk(). Signed-off-by: Masami Hiramatsu Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070921.17777.31103.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 55d9bc0..fdd0c64 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 704a37c..dab4187 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_regs(&e->trace, regs); + save_stack_trace_regs(regs, &e->trace); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 25310f1..115b570 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -14,8 +14,8 @@ struct stack_trace { }; extern void save_stack_trace(struct stack_trace *trace); -extern void save_stack_trace_regs(struct stack_trace *trace, - struct pt_regs *regs); +extern void save_stack_trace_regs(struct pt_regs *regs, + struct stack_trace *trace); extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); -- cgit v0.10.2 From c624d33f61cd05241e85b906311f0b712fdb0f32 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:27 +0900 Subject: stack_trace: Add weak save_stack_trace_regs() Add weak symbol of save_stack_trace_regs() as same as save_stack_trace_tsk() since that is not implemented except x86 yet. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070927.17777.37895.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index eb212f8..d20c698 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces) EXPORT_SYMBOL_GPL(print_stack_trace); /* - * Architectures that do not implement save_stack_trace_tsk get this - * weak alias and a once-per-bootup warning (whenever this facility - * is utilized - for example by procfs): + * Architectures that do not implement save_stack_trace_tsk or + * save_stack_trace_regs get this weak alias and a once-per-bootup warning + * (whenever this facility is utilized - for example by procfs): */ __weak void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); } + +__weak void +save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); +} -- cgit v0.10.2 From 1fd8df2c3970c9e7e4e262354154ee39e58bdd7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:34 +0900 Subject: tracing/kprobes: Fix kprobe-tracer to support stack trace Fix to support kernel stack trace correctly on kprobe-tracer. Since the execution path of kprobe-based dynamic events is different from other tracepoint-based events, normal ftrace_trace_stack() doesn't work correctly. To fix that, this introduces ftrace_trace_stack_regs() which traces stack via pt_regs instead of current stack register. e.g. # echo p schedule+4 > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/options/stacktrace # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # head -n 20 /sys/kernel/debug/tracing/trace bash-2968 [000] 10297.050245: p_schedule_4: (schedule+0x4/0x4ca) bash-2968 [000] 10297.050247: => schedule_timeout => n_tty_read => tty_read => vfs_read => sys_read => system_call_fastpath kworker/0:1-2940 [000] 10297.050265: p_schedule_4: (schedule+0x4/0x4ca) kworker/0:1-2940 [000] 10297.050266: => worker_thread => kthread => kernel_thread_helper sshd-1132 [000] 10297.050365: p_schedule_4: (schedule+0x4/0x4ca) sshd-1132 [000] 10297.050365: => sysret_careful Note: Even with this fix, the first entry will be skipped if the probe is put on the function entry area before the frame pointer is set up (usually, that is 4 bytes (push %bp; mov %sp %bp) on x86), because stack unwinder depends on the frame pointer. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070934.17777.17116.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 59d3ef1..b1e69ee 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -129,6 +129,10 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c977018..d9c1612 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) +{ + ring_buffer_unlock_commit(buffer, event); + + ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_userstack(buffer, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); + void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -1238,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) + int skip, int pc, struct pt_regs *regs) { struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -1257,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; trace.entries = entry->caller; - save_stack_trace(&trace); + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); } +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, regs); +} + void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); } /** @@ -1290,7 +1314,7 @@ void trace_dump_stack(void) local_save_flags(flags); /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 742f545..a3e2db7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr, void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); @@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, { } +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, + unsigned long flags, int skip, + int pc, struct pt_regs *regs) +{ +} + static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45..7053ef3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ -- cgit v0.10.2 From a0e3e70243f5b270bc3eca718f0a9fa5e6b8262e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 3 Jun 2011 16:37:47 +0200 Subject: oprofile, x86: Fix nmi-unsafe callgraph support Current oprofile's x86 callgraph support may trigger page faults throwing the BUG_ON(in_nmi()) message below. This patch fixes this by using the same nmi-safe copy-from-user code as in perf. ------------[ cut here ]------------ kernel BUG at .../arch/x86/kernel/traps.c:436! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:00/0000:00:0a.0/0000:07:00.0/0000:08:04.0/net/eth0/broadcast CPU 5 Modules linked in: Pid: 8611, comm: opcontrol Not tainted 2.6.39-00007-gfe47ae7 #1 Advanced Micro Device Anaheim/Anaheim RIP: 0010:[] [] do_nmi+0x22/0x1ee RSP: 0000:ffff88042fd47f28 EFLAGS: 00010002 RAX: ffff88042c0a7fd8 RBX: 0000000000000001 RCX: 00000000c0000101 RDX: 00000000ffff8804 RSI: ffffffffffffffff RDI: ffff88042fd47f58 RBP: ffff88042fd47f48 R08: 0000000000000004 R09: 0000000000001484 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88042fd47f58 R13: 0000000000000000 R14: ffff88042fd47d98 R15: 0000000000000020 FS: 00007fca25e56700(0000) GS:ffff88042fd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000074 CR3: 000000042d28b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process opcontrol (pid: 8611, threadinfo ffff88042c0a6000, task ffff88042c532310) Stack: 0000000000000000 0000000000000001 ffff88042c0a7fd8 0000000000000000 ffff88042fd47de8 ffffffff813e897a 0000000000000020 ffff88042fd47d98 0000000000000000 ffff88042c0a7fd8 ffff88042fd47de8 0000000000000074 Call Trace: [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Code: ff 59 5b 41 5c 41 5d c9 c3 55 65 48 8b 04 25 88 b5 00 00 48 89 e5 41 55 41 54 49 89 fc 53 48 83 ec 08 f6 80 47 e0 ff ff 04 74 04 <0f> 0b eb fe 81 80 44 e0 ff ff 00 00 01 04 65 ff 04 25 c4 0f 01 RIP [] do_nmi+0x22/0x1ee RSP ---[ end trace ed6752185092104b ]--- Kernel panic - not syncing: Fatal exception in interrupt Pid: 8611, comm: opcontrol Tainted: G D 2.6.39-00007-gfe47ae7 #1 Call Trace: [] panic+0x8c/0x188 [] oops_end+0x81/0x8e [] die+0x55/0x5e [] do_trap+0x11c/0x12b [] do_invalid_op+0x91/0x9a [] ? do_nmi+0x22/0x1ee [] ? oprofile_add_sample+0x83/0x95 [] ? op_amd_check_ctrs+0x4f/0x2cf [] invalid_op+0x15/0x20 [] ? do_nmi+0x22/0x1ee [] ? do_nmi+0x67/0x1ee [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Cc: John Lumby Cc: Maynard Johnson Cc: # .37+ Signed-off-by: Robert Richter diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index a5b64ab..32f78eb 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -11,10 +11,12 @@ #include #include #include +#include +#include + #include #include #include -#include static int backtrace_stack(void *data, char *name) { @@ -36,17 +38,53 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; +/* from arch/x86/kernel/cpu/perf_event.c: */ + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame_ia32 bufhead[2]; struct stack_frame_ia32 *fp; + unsigned long bytes; - /* Also check accessibility of one struct frame_head beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); @@ -87,12 +125,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) static struct stack_frame *dump_user_backtrace(struct stack_frame *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame bufhead[2]; + unsigned long bytes; - /* Also check accessibility of one struct stack_frame beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; oprofile_add_trace(bufhead[0].return_address); -- cgit v0.10.2 From 94ea09c6a8e6c5ffb59bb1d5ca10008d37544e1a Mon Sep 17 00:00:00 2001 From: Daniel Kalmar Date: Fri, 13 May 2011 08:38:04 -0400 Subject: MIPS: Add new unwind_stack variant The unwind_stack_by_address variant supports unwinding based on any kernel code address. This symbol is also exported so it can be called from modules. Signed-off-by: Daniel Kalmar Signed-off-by: Gergely Kis Signed-off-by: Robert Richter diff --git a/arch/mips/include/asm/stacktrace.h b/arch/mips/include/asm/stacktrace.h index 0bf8281..780ee2c 100644 --- a/arch/mips/include/asm/stacktrace.h +++ b/arch/mips/include/asm/stacktrace.h @@ -7,6 +7,10 @@ extern int raw_show_trace; extern unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, unsigned long pc, unsigned long *ra); +extern unsigned long unwind_stack_by_address(unsigned long stack_page, + unsigned long *sp, + unsigned long pc, + unsigned long *ra); #else #define raw_show_trace 1 static inline unsigned long unwind_stack(struct task_struct *task, diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index d2112d3..c28fbe6 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -373,18 +373,18 @@ unsigned long thread_saved_pc(struct task_struct *tsk) #ifdef CONFIG_KALLSYMS -/* used by show_backtrace() */ -unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, - unsigned long pc, unsigned long *ra) +/* generic stack unwinding function */ +unsigned long notrace unwind_stack_by_address(unsigned long stack_page, + unsigned long *sp, + unsigned long pc, + unsigned long *ra) { - unsigned long stack_page; struct mips_frame_info info; unsigned long size, ofs; int leaf; extern void ret_from_irq(void); extern void ret_from_exception(void); - stack_page = (unsigned long)task_stack_page(task); if (!stack_page) return 0; @@ -443,6 +443,15 @@ unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, *ra = 0; return __kernel_text_address(pc) ? pc : 0; } +EXPORT_SYMBOL(unwind_stack_by_address); + +/* used by show_backtrace() */ +unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, + unsigned long pc, unsigned long *ra) +{ + unsigned long stack_page = (unsigned long)task_stack_page(task); + return unwind_stack_by_address(stack_page, sp, pc, ra); +} #endif /* -- cgit v0.10.2 From 5f307491f3a0c8551cedf5d90d660d656e0d56ec Mon Sep 17 00:00:00 2001 From: Daniel Kalmar Date: Fri, 13 May 2011 08:38:05 -0400 Subject: MIPS: oprofile: Add callgraph support Stack unwinding is done by code examination. For kernelspace, the already existing unwind function is utilized that uses kallsyms to quickly find the beginning of functions. For userspace a new function was added that examines code at and before the pc. Signed-off-by: Daniel Kalmar Signed-off-by: Gergely Kis Signed-off-by: Robert Richter diff --git a/arch/mips/oprofile/Makefile b/arch/mips/oprofile/Makefile index 4b9d704..29f2f13 100644 --- a/arch/mips/oprofile/Makefile +++ b/arch/mips/oprofile/Makefile @@ -8,7 +8,7 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprofilefs.o oprofile_stats.o \ timer_int.o ) -oprofile-y := $(DRIVER_OBJS) common.o +oprofile-y := $(DRIVER_OBJS) common.o backtrace.o oprofile-$(CONFIG_CPU_MIPS32) += op_model_mipsxx.o oprofile-$(CONFIG_CPU_MIPS64) += op_model_mipsxx.o diff --git a/arch/mips/oprofile/backtrace.c b/arch/mips/oprofile/backtrace.c new file mode 100644 index 0000000..6854ed5 --- /dev/null +++ b/arch/mips/oprofile/backtrace.c @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct stackframe { + unsigned long sp; + unsigned long pc; + unsigned long ra; +}; + +static inline int get_mem(unsigned long addr, unsigned long *result) +{ + unsigned long *address = (unsigned long *) addr; + if (!access_ok(VERIFY_READ, addr, sizeof(unsigned long))) + return -1; + if (__copy_from_user_inatomic(result, address, sizeof(unsigned long))) + return -3; + return 0; +} + +/* + * These two instruction helpers were taken from process.c + */ +static inline int is_ra_save_ins(union mips_instruction *ip) +{ + /* sw / sd $ra, offset($sp) */ + return (ip->i_format.opcode == sw_op || ip->i_format.opcode == sd_op) + && ip->i_format.rs == 29 && ip->i_format.rt == 31; +} + +static inline int is_sp_move_ins(union mips_instruction *ip) +{ + /* addiu/daddiu sp,sp,-imm */ + if (ip->i_format.rs != 29 || ip->i_format.rt != 29) + return 0; + if (ip->i_format.opcode == addiu_op || ip->i_format.opcode == daddiu_op) + return 1; + return 0; +} + +/* + * Looks for specific instructions that mark the end of a function. + * This usually means we ran into the code area of the previous function. + */ +static inline int is_end_of_function_marker(union mips_instruction *ip) +{ + /* jr ra */ + if (ip->r_format.func == jr_op && ip->r_format.rs == 31) + return 1; + /* lui gp */ + if (ip->i_format.opcode == lui_op && ip->i_format.rt == 28) + return 1; + return 0; +} + +/* + * TODO for userspace stack unwinding: + * - handle cases where the stack is adjusted inside a function + * (generally doesn't happen) + * - find optimal value for max_instr_check + * - try to find a way to handle leaf functions + */ + +static inline int unwind_user_frame(struct stackframe *old_frame, + const unsigned int max_instr_check) +{ + struct stackframe new_frame = *old_frame; + off_t ra_offset = 0; + size_t stack_size = 0; + unsigned long addr; + + if (old_frame->pc == 0 || old_frame->sp == 0 || old_frame->ra == 0) + return -9; + + for (addr = new_frame.pc; (addr + max_instr_check > new_frame.pc) + && (!ra_offset || !stack_size); --addr) { + union mips_instruction ip; + + if (get_mem(addr, (unsigned long *) &ip)) + return -11; + + if (is_sp_move_ins(&ip)) { + int stack_adjustment = ip.i_format.simmediate; + if (stack_adjustment > 0) + /* This marks the end of the previous function, + which means we overran. */ + break; + stack_size = (unsigned) stack_adjustment; + } else if (is_ra_save_ins(&ip)) { + int ra_slot = ip.i_format.simmediate; + if (ra_slot < 0) + /* This shouldn't happen. */ + break; + ra_offset = ra_slot; + } else if (is_end_of_function_marker(&ip)) + break; + } + + if (!ra_offset || !stack_size) + return -1; + + if (ra_offset) { + new_frame.ra = old_frame->sp + ra_offset; + if (get_mem(new_frame.ra, &(new_frame.ra))) + return -13; + } + + if (stack_size) { + new_frame.sp = old_frame->sp + stack_size; + if (get_mem(new_frame.sp, &(new_frame.sp))) + return -14; + } + + if (new_frame.sp > old_frame->sp) + return -2; + + new_frame.pc = old_frame->ra; + *old_frame = new_frame; + + return 0; +} + +static inline void do_user_backtrace(unsigned long low_addr, + struct stackframe *frame, + unsigned int depth) +{ + const unsigned int max_instr_check = 512; + const unsigned long high_addr = low_addr + THREAD_SIZE; + + while (depth-- && !unwind_user_frame(frame, max_instr_check)) { + oprofile_add_trace(frame->ra); + if (frame->sp < low_addr || frame->sp > high_addr) + break; + } +} + +#ifndef CONFIG_KALLSYMS +static inline void do_kernel_backtrace(unsigned long low_addr, + struct stackframe *frame, + unsigned int depth) { } +#else +static inline void do_kernel_backtrace(unsigned long low_addr, + struct stackframe *frame, + unsigned int depth) +{ + while (depth-- && frame->pc) { + frame->pc = unwind_stack_by_address(low_addr, + &(frame->sp), + frame->pc, + &(frame->ra)); + oprofile_add_trace(frame->ra); + } +} +#endif + +void notrace op_mips_backtrace(struct pt_regs *const regs, unsigned int depth) +{ + struct stackframe frame = { .sp = regs->regs[29], + .pc = regs->cp0_epc, + .ra = regs->regs[31] }; + const int userspace = user_mode(regs); + const unsigned long low_addr = ALIGN(frame.sp, THREAD_SIZE); + + if (userspace) + do_user_backtrace(low_addr, &frame, depth); + else + do_kernel_backtrace(low_addr, &frame, depth); +} diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c index f9eb1ab..d1f2d4c 100644 --- a/arch/mips/oprofile/common.c +++ b/arch/mips/oprofile/common.c @@ -115,6 +115,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) ops->start = op_mips_start; ops->stop = op_mips_stop; ops->cpu_type = lmodel->cpu_type; + ops->backtrace = op_mips_backtrace; printk(KERN_INFO "oprofile: using %s performance monitoring.\n", lmodel->cpu_type); diff --git a/arch/mips/oprofile/op_impl.h b/arch/mips/oprofile/op_impl.h index f04b54f..7c2da27 100644 --- a/arch/mips/oprofile/op_impl.h +++ b/arch/mips/oprofile/op_impl.h @@ -36,4 +36,6 @@ struct op_mips_model { unsigned char num_counters; }; +void op_mips_backtrace(struct pt_regs * const regs, unsigned int depth); + #endif -- cgit v0.10.2 From d797fdc5c5c245fbb05f553e68cb95d962fbdd01 Mon Sep 17 00:00:00 2001 From: Sam Liao Date: Tue, 7 Jun 2011 23:49:46 +0800 Subject: perf tools: Add inverted call graph report support. Add "caller/callee" option to support inverted butterfly report, in the inverted report (with caller option), the call graph start from the callee's ancestor. Users can use such view to catch system's performance bottleneck from a sysprof like view. Using this option with specified sort order like pid gives us high level view of call graph statistics. Also add "-G" alias for inverted call graph. Signed-off-by: Sam Liao Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Signed-off-by: Frederic Weisbecker diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 8ba03d6..cfa8e51 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -80,15 +80,24 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. --g [type,min]:: +-g [type,min,order]:: --call-graph:: - Display call chains using type and min percent threshold. + Display call chains using type, min percent threshold and order. type can be either: - flat: single column, linear exposure of call chains. - graph: use a graph tree, displaying absolute overhead rates. - fractal: like graph, but displays relative rates. Each branch of the tree is considered as a new profiled object. + - Default: fractal,0.5. + + order can be either: + - callee: callee based call graph. + - caller: inverted caller based call graph. + + Default: fractal,0.5,callee. + +-G:: +--inverted:: + alias for inverted caller based call graph. --pretty=:: Pretty printing style. key: normal, raw diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 287a173..271e252 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -45,7 +45,8 @@ static struct perf_read_values show_threads_values; static const char default_pretty_printing_style[] = "normal"; static const char *pretty_printing_style = default_pretty_printing_style; -static char callchain_default_opt[] = "fractal,0.5"; +static char callchain_default_opt[] = "fractal,0.5,callee"; +static bool inverted_callchain; static symbol_filter_t annotate_init; static int perf_session__add_hist_entry(struct perf_session *session, @@ -386,13 +387,29 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, if (!tok) goto setup; - tok2 = strtok(NULL, ","); callchain_param.min_percent = strtod(tok, &endptr); if (tok == endptr) return -1; - if (tok2) + /* get the print limit */ + tok2 = strtok(NULL, ","); + if (!tok2) + goto setup; + + if (tok2[0] != 'c') { callchain_param.print_limit = strtod(tok2, &endptr); + tok2 = strtok(NULL, ","); + if (!tok2) + goto setup; + } + + /* get the call chain order */ + if (!strcmp(tok2, "caller")) + callchain_param.order = ORDER_CALLER; + else if (!strcmp(tok2, "callee")) + callchain_param.order = ORDER_CALLEE; + else + return -1; setup: if (callchain_register_param(&callchain_param) < 0) { fprintf(stderr, "Can't register callchain params\n"); @@ -436,9 +453,10 @@ static const struct option options[] = { "regex filter to identify parent, see: '--sort parent'"), OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, "Only display entries with parent-match"), - OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent", - "Display callchains using output_type (graph, flat, fractal, or none) and min percent threshold. " - "Default: fractal,0.5", &parse_callchain_opt, callchain_default_opt), + OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent, call_order", + "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold and callchain order. " + "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), + OPT_BOOLEAN('G', "inverted", &inverted_callchain, "alias for inverted call graph"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", "only consider symbols in these dsos"), OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", @@ -467,6 +485,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) else if (use_tui) use_browser = 1; + if (inverted_callchain) + callchain_param.order = ORDER_CALLER; + if (strcmp(input_name, "-") != 0) setup_browser(true); else diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 1a79df9..9b4ff16c 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -14,6 +14,11 @@ enum chain_mode { CHAIN_GRAPH_REL }; +enum chain_order { + ORDER_CALLER, + ORDER_CALLEE +}; + struct callchain_node { struct callchain_node *parent; struct list_head siblings; @@ -41,6 +46,7 @@ struct callchain_param { u32 print_limit; double min_percent; sort_chain_func_t sort; + enum chain_order order; }; struct callchain_list { diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 627a02e..dae4202 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -14,7 +14,8 @@ enum hist_filter { struct callchain_param callchain_param = { .mode = CHAIN_GRAPH_REL, - .min_percent = 0.5 + .min_percent = 0.5, + .order = ORDER_CALLEE }; u16 hists__col_len(struct hists *self, enum hist_column col) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index b723f21..558bcf9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -247,9 +247,14 @@ int perf_session__resolve_callchain(struct perf_session *self, callchain_cursor_reset(&self->callchain_cursor); for (i = 0; i < chain->nr; i++) { - u64 ip = chain->ips[i]; + u64 ip; struct addr_location al; + if (callchain_param.order == ORDER_CALLEE) + ip = chain->ips[i]; + else + ip = chain->ips[chain->nr - i - 1]; + if (ip >= PERF_CONTEXT_MAX) { switch (ip) { case PERF_CONTEXT_HV: -- cgit v0.10.2 From 872a878fb1ee53e21c90040de2c01b3fc53b5942 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jun 2011 03:14:52 +0200 Subject: perf tools: Make sort operations static These don't need to be globally visible. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Cc: Sam Liao diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f44fa54..f5dba56 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -15,95 +15,6 @@ char * field_sep; LIST_HEAD(hist_entry__sort_list); -static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); -static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); -static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); -static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); -static int hist_entry__parent_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); -static int hist_entry__cpu_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width); - -struct sort_entry sort_thread = { - .se_header = "Command: Pid", - .se_cmp = sort__thread_cmp, - .se_snprintf = hist_entry__thread_snprintf, - .se_width_idx = HISTC_THREAD, -}; - -struct sort_entry sort_comm = { - .se_header = "Command", - .se_cmp = sort__comm_cmp, - .se_collapse = sort__comm_collapse, - .se_snprintf = hist_entry__comm_snprintf, - .se_width_idx = HISTC_COMM, -}; - -struct sort_entry sort_dso = { - .se_header = "Shared Object", - .se_cmp = sort__dso_cmp, - .se_snprintf = hist_entry__dso_snprintf, - .se_width_idx = HISTC_DSO, -}; - -struct sort_entry sort_sym = { - .se_header = "Symbol", - .se_cmp = sort__sym_cmp, - .se_snprintf = hist_entry__sym_snprintf, - .se_width_idx = HISTC_SYMBOL, -}; - -struct sort_entry sort_parent = { - .se_header = "Parent symbol", - .se_cmp = sort__parent_cmp, - .se_snprintf = hist_entry__parent_snprintf, - .se_width_idx = HISTC_PARENT, -}; - -struct sort_entry sort_cpu = { - .se_header = "CPU", - .se_cmp = sort__cpu_cmp, - .se_snprintf = hist_entry__cpu_snprintf, - .se_width_idx = HISTC_CPU, -}; - -struct sort_dimension { - const char *name; - struct sort_entry *entry; - int taken; -}; - -static struct sort_dimension sort_dimensions[] = { - { .name = "pid", .entry = &sort_thread, }, - { .name = "comm", .entry = &sort_comm, }, - { .name = "dso", .entry = &sort_dso, }, - { .name = "symbol", .entry = &sort_sym, }, - { .name = "parent", .entry = &sort_parent, }, - { .name = "cpu", .entry = &sort_cpu, }, -}; - -int64_t cmp_null(void *l, void *r) -{ - if (!l && !r) - return 0; - else if (!l) - return -1; - else - return 1; -} - -/* --sort pid */ - -int64_t -sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) -{ - return right->thread->pid - left->thread->pid; -} - static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...) { int n; @@ -125,6 +36,24 @@ static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...) return n; } +static int64_t cmp_null(void *l, void *r) +{ + if (!l && !r) + return 0; + else if (!l) + return -1; + else + return 1; +} + +/* --sort pid */ + +static int64_t +sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return right->thread->pid - left->thread->pid; +} + static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf, size_t size, unsigned int width) { @@ -132,15 +61,50 @@ static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf, self->thread->comm ?: "", self->thread->pid); } +struct sort_entry sort_thread = { + .se_header = "Command: Pid", + .se_cmp = sort__thread_cmp, + .se_snprintf = hist_entry__thread_snprintf, + .se_width_idx = HISTC_THREAD, +}; + +/* --sort comm */ + +static int64_t +sort__comm_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return right->thread->pid - left->thread->pid; +} + +static int64_t +sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) +{ + char *comm_l = left->thread->comm; + char *comm_r = right->thread->comm; + + if (!comm_l || !comm_r) + return cmp_null(comm_l, comm_r); + + return strcmp(comm_l, comm_r); +} + static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf, size_t size, unsigned int width) { return repsep_snprintf(bf, size, "%*s", width, self->thread->comm); } +struct sort_entry sort_comm = { + .se_header = "Command", + .se_cmp = sort__comm_cmp, + .se_collapse = sort__comm_collapse, + .se_snprintf = hist_entry__comm_snprintf, + .se_width_idx = HISTC_COMM, +}; + /* --sort dso */ -int64_t +static int64_t sort__dso_cmp(struct hist_entry *left, struct hist_entry *right) { struct dso *dso_l = left->ms.map ? left->ms.map->dso : NULL; @@ -173,9 +137,16 @@ static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf, return repsep_snprintf(bf, size, "%-*s", width, "[unknown]"); } +struct sort_entry sort_dso = { + .se_header = "Shared Object", + .se_cmp = sort__dso_cmp, + .se_snprintf = hist_entry__dso_snprintf, + .se_width_idx = HISTC_DSO, +}; + /* --sort symbol */ -int64_t +static int64_t sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) { u64 ip_l, ip_r; @@ -211,29 +182,16 @@ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf, return ret; } -/* --sort comm */ - -int64_t -sort__comm_cmp(struct hist_entry *left, struct hist_entry *right) -{ - return right->thread->pid - left->thread->pid; -} - -int64_t -sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) -{ - char *comm_l = left->thread->comm; - char *comm_r = right->thread->comm; - - if (!comm_l || !comm_r) - return cmp_null(comm_l, comm_r); - - return strcmp(comm_l, comm_r); -} +struct sort_entry sort_sym = { + .se_header = "Symbol", + .se_cmp = sort__sym_cmp, + .se_snprintf = hist_entry__sym_snprintf, + .se_width_idx = HISTC_SYMBOL, +}; /* --sort parent */ -int64_t +static int64_t sort__parent_cmp(struct hist_entry *left, struct hist_entry *right) { struct symbol *sym_l = left->parent; @@ -252,9 +210,16 @@ static int hist_entry__parent_snprintf(struct hist_entry *self, char *bf, self->parent ? self->parent->name : "[other]"); } +struct sort_entry sort_parent = { + .se_header = "Parent symbol", + .se_cmp = sort__parent_cmp, + .se_snprintf = hist_entry__parent_snprintf, + .se_width_idx = HISTC_PARENT, +}; + /* --sort cpu */ -int64_t +static int64_t sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right) { return right->cpu - left->cpu; @@ -266,6 +231,28 @@ static int hist_entry__cpu_snprintf(struct hist_entry *self, char *bf, return repsep_snprintf(bf, size, "%-*d", width, self->cpu); } +struct sort_entry sort_cpu = { + .se_header = "CPU", + .se_cmp = sort__cpu_cmp, + .se_snprintf = hist_entry__cpu_snprintf, + .se_width_idx = HISTC_CPU, +}; + +struct sort_dimension { + const char *name; + struct sort_entry *entry; + int taken; +}; + +static struct sort_dimension sort_dimensions[] = { + { .name = "pid", .entry = &sort_thread, }, + { .name = "comm", .entry = &sort_comm, }, + { .name = "dso", .entry = &sort_dso, }, + { .name = "symbol", .entry = &sort_sym, }, + { .name = "parent", .entry = &sort_parent, }, + { .name = "cpu", .entry = &sort_cpu, }, +}; + int sort_dimension__add(const char *tok) { unsigned int i; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 0b91053..4a6d309 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -108,14 +108,6 @@ extern size_t sort__thread_print(FILE *, struct hist_entry *, unsigned int); extern size_t sort__comm_print(FILE *, struct hist_entry *, unsigned int); extern size_t sort__dso_print(FILE *, struct hist_entry *, unsigned int); extern size_t sort__sym_print(FILE *, struct hist_entry *, unsigned int __used); -extern int64_t cmp_null(void *, void *); -extern int64_t sort__thread_cmp(struct hist_entry *, struct hist_entry *); -extern int64_t sort__comm_cmp(struct hist_entry *, struct hist_entry *); -extern int64_t sort__comm_collapse(struct hist_entry *, struct hist_entry *); -extern int64_t sort__dso_cmp(struct hist_entry *, struct hist_entry *); -extern int64_t sort__sym_cmp(struct hist_entry *, struct hist_entry *); -extern int64_t sort__parent_cmp(struct hist_entry *, struct hist_entry *); -int64_t sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right); extern size_t sort__parent_print(FILE *, struct hist_entry *, unsigned int); extern int sort_dimension__add(const char *); void sort_entry__setup_elide(struct sort_entry *self, struct strlist *list, -- cgit v0.10.2 From 2fd701bc782fad8792059dd586e1f00b64f6a52e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jun 2011 03:25:14 +0200 Subject: perf tools: Remove sort print helpers declarations These are probably some old leftovers. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Cc: Sam Liao diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 4a6d309..77d0388 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -103,12 +103,6 @@ extern struct sort_entry sort_thread; extern struct list_head hist_entry__sort_list; void setup_sorting(const char * const usagestr[], const struct option *opts); - -extern size_t sort__thread_print(FILE *, struct hist_entry *, unsigned int); -extern size_t sort__comm_print(FILE *, struct hist_entry *, unsigned int); -extern size_t sort__dso_print(FILE *, struct hist_entry *, unsigned int); -extern size_t sort__sym_print(FILE *, struct hist_entry *, unsigned int __used); -extern size_t sort__parent_print(FILE *, struct hist_entry *, unsigned int); extern int sort_dimension__add(const char *); void sort_entry__setup_elide(struct sort_entry *self, struct strlist *list, const char *list_name, FILE *fp); -- cgit v0.10.2 From e84d21227c6865fe1f3d0c79d1539b6877f54c84 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jun 2011 22:23:03 +0200 Subject: perf tools: Don't display ignored entries on stdio ui As for newt ui, don't display entries that have been marked as ignored. The practical current effect of this is to make parent filtering really working. Before, entries that were ignored were given a null parent but were still displayed. This resulted in some weird effects: # Overhead Command Shared Object Symbol # ........ ........... ................. ............ # ^A | --- __lock_acquire | |--95.97%-- lock_acquire | | | |--30.75%-- _raw_spin_lock Discard these from the stdio display. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Cc: Sam Liao diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index dae4202..677e1da 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -847,6 +847,9 @@ print_entries: for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) { struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node); + if (h->filtered) + continue; + if (show_displacement) { if (h->pair != NULL) displacement = ((long)h->pair->position - -- cgit v0.10.2 From fd8ea21276adefc7f0133bd42fcf3b2faf0b15f8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jun 2011 23:08:14 +0200 Subject: perf tools: Allow sort dimensions to be registered more than once So that the parent sort dimension can be registered twice: once if we add it as an explicit sort dimension (-s parent) and twice if we request a parent filter (-p foo). We'll have only one parent sort dimension in the end but this allows to override the default parent filter with we gave in "-p" option. The goal of this is to prepare to allow the use of "-s parent" and "-p foo" at the same time, ie: sort by filtered parent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Cc: Sam Liao diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index f5dba56..401e220 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -260,15 +260,9 @@ int sort_dimension__add(const char *tok) for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) { struct sort_dimension *sd = &sort_dimensions[i]; - if (sd->taken) - continue; - if (strncasecmp(tok, sd->name, strlen(tok))) continue; - if (sd->entry->se_collapse) - sort__need_collapse = 1; - if (sd->entry == &sort_parent) { int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED); if (ret) { @@ -281,6 +275,12 @@ int sort_dimension__add(const char *tok) sort__has_parent = 1; } + if (sd->taken) + return 0; + + if (sd->entry->se_collapse) + sort__need_collapse = 1; + if (list_empty(&hist_entry__sort_list)) { if (!strcmp(sd->name, "pid")) sort__first_dimension = SORT_PID; -- cgit v0.10.2 From cb1955b86c86782ff20037da42ef030057501c34 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jun 2011 23:52:52 +0200 Subject: perf tools: Only display parent field if explictly sorted We don't need to display the parent field if the parent sorting machinery is only used for parent filtering (as in "-p foo"). However if parent filtering is used in combination with explicit parent sorting ( -s parent), we want to display it. Result with: perf report -p kernel_thread -s parent Before: # Overhead Parent symbol # ........ ............. # 0.07% | --- ioread8 ata_sff_check_status ata_sff_tf_load ata_sff_qc_issue ata_bmdma_qc_issue ata_qc_issue ata_scsi_translate ata_scsi_queuecmd scsi_dispatch_cmd scsi_request_fn __blk_run_queue __make_request generic_make_request submit_bio submit_bh journal_submit_commit_record jbd2_journal_commit_transaction kjournald2 kthread kernel_thread_helpe After: # Overhead Parent symbol # ........ ............. # 0.07% kernel_thread_helper | --- ioread8 ata_sff_check_status ata_sff_tf_load ata_sff_qc_issue ata_bmdma_qc_issue ata_qc_issue ata_scsi_translate ata_scsi_queuecmd scsi_dispatch_cmd scsi_request_fn __blk_run_queue __make_request generic_make_request submit_bio submit_bh journal_submit_commit_record jbd2_journal_commit_transaction kjournald2 kthread kernel_thread_helper Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Ahern Cc: Sam Liao diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 271e252..5d43d01 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -525,7 +525,14 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) if (parent_pattern != default_parent_pattern) { if (sort_dimension__add("parent") < 0) return -1; - sort_parent.elide = 1; + + /* + * Only show the parent fields if we explicitly + * sort that way. If we only use parent machinery + * for filtering, we don't want it. + */ + if (!strstr(sort_order, "parent")) + sort_parent.elide = 1; } else symbol_conf.exclude_other = false; -- cgit v0.10.2 From 4f8b50bbbe63ae4ec6bea28a90a9a603c745ea71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 17:22:43 +0200 Subject: irq_work, ppc: Fix up arch hooks Commit e360adbe29 ("irq_work: Add generic hardirq context callbacks") fouled up the ppc bit, not properly naming the arch specific function that raises the 'self-IPI'. Cc: Huang Ying Cc: Benjamin Herrenschmidt Cc: Anton Blanchard Cc: Eric B Munson Cc: stable@kernel.org # 37+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-eg0aqien8p1aqvzu9dft6dtv@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f33acfd..03b29a6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -544,7 +544,7 @@ DEFINE_PER_CPU(u8, irq_work_pending); #endif /* 32 vs 64 bit */ -void set_irq_work_pending(void) +void arch_irq_work_raise(void) { preempt_disable(); set_irq_work_pending_flag(); -- cgit v0.10.2 From 0f933625e7b6c3d91878ae95e341bf1984db7eaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Jun 2011 12:15:51 +0200 Subject: irq_work, alpha: Fix up arch hooks Commit e360adbe29 ("irq_work: Add generic hardirq context callbacks") fouled up the Alpha bit, not properly naming the arch specific function that raises the 'self-IPI'. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: stable@kernel.org # 37+ Link: http://lkml.kernel.org/n/tip-gukh0txmql2l4thgrekzzbfy@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 818e74e..f20d1b5 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -91,7 +91,7 @@ DEFINE_PER_CPU(u8, irq_work_pending); #define test_irq_work_pending() __get_cpu_var(irq_work_pending) #define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 -void set_irq_work_pending(void) +void arch_irq_work_raise(void) { set_irq_work_pending_flag(); } -- cgit v0.10.2 From 4ec8363dfc1451f8c8f86825731fe712798ada02 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 1 Jun 2011 15:15:36 -0400 Subject: perf_events: Fix perf buffer watermark setting Since 2.6.36 (specifically commit d57e34fdd60b ("perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do everything needed"), the perf_buffer_init_code() has been mis-setting the buffer watermark if perf_event_attr.wakeup_events has a non-zero value. This is because perf_event_attr.wakeup_events is a union with perf_event_attr.wakeup_watermark. This commit re-enables the check for perf_event_attr.watermark being set before continuing with setting a non-default watermark. This bug is most noticable when you are trying to use PERF_IOC_REFRESH with a value larger than one and perf_event_attr.wakeup_events is set to one. In this case the buffer watermark will be set to 1 and you will get extraneous POLL_IN overflows rather than POLL_HUP as expected. [ avoid using attr.wakeup_events when attr.watermark is set ] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106011506390.5384@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e70f62..e4aee51 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3569,8 +3569,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, event->attr.wakeup_watermark, - event->cpu, flags); + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + if (!rb) { ret = -ENOMEM; goto unlock; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fde5259..fc2701c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -199,13 +199,15 @@ void perf_output_end(struct perf_output_handle *handle) struct perf_event *event = handle->event; struct ring_buffer *rb = handle->rb; - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&rb->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); + if (handle->sample && !event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + int events = local_inc_return(&rb->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } } } -- cgit v0.10.2 From 28009ce4a8130af7260a9271901b4419834ad152 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Tue, 7 Jun 2011 16:33:38 +0100 Subject: perf: Remove 64-bit alignment padding from perf_event_context Reorder perf_event_context to remove 8 bytes of 64 bit alignment padding shrinking its size to 192 bytes, allowing it to fit into a smaller slab and use one fewer cache lines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307460819.1950.5.camel@castor.rsk Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e76a410..2f7b5d4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -894,8 +894,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - struct rcu_head rcu_head; int nr_cgroups; /* cgroup events present */ + struct rcu_head rcu_head; }; /* -- cgit v0.10.2 From b7526f0ca6dc68f57ca467ce503151b1d476a3e4 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:37 -0400 Subject: events: Add note to update_event_times comment about holding ctx->lock Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308861279-15216-1-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index e4aee51..2293e0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -748,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. + * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { -- cgit v0.10.2 From c4794295917ebeda8013b6cb9c8d71ab4f74a1fa Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:38 -0400 Subject: events: Move lockless timer calculation into helper function Take the timer calculation from perf_output_read and move it to a helper function for any place that needs timer values but cannot take the ctx->lock. Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308861279-15216-2-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 2293e0b..c851d70 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3351,6 +3351,18 @@ static int perf_event_index(struct perf_event *event) return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; } +static void calc_timer_values(struct perf_event *event, + u64 *running, + u64 *enabled) +{ + u64 now, ctx_time; + + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + *enabled = ctx_time - event->tstamp_enabled; + *running = ctx_time - event->tstamp_running; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3816,7 +3828,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0, now, ctx_time; + u64 enabled = 0, running = 0; u64 read_format = event->attr.read_format; /* @@ -3828,12 +3840,8 @@ static void perf_output_read(struct perf_output_handle *handle, * because of locking issue as we are called in * NMI context */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) { - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - enabled = ctx_time - event->tstamp_enabled; - running = ctx_time - event->tstamp_running; - } + if (read_format & PERF_FORMAT_TOTAL_TIMES) + calc_timer_values(event, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v0.10.2 From 0d6412085b7ff58612af52e51ffa864f0df4b8fd Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Fri, 24 Jun 2011 12:26:26 -0400 Subject: events: Ensure that timers are updated without requiring read() call The event tracing infrastructure exposes two timers which should be updated each time the value of the counter is updated. Currently, these counters are only updated when userspace calls read() on the fd associated with an event. This means that counters which are read via the mmap'd page exclusively never have their timers updated. This patch adds ensures that the timers are updated each time the values in the mmap'd page are updated. Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308932786-5111-1-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index c851d70..270e32f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3372,8 +3372,19 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; + u64 enabled, running; rcu_read_lock(); + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we can be called in + * NMI context + */ + calc_timer_values(event, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3392,10 +3403,10 @@ void perf_event_update_userpage(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ACTIVE) userpg->offset -= local64_read(&event->hw.prev_count); - userpg->time_enabled = event->total_time_enabled + + userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); - userpg->time_running = event->total_time_running + + userpg->time_running = running + atomic64_read(&event->child_total_time_running); barrier(); -- cgit v0.10.2 From 1880c4ae182afb5650c5678949ecfe7ff66a724e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jun 2011 16:49:18 +0400 Subject: perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Due to restriction and specifics of Netburst PMU we need a separated event for NMI watchdog. In particular every Netburst event consumes not just a counter and a config register, but also an additional ESCR register. Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied for some event there is no room for another event to enter until its released) we need to pick up the "least" used ESCR (or the most available one) for nmi-watchdog purposes -- so MSR_P4_CRU_ESCR2/3 was chosen. With this patch nmi-watchdog and perf top should be able to run simultaneously. Signed-off-by: Cyrill Gorcunov CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Tested-and-reviewed-by: Don Zickus Tested-and-reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110623124918.GC13050@sun Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3a0338b..8a57f9a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -233,6 +233,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + if (x86_pmu.hw_watchdog_set_attr) + x86_pmu.hw_watchdog_set_attr(wd_attr); +} + /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ead584f..f76fddf 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + /* + * Watchdog ticks are special on Netburst, we use + * that named "non-sleeping" ticks as recommended + * by Intel SDM Vol3b. + */ + WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || + wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); + + wd_attr->type = PERF_TYPE_RAW; + wd_attr->config = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE); +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3d0c56a..752b75b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -200,6 +200,8 @@ static int is_softlockup(unsigned long touch_ts) } #ifdef CONFIG_HARDLOCKUP_DETECTOR +void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { } + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu) if (event != NULL) goto out_enable; - /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + hw_nmi_watchdog_set_attr(wd_attr); + + /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); -- cgit v0.10.2 From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 90561c4..8e47709 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -847,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, data.period = event->hw.last_period; if (alpha_perf_event_set_period(event, hwc, idx)) { - if (perf_event_overflow(event, 1, &data, regs)) { + if (perf_event_overflow(event, &data, regs)) { /* Interrupts coming too quickly; "throttle" the * counter, i.e., disable it for a little while. */ diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index f1e8dd9..38dc4da 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -479,7 +479,7 @@ armv6pmu_handle_irq(int irq_num, if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 4960686..6e5f875 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -787,7 +787,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index 39affbe..99b6b85 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -251,7 +251,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } @@ -583,7 +583,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 9726006..0c9b105 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -396,7 +396,7 @@ static long ptrace_hbp_idx_to_num(int idx) /* * Handle hitting a HW-breakpoint. */ -static void ptrace_hbptriggered(struct perf_event *bp, int unused, +static void ptrace_hbptriggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index 40ee7e5..5f452f8 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -183,7 +183,7 @@ static int swp_handler(struct pt_regs *regs, unsigned int instr) unsigned int address, destreg, data, type; unsigned int res = 0; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, regs->ARM_pc); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->ARM_pc); if (current->pid != previous_pid) { pr_debug("\"%s\" (%ld) uses deprecated SWP{B} instruction\n", diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bc0e1d8..9ea4f7d 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -318,11 +318,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) fault = __do_page_fault(mm, addr, fsr, tsk); up_read(&mm->mmap_sem); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (fault & VM_FAULT_MAJOR) - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); else if (fault & VM_FAULT_MINOR) - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); /* * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index a824485..d0deaab 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -527,7 +527,7 @@ handle_associated_event(struct cpu_hw_events *cpuc, if (!mipspmu_event_set_period(event, hwc, idx)) return; - if (perf_event_overflow(event, 0, data, regs)) + if (perf_event_overflow(event, data, regs)) mipspmu->disable_event(idx); } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e9b3af2..b7517e3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -578,12 +578,12 @@ static int simulate_llsc(struct pt_regs *regs, unsigned int opcode) { if ((opcode & OPCODE) == LL) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return simulate_ll(regs, opcode); } if ((opcode & OPCODE) == SC) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return simulate_sc(regs, opcode); } @@ -602,7 +602,7 @@ static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode) int rd = (opcode & RD) >> 11; int rt = (opcode & RT) >> 16; perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); switch (rd) { case 0: /* CPU number */ regs->regs[rt] = smp_processor_id(); @@ -640,7 +640,7 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode) { if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return 0; } diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index cfea1ad..eb319b5 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -111,8 +111,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, unsigned long value; unsigned int res; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); /* * This load never faults. @@ -517,7 +516,7 @@ asmlinkage void do_ade(struct pt_regs *regs) mm_segment_t seg; perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, - 1, 0, regs, regs->cp0_badvaddr); + 1, regs, regs->cp0_badvaddr); /* * Did we catch a fault trying to load an instruction? * Or are we running in MIPS16 mode? diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index d32cb05..dbf2f93 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -272,8 +272,7 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx, } emul: - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, xcp, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, xcp, 0); MIPS_FPU_EMU_INC_STATS(emulated); switch (MIPSInst_OPCODE(ir)) { case ldc1_op:{ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 137ee76..937cf33 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -145,7 +145,7 @@ good_area: * the fault. */ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -154,12 +154,10 @@ good_area: BUG(); } if (fault & VM_FAULT_MAJOR) { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); tsk->maj_flt++; } else { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); tsk->min_flt++; } diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index 4592167..2cc41c7 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -78,14 +78,14 @@ extern void ppc_warn_emulated_print(const char *type); #define PPC_WARN_EMULATED(type, regs) \ do { \ perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, \ - 1, 0, regs, 0); \ + 1, regs, 0); \ __PPC_WARN_EMULATED(type); \ } while (0) #define PPC_WARN_ALIGNMENT(type, regs) \ do { \ perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, \ - 1, 0, regs, regs->dar); \ + 1, regs, regs->dar); \ __PPC_WARN_EMULATED(type); \ } while (0) diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 822f630..14967de 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1207,7 +1207,7 @@ struct pmu power_pmu = { * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; @@ -1258,7 +1258,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); - if (perf_event_overflow(event, nmi, &data, regs)) + if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } } @@ -1346,7 +1346,7 @@ static void perf_event_interrupt(struct pt_regs *regs) if ((int)val < 0) { /* event has overflowed */ found = 1; - record_and_restart(event, val, regs, nmi); + record_and_restart(event, val, regs); } } diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index b0dc8f7..0a6d2a9 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -568,7 +568,7 @@ static struct pmu fsl_emb_pmu = { * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; @@ -616,7 +616,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - if (perf_event_overflow(event, nmi, &data, regs)) + if (perf_event_overflow(event, &data, regs)) fsl_emb_pmu_stop(event, 0); } } @@ -644,7 +644,7 @@ static void perf_event_interrupt(struct pt_regs *regs) if (event) { /* event has overflowed */ found = 1; - record_and_restart(event, val, regs, nmi); + record_and_restart(event, val, regs); } else { /* * Disabled counter is negative, diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index cb22024..3177617 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -882,7 +882,7 @@ void user_disable_single_step(struct task_struct *task) } #ifdef CONFIG_HAVE_HW_BREAKPOINT -void ptrace_triggered(struct perf_event *bp, int nmi, +void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_event_attr attr; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 54f4fb9..dbc4825 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -173,7 +173,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -319,7 +319,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -330,7 +330,7 @@ good_area: #endif } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fe103e8..095f782 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -299,7 +299,7 @@ static inline int do_exception(struct pt_regs *regs, int access, goto out; address = trans_exc_code & __FAIL_ADDR_MASK; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; @@ -345,11 +345,11 @@ retry: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 3d7b209..8051976 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -63,7 +63,7 @@ static inline int put_stack_long(struct task_struct *task, int offset, return 0; } -void ptrace_triggered(struct perf_event *bp, int nmi, +void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_event_attr attr; diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index b51a171..d9006f8 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -393,7 +393,7 @@ int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs, */ if (!expected) { unaligned_fixups_notify(current, instruction, regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); } diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 6713ca9..67110be 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -434,7 +434,7 @@ static int misaligned_load(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); destreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -512,7 +512,7 @@ static int misaligned_store(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); srcreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -588,7 +588,7 @@ static int misaligned_fpu_load(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address); destreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -665,7 +665,7 @@ static int misaligned_fpu_store(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address); srcreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index f76a509..9771952 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -620,7 +620,7 @@ int do_fpu_inst(unsigned short inst, struct pt_regs *regs) struct task_struct *tsk = current; struct sh_fpu_soft_struct *fpu = &(tsk->thread.xstate->softfpu); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (!(task_thread_info(tsk)->status & TS_USEDFPU)) { /* initialize once. */ diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index d4c34d7..7bebd04 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -160,7 +160,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if ((regs->sr & SR_IMASK) != SR_IMASK) local_irq_enable(); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -210,11 +210,11 @@ good_area: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 7f5810f..e3430e0 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, /* Not an IO address, so reenable interrupts */ local_irq_enable(); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt or have no user @@ -200,11 +200,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 2cb0e1c..0b32f2e 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1277,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, if (!sparc_perf_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) sparc_pmu_stop(event, 0); } diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 4491f4c..7efbb2f 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -247,7 +247,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) unsigned long addr = compute_effective_address(regs, insn); int err; - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch (dir) { case load: err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f), @@ -338,7 +338,7 @@ asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) } addr = compute_effective_address(regs, insn); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch(dir) { case load: err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f), diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c index b2b019e..35cff16 100644 --- a/arch/sparc/kernel/unaligned_64.c +++ b/arch/sparc/kernel/unaligned_64.c @@ -317,7 +317,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) addr = compute_effective_address(regs, insn, ((insn >> 25) & 0x1f)); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch (asi) { case ASI_NL: case ASI_AIUPL: @@ -384,7 +384,7 @@ int handle_popc(u32 insn, struct pt_regs *regs) int ret, i, rd = ((insn >> 25) & 0x1f); int from_kernel = (regs->tstate & TSTATE_PRIV) != 0; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (insn & 0x2000) { maybe_flush_windows(0, 0, rd, from_kernel); value = sign_extend_imm13(insn); @@ -431,7 +431,7 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs) int asi = decode_asi(insn, regs); int flag = (freg < 32) ? FPRS_DL : FPRS_DU; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); save_and_clear_fpu(); current_thread_info()->xfsr[0] &= ~0x1c000; @@ -554,7 +554,7 @@ void handle_ld_nf(u32 insn, struct pt_regs *regs) int from_kernel = (regs->tstate & TSTATE_PRIV) != 0; unsigned long *reg; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); maybe_flush_windows(0, 0, rd, from_kernel); reg = fetch_reg_addr(rd, regs); @@ -586,7 +586,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr if (tstate & TSTATE_PRIV) die_if_kernel("lddfmna from kernel", regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { @@ -647,7 +647,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr if (tstate & TSTATE_PRIV) die_if_kernel("stdfmna from kernel", regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c index 3635771..32b626c 100644 --- a/arch/sparc/kernel/visemul.c +++ b/arch/sparc/kernel/visemul.c @@ -802,7 +802,7 @@ int vis_emul(struct pt_regs *regs, unsigned int insn) BUG_ON(regs->tstate & TSTATE_PRIV); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c index a3fccde..aa4d55b 100644 --- a/arch/sparc/math-emu/math_32.c +++ b/arch/sparc/math-emu/math_32.c @@ -164,7 +164,7 @@ int do_mathemu(struct pt_regs *regs, struct task_struct *fpt) int retcode = 0; /* assume all succeed */ unsigned long insn; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); #ifdef DEBUG_MATHEMU printk("In do_mathemu()... pc is %08lx\n", regs->pc); diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c index 56d2c44..e575bd2 100644 --- a/arch/sparc/math-emu/math_64.c +++ b/arch/sparc/math-emu/math_64.c @@ -184,7 +184,7 @@ int do_mathemu(struct pt_regs *regs, struct fpustate *f) if (tstate & TSTATE_PRIV) die_if_kernel("unfinished/unimplemented FPop from kernel", regs); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 7543ddb..aa1c1b1 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -251,7 +251,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, if (in_atomic() || !mm) goto no_context; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); down_read(&mm->mmap_sem); @@ -301,12 +301,10 @@ good_area: } if (fault & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); return; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index f92ce56..504c062 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -325,7 +325,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) if (in_atomic() || !mm) goto intr_or_no_mm; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (!down_read_trylock(&mm->mmap_sem)) { if ((regs->tstate & TSTATE_PRIV) && @@ -433,12 +433,10 @@ good_area: } if (fault & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8a57f9a..5b86ec5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1339,7 +1339,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 41178c8..d38b002 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,7 +1003,7 @@ again: data.period = event->hw.last_period; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index bab491b..0941f93 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + if (perf_output_begin(&handle, event, header.size * (top - at), 1)) return 1; for (; at < top; at++) { @@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; - if (perf_event_overflow(event, 1, &data, ®s)) + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index f76fddf..d6e6a67 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -970,7 +970,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5f9ecff..98da6a7 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -608,7 +608,7 @@ int kgdb_arch_init(void) return register_die_notifier(&kgdb_notifier); } -static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, +static void kgdb_hw_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 807c2a2..11db2e9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, int nmi, +static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf..4d09df0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1161,11 +1161,11 @@ good_area: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2f7b5d4..0946a8b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -682,7 +682,7 @@ enum perf_event_active_state { struct file; struct perf_sample_data; -typedef void (*perf_overflow_handler_t)(struct perf_event *, int, +typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); @@ -925,7 +925,6 @@ struct perf_output_handle { unsigned long size; void *addr; int page; - int nmi; int sample; }; @@ -993,7 +992,7 @@ extern void perf_prepare_sample(struct perf_event_header *header, struct perf_event *event, struct pt_regs *regs); -extern int perf_event_overflow(struct perf_event *event, int nmi, +extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); @@ -1012,7 +1011,7 @@ static inline int is_software_event(struct perf_event *event) extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; -extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); +extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } @@ -1034,7 +1033,7 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) } static __always_inline void -perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; @@ -1043,7 +1042,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; } - __perf_sw_event(event_id, nr, nmi, regs, addr); + __perf_sw_event(event_id, nr, regs, addr); } } @@ -1057,7 +1056,7 @@ static inline void perf_event_task_sched_in(struct task_struct *task) static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); __perf_event_task_sched_out(task, next); } @@ -1119,7 +1118,7 @@ extern void perf_bp_event(struct perf_event *event, void *data); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, - int nmi, int sample); + int sample); extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); @@ -1143,8 +1142,7 @@ static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void -perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) { } +perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index 270e32f..dbd1ca7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3972,7 +3972,7 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_event_output(struct perf_event *event, int nmi, +static void perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { @@ -3984,7 +3984,7 @@ static void perf_event_output(struct perf_event *event, int nmi, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size, nmi, 1)) + if (perf_output_begin(&handle, event, header.size, 1)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -4024,7 +4024,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + ret = perf_output_begin(&handle, event, read_event.header.size, 0); if (ret) return; @@ -4067,7 +4067,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0, 0); + task_event->event_id.header.size, 0); if (ret) goto out; @@ -4204,7 +4204,7 @@ static void perf_event_comm_output(struct perf_event *event, perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0, 0); + comm_event->event_id.header.size, 0); if (ret) goto out; @@ -4351,7 +4351,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0, 0); + mmap_event->event_id.header.size, 0); if (ret) goto out; @@ -4546,7 +4546,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, event, - throttle_event.header.size, 1, 0); + throttle_event.header.size, 0); if (ret) return; @@ -4559,7 +4559,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) * Generic event overflow handling, sampling. */ -static int __perf_event_overflow(struct perf_event *event, int nmi, +static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4602,34 +4602,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); + event->pending_disable = 1; + irq_work_queue(&event->pending); } if (event->overflow_handler) - event->overflow_handler(event, nmi, data, regs); + event->overflow_handler(event, data, regs); else - perf_event_output(event, nmi, data, regs); + perf_event_output(event, data, regs); if (event->fasync && event->pending_kill) { - if (nmi) { - event->pending_wakeup = 1; - irq_work_queue(&event->pending); - } else - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending); } return ret; } -int perf_event_overflow(struct perf_event *event, int nmi, +int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - return __perf_event_overflow(event, nmi, 1, data, regs); + return __perf_event_overflow(event, 1, data, regs); } /* @@ -4678,7 +4672,7 @@ again: } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -4692,7 +4686,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, return; for (; overflow; overflow--) { - if (__perf_event_overflow(event, nmi, throttle, + if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when @@ -4705,7 +4699,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } static void perf_swevent_event(struct perf_event *event, u64 nr, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -4719,12 +4713,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, nmi, data, regs); + return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; - perf_swevent_overflow(event, 0, nmi, data, regs); + perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, @@ -4812,7 +4806,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, + u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4828,7 +4822,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); @@ -4849,8 +4843,7 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; int rctx; @@ -4862,7 +4855,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, perf_sample_data_init(&data, addr); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); @@ -5110,7 +5103,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, 1, &data, regs); + perf_swevent_event(event, count, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -5203,7 +5196,7 @@ void perf_bp_event(struct perf_event *bp, void *data) perf_sample_data_init(&sample, bp->attr.bp_addr); if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, 1, &sample, regs); + perf_swevent_event(bp, 1, &sample, regs); } #endif @@ -5232,7 +5225,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 114f27f..09097dd 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,7 +27,6 @@ struct ring_buffer { void *data_pages[0]; }; - extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fc2701c..8b3b736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,11 +38,8 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); } /* @@ -102,7 +99,7 @@ out: int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, - int nmi, int sample) + int sample) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -127,7 +124,6 @@ int perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; - handle->nmi = nmi; handle->sample = sample; if (!rb->nr_pages) diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502..d08d110 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } __set_task_cpu(p, new_cpu); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 752b75b..a6708e67 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -211,7 +211,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 0636539..7b164d3 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -41,7 +41,7 @@ module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); -static void sample_hbp_handler(struct perf_event *bp, int nmi, +static void sample_hbp_handler(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { -- cgit v0.10.2 From a7ac67ea021b4603095d2aa458bc41641238f22c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 16:47:16 +0200 Subject: perf: Remove the perf_output_begin(.sample) argument Since only samples call perf_output_sample() its much saner (and more correct) to put the sample logic in there than in the perf_output_begin()/perf_output_end() pair. Saves a useless argument, reduces conditionals and shrinks struct perf_output_handle, win! Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0941f93..1b1ef3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1)) + if (perf_output_begin(&handle, event, header.size * (top - at))) return 1; for (; at < top; at++) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0946a8b..771b0b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -925,7 +925,6 @@ struct perf_output_handle { unsigned long size; void *addr; int page; - int sample; }; #ifdef CONFIG_PERF_EVENTS @@ -1117,8 +1116,7 @@ extern void perf_bp_event(struct perf_event *event, void *data); #endif extern int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int sample); + struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); diff --git a/kernel/events/core.c b/kernel/events/core.c index dbd1ca7..81de28dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3928,6 +3928,20 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, raw); } } + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3984,7 +3998,7 @@ static void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size, 1)) + if (perf_output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -4024,7 +4038,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0); + ret = perf_output_begin(&handle, event, read_event.header.size); if (ret) return; @@ -4067,7 +4081,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0); + task_event->event_id.header.size); if (ret) goto out; @@ -4204,7 +4218,7 @@ static void perf_event_comm_output(struct perf_event *event, perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0); + comm_event->event_id.header.size); if (ret) goto out; @@ -4351,7 +4365,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0); + mmap_event->event_id.header.size); if (ret) goto out; @@ -4546,7 +4560,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, event, - throttle_event.header.size, 0); + throttle_event.header.size); if (ret) return; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 8b3b736..a2a2920 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -98,8 +98,7 @@ out: } int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int sample) + struct perf_event *event, unsigned int size) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -124,7 +123,6 @@ int perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; - handle->sample = sample; if (!rb->nr_pages) goto out; @@ -192,21 +190,6 @@ void perf_output_copy(struct perf_output_handle *handle, void perf_output_end(struct perf_output_handle *handle) { - struct perf_event *event = handle->event; - struct ring_buffer *rb = handle->rb; - - if (handle->sample && !event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - int events = local_inc_return(&rb->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } - perf_output_put_handle(handle); rcu_read_unlock(); } -- cgit v0.10.2 From efc9f05df2dd171280dcb736a4d973ffefd5508e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:03 +0200 Subject: perf_events: Update Intel extra regs shared constraints management This patch improves the code managing the extra shared registers used for offcore_response events on Intel Nehalem/Westmere. The idea is to use static allocation instead of dynamic allocation. This simplifies greatly the get and put constraint routines for those events. The patch also renames per_core to shared_regs because the same data structure gets used whether or not HT is on. When HT is off, those events still need to coordination because they use a extra MSR that has to be shared within an event group. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145703.GA7258@quad Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b86ec5..019fda7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -45,6 +45,29 @@ do { \ #endif /* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +/* * best effort, GUP based copy_from_user() that assumes IRQ or NMI context */ static unsigned long @@ -132,11 +155,10 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* - * Intel percore register state. - * Coordinate shared resources between HT threads. + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB */ - int percore_used; /* Used by this CPU? */ - struct intel_percore *per_core; + struct intel_shared_regs *shared_regs; /* * AMD specific bits @@ -187,26 +209,45 @@ struct cpu_hw_events { for ((e) = (c); (e)->weight; (e)++) /* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* * Extra registers for specific events. + * * Some events need large masks and require external MSRs. - * Define a mapping to these extra registers. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. */ struct extra_reg { unsigned int event; unsigned int msr; u64 config_mask; u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ }; -#define EVENT_EXTRA_REG(e, ms, m, vm) { \ +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ .event = (e), \ .msr = (ms), \ .config_mask = (m), \ .valid_mask = (vm), \ + .idx = EXTRA_REG_##i \ } -#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) union perf_capabilities { struct { @@ -253,7 +294,6 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -400,10 +440,10 @@ static inline unsigned int x86_pmu_event_addr(int index) */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct hw_perf_event_extra *reg; struct extra_reg *er; - event->hw.extra_reg = 0; - event->hw.extra_config = 0; + reg = &event->hw.extra_reg; if (!x86_pmu.extra_regs) return 0; @@ -413,8 +453,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - event->hw.extra_reg = er->msr; - event->hw.extra_config = event->attr.config1; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; break; } return 0; @@ -713,6 +755,9 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } @@ -754,8 +799,8 @@ static void x86_pmu_disable(struct pmu *pmu) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { - if (hwc->extra_reg) - wrmsrl(hwc->extra_reg, hwc->extra_config); + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); wrmsrl(hwc->config_base, hwc->config | enable_mask); } @@ -1692,7 +1737,6 @@ static int validate_group(struct perf_event *event) fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); if (!fake_cpuc) goto out; - /* * the event is not yet connected with its * siblings therefore we must first collect diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d38b002..6ad95ba 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,25 +1,15 @@ #ifdef CONFIG_CPU_SUP_INTEL -#define MAX_EXTRA_REGS 2 - -/* - * Per register state. - */ -struct er_account { - int ref; /* reference count */ - unsigned int extra_reg; /* extra MSR number */ - u64 extra_config; /* extra MSR config */ -}; - /* - * Per core state - * This used to coordinate shared registers for HT threads. + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. */ -struct intel_percore { - raw_spinlock_t lock; /* protect structure */ - struct er_account regs[MAX_EXTRA_REGS]; - int refcnt; /* number of threads */ - unsigned core_id; +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ }; /* @@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -125,18 +109,11 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - INTEL_EVENT_CONSTRAINT(0xbb, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1037,65 +1014,89 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ static struct event_constraint * -intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) { - struct hw_perf_event *hwc = &event->hw; - unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; - struct event_constraint *c; - struct intel_percore *pc; + struct event_constraint *c = &emptyconstraint; struct er_account *era; - int i; - int free_slot; - int found; - if (!x86_pmu.percore_constraints || hwc->extra_alloc) - return NULL; + /* already allocated shared msr */ + if (reg->alloc || !cpuc->shared_regs) + return &unconstrained; - for (c = x86_pmu.percore_constraints; c->cmask; c++) { - if (e != c->code) - continue; + era = &cpuc->shared_regs->regs[reg->idx]; + + raw_spin_lock(&era->lock); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* no need to reallocate during incremental event scheduling */ + reg->alloc = 1; /* - * Allocate resource per core. + * All events using extra_reg are unconstrained. + * Avoids calling x86_get_event_constraints() + * + * Must revisit if extra_reg controlling events + * ever have constraints. Worst case we go through + * the regular event constraint table. */ - pc = cpuc->per_core; - if (!pc) - break; - c = &emptyconstraint; - raw_spin_lock(&pc->lock); - free_slot = -1; - found = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { - /* Allow sharing same config */ - if (hwc->extra_config == era->extra_config) { - era->ref++; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - /* else conflict */ - found = 1; - break; - } else if (era->ref == 0 && free_slot == -1) - free_slot = i; - } - if (!found && free_slot != -1) { - era = &pc->regs[free_slot]; - era->ref = 1; - era->extra_reg = hwc->extra_reg; - era->extra_config = hwc->extra_config; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - raw_spin_unlock(&pc->lock); - return c; + c = &unconstrained; } + raw_spin_unlock(&era->lock); - return NULL; + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * only put constraint if extra reg was actually + * allocated. Also takes care of event which do + * not use an extra shared reg + */ + if (!reg->alloc) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL; + struct hw_perf_event_extra *xreg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, xreg); + return c; } static struct event_constraint * @@ -1111,49 +1112,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - c = intel_percore_constraints(cpuc, event); + c = intel_shared_regs_constraints(cpuc, event); if (c) return c; return x86_get_event_constraints(cpuc, event); } -static void intel_put_event_constraints(struct cpu_hw_events *cpuc, +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct extra_reg *er; - struct intel_percore *pc; - struct er_account *era; - struct hw_perf_event *hwc = &event->hw; - int i, allref; - - if (!cpuc->percore_used) - return; + struct hw_perf_event_extra *reg; - for (er = x86_pmu.extra_regs; er->msr; er++) { - if (er->event != (hwc->config & er->config_mask)) - continue; + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} - pc = cpuc->per_core; - raw_spin_lock(&pc->lock); - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && - era->extra_config == hwc->extra_config && - era->extra_reg == er->msr) { - era->ref--; - hwc->extra_alloc = 0; - break; - } - } - allref = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) - allref += pc->regs[i].ref; - if (allref == 0) - cpuc->percore_used = 0; - raw_spin_unlock(&pc->lock); - break; - } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); } static int intel_pmu_hw_config(struct perf_event *event) @@ -1231,20 +1211,36 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!cpu_has_ht_siblings()) + if (!x86_pmu.extra_regs) return NOTIFY_OK; - cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), - GFP_KERNEL, cpu_to_node(cpu)); - if (!cpuc->per_core) + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) return NOTIFY_BAD; - raw_spin_lock_init(&cpuc->per_core->lock); - cpuc->per_core->core_id = -1; return NOTIFY_OK; } @@ -1260,32 +1256,34 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpu_has_ht_siblings()) + if (!cpuc->shared_regs) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + struct intel_shared_regs *pc; + pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->per_core); - cpuc->per_core = pc; + kfree(cpuc->shared_regs); + cpuc->shared_regs = pc; break; } } - cpuc->per_core->core_id = core_id; - cpuc->per_core->refcnt++; + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_percore *pc = cpuc->per_core; + struct intel_shared_regs *pc; + pc = cpuc->shared_regs; if (pc) { if (pc->core_id == -1 || --pc->refcnt == 0) kfree(pc); - cpuc->per_core = NULL; + cpuc->shared_regs = NULL; } fini_debug_store_on_cpu(cpu); @@ -1436,7 +1434,6 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; - x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; @@ -1481,7 +1478,6 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; - x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 771b0b2..069315e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -536,6 +536,16 @@ struct perf_branch_stack { struct task_struct; +/* + * extra PMU register associated with an event + */ +struct hw_perf_event_extra { + u64 config; /* register value */ + unsigned int reg; /* register address or index */ + int alloc; /* extra register already allocated */ + int idx; /* index in shared_regs->regs[] */ +}; + /** * struct hw_perf_event - performance event hardware details: */ @@ -549,9 +559,7 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; - unsigned int extra_reg; - u64 extra_config; - int extra_alloc; + struct hw_perf_event_extra extra_reg; }; struct { /* software */ struct hrtimer hrtimer; -- cgit v0.10.2 From cd8a38d33e2528998998bae70a45ad27e442f114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:08 +0200 Subject: perf_events: Fix validation of events using an extra reg The validate_group() function needs to validate events with extra shared regs. Within an event group, only events with the same value for the extra reg can co-exist. This was not checked by validate_group() because it was missing the shared_regs logic. This patch changes the allocation of the fake cpuc used for validation to also point to a fake shared_regs structure such that group events be properly testing. It modifies __intel_shared_reg_get_constraints() to use spin_lock_irqsave() to avoid lockdep issues. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145708.GA7279@quad Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 019fda7..9a0f55c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1689,6 +1689,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu) perf_pmu_enable(pmu); return 0; } +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} /* * validate that we can schedule this event @@ -1699,9 +1733,9 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - return -ENOMEM; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, event); @@ -1711,7 +1745,7 @@ static int validate_event(struct perf_event *event) if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); - kfree(fake_cpuc); + free_fake_cpuc(fake_cpuc); return ret; } @@ -1731,35 +1765,32 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret, n; + int ret = -ENOSPC, n; - ret = -ENOMEM; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - goto out; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ - ret = -ENOSPC; n = collect_events(fake_cpuc, leader, true); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); -out_free: - kfree(fake_cpuc); out: + free_fake_cpuc(fake_cpuc); return ret; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6ad95ba..ac02b83 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,14 +1027,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, { struct event_constraint *c = &emptyconstraint; struct er_account *era; + unsigned long flags; /* already allocated shared msr */ - if (reg->alloc || !cpuc->shared_regs) + if (reg->alloc) return &unconstrained; era = &cpuc->shared_regs->regs[reg->idx]; - - raw_spin_lock(&era->lock); + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); if (!atomic_read(&era->ref) || era->config == reg->config) { @@ -1058,7 +1062,7 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, */ c = &unconstrained; } - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); return c; } @@ -1524,4 +1528,8 @@ static int intel_pmu_init(void) return 0; } +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} #endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v0.10.2 From ee89cbc2d48150c7c0e9f2aaac00afde99af098c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:12 +0200 Subject: perf_events: Add Intel Sandy Bridge offcore_response low-level support This patch adds Intel Sandy Bridge offcore_response support by providing the low-level constraint table for those events. On Sandy Bridge, there are two offcore_response events. Each uses its own dedictated extra register. But those registers are NOT shared between sibling CPUs when HT is on unlike Nehalem/Westmere. They are always private to each CPU. But they still need to be controlled within an event group. All events within an event group must use the same value for the extra MSR. That's not controlled by the second patch in this series. Furthermore on Sandy Bridge, the offcore_response events have NO counter constraints contrary to what the official documentation indicates, so drop the events from the contraint table. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145712.GA7304@quad Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9a0f55c..583f311 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,6 +327,7 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; + bool regs_no_ht_sharing; }; static struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ac02b83..a674ae4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -100,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ - INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ EVENT_CONSTRAINT_END @@ -122,6 +120,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + EVENT_EXTRA_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -1260,7 +1264,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs) + if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1502,6 +1506,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.regs_no_ht_sharing = true; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v0.10.2 From b79e8941fb9af07d810da91b4e29da2bba331b6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2011 11:08:15 +0200 Subject: perf, intel: Try alternative OFFCORE encodings Since the OFFCORE registers are fully symmetric, try the other one when the specified one is already in use. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306141897.18455.8.camel@twins Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 583f311..c53d433 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,9 +327,12 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; - bool regs_no_ht_sharing; + unsigned int er_flags; }; +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a674ae4..5c44862 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1018,6 +1018,29 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return false; + + if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.idx = EXTRA_REG_RSP_1; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01b7; + event->hw.extra_reg.idx = EXTRA_REG_RSP_0; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } + + if (event->hw.extra_reg.idx == orig_idx) + return false; + + return true; +} + /* * manage allocation of shared extra msr for certain events * @@ -1027,16 +1050,19 @@ intel_bts_constraints(struct perf_event *event) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct hw_perf_event_extra *reg) + struct perf_event *event) { struct event_constraint *c = &emptyconstraint; + struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; + int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) return &unconstrained; +again: era = &cpuc->shared_regs->regs[reg->idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when @@ -1065,6 +1091,9 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, * the regular event constraint table. */ c = &unconstrained; + } else if (intel_try_alt_er(event, orig_idx)) { + raw_spin_unlock(&era->lock); + goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1099,11 +1128,10 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c = NULL; - struct hw_perf_event_extra *xreg; - xreg = &event->hw.extra_reg; - if (xreg->idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, xreg); + if (event->hw.extra_reg.idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, event); + return c; } @@ -1264,7 +1292,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) + if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1489,6 +1517,7 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; @@ -1508,7 +1537,8 @@ static __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_snb_pebs_events; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.regs_no_ht_sharing = true; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v0.10.2 From 89d6c0b5bdbb1927775584dcf532d98b3efe1477 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 23:37:06 +0200 Subject: perf, arch: Add generic NODE cache events Add a NODE level to the generic cache events which is used to measure local vs remote memory accesses. Like all other cache events, an ACCESS is HIT+MISS, if there is no way to distinguish between reads and writes do reads only etc.. The below needs filling out for !x86 (which I filled out with unsupported events). I'm fairly sure ARM can leave it like that since it doesn't strike me as an architecture that even has NUMA support. SH might have something since it does appear to have some NUMA bits. Sparc64, PowerPC and MIPS certainly want a good look there since they clearly are NUMA capable. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Anton Blanchard Cc: David Daney Cc: Deng-Cheng Zhu Cc: Paul Mundt Cc: Will Deacon Cc: Robert Richter Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1303508226.4865.8.camel@laptop Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index 38dc4da..dd7f3b9 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -173,6 +173,20 @@ static const unsigned armv6_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, }, }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, }; enum armv6mpcore_perf_types { @@ -310,6 +324,20 @@ static const unsigned armv6mpcore_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, }, }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, }; static inline unsigned long diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 6e5f875..e20ca9c 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -255,6 +255,20 @@ static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, }, }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, }; /* @@ -371,6 +385,20 @@ static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, }, }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, }; /* diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index 99b6b85..3c43974 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -144,6 +144,20 @@ static const unsigned xscale_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, }, }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, }; #define XSCALE_PMU_ENABLE 0x001 diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 75266ff..e5ad09a 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -377,6 +377,20 @@ static const struct mips_perf_event mipsxxcore_cache_map [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, }, }, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, +}, }; /* 74K core has completely different cache event map. */ @@ -480,6 +494,20 @@ static const struct mips_perf_event mipsxx74Kcore_cache_map [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, }, }, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = { UNSUPPORTED_PERF_EVENT_ID }, + [C(RESULT_MISS)] = { UNSUPPORTED_PERF_EVENT_ID }, + }, +}, }; #ifdef CONFIG_MIPS_MT_SMP diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c index b150b51..cb2e294 100644 --- a/arch/powerpc/kernel/e500-pmu.c +++ b/arch/powerpc/kernel/e500-pmu.c @@ -75,6 +75,11 @@ static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static int num_events = 128; diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c index 2cc5e03..845a584 100644 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ b/arch/powerpc/kernel/mpc7450-pmu.c @@ -388,6 +388,11 @@ static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; struct power_pmu mpc7450_pmu = { diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index ead8b3c..e9dbc2d 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -587,6 +587,11 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu power4_pmu = { diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index eca0ac5..f58a2bd 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -653,6 +653,11 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu power5p_pmu = { diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index d5ff0f6..b1acab6 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -595,6 +595,11 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu power5_pmu = { diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 3160392..b24a3a2 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -516,6 +516,11 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu power6_pmu = { diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 593740f..6d9dccb 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -342,6 +342,11 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu power7_pmu = { diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 9a6e093..b121de9 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -467,6 +467,11 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { -1, -1 }, }, + [C(NODE)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { -1, -1 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, }; static struct power_pmu ppc970_pmu = { diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c index 748955d..fa4f724 100644 --- a/arch/sh/kernel/cpu/sh4/perf_event.c +++ b/arch/sh/kernel/cpu/sh4/perf_event.c @@ -180,6 +180,21 @@ static const int sh7750_cache_events [ C(RESULT_MISS) ] = -1, }, }, + + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static int sh7750_event_map(int event) diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c index 17e6beb..84a2c39 100644 --- a/arch/sh/kernel/cpu/sh4a/perf_event.c +++ b/arch/sh/kernel/cpu/sh4a/perf_event.c @@ -205,6 +205,21 @@ static const int sh4a_cache_events [ C(RESULT_MISS) ] = -1, }, }, + + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static int sh4a_event_map(int event) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0b32f2e..62a0343 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -246,6 +246,20 @@ static const cache_map_t ultra3_cache_map = { [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, }, }, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, }; static const struct sparc_pmu ultra3_pmu = { @@ -361,6 +375,20 @@ static const cache_map_t niagara1_cache_map = { [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, }, }, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, }; static const struct sparc_pmu niagara1_pmu = { @@ -473,6 +501,20 @@ static const cache_map_t niagara2_cache_map = { [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, }, }, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED }, + [C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED }, + [ C(RESULT_MISS) ] = { CACHE_OP_UNSUPPORTED }, + }, +}, }; static const struct sparc_pmu niagara2_pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index fe29c1d..941caa2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; /* diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5c44862..bf6f92f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -226,6 +226,21 @@ static __initconst const u64 snb_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + }; static __initconst const u64 westmere_hw_cache_event_ids @@ -327,6 +342,20 @@ static __initconst const u64 westmere_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; /* @@ -379,7 +408,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, - } + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, + }, + }, }; static __initconst const u64 nehalem_hw_cache_event_ids @@ -481,6 +524,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; static __initconst const u64 core2_hw_cache_event_ids diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d6e6a67..fb901c5 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -554,6 +554,20 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p4_general_events[PERF_COUNT_HW_MAX] = { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 069315e..a5f54b9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,7 +61,7 @@ enum perf_hw_id { /* * Generalized hardware cache events: * - * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x * { read, write, prefetch } x * { accesses, misses } */ @@ -72,6 +72,7 @@ enum perf_hw_cache_id { PERF_COUNT_HW_CACHE_DTLB = 3, PERF_COUNT_HW_CACHE_ITLB = 4, PERF_COUNT_HW_CACHE_BPU = 5, + PERF_COUNT_HW_CACHE_NODE = 6, PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ }; -- cgit v0.10.2 From 4dc0da86967d5463708631d02a70cfed5b104884 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:35 +0300 Subject: perf: Add context field to perf_event The perf_event overflow handler does not receive any caller-derived argument, so many callers need to resort to looking up the perf_event in their local data structure. This is ugly and doesn't scale if a single callback services many perf_events. Fix by adding a context parameter to perf_event_create_kernel_counter() (and derived hardware breakpoints APIs) and storing it in the perf_event. The field can be accessed from the callback as event->overflow_handler_context. All callers are updated. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-2-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 0c9b105..5c19961 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -479,7 +479,8 @@ static struct perf_event *ptrace_hbp_create(struct task_struct *tsk, int type) attr.bp_type = type; attr.disabled = 1; - return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, tsk); + return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, + tsk); } static int ptrace_gethbpregs(struct task_struct *tsk, long num, diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 3177617..05b7dd2 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -973,7 +973,7 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, &attr.bp_type); thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, - ptrace_triggered, task); + ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; ptrace_put_breakpoints(task); diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 8051976..92b3c27 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -91,7 +91,8 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr) attr.bp_len = HW_BREAKPOINT_LEN_2; attr.bp_type = HW_BREAKPOINT_R; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); if (IS_ERR(bp)) return PTR_ERR(bp); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 98da6a7..00354d4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -638,7 +638,7 @@ void kgdb_arch_late(void) for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; - breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 11db2e9..8252879 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); /* * CHECKME: the previous code returned -EIO if the addr wasn't diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c index 9046f7b..59acf9e 100644 --- a/drivers/oprofile/oprofile_perf.c +++ b/drivers/oprofile/oprofile_perf.c @@ -79,7 +79,7 @@ static int op_create_counter(int cpu, int event) pevent = perf_event_create_kernel_counter(&counter_config[event].attr, cpu, NULL, - op_overflow_handler); + op_overflow_handler, NULL); if (IS_ERR(pevent)) return PTR_ERR(pevent); diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index d1e55fe..6ae9c63 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -73,6 +73,7 @@ static inline unsigned long hw_breakpoint_len(struct perf_event *bp) extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ @@ -85,11 +86,13 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, int cpu); extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered); + perf_overflow_handler_t triggered, + void *context); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); @@ -115,6 +118,7 @@ static inline int __init init_hw_breakpoint(void) { return 0; } static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk) { return NULL; } static inline int modify_user_hw_breakpoint(struct perf_event *bp, @@ -122,10 +126,12 @@ modify_user_hw_breakpoint(struct perf_event *bp, static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, int cpu) { return NULL; } static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) { return NULL; } + perf_overflow_handler_t triggered, + void *context) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline int diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a5f54b9..2a08cac 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -839,6 +839,7 @@ struct perf_event { u64 id; perf_overflow_handler_t overflow_handler; + void *overflow_handler_context; #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; @@ -960,7 +961,8 @@ extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, - perf_overflow_handler_t callback); + perf_overflow_handler_t callback, + void *context); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); diff --git a/kernel/events/core.c b/kernel/events/core.c index 81de28dc..ba8e0f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5745,7 +5745,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct pmu *pmu; struct perf_event *event; @@ -5803,10 +5804,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, #endif } - if (!overflow_handler && parent_event) + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; + context = parent_event->overflow_handler_context; + } event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -6073,7 +6077,8 @@ SYSCALL_DEFINE5(perf_event_open, } } - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, + NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; @@ -6258,7 +6263,8 @@ err_fd: struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct perf_event_context *ctx; struct perf_event *event; @@ -6268,7 +6274,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Get the target context (task or percpu): */ - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, + overflow_handler, context); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -6552,7 +6559,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL); + NULL, NULL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -6579,6 +6586,8 @@ inherit_event(struct perf_event *parent_event, child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; + child_event->overflow_handler_context + = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf2..b7971d6 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp) struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk, triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered, + context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) + perf_overflow_handler_t triggered, + void *context) { struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; @@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, + triggered, context); *pevent = bp; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6708e67..a933e3a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -375,7 +375,7 @@ static int watchdog_nmi_enable(int cpu) hw_nmi_watchdog_set_attr(wd_attr); /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); goto out_save; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 7b164d3..ef7f322 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -60,7 +60,7 @@ static int __init hw_break_module_init(void) attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; - sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler); + sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); if (IS_ERR((void __force *)sample_hbp)) { ret = PTR_ERR((void __force *)sample_hbp); goto fail; -- cgit v0.10.2 From 0af3ac1fdb9d5c297b4b07c9e0172531d42b6716 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:36 +0300 Subject: x86, perf: Add constraints for architectural PMU The v1 PMU does not have any fixed counters. Using the v2 constraints, which do have fixed counters, causes an additional choice to be present in the weight calculation, but not when actually scheduling the event, leading to an event being not scheduled at all. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-3-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index bf6f92f..45fbb8f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -112,6 +112,11 @@ static struct extra_reg intel_westmere_extra_regs[] __read_mostly = EVENT_EXTRA_END }; +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1606,11 +1611,19 @@ static __init int intel_pmu_init(void) break; default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } } return 0; } -- cgit v0.10.2 From 26ca5c11fb45ae2b2ac7e3574b8db6b3a3c7d350 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:37 +0300 Subject: perf: export perf_event_refresh() to modules KVM needs one-shot samples, since a PMC programmed to -X will fire after X events and then again after 2^40 events (i.e. variable period). Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-4-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2a08cac..3f2711c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -955,6 +955,7 @@ extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); +extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * @@ -1149,6 +1150,10 @@ static inline void perf_event_delayed_put(struct task_struct *task) { } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } +static inline int perf_event_refresh(struct perf_event *event, int refresh) +{ + return -EINVAL; +} static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index ba8e0f4..0567e32 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1764,7 +1764,7 @@ out: raw_spin_unlock_irq(&ctx->lock); } -static int perf_event_refresh(struct perf_event *event, int refresh) +int perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -1777,6 +1777,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } +EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, -- cgit v0.10.2 From 3ae9a34d747f9abf2bcc85dc0e77b951513ccdf2 Mon Sep 17 00:00:00 2001 From: Zhengyu He Date: Thu, 23 Jun 2011 13:45:42 -0700 Subject: perf stat: Add noise output for csv mode Previously, when you want perf-stat to output the statistics in csv mode, no information of the noise will be printed out. For example right now we output this --repeat information: ./perf stat -r3 -x, sleep 1 1.164789,task-clock 8,context-switches 0,CPU-migrations 219,page-faults 3337800,cycles With this patch, the output will be appended with an additional entry for the noise value: ./perf stat -r3 -x, sleep 1 1.164789,task-clock,3.75% 8,context-switches,75.00% 0,CPU-migrations,100.00% 219,page-faults,0.00% 3337800,cycles,3.36% Signed-off-by: Zhengyu He Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: Venkatesh Pallipadi Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1308861942-4945-1-git-send-email-zhengyuh@google.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 784ed6d..1d08c80 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -517,7 +517,10 @@ static void print_noise_pct(double total, double avg) if (avg) pct = 100.0*total/avg; - fprintf(stderr, " ( +-%6.2f%% )", pct); + if (csv_output) + fprintf(stderr, "%s%.2f%%", csv_sep, pct); + else + fprintf(stderr, " ( +-%6.2f%% )", pct); } static void print_noise(struct perf_evsel *evsel, double avg) @@ -882,13 +885,13 @@ static void print_counter_aggr(struct perf_evsel *counter) else abs_printout(-1, counter, avg); + print_noise(counter, avg); + if (csv_output) { fputc('\n', stderr); return; } - print_noise(counter, avg); - if (scaled) { double avg_enabled, avg_running; -- cgit v0.10.2 From 29dfc4fd7e792854b30d771d4e299fbdd2adb763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 Jul 2011 15:24:53 +0200 Subject: perf, powerpc: Fix build borkage The patch a8b0ca17b80e ("perf: Remove the nmi parameter from the swevent and overflow interface") missed a spot in the ppc hw_breakpoint code, fix this up so things compile again. Reported-by: Ingo Molnar Cc: Anton Blanchard Cc: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-09pfip95g88s70iwkxu6nnbt@git.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 1c33ec1..80fd4d2 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -57,7 +57,7 @@ void hw_breakpoint_pmu_read(struct perf_event *bp); extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); extern struct pmu perf_ops_bp; -extern void ptrace_triggered(struct perf_event *bp, int nmi, +extern void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs); static inline void hw_breakpoint_disable(void) { -- cgit v0.10.2 From 9e46294dadedc0c04adcb8ce760bd2cd74f7332d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:00:52 +0200 Subject: x86: Save stack pointer in perf live regs savings In order to prepare for fetching the stack pointer from the regs when possible in dump_trace() instead of taking the local one, save the current stack pointer in perf live regs saving. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d9d4dae..094fb30 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); (regs)->bp = caller_frame_pointer(); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ } #else -- cgit v0.10.2 From 47ce11a2b6519f9c7843223ea8e561eb71ea5896 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Jun 2011 19:04:56 +0200 Subject: x86: Fetch stack from regs when possible in dump_trace() When regs are passed to dump_stack(), we fetch the frame pointer from the regs but the stack pointer is taken from the current frame. Thus the frame and stack pointers may not come from the same context. For example this can result in the unwinder to think the context is in irq, due to the current value of the stack, but the frame pointer coming from the regs points to a frame from another place. It then tries to fix up the irq link but ends up dereferencing a random frame pointer that doesn't belong to the irq stack: [ 9131.706906] ------------[ cut here ]------------ [ 9131.707003] WARNING: at arch/x86/kernel/dumpstack_64.c:129 dump_trace+0x2aa/0x330() [ 9131.707003] Hardware name: AMD690VM-FMH [ 9131.707003] Perf: bad frame pointer = 0000000000000005 in callchain [ 9131.707003] Modules linked in: [ 9131.707003] Pid: 1050, comm: perf Not tainted 3.0.0-rc3+ #181 [ 9131.707003] Call Trace: [ 9131.707003] [] warn_slowpath_common+0x7a/0xb0 [ 9131.707003] [] warn_slowpath_fmt+0x41/0x50 [ 9131.707003] [] ? bad_to_user+0x6d/0x10be [ 9131.707003] [] dump_trace+0x2aa/0x330 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] perf_callchain_kernel+0x54/0x70 [ 9131.707003] [] perf_prepare_sample+0x19f/0x2a0 [ 9131.707003] [] __perf_event_overflow+0x16c/0x290 [ 9131.707003] [] ? __perf_event_overflow+0x130/0x290 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] ? sched_clock+0x9/0x10 [ 9131.707003] [] ? T.375+0x15/0x90 [ 9131.707003] [] ? trace_hardirqs_on_caller+0x64/0x180 [ 9131.707003] [] ? trace_hardirqs_off+0xd/0x10 [ 9131.707003] [] perf_event_overflow+0x14/0x20 [ 9131.707003] [] perf_swevent_hrtimer+0x11c/0x130 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] __run_hrtimer+0x83/0x1e0 [ 9131.707003] [] ? perf_event_overflow+0x20/0x20 [ 9131.707003] [] hrtimer_interrupt+0x106/0x250 [ 9131.707003] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 9131.707003] [] smp_apic_timer_interrupt+0x53/0x90 [ 9131.707003] [] apic_timer_interrupt+0x13/0x20 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] ? error_exit+0x4c/0xb0 [ 9131.707003] ---[ end trace b2560d4876709347 ]--- Fix this by simply taking the stack pointer from regs->sp when regs are provided. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index e71c98d..788295c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -155,9 +155,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, task = current; if (!stack) { - stack = &dummy; - if (task && task != current) + if (regs) + stack = (unsigned long *)regs->sp; + else if (task && task != current) stack = (unsigned long *)task->thread.sp; + else + stack = &dummy; } if (!bp) -- cgit v0.10.2 From 1871853f7abc3c727c4346539c5062cbeaf016a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 01:51:22 +0200 Subject: x86,64: Simplify save_regs() The save_regs function that saves the regs on low level irq entry is complicated because of the fact it changes its stack in the middle and also because it manipulates data allocated in the caller frame and accesses there are directly calculated from callee rsp value with the return address in the middle of the way. This complicates the static stack offsets calculation and require more dynamic ones. It also needs a save/restore of the function's return address. To simplify and optimize this, turn save_regs() into a macro. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..b6b2e85 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -297,27 +297,22 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ - .pushsection .kprobes.text, "ax" -ENTRY(save_args) - XCPT_FRAME + .macro SAVE_ARGS_IRQ cld - /* - * start from rbp in pt_regs and jump over - * return address. - */ - movq_cfi rdi, RDI+8-RBP - movq_cfi rsi, RSI+8-RBP - movq_cfi rdx, RDX+8-RBP - movq_cfi rcx, RCX+8-RBP - movq_cfi rax, RAX+8-RBP - movq_cfi r8, R8+8-RBP - movq_cfi r9, R9+8-RBP - movq_cfi r10, R10+8-RBP - movq_cfi r11, R11+8-RBP - - leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ - movq_cfi rbp, 8 /* push %rbp */ - leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 0 /* push %rbp */ + movq %rsp, %rbp testl $3, CS(%rdi) je 1f SWAPGS @@ -329,19 +324,14 @@ ENTRY(save_args) */ 1: incl PER_CPU_VAR(irq_count) jne 2f - popq_cfi %rax /* move return address... */ mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ - pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ 2: TRACE_IRQS_OFF - ret - CFI_ENDPROC -END(save_args) - .popsection + .endm ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -791,7 +781,7 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - call save_args + SAVE_ARGS_IRQ PARTIAL_FRAME 0 call \func .endm -- cgit v0.10.2 From 3b99a3ef55b292180473a221f3d6bc24455f0632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 02:25:17 +0200 Subject: x86,64: Separate arg1 from rbp handling in SAVE_REGS_IRQ Just for clarity in the code. Have a first block that handles the frame pointer and a separate one that handles pt_regs pointer and its use. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b6b2e85..20dc8e6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,9 +310,10 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - leaq -RBP(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 0 /* push %rbp */ movq %rsp, %rbp + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) je 1f SWAPGS -- cgit v0.10.2 From 48ffee7d9e6df51b4957bed64115b7beed671374 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:03:44 +0200 Subject: x86: Remove useless unwinder backlink from irq regs saving The unwinder backlink in interrupt entry is very useless. It's actually not part of the stack frame chain and thus is never used. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 20dc8e6..6131432 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -327,7 +327,6 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - pushq_cfi %rbp /* backlink for unwinder */ /* * We entered an interrupt context - irqs are off: */ -- cgit v0.10.2 From a2bbe75089d5eb9a3a46d50dd5c215e213790288 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 16:52:45 +0200 Subject: x86: Don't use frame pointer to save old stack on irq entry rbp is used in SAVE_ARGS_IRQ to save the old stack pointer in order to restore it later in ret_from_intr. It is convenient because we save its value in the irq regs and it's easily restored using the leave instruction. However this is a kind of abuse of the frame pointer which role is to help unwinding the kernel by chaining frames together, each node following the return address to the previous frame. But although we are breaking the frame by changing the stack pointer, there is no preceding return address before the new frame. Hence using the frame pointer to link the two stacks breaks the stack unwinders that find a random value instead of a return address here. There is no workaround that can work in every case. We are using the fixup_bp_irq_link() function to dereference that abused frame pointer in the case of non nesting interrupt (which means stack changed). But that doesn't fix the case of interrupts that don't change the stack (but we still have the unconditional frame link), which is the case of hardirq interrupting softirq. We have no way to detect this transition so the frame irq link is considered as a real frame pointer and the return address is dereferenced but it is still a spurious one. There are two possible results of this: either the spurious return address, a random stack value, luckily belongs to the kernel text and then the unwinding can continue and we just have a weird entry in the stack trace. Or it doesn't belong to the kernel text and unwinding stops there. This is the reason why stacktraces (including perf callchains) on irqs that interrupted softirqs don't work very well. To solve this, we don't save the old stack pointer on rbp anymore but we save it to a scratch register that we push on the new stack and that we pop back later on irq return. This preserves the whole frame chain without spurious return addresses in the middle and drops the need for the horrid fixup_bp_irq_link() workaround. And finally irqs that interrupt softirq are sanely unwinded. Before: 99.81% perf [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--41.60%-- __read | | | |--99.90%-- create_worker | | bench_sched_messaging | | cmd_bench | | run_builtin | | main | | __libc_start_main | --0.10%-- [...] After: 1.64% swapper [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--95.00%-- arch_irq_work_raise | irq_work_queue | __perf_event_overflow | perf_swevent_overflow | perf_swevent_event | perf_tp_event | perf_trace_softirq | __do_softirq | call_softirq | do_softirq | irq_exit | | | |--73.68%-- smp_apic_timer_interrupt | | apic_timer_interrupt | | | | | |--96.43%-- amd_e400_idle | | | cpu_idle | | | start_secondary Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 788295c..19853ad 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, } /* - * We are returning from the irq stack and go to the previous one. - * If the previous stack is also in the irq stack, then bp in the first - * frame of the irq stack points to the previous, interrupted one. - * Otherwise we have another level of indirection: We first save - * the bp of the previous stack, then we switch the stack to the irq one - * and save a new bp that links to the previous one. - * (See save_args()) - */ -static inline unsigned long -fixup_bp_irq_link(unsigned long bp, unsigned long *stack, - unsigned long *irq_stack, unsigned long *irq_stack_end) -{ -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long next; - - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (!probe_kernel_address(&frame->next_frame, next)) - return next; - else - WARN_ONCE(1, "Perf: bad frame pointer = %p in " - "callchain\n", &frame->next_frame); - } -#endif - return bp; -} - -/* * x86-64 can have up to three kernel stacks: * process stack * interrupt stack @@ -208,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); - bp = fixup_bp_irq_link(bp, stack, irq_stack, - irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6131432..d656f68 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,8 +310,11 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - movq_cfi rbp, 0 /* push %rbp */ - movq %rsp, %rbp + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) @@ -327,10 +330,11 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - /* - * We entered an interrupt context - irqs are off: - */ -2: TRACE_IRQS_OFF + +2: /* Store previous stack value */ + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF .endm ENTRY(save_rest) @@ -804,15 +808,14 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) - leaveq - CFI_RESTORE rbp + /* Restore saved previous stack */ + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -16 - /* we did not save rbx, restore only from ARGOFFSET */ - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) -- cgit v0.10.2 From 5d67be97f8903d05ce53597fb5f3bc25a45e8026 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 4 Jul 2011 21:57:50 +1000 Subject: perf report/annotate/script: Add option to specify a CPU range Add an option to perf report/annotate/script to specify which CPUs to operate on. This enables us to take a single system wide profile and analyse each CPU (or group of CPUs) in isolation. This was useful when profiling a multiprocess workload where the bottleneck was on one CPU but this was hidden in the overall profile. Per process and per thread breakdowns didn't help because multiple processes were running on each CPU and no single process consumed an entire CPU. The patch converts the list of CPUs returned by cpu_map__new into a bitmap for fast lookup. I wanted to use -C to be consistent with perf top/record/stat, but unfortunately perf report already uses -C . v2: Incorporate suggestions from David Ahern: - Added -c to perf script - Check that SAMPLE_CPU is set when -c is used - Update documentation v3: Create perf_session__cpu_bitmap() Signed-off-by: Anton Blanchard Acked-by: David Ahern Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Paul Mackerras Link: http://lkml.kernel.org/r/20110704215750.11647eb9@kryten Signed-off-by: Ingo Molnar diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 6f5a4986..85c5f02 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -66,6 +66,12 @@ OPTIONS used. This interfaces starts by centering on the line with more samples, TAB/UNTAB cycles through the lines with more samples. +-c:: +--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can + be provided as a comma-separated list with no space: 0,1. Ranges of + CPUs are specified with -: 0-2. Default is to report samples on all + CPUs. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index cfa8e51..04253c0 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -128,6 +128,12 @@ OPTIONS --symfs=:: Look for files with symbols relative to this directory. +-c:: +--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can + be provided as a comma-separated list with no space: 0,1. Ranges of + CPUs are specified with -: 0-2. Default is to report samples on all + CPUs. + SEE ALSO -------- linkperf:perf-stat[1] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index c6068cb..db01786 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -182,6 +182,12 @@ OPTIONS --hide-call-graph:: When printing symbols do not display call chain. +-c:: +--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can + be provided as a comma-separated list with no space: 0,1. Ranges of + CPUs are specified with -: 0-2. Default is to report samples on all + CPUs. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 7b139e1..555aefd 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -28,6 +28,8 @@ #include "util/hist.h" #include "util/session.h" +#include + static char const *input_name = "perf.data"; static bool force, use_tui, use_stdio; @@ -38,6 +40,9 @@ static bool print_line; static const char *sym_hist_filter; +static const char *cpu_list; +static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); + static int perf_evlist__add_sample(struct perf_evlist *evlist, struct perf_sample *sample, struct perf_evsel *evsel, @@ -90,6 +95,9 @@ static int process_sample_event(union perf_event *event, return -1; } + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + return 0; + if (!al.filtered && perf_evlist__add_sample(session->evlist, sample, evsel, &al)) { pr_warning("problem incrementing symbol count, " @@ -177,6 +185,12 @@ static int __cmd_annotate(void) if (session == NULL) return -ENOMEM; + if (cpu_list) { + ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); + if (ret) + goto out_delete; + } + ret = perf_session__process_events(session, &event_ops); if (ret) goto out_delete; @@ -252,6 +266,7 @@ static const struct option options[] = { "print matching source lines (may be slow)"), OPT_BOOLEAN('P', "full-paths", &full_paths, "Don't shorten the displayed pathnames"), + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_END() }; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 5d43d01..f854efd 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -33,6 +33,8 @@ #include "util/sort.h" #include "util/hist.h" +#include + static char const *input_name = "perf.data"; static bool force, use_tui, use_stdio; @@ -49,6 +51,9 @@ static char callchain_default_opt[] = "fractal,0.5,callee"; static bool inverted_callchain; static symbol_filter_t annotate_init; +static const char *cpu_list; +static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); + static int perf_session__add_hist_entry(struct perf_session *session, struct addr_location *al, struct perf_sample *sample, @@ -117,6 +122,9 @@ static int process_sample_event(union perf_event *event, if (al.filtered || (hide_unresolved && al.sym == NULL)) return 0; + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + return 0; + if (al.map != NULL) al.map->dso->hit = 1; @@ -263,6 +271,12 @@ static int __cmd_report(void) if (session == NULL) return -ENOMEM; + if (cpu_list) { + ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); + if (ret) + goto out_delete; + } + if (show_threads) perf_read_values_init(&show_threads_values); @@ -473,6 +487,7 @@ static const struct option options[] = { "Only display entries resolved to a symbol"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_END() }; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3056b45..09024ec 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -13,6 +13,7 @@ #include "util/util.h" #include "util/evlist.h" #include "util/evsel.h" +#include static char const *script_name; static char const *generate_script_lang; @@ -21,6 +22,8 @@ static u64 last_timestamp; static u64 nr_unordered; extern const struct option record_options[]; static bool no_callchain; +static const char *cpu_list; +static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); enum perf_output_field { PERF_OUTPUT_COMM = 1U << 0, @@ -453,6 +456,10 @@ static int process_sample_event(union perf_event *event, last_timestamp = sample->time; return 0; } + + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + return 0; + scripting_ops->process_event(event, sample, evsel, session, thread); session->hists.stats.total_period += sample->period; @@ -1075,6 +1082,7 @@ static const struct option options[] = { OPT_CALLBACK('f', "fields", NULL, "str", "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr", parse_output_fields), + OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_END() }; @@ -1255,6 +1263,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) if (session == NULL) return -ENOMEM; + if (cpu_list) { + if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap)) + return -1; + } + if (!no_callchain) symbol_conf.use_callchain = true; else diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 558bcf9..080e533 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -12,6 +12,7 @@ #include "session.h" #include "sort.h" #include "util.h" +#include "cpumap.h" static int perf_session__open(struct perf_session *self, bool force) { @@ -1282,3 +1283,40 @@ void perf_session__print_ip(union perf_event *event, } } } + +int perf_session__cpu_bitmap(struct perf_session *session, + const char *cpu_list, unsigned long *cpu_bitmap) +{ + int i; + struct cpu_map *map; + + for (i = 0; i < PERF_TYPE_MAX; ++i) { + struct perf_evsel *evsel; + + evsel = perf_session__find_first_evtype(session, i); + if (!evsel) + continue; + + if (!(evsel->attr.sample_type & PERF_SAMPLE_CPU)) { + pr_err("File does not contain CPU events. " + "Remove -c option to proceed.\n"); + return -1; + } + } + + map = cpu_map__new(cpu_list); + + for (i = 0; i < map->nr; i++) { + int cpu = map->map[i]; + + if (cpu >= MAX_NR_CPUS) { + pr_err("Requested CPU %d too large. " + "Consider raising MAX_NR_CPUS\n", cpu); + return -1; + } + + set_bit(cpu, cpu_bitmap); + } + + return 0; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index de4178d..5de754f 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -172,4 +172,7 @@ void perf_session__print_ip(union perf_event *event, struct perf_session *session, int print_sym, int print_dso); +int perf_session__cpu_bitmap(struct perf_session *session, + const char *cpu_list, unsigned long *cpu_bitmap); + #endif /* __PERF_SESSION_H */ -- cgit v0.10.2 From e9dbfae53eeb9fc3d4bb7da3df87fa9875f5da02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 11:36:06 -0400 Subject: tracing: Fix bug when reading system filters on module removal The event system is freed when its nr_events is set to zero. This happens when a module created an event system and then later the module is removed. Modules may share systems, so the system is allocated when it is created and freed when the modules are unloaded and all the events under the system are removed (nr_events set to zero). The problem arises when a task opened the "filter" file for the system. If the module is unloaded and it removed the last event for that system, the system structure is freed. If the task that opened the filter file accesses the "filter" file after the system has been freed, the system will access an invalid pointer. By adding a ref_count, and using it to keep track of what is using the event system, we can free it after all users are finished with the event system. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f859..f807407 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,6 +677,7 @@ struct event_subsystem { struct dentry *entry; struct event_filter *filter; int nr_events; + int ref_count; }; #define FILTER_PRED_INVALID ((unsigned short)-1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec39..ffc5b28 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) mutex_unlock(&event_mutex); } +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -826,6 +855,47 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + put_system(system); + + return 0; +} + static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -963,10 +1033,11 @@ static const struct file_operations ftrace_event_filter_fops = { }; static const struct file_operations ftrace_subsystem_filter_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { @@ -1002,8 +1073,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -static LIST_HEAD(event_subsystems); - static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1013,6 +1082,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { + __get_system(system); system->nr_events++; return system->entry; } @@ -1035,6 +1105,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } system->nr_events = 1; + system->ref_count = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -1184,16 +1255,9 @@ static void remove_subsystem_dir(const char *name) list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { if (!--system->nr_events) { - struct event_filter *filter = system->filter; - debugfs_remove_recursive(system->entry); list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); + __put_system(system); } break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddc..256764ec 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); -- cgit v0.10.2 From 40ee4dffff061399eb9358e0c8fcfbaf8de4c8fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 14:32:51 -0400 Subject: tracing: Have "enable" file use refcounts like the "filter" file The "enable" file for the event system can be removed when a module is unloaded and the event system only has events from that module. As the event system nr_events count goes to zero, it may be freed if its ref_count is also set to zero. Like the "filter" file, the "enable" file may be opened by a task and referenced later, after a module has been unloaded and the events for that event system have been removed. Although the "filter" file referenced the event system structure, the "enable" file only references a pointer to the event system name. Since the name is freed when the event system is removed, it is possible that an access to the "enable" file may reference a freed pointer. Update the "enable" file to use the subsystem_open() routine that the "filter" file uses, to keep a reference to the event system structure while the "enable" file is opened. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ffc5b28..3e2a7c9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -557,7 +557,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; int set = 0; @@ -568,7 +568,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system) != 0) + if (system && strcmp(call->class->system, system->name) != 0) continue; /* @@ -598,7 +598,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; + const char *name = NULL; unsigned long val; char buf[64]; ssize_t ret; @@ -622,7 +623,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - ret = __ftrace_set_clr_event(NULL, system, NULL, val); + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); if (ret) goto out; @@ -862,6 +870,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; + if (!inode->i_private) + goto skip_search; + /* Make sure the system still exists */ mutex_lock(&event_mutex); list_for_each_entry(system, &event_subsystems, list) { @@ -880,8 +891,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (system != inode->i_private) return -ENODEV; + skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0 && system) put_system(system); return ret; @@ -891,7 +903,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct event_subsystem *system = inode->i_private; - put_system(system); + if (system) + put_system(system); return 0; } @@ -1041,10 +1054,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { }; static const struct file_operations ftrace_system_enable_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { @@ -1133,8 +1147,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, - (void *)system->name, + trace_create_file("enable", 0644, system->entry, system, &ftrace_system_enable_fops); return system->entry; -- cgit v0.10.2 From 43dd61c9a09bd413e837df829e6bfb42159be52a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jul 2011 11:09:22 -0400 Subject: ftrace: Fix regression of :mod:module function enabling The new code that allows different utilities to pick and choose what functions they trace broke the :mod: hook that allows users to trace only functions of a particular module. The reason is that the :mod: hook bypasses the hash that is setup to allow individual users to trace their own functions and uses the global hash directly. But if the global hash has not been set up, it will cause a bug: echo '*:mod:radeon' > /sys/kernel/debug/set_ftrace_filter produces: [drm:drm_mode_getfb] *ERROR* invalid framebuffer id [drm:radeon_crtc_page_flip] *ERROR* failed to reserve new rbo buffer before flip BUG: unable to handle kernel paging request at ffffffff8160ec90 IP: [] add_hash_entry+0x66/0xd0 PGD 1a05067 PUD 1a09063 PMD 80000000016001e1 Oops: 0003 [#1] SMP Jul 7 04:02:28 phyllis kernel: [55303.858604] CPU 1 Modules linked in: cryptd aes_x86_64 aes_generic binfmt_misc rfcomm bnep ip6table_filter hid radeon r8169 ahci libahci mii ttm drm_kms_helper drm video i2c_algo_bit intel_agp intel_gtt Pid: 10344, comm: bash Tainted: G WC 3.0.0-rc5 #1 Dell Inc. Inspiron N5010/0YXXJJ RIP: 0010:[] [] add_hash_entry+0x66/0xd0 RSP: 0018:ffff88003a96bda8 EFLAGS: 00010246 RAX: ffff8801301735c0 RBX: ffffffff8160ec80 RCX: 0000000000306ee0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880137c92940 RBP: ffff88003a96bdb8 R08: ffff880137c95680 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81c9df78 R13: ffff8801153d1000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f329c18a700(0000) GS:ffff880137c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8160ec90 CR3: 000000003002b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process bash (pid: 10344, threadinfo ffff88003a96a000, task ffff88012fcfc470) Stack: 0000000000000fd0 00000000000000fc ffff88003a96be38 ffffffff810d92f5 ffff88011c4c4e00 ffff880000000000 000000000b69f4d0 ffffffff8160ec80 ffff8800300e6f06 0000000081130295 0000000000000282 ffff8800300e6f00 Call Trace: [] match_records+0x155/0x1b0 [] ftrace_mod_callback+0xbc/0x100 [] ftrace_regex_write+0x16f/0x210 [] ftrace_filter_write+0xf/0x20 [] vfs_write+0xc8/0x190 [] sys_write+0x51/0x90 [] system_call_fastpath+0x16/0x1b Code: 48 8b 33 31 d2 48 85 f6 75 33 49 89 d4 4c 03 63 08 49 8b 14 24 48 85 d2 48 89 10 74 04 48 89 42 08 49 89 04 24 4c 89 60 08 31 d2 RIP [] add_hash_entry+0x66/0xd0 RSP CR2: ffffffff8160ec90 ---[ end trace a5d031828efdd88e ]--- Reported-by: Brian Marete Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9d88e1c..ed0eb52 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -123,7 +123,8 @@ stack_trace_sysctl(struct ctl_table *table, int write, struct ftrace_func_command { struct list_head list; char *name; - int (*func)(char *func, char *cmd, + int (*func)(struct ftrace_hash *hash, + char *func, char *cmd, char *params, int enable); }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 908038f..1c4c0b0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2407,10 +2407,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) */ static int -ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) { - struct ftrace_ops *ops = &global_ops; - struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -2430,11 +2429,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; @@ -2760,7 +2754,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(func, command, next, enable); + ret = p->func(hash, func, command, next, enable); goto out_unlock; } } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8d0e1cc..c7b0c6a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) } static int -ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; -- cgit v0.10.2 From e08fbb78f03fe2c4f88824faf6f51ce6af185e11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Jul 2011 23:04:36 -0400 Subject: tracing, x86/irq: Do not trace arch_local_{*,irq_*}() functions I triggered a triple fault with gcc 4.5.1 because it did not honor the inline annotation to arch_local_save_flags() function and that function was added to the pool of functions traced by the function tracer. When preempt_schedule() called arch_local_save_flags() (called by irqs_disabled()), it was traced, but the first thing the function tracer does is disable preemption. When it enables preemption, the NEED_RESCHED flag will not have been cleared and the preemption check will trigger the call to preempt_schedule() again. Although the dynamic function tracer crashed immediately, the static version of the function tracer (CONFIG_DYNAMIC_FTRACE is not set) actually was able to show where the problem was. swapper-1 3.N.. 103885us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule It went on for a while before it triple faulted with a corrupted stack. The arch_local_save_flags and arch_local_irq_* functions should not be traced. Even though they are marked as inline, gcc may still make them a function and enable tracing of them. The simple solution is to just mark them as notrace. I had to add the for this file to include the notrace tag. Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/20110702033852.733414762@goodmis.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 5745ce8..bba3cf8 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -60,23 +60,24 @@ static inline void native_halt(void) #include #else #ifndef __ASSEMBLY__ +#include -static inline unsigned long arch_local_save_flags(void) +static inline notrace unsigned long arch_local_save_flags(void) { return native_save_fl(); } -static inline void arch_local_irq_restore(unsigned long flags) +static inline notrace void arch_local_irq_restore(unsigned long flags) { native_restore_fl(flags); } -static inline void arch_local_irq_disable(void) +static inline notrace void arch_local_irq_disable(void) { native_irq_disable(); } -static inline void arch_local_irq_enable(void) +static inline notrace void arch_local_irq_enable(void) { native_irq_enable(); } @@ -102,7 +103,7 @@ static inline void halt(void) /* * For spinlocks, etc: */ -static inline unsigned long arch_local_irq_save(void) +static inline notrace unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); -- cgit v0.10.2 From e4a3f541f0b67fdad98b326c851dfe7f4b6b6dad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 19:02:29 -0400 Subject: tracing: Still trace filtered irq functions when irq trace is disabled If a function is set to be traced by the set_graph_function, but the option funcgraph-irqs is zero, and the traced function happens to be called from a interrupt, it will not be traced. The point of funcgraph-irqs is to not trace interrupts when we are preempted by an irq, not to not trace functions we want to trace that happen to be *in* a irq. Luckily the current->trace_recursion element is perfect to add a flag to help us be able to trace functions within an interrupt even when we are not tracing interrupts that preempt the trace. Reported-by: Heiko Carstens Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a3e2db7..651f35b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -278,6 +278,29 @@ struct tracer { }; +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ +#define TRACE_IRQ_BIT (1<<13) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #define TRACE_PIPE_ALL_CPU -1 int tracer_init(struct tracer *t, struct trace_array *tr); @@ -516,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) + if (addr == ftrace_graph_funcs[i]) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); return 1; + } } return 0; @@ -794,19 +827,4 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" -/* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) - -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) - #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e8d6bb5..a7d2a4c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -227,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, static inline int ftrace_graph_ignore_irqs(void) { - if (!ftrace_graph_skip_irqs) + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; return in_irq(); -- cgit v0.10.2 From 4376cac66778b25e599be3f5d54f33f58ba8ead7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Jun 2011 23:28:13 -0400 Subject: ftrace: Do not disable interrupts for modules in mcount update When I mounted an NFS directory, it caused several modules to be loaded. At the time I was running the preemptirqsoff tracer, and it showed the following output: # tracer: preemptirqsoff # # preemptirqsoff latency trace v1.1.5 on 2.6.33.9-rt30-mrg-test # -------------------------------------------------------------------- # latency: 1177 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: modprobe-19370 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: ftrace_module_notify # => ended at: ftrace_module_notify # # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| /_--=> lock-depth # |||||/ delay # cmd pid |||||| time | caller # \ / |||||| \ | / modprobe-19370 3d.... 0us!: ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1176us : ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1178us : trace_hardirqs_on <-ftrace_module_notify modprobe-19370 3d.... 1178us : => ftrace_process_locs => ftrace_module_notify => notifier_call_chain => __blocking_notifier_call_chain => blocking_notifier_call_chain => sys_init_module => system_call_fastpath That's over 1ms that interrupts are disabled on a Real-Time kernel! Looking at the cause (being the ftrace author helped), I found that the interrupts are disabled before the code modification of mcounts into nops. The interrupts only need to be disabled on start up around this code, not when modules are being loaded. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c997f73..df93392 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3318,7 +3318,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; + unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); p = start; @@ -3336,12 +3336,18 @@ static int ftrace_process_locs(struct module *mod, } /* - * Disable interrupts to prevent interrupts from executing - * code that is being modified. + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. */ - local_irq_save(flags); + if (!mod) + local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); + if (!mod) + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v0.10.2 From 04da85b86188f224cc9b391b5bdd92a3ba20ffcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 Jul 2011 10:12:59 -0400 Subject: ftrace: Fix warning when CONFIG_FUNCTION_TRACER is not defined The struct ftrace_hash was declared within CONFIG_FUNCTION_TRACER but was referenced outside of it. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ed0eb52..f0c0e8a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -19,6 +19,8 @@ #include +struct ftrace_hash; + #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -29,8 +31,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); -struct ftrace_hash; - enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, -- cgit v0.10.2 From 41fb61c2d08107ce96a5dcb3a6289b2afd3e135c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:03:44 -0400 Subject: ftrace: Balance records when updating the hash Whenever the hash of the ftrace_ops is updated, the record counts must be balance. This requires disabling the records that are set in the original hash, and then enabling the records that are set in the updated hash. Moving the update into ftrace_hash_move() removes the bug where the hash was updated but the records were not, which results in ftrace triggering a warning and disabling itself because the ftrace_ops filter is updated while the ftrace_ops was registered, and then the failure happens when the ftrace_ops is unregistered. The current code will not trigger this bug, but new code will. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index df93392..853f6f0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1170,8 +1170,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1181,9 +1187,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) unsigned long key; int size = src->count; int bits = 0; + int ret; int i; /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* * If the new source is empty, just free dst and assign it * the empty_hash. */ @@ -1203,9 +1216,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1224,7 +1238,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -2845,7 +2868,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3028,18 +3051,12 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); -- cgit v0.10.2 From 072126f4529196f71a97960248bca54fd4554c2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:08:31 -0400 Subject: ftrace: Update filter when tracing enabled in set_ftrace_filter() Currently, if set_ftrace_filter() is called when the ftrace_ops is active, the function filters will not be updated. They will only be updated when tracing is disabled and re-enabled. Update the functions immediately during set_ftrace_filter(). Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 853f6f0..a0dc0de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2869,6 +2869,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); -- cgit v0.10.2 From 6331c28c962561aee59e5a493b7556a4bb585957 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:11:02 -0400 Subject: ftrace: Fix dynamic selftest failure on some archs Archs that do not implement CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, will fail the dynamic ftrace selftest. The function tracer has a quick 'off' variable that will prevent the call back functions from being called. This variable is called function_trace_stop. In x86, this is implemented directly in the mcount assembly, but for other archs, an intermediate function is used called ftrace_test_stop_func(). In dynamic ftrace, the function pointer variable ftrace_trace_function is used to update the caller code in the mcount caller. But for archs that do not have CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST set, it only calls ftrace_test_stop_func() instead, which in turn calls __ftrace_trace_function. When more than one ftrace_ops is registered, the function it calls is ftrace_ops_list_func(), which will iterate over all registered ftrace_ops and call the callbacks that have their hash matching. The issue happens when two ftrace_ops are registered for different functions and one is then unregistered. The __ftrace_trace_function is then pointed to the remaining ftrace_ops callback function directly. This mean it will be called for all functions that were registered to trace by both ftrace_ops that were registered. This is not an issue for archs with CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, because the update of ftrace_trace_function doesn't happen until after all functions have been updated, and then the mcount caller is updated. But for those archs that do use the ftrace_test_stop_func(), the update is immediate. The dynamic selftest fails because it hits this situation, and the ftrace_ops that it registers fails to only trace what it was suppose to and instead traces all other functions. The solution is to delay the setting of __ftrace_trace_function until after all the functions have been updated according to the registered ftrace_ops. Also, function_trace_stop is set during the update to prevent function tracing from calling code that is caused by the function tracer itself. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a0dc0de..62e26d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -88,6 +88,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -146,9 +147,11 @@ void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; __ftrace_trace_function = ftrace_stub; + __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } +#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their @@ -208,7 +211,12 @@ static void update_ftrace_function(void) #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else +#ifdef CONFIG_DYNAMIC_FTRACE + /* do not update till all functions have been modified */ + __ftrace_trace_function_delay = func; +#else __ftrace_trace_function = func; +#endif ftrace_trace_function = ftrace_test_stop_func; #endif } @@ -1607,6 +1615,12 @@ static int __ftrace_modify_code(void *data) { int *command = data; + /* + * Do not call function tracer while we update the code. + * We are in stop machine, no worrying about races. + */ + function_trace_stop++; + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1620,6 +1634,18 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; + return 0; } -- cgit v0.10.2 From 259032bfe379281bf7cba512b7705bdb4ce41db5 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Thu, 14 Jul 2011 13:34:43 +1000 Subject: perf: Robustify proc and debugfs file recording While attempting to create a timechart of boot up I found perf didn't tolerate modules being loaded/unloaded. This patch fixes this by reading the file once and then writing the size read at the correct point in the file. It also simplifies the code somewhat. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Signed-off-by: Sonny Rao Signed-off-by: Michael Neuling Link: http://lkml.kernel.org/r/10011.1310614483@neuling.org Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 35729f4..3403f81 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -183,106 +183,59 @@ int bigendian(void) return *ptr == 0x01020304; } -static unsigned long long copy_file_fd(int fd) +/* unfortunately, you can not stat debugfs or proc files for size */ +static void record_file(const char *file, size_t hdr_sz) { unsigned long long size = 0; - char buf[BUFSIZ]; - int r; - - do { - r = read(fd, buf, BUFSIZ); - if (r > 0) { - size += r; - write_or_die(buf, r); - } - } while (r > 0); - - return size; -} - -static unsigned long long copy_file(const char *file) -{ - unsigned long long size = 0; - int fd; + char buf[BUFSIZ], *sizep; + off_t hdr_pos = lseek(output_fd, 0, SEEK_CUR); + int r, fd; fd = open(file, O_RDONLY); if (fd < 0) die("Can't read '%s'", file); - size = copy_file_fd(fd); - close(fd); - return size; -} - -static unsigned long get_size_fd(int fd) -{ - unsigned long long size = 0; - char buf[BUFSIZ]; - int r; + /* put in zeros for file size, then fill true size later */ + write_or_die(&size, hdr_sz); do { r = read(fd, buf, BUFSIZ); - if (r > 0) + if (r > 0) { size += r; + write_or_die(buf, r); + } } while (r > 0); - - lseek(fd, 0, SEEK_SET); - - return size; -} - -static unsigned long get_size(const char *file) -{ - unsigned long long size = 0; - int fd; - - fd = open(file, O_RDONLY); - if (fd < 0) - die("Can't read '%s'", file); - size = get_size_fd(fd); close(fd); - return size; + /* ugh, handle big-endian hdr_size == 4 */ + sizep = (char*)&size; + if (bigendian()) + sizep += sizeof(u64) - hdr_sz; + + if (pwrite(output_fd, sizep, hdr_sz, hdr_pos) < 0) + die("writing to %s", output_file); } static void read_header_files(void) { - unsigned long long size, check_size; char *path; - int fd; + struct stat st; path = get_tracing_file("events/header_page"); - fd = open(path, O_RDONLY); - if (fd < 0) + if (stat(path, &st) < 0) die("can't read '%s'", path); - /* unfortunately, you can not stat debugfs files for size */ - size = get_size_fd(fd); - write_or_die("header_page", 12); - write_or_die(&size, 8); - check_size = copy_file_fd(fd); - close(fd); - - if (size != check_size) - die("wrong size for '%s' size=%lld read=%lld", - path, size, check_size); + record_file(path, 8); put_tracing_file(path); path = get_tracing_file("events/header_event"); - fd = open(path, O_RDONLY); - if (fd < 0) + if (stat(path, &st) < 0) die("can't read '%s'", path); - size = get_size_fd(fd); - write_or_die("header_event", 13); - write_or_die(&size, 8); - check_size = copy_file_fd(fd); - if (size != check_size) - die("wrong size for '%s'", path); + record_file(path, 8); put_tracing_file(path); - close(fd); } static bool name_in_tp_list(char *sys, struct tracepoint_path *tps) @@ -298,7 +251,6 @@ static bool name_in_tp_list(char *sys, struct tracepoint_path *tps) static void copy_event_system(const char *sys, struct tracepoint_path *tps) { - unsigned long long size, check_size; struct dirent *dent; struct stat st; char *format; @@ -338,14 +290,8 @@ static void copy_event_system(const char *sys, struct tracepoint_path *tps) sprintf(format, "%s/%s/format", sys, dent->d_name); ret = stat(format, &st); - if (ret >= 0) { - /* unfortunately, you can not stat debugfs files for size */ - size = get_size(format); - write_or_die(&size, 8); - check_size = copy_file(format); - if (size != check_size) - die("error in size of file '%s'", format); - } + if (ret >= 0) + record_file(format, 8); free(format); } @@ -426,7 +372,7 @@ static void read_event_files(struct tracepoint_path *tps) static void read_proc_kallsyms(void) { - unsigned int size, check_size; + unsigned int size; const char *path = "/proc/kallsyms"; struct stat st; int ret; @@ -438,17 +384,12 @@ static void read_proc_kallsyms(void) write_or_die(&size, 4); return; } - size = get_size(path); - write_or_die(&size, 4); - check_size = copy_file(path); - if (size != check_size) - die("error in size of file '%s'", path); - + record_file(path, 4); } static void read_ftrace_printk(void) { - unsigned int size, check_size; + unsigned int size; char *path; struct stat st; int ret; @@ -461,11 +402,8 @@ static void read_ftrace_printk(void) write_or_die(&size, 4); goto out; } - size = get_size(path); - write_or_die(&size, 4); - check_size = copy_file(path); - if (size != check_size) - die("error in size of file '%s'", path); + record_file(path, 4); + out: put_tracing_file(path); } -- cgit v0.10.2 From 4a9bd3f134decd6d16ead8d288342d57aad486be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 16:36:53 -0400 Subject: tracing: Have dynamic size event stack traces Currently the stack trace per event in ftace is only 8 frames. This can be quite limiting and sometimes useless. Especially when the "ignore frames" is wrong and we also use up stack frames for the event processing itself. Change this to be dynamic by adding a percpu buffer that we can write a large stack frame into and then copy into the ring buffer. For interrupts and NMIs that come in while another event is being process, will only get to use the 8 frame stack. That should be enough as the task that it interrupted will have the full stack frame anyway. Requested-by: Thomas Gleixner Signed-off-by: Steven Rostedt diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index b1e69ee..96efa67 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -76,6 +76,7 @@ struct trace_iterator { struct trace_entry *ent; unsigned long lost_events; int leftover; + int ent_size; int cpu; u64 ts; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9c1612..e5df02c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1248,6 +1248,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { + unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) @@ -1256,25 +1265,77 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + int use_stack; + int size = FTRACE_STACK_ENTRIES; + + trace.nr_entries = 0; + trace.skip = skip; + + /* + * Since events can happen in NMIs there's no safe way to + * use the per cpu ftrace_stacks. We reserve it and if an interrupt + * or NMI comes in, it will just have to use the default + * FTRACE_STACK_SIZE. + */ + preempt_disable_notrace(); + + use_stack = ++__get_cpu_var(ftrace_stack_reserve); + /* + * We don't need any atomic variables, just a barrier. + * If an interrupt comes in, we don't care, because it would + * have exited and put the counter back to what we want. + * We just need a barrier to keep gcc from moving things + * around. + */ + barrier(); + if (use_stack == 1) { + trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.max_entries = FTRACE_STACK_MAX_ENTRIES; + + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; + } else + /* From now on, use_stack is a boolean */ + use_stack = 0; + + size *= sizeof(unsigned long); event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry) + size, flags, pc); if (!event) - return; - entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, sizeof(entry->caller)); + goto out; + entry = ring_buffer_event_data(event); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->caller; + memset(&entry->caller, 0, size); + + if (use_stack) + memcpy(&entry->caller, trace.entries, + trace.nr_entries * sizeof(unsigned long)); + else { + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.entries = entry->caller; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + } + + entry->size = trace.nr_entries; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __get_cpu_var(ftrace_stack_reserve)--; + preempt_enable_notrace(); + } void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, @@ -1562,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, ftrace_enable_cpu(); - return event ? ring_buffer_event_data(event) : NULL; + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; } static struct trace_entry * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c..9336590 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, F_STRUCT( - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __field( int, size ) + __dynamic_array(unsigned long, caller ) ), F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e37de49..5199930 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, { struct stack_entry *field; struct trace_seq *s = &iter->seq; - int i; + unsigned long *p; + unsigned long *end; trace_assign_type(field, iter->ent); + end = (unsigned long *)((long)iter->ent + iter->ent_size); if (!trace_seq_puts(s, "\n")) goto partial; - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) - break; + + for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { if (!trace_seq_puts(s, " => ")) goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) + if (!seq_print_ip_sym(s, *p, flags)) goto partial; if (!trace_seq_puts(s, "\n")) goto partial; -- cgit v0.10.2 From f91298709790b9a483752ca3c967845537df2af3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 9 Jul 2011 00:17:12 +0400 Subject: perf, x86: P4 PMU - Introduce event alias feature Instead of hw_nmi_watchdog_set_attr() weak function and appropriate x86_pmu::hw_watchdog_set_attr() call we introduce even alias mechanism which allow us to drop this routines completely and isolate quirks of Netburst architecture inside P4 PMU code only. The main idea remains the same though -- to allow nmi-watchdog and perf top run simultaneously. Note the aliasing mechanism applies to generic PERF_COUNT_HW_CPU_CYCLES event only because arbitrary event (say passed as RAW initially) might have some additional bits set inside ESCR register changing the behaviour of event and we can't guarantee anymore that alias event will give the same result. P.S. Thanks a huge to Don and Steven for for testing and early review. Acked-by: Don Zickus Tested-by: Steven Rostedt Signed-off-by: Cyrill Gorcunov CC: Ingo Molnar CC: Peter Zijlstra CC: Stephane Eranian CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110708201712.GS23657@sun Signed-off-by: Steven Rostedt diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 56fd9e3..4d86c86 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -102,6 +102,14 @@ #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) /* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + +/* * The bits we allow to pass for RAW events */ #define P4_CONFIG_MASK_ESCR \ @@ -123,6 +131,31 @@ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) +/* + * In case of event aliasing we need to preserve some + * caller bits otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c53d433..b7a010f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -274,7 +274,6 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); - void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -360,12 +359,6 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - if (x86_pmu.hw_watchdog_set_attr) - x86_pmu.hw_watchdog_set_attr(wd_attr); -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index fb901c5..8b7a0c3 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -570,11 +570,92 @@ static __initconst const u64 p4_hw_cache_event_ids }, }; +/* + * Because of Netburst being quite restricted in now + * many same events can run simultaneously, we use + * event aliases, ie different events which have the + * same functionallity but use non-intersected resources + * (ESCR/CCCR/couter registers). This allow us to run + * two or more semi-same events together. It is done + * transparently to a user space. + * + * Never set any cusom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up-to-dated automatically either not appliable + * at all. + * + * And be really carefull choosing aliases! + */ +struct p4_event_alias { + u64 orig; + u64 alter; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with + * non-sleeping cycles (see Intel SDM Vol3b for + * details). + */ + .orig = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alter = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Probably we're lucky and don't have to do + * matching over all config bits. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + /* + * If an event was previously swapped to the alter config + * we should swap it back otherwise contnention on registers + * will return back. + */ + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].orig) { + config_match = p4_event_aliases[i].alter; + break; + } else if (config_match == p4_event_aliases[i].alter) { + config_match = p4_event_aliases[i].orig; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | + (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -719,31 +800,6 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } -static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - /* - * Watchdog ticks are special on Netburst, we use - * that named "non-sleeping" ticks as recommended - * by Intel SDM Vol3b. - */ - WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || - wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); - - wd_attr->type = PERF_TYPE_RAW; - wd_attr->config = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE); -} - static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1159,6 +1215,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1167,6 +1225,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * Aliases are swappable so we may hit circular + * lock if both original config and alias need + * resources (MSR registers) which already busy. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1180,8 +1249,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Probably an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } p4_pmu_swap_config_ts(hwc, cpu); if (assign) @@ -1218,7 +1296,6 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a933e3a..36491cd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -200,7 +200,6 @@ static int is_softlockup(unsigned long touch_ts) } #ifdef CONFIG_HARDLOCKUP_DETECTOR -void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { } static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -372,7 +371,6 @@ static int watchdog_nmi_enable(int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - hw_nmi_watchdog_set_attr(wd_attr); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v0.10.2 From 7143f168e2aa4bc7f8a8bcfe46d8dc52f7be869a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:36 +0900 Subject: tracing/kprobes: Rename probe_* to trace_probe_* Rename probe_* to trace_probe_* for avoiding namespace confliction. This also fixes improper names of find_probe_event() and cleanup_all_probes() to find_trace_probe() and release_all_trace_probes() respectively. Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110627072636.6528.60374.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7db7b68..14b88ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -555,12 +555,12 @@ struct trace_probe { (sizeof(struct probe_arg) * (n))) -static __kprobes int probe_is_return(struct trace_probe *tp) +static __kprobes int trace_probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; } -static __kprobes const char *probe_symbol(struct trace_probe *tp) +static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) { return tp->symbol ? tp->symbol : "unknown"; } @@ -671,7 +671,7 @@ static void free_trace_probe(struct trace_probe *tp) kfree(tp); } -static struct trace_probe *find_probe_event(const char *event, +static struct trace_probe *find_trace_probe(const char *event, const char *group) { struct trace_probe *tp; @@ -686,7 +686,7 @@ static struct trace_probe *find_probe_event(const char *event, /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) unregister_kretprobe(&tp->rp); else unregister_kprobe(&tp->rp.kp); @@ -703,7 +703,7 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); /* register as an event */ - old_tp = find_probe_event(tp->call.name, tp->call.class->system); + old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { /* delete old event */ unregister_trace_probe(old_tp); @@ -716,7 +716,7 @@ static int register_trace_probe(struct trace_probe *tp) } tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) ret = register_kretprobe(&tp->rp); else ret = register_kprobe(&tp->rp.kp); @@ -1025,7 +1025,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_probe_event(event, group); + tp = find_trace_probe(event, group); if (!tp) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); @@ -1144,7 +1144,7 @@ error: return ret; } -static void cleanup_all_probes(void) +static void release_all_trace_probes(void) { struct trace_probe *tp; @@ -1181,15 +1181,16 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_probe *tp = v; int i; - seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); if (!tp->symbol) seq_printf(m, " 0x%p", tp->rp.kp.addr); else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + seq_printf(m, " %s+%u", trace_probe_symbol(tp), + tp->rp.kp.offset); else - seq_printf(m, " %s", probe_symbol(tp)); + seq_printf(m, " %s", trace_probe_symbol(tp)); for (i = 0; i < tp->nr_args; i++) seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); @@ -1209,7 +1210,7 @@ static int probes_open(struct inode *inode, struct file *file) { if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - cleanup_all_probes(); + release_all_trace_probes(); return seq_open(file, &probes_seq_op); } @@ -1518,7 +1519,7 @@ static int probe_event_enable(struct ftrace_event_call *call) struct trace_probe *tp = (struct trace_probe *)call->data; tp->flags |= TP_FLAG_TRACE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1530,7 +1531,7 @@ static void probe_event_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_TRACE; if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1598,7 +1599,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) const char *fmt, *arg; - if (!probe_is_return(tp)) { + if (!trace_probe_is_return(tp)) { fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; } else { @@ -1722,7 +1723,7 @@ static int probe_perf_enable(struct ftrace_event_call *call) tp->flags |= TP_FLAG_PROFILE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1735,7 +1736,7 @@ static void probe_perf_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_PROFILE; if (!(tp->flags & TP_FLAG_TRACE)) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1807,7 +1808,7 @@ static int register_probe_event(struct trace_probe *tp) /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (probe_is_return(tp)) { + if (trace_probe_is_return(tp)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { @@ -1899,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1914,7 +1915,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1940,7 +1941,7 @@ static __init int kprobe_trace_self_tests_init(void) } end: - cleanup_all_probes(); + release_all_trace_probes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else -- cgit v0.10.2 From f7bc8b61f65726ff98f52e286b28e294499d7a08 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 23:02:27 -0400 Subject: ftrace: Fix regression where ftrace breaks when modules are loaded Enabling function tracer to trace all functions, then load a module and then disable function tracing will cause ftrace to fail. This can also happen by enabling function tracing on the command line: ftrace=function and during boot up, modules are loaded, then you disable function tracing with 'echo nop > current_tracer' you will trigger a bug in ftrace that will shut itself down. The reason is, the new ftrace code keeps ref counts of all ftrace_ops that are registered for tracing. When one or more ftrace_ops are registered, all the records that represent the functions that the ftrace_ops will trace have a ref count incremented. If this ref count is not zero, when the code modification runs, that function will be enabled for tracing. If the ref count is zero, that function will be disabled from tracing. To make sure the accounting was working, FTRACE_WARN_ON()s were added to updating of the ref counts. If the ref count hits its max (> 2^30 ftrace_ops added), or if the ref count goes below zero, a FTRACE_WARN_ON() is triggered which disables all modification of code. Since it is common for ftrace_ops to trace all functions in the kernel, instead of creating > 20,000 hash items for the ftrace_ops, the hash count is just set to zero, and it represents that the ftrace_ops is to trace all functions. This is where the issues arrise. If you enable function tracing to trace all functions, and then add a module, the modules function records do not get the ref count updated. When the function tracer is disabled, all function records ref counts are subtracted. Since the modules never had their ref counts incremented, they go below zero and the FTRACE_WARN_ON() is triggered. The solution to this is rather simple. When modules are loaded, and their functions are added to the the ftrace pool, look to see if any ftrace_ops are registered that trace all functions. And for those, update the ref count for the module function records. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1c4c0b0..ef9271b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1744,10 +1744,36 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; +static int ops_traces_mod(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash; + + hash = ops->filter_hash; + return !!(!hash || !hash->count); +} + static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p; cycle_t start, stop; + unsigned long ref = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED && + ops_traces_mod(ops)) + ref++; + } + } start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -1760,7 +1786,7 @@ static int ftrace_update_code(struct module *mod) p = ftrace_new_addrs; ftrace_new_addrs = p->newlist; - p->flags = 0L; + p->flags = ref; /* * Do the initial record conversion from mcount jump @@ -1783,7 +1809,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up) { + if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); if (failed) { ftrace_bug(failed, p->ip); -- cgit v0.10.2 From 1538f888f1e793de04e0f90372352ac1b05833cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:44 +0900 Subject: tracing/kprobes: Merge trace probe enable/disable functions Merge redundant enable/disable functions into enable_trace_probe() and disable_trace_probe(). Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Ingo Molnar Link: http://lkml.kernel.org/r/20110627072644.6528.26910.stgit@fedora15 [ converted kprobe selftest to use enable_trace_probe ] Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14b88ed..3be50c5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -683,6 +683,34 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } +/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static int enable_trace_probe(struct trace_probe *tp, int flag) +{ + int ret = 0; + + tp->flags |= flag; + if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_return(tp)) + ret = enable_kretprobe(&tp->rp); + else + ret = enable_kprobe(&tp->rp.kp); + } + + return ret; +} + +/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static void disable_trace_probe(struct trace_probe *tp, int flag) +{ + tp->flags &= ~flag; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (trace_probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { @@ -1514,30 +1542,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static int probe_event_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_TRACE; - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_event_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_TRACE; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1716,49 +1720,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } - -static int probe_perf_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_PROFILE; - - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_perf_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_PROFILE; - - if (!(tp->flags & TP_FLAG_TRACE)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} #endif /* CONFIG_PERF_EVENTS */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) { + struct trace_probe *tp = (struct trace_probe *)event->data; + switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(event); + return enable_trace_probe(tp, TP_FLAG_TRACE); case TRACE_REG_UNREGISTER: - probe_event_disable(event); + disable_trace_probe(tp, TP_FLAG_TRACE); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_perf_enable(event); + return enable_trace_probe(tp, TP_FLAG_PROFILE); case TRACE_REG_PERF_UNREGISTER: - probe_perf_disable(event); + disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; #endif } @@ -1905,7 +1885,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " @@ -1920,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } if (warn) -- cgit v0.10.2 From bc81d48d13d8839fae6833c95794c403b2133f36 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:50 +0900 Subject: kprobes: Return -ENOENT if probe point doesn't exist Return -ENOENT if probe point doesn't exist, but still returns -EINVAL if both of kprobe->addr and kprobe->symbol_name are specified or both are not specified. Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Anil S Keshavamurthy Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20110627072650.6528.67329.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7798181..b30fd54 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr) /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. + * This returns encoded errors if it fails to look up symbol or invalid + * combination of parameters. */ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; + + if ((p->symbol_name && p->addr) || + (!p->symbol_name && !p->addr)) + goto invalid; + if (p->symbol_name) { - if (addr) - return NULL; kprobe_lookup_name(p->symbol_name, addr); + if (!addr) + return ERR_PTR(-ENOENT); } - if (!addr) - return NULL; - return (kprobe_opcode_t *)(((char *)addr) + p->offset); + addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); + if (addr) + return addr; + +invalid: + return ERR_PTR(-EINVAL); } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ @@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p) kprobe_opcode_t *addr; addr = kprobe_addr(p); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); p->addr = addr; ret = check_kprobe_rereg(p); @@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p) */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { + /* Return -ENOENT if fail. */ + ret = -ENOENT; /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) module_put(probed_mod); goto fail_with_jump_label; } + /* ret will be updated by following code */ } preempt_enable(); jump_label_unlock(); @@ -1399,7 +1412,7 @@ out: fail_with_jump_label: preempt_enable(); jump_label_unlock(); - return -EINVAL; + return ret; } EXPORT_SYMBOL_GPL(register_kprobe); @@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) -- cgit v0.10.2 From 614243181050436785f5a621749a7da2336a7916 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:56 +0900 Subject: tracing/kprobes: Support module init function probing To support probing module init functions, kprobe-tracer allows user to define a probe on non-existed function when it is given with a module name. This also enables user to set a probe on a function on a specific module, even if a same name (but different) function is locally defined in another module. The module name must be in the front of function name and separated by a ':'. e.g. btrfs:btrfs_init_sysfs Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072656.6528.89970.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index c83bd6b..d0d0bb9 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -22,14 +22,15 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events ------------------------- - p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated - based on SYMBOL+offs or MEMADDR. - SYMBOL[+offs] : Symbol+offset where the probe is inserted. + based on SYM+offs or MEMADDR. + MOD : Module name which has given SYM. + SYM[+offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. FETCHARGS : Arguments. Each probe can have up to 128 args. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3be50c5..acc6664 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -536,6 +536,7 @@ struct probe_arg { /* Flags for trace_probe */ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 struct trace_probe { struct list_head list; @@ -565,6 +566,39 @@ static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } +static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +{ + return tp->rp.kp.offset; +} + +static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) +{ + return !!(kprobe_gone(&tp->rp.kp)); +} + +static __kprobes bool trace_probe_within_module(struct trace_probe *tp, + struct module *mod) +{ + int len = strlen(mod->name); + const char *name = trace_probe_symbol(tp); + return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} + +static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +{ + return !!strchr(trace_probe_symbol(tp), ':'); +} + static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -689,7 +723,8 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) int ret = 0; tp->flags |= flag; - if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && + !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else @@ -703,7 +738,7 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) static void disable_trace_probe(struct trace_probe *tp, int flag) { tp->flags &= ~flag; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else @@ -711,13 +746,64 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) } } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_probe(struct trace_probe *tp) { + int ret; + + if (trace_probe_is_registered(tp)) + return -EINVAL; + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(tp)) + tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + else + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + ret = register_kretprobe(&tp->rp); else - unregister_kprobe(&tp->rp.kp); + ret = register_kprobe(&tp->rp.kp); + + if (ret == 0) + tp->flags |= TP_FLAG_REGISTERED; + else { + pr_warning("Could not insert probe at %s+%lu: %d\n", + trace_probe_symbol(tp), trace_probe_offset(tp), ret); + if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + pr_warning("This probe might be able to register after" + "target module is loaded. Continue.\n"); + ret = 0; + } else if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + } + + return ret; +} + +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_probe(struct trace_probe *tp) +{ + if (trace_probe_is_registered(tp)) { + if (trace_probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + tp->flags &= ~TP_FLAG_REGISTERED; + /* Cleanup kprobe for reuse */ + if (tp->rp.kp.symbol_name) + tp->rp.kp.addr = NULL; + } +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); } @@ -730,41 +816,65 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); - /* register as an event */ + /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - /* delete old event */ unregister_trace_probe(old_tp); free_trace_probe(old_tp); } + + /* Register new event */ ret = register_probe_event(tp); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); - else - ret = register_kprobe(&tp->rp.kp); - - if (ret) { - pr_warning("Could not insert probe(%d)\n", ret); - if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tp->rp.kp.addr); - ret = -EINVAL; - } + /* Register k*probe */ + ret = __register_trace_probe(tp); + if (ret < 0) unregister_probe_event(tp); - } else + else list_add_tail(&tp->list, &probe_list); + end: mutex_unlock(&probe_lock); return ret; } +/* Module notifier call back, checking event on the module */ +static int trace_probe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct trace_probe *tp; + int ret; + + if (val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + /* Update probes on coming module */ + mutex_lock(&probe_lock); + list_for_each_entry(tp, &probe_list, list) { + if (trace_probe_within_module(tp, mod)) { + __unregister_trace_probe(tp); + ret = __register_trace_probe(tp); + if (ret) + pr_warning("Failed to re-register probe %s on" + "%s: %d\n", + tp->call.name, mod->name, ret); + } + } + mutex_unlock(&probe_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block trace_probe_module_nb = { + .notifier_call = trace_probe_module_callback, + .priority = 1 /* Invoked after kprobe module callback */ +}; + /* Split symbol and offset. */ static int split_symbol_offset(char *symbol, unsigned long *offset) { @@ -990,8 +1100,8 @@ static int create_trace_probe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -1186,7 +1296,6 @@ static void release_all_trace_probes(void) mutex_unlock(&probe_lock); } - /* Probes listing interfaces */ static void *probes_seq_start(struct seq_file *m, loff_t *pos) { @@ -1827,6 +1936,9 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; + if (register_module_notifier(&trace_probe_module_nb)) + return -EINVAL; + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; -- cgit v0.10.2 From 7f6878a3d707b947603e09d95df0c3a98987e3a4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:03 +0900 Subject: tracing/kprobe: Update symbol reference when loading module Since the address of a module-local variable can only be solved after the target module is loaded, the symbol fetch-argument should be updated when loading target module. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072703.6528.75042.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index acc6664..5fb3697 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) DEFINE_FETCH_deref(string_size) +static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) @@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string_size NULL static __kprobes void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static __kprobes void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } + /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -680,6 +702,16 @@ error: return ERR_PTR(ret); } +static void update_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + static void free_probe_arg(struct probe_arg *arg) { if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) @@ -749,11 +781,14 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) /* Internal register function - just handle k*probes and flags */ static int __register_trace_probe(struct trace_probe *tp) { - int ret; + int i, ret; if (trace_probe_is_registered(tp)) return -EINVAL; + for (i = 0; i < tp->nr_args; i++) + update_probe_arg(&tp->args[i]); + /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(tp)) tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; -- cgit v0.10.2 From baad2d3e69ba154dae340904a47ae12414f1894f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:09 +0900 Subject: perf probe: Rename DIE_FIND_CB_FOUND to DIE_FIND_CB_END Since die_find/walk* callbacks use DIE_FIND_CB_FOUND for both of failed and found cases, it should be "END" instead "FOUND" for avoiding confusion. Signed-off-by: Masami Hiramatsu Reported-by: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Link: http://lkml.kernel.org/r/20110627072709.6528.45706.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 3b9d0b8..7b78904 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -435,7 +435,7 @@ static int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs) /* Return values for die_find callbacks */ enum { - DIE_FIND_CB_FOUND = 0, /* End of Search */ + DIE_FIND_CB_END = 0, /* End of Search */ DIE_FIND_CB_CHILD = 1, /* Search only children */ DIE_FIND_CB_SIBLING = 2, /* Search only siblings */ DIE_FIND_CB_CONTINUE = 3, /* Search children and siblings */ @@ -455,7 +455,7 @@ static Dwarf_Die *die_find_child(Dwarf_Die *rt_die, do { ret = callback(die_mem, data); - if (ret == DIE_FIND_CB_FOUND) + if (ret == DIE_FIND_CB_END) return die_mem; if ((ret & DIE_FIND_CB_CHILD) && @@ -507,7 +507,7 @@ static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data) if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine && dwarf_haspc(die_mem, *addr)) - return DIE_FIND_CB_FOUND; + return DIE_FIND_CB_END; return DIE_FIND_CB_CONTINUE; } @@ -555,7 +555,7 @@ static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data) lw->retval = lw->handler(lw->fname, lineno, addr, lw->data); if (lw->retval != 0) - return DIE_FIND_CB_FOUND; + return DIE_FIND_CB_END; } } return DIE_FIND_CB_SIBLING; @@ -691,7 +691,7 @@ static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data) if ((tag == DW_TAG_formal_parameter || tag == DW_TAG_variable) && die_compare_name(die_mem, fvp->name)) - return DIE_FIND_CB_FOUND; + return DIE_FIND_CB_END; if (dwarf_haspc(die_mem, fvp->addr)) return DIE_FIND_CB_CONTINUE; @@ -715,7 +715,7 @@ static int __die_find_member_cb(Dwarf_Die *die_mem, void *data) if ((dwarf_tag(die_mem) == DW_TAG_member) && die_compare_name(die_mem, name)) - return DIE_FIND_CB_FOUND; + return DIE_FIND_CB_END; return DIE_FIND_CB_SIBLING; } -- cgit v0.10.2 From bad03ae476214d9d66bb96be02b630385936f788 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:15 +0900 Subject: perf probe: Move strtailcmp to string.c Since strtailcmp() is enough generic, it should be defined in string.c. Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072715.6528.10677.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 7b78904..459ebe8 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -43,21 +43,6 @@ /* Kprobe tracer basic type is up to u64 */ #define MAX_BASIC_TYPE_BITS 64 -/* - * Compare the tail of two strings. - * Return 0 if whole of either string is same as another's tail part. - */ -static int strtailcmp(const char *s1, const char *s2) -{ - int i1 = strlen(s1); - int i2 = strlen(s2); - while (--i1 >= 0 && --i2 >= 0) { - if (s1[i1] != s2[i2]) - return s1[i1] - s2[i2]; - } - return 0; -} - /* Line number list operations */ /* Add a line to line number list */ diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index b9a985d..d583638 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -294,3 +294,22 @@ bool strlazymatch(const char *str, const char *pat) { return __match_glob(str, pat, true); } + +/** + * strtailcmp - Compare the tail of two strings + * @s1: 1st string to be compared + * @s2: 2nd string to be compared + * + * Return 0 if whole of either string is same as another's tail part. + */ +int strtailcmp(const char *s1, const char *s2) +{ + int i1 = strlen(s1); + int i2 = strlen(s2); + while (--i1 >= 0 && --i2 >= 0) { + if (s1[i1] != s2[i2]) + return s1[i1] - s2[i2]; + } + return 0; +} + diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index fc78428..0128906 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -238,6 +238,7 @@ char **argv_split(const char *str, int *argcp); void argv_free(char **argv); bool strglobmatch(const char *str, const char *pat); bool strlazymatch(const char *str, const char *pat); +int strtailcmp(const char *s1, const char *s2); unsigned long convert_unit(unsigned long value, char *unit); int readn(int fd, void *buf, size_t size); -- cgit v0.10.2 From bcfc082150c6b1e9443c1277bca8be80094150b5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:21 +0900 Subject: perf probe: Remove redundant dwarf functions Since there are dwarf_bitsize, dwarf_bitoffset and dwarf_bytesize defined in libdw, we don't need die_get_bit_size, die_get_bit_offset and die_get_byte_size anymore. Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072721.6528.2747.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 459ebe8..d443b64 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -361,36 +361,6 @@ static bool die_is_signed_type(Dwarf_Die *tp_die) ret == DW_ATE_signed_fixed); } -static int die_get_byte_size(Dwarf_Die *tp_die) -{ - Dwarf_Word ret; - - if (die_get_attr_udata(tp_die, DW_AT_byte_size, &ret)) - return 0; - - return (int)ret; -} - -static int die_get_bit_size(Dwarf_Die *tp_die) -{ - Dwarf_Word ret; - - if (die_get_attr_udata(tp_die, DW_AT_bit_size, &ret)) - return 0; - - return (int)ret; -} - -static int die_get_bit_offset(Dwarf_Die *tp_die) -{ - Dwarf_Word ret; - - if (die_get_attr_udata(tp_die, DW_AT_bit_offset, &ret)) - return 0; - - return (int)ret; -} - /* Get data_member_location offset */ static int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs) { @@ -882,6 +852,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg_ref **ref_ptr = &tvar->ref; Dwarf_Die type; char buf[16]; + int bsize, boffs, total; int ret; /* TODO: check all types */ @@ -891,11 +862,15 @@ static int convert_variable_type(Dwarf_Die *vr_die, return (tvar->type == NULL) ? -ENOMEM : 0; } - if (die_get_bit_size(vr_die) != 0) { + bsize = dwarf_bitsize(vr_die); + if (bsize > 0) { /* This is a bitfield */ - ret = snprintf(buf, 16, "b%d@%d/%zd", die_get_bit_size(vr_die), - die_get_bit_offset(vr_die), - BYTES_TO_BITS(die_get_byte_size(vr_die))); + boffs = dwarf_bitoffset(vr_die); + total = dwarf_bytesize(vr_die); + if (boffs < 0 || total < 0) + return -ENOENT; + ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs, + BYTES_TO_BITS(total)); goto formatted; } @@ -943,10 +918,11 @@ static int convert_variable_type(Dwarf_Die *vr_die, return (tvar->type == NULL) ? -ENOMEM : 0; } - ret = BYTES_TO_BITS(die_get_byte_size(&type)); - if (!ret) + ret = dwarf_bytesize(&type); + if (ret <= 0) /* No size ... try to use default type */ return 0; + ret = BYTES_TO_BITS(ret); /* Check the bitwidth */ if (ret > MAX_BASIC_TYPE_BITS) { @@ -1010,7 +986,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, else *ref_ptr = ref; } - ref->offset += die_get_byte_size(&type) * field->index; + ref->offset += dwarf_bytesize(&type) * field->index; if (!field->next) /* Save vr_die for converting types */ memcpy(die_mem, vr_die, sizeof(*die_mem)); -- cgit v0.10.2 From e0d153c69040bb37cbdf09deb52fee3013c07742 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:27 +0900 Subject: perf-probe: Move dwarf library routines to dwarf-aux.{c, h} Move dwarf library related routines to dwarf-aux.{c,h}. This includes several minor changes. - Add simple documents for each API. - Rename die_find_real_subprogram() to die_find_realfunc() - Rename line_walk_handler_t to line_walk_callback_t. - Minor cleanups. Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072727.6528.57647.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 940257b..d0861bb 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -279,6 +279,7 @@ LIB_H += util/thread.h LIB_H += util/thread_map.h LIB_H += util/trace-event.h LIB_H += util/probe-finder.h +LIB_H += util/dwarf-aux.h LIB_H += util/probe-event.h LIB_H += util/pstack.h LIB_H += util/cpumap.h @@ -435,6 +436,7 @@ else BASIC_CFLAGS += -DDWARF_SUPPORT EXTLIBS += -lelf -ldw LIB_OBJS += $(OUTPUT)util/probe-finder.o + LIB_OBJS += $(OUTPUT)util/dwarf-aux.o endif # PERF_HAVE_DWARF_REGS endif # NO_DWARF diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c new file mode 100644 index 0000000..fddf40f --- /dev/null +++ b/tools/perf/util/dwarf-aux.c @@ -0,0 +1,663 @@ +/* + * dwarf-aux.c : libdw auxiliary interfaces + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +#include +#include "util.h" +#include "debug.h" +#include "dwarf-aux.h" + +/** + * cu_find_realpath - Find the realpath of the target file + * @cu_die: A DIE(dwarf information entry) of CU(compilation Unit) + * @fname: The tail filename of the target file + * + * Find the real(long) path of @fname in @cu_die. + */ +const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname) +{ + Dwarf_Files *files; + size_t nfiles, i; + const char *src = NULL; + int ret; + + if (!fname) + return NULL; + + ret = dwarf_getsrcfiles(cu_die, &files, &nfiles); + if (ret != 0) + return NULL; + + for (i = 0; i < nfiles; i++) { + src = dwarf_filesrc(files, i, NULL, NULL); + if (strtailcmp(src, fname) == 0) + break; + } + if (i == nfiles) + return NULL; + return src; +} + +/** + * cu_get_comp_dir - Get the path of compilation directory + * @cu_die: a CU DIE + * + * Get the path of compilation directory of given @cu_die. + * Since this depends on DW_AT_comp_dir, older gcc will not + * embedded it. In that case, this returns NULL. + */ +const char *cu_get_comp_dir(Dwarf_Die *cu_die) +{ + Dwarf_Attribute attr; + if (dwarf_attr(cu_die, DW_AT_comp_dir, &attr) == NULL) + return NULL; + return dwarf_formstring(&attr); +} + +/** + * cu_find_lineinfo - Get a line number and file name for given address + * @cu_die: a CU DIE + * @addr: An address + * @fname: a pointer which returns the file name string + * @lineno: a pointer which returns the line number + * + * Find a line number and file name for @addr in @cu_die. + */ +int cu_find_lineinfo(Dwarf_Die *cu_die, unsigned long addr, + const char **fname, int *lineno) +{ + Dwarf_Line *line; + Dwarf_Addr laddr; + + line = dwarf_getsrc_die(cu_die, (Dwarf_Addr)addr); + if (line && dwarf_lineaddr(line, &laddr) == 0 && + addr == (unsigned long)laddr && dwarf_lineno(line, lineno) == 0) { + *fname = dwarf_linesrc(line, NULL, NULL); + if (!*fname) + /* line number is useless without filename */ + *lineno = 0; + } + + return *lineno ?: -ENOENT; +} + +/** + * die_compare_name - Compare diename and tname + * @dw_die: a DIE + * @tname: a string of target name + * + * Compare the name of @dw_die and @tname. Return false if @dw_die has no name. + */ +bool die_compare_name(Dwarf_Die *dw_die, const char *tname) +{ + const char *name; + name = dwarf_diename(dw_die); + return name ? (strcmp(tname, name) == 0) : false; +} + +/** + * die_get_call_lineno - Get callsite line number of inline-function instance + * @in_die: a DIE of an inlined function instance + * + * Get call-site line number of @in_die. This means from where the inline + * function is called. + */ +int die_get_call_lineno(Dwarf_Die *in_die) +{ + Dwarf_Attribute attr; + Dwarf_Word ret; + + if (!dwarf_attr(in_die, DW_AT_call_line, &attr)) + return -ENOENT; + + dwarf_formudata(&attr, &ret); + return (int)ret; +} + +/** + * die_get_type - Get type DIE + * @vr_die: a DIE of a variable + * @die_mem: where to store a type DIE + * + * Get a DIE of the type of given variable (@vr_die), and store + * it to die_mem. Return NULL if fails to get a type DIE. + */ +Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) +{ + Dwarf_Attribute attr; + + if (dwarf_attr_integrate(vr_die, DW_AT_type, &attr) && + dwarf_formref_die(&attr, die_mem)) + return die_mem; + else + return NULL; +} + +/* Get a type die, but skip qualifiers */ +static Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) +{ + int tag; + + do { + vr_die = die_get_type(vr_die, die_mem); + if (!vr_die) + break; + tag = dwarf_tag(vr_die); + } while (tag == DW_TAG_const_type || + tag == DW_TAG_restrict_type || + tag == DW_TAG_volatile_type || + tag == DW_TAG_shared_type); + + return vr_die; +} + +/** + * die_get_real_type - Get a type die, but skip qualifiers and typedef + * @vr_die: a DIE of a variable + * @die_mem: where to store a type DIE + * + * Get a DIE of the type of given variable (@vr_die), and store + * it to die_mem. Return NULL if fails to get a type DIE. + * If the type is qualifiers (e.g. const) or typedef, this skips it + * and tries to find real type (structure or basic types, e.g. int). + */ +Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) +{ + do { + vr_die = __die_get_real_type(vr_die, die_mem); + } while (vr_die && dwarf_tag(vr_die) == DW_TAG_typedef); + + return vr_die; +} + +/* Get attribute and translate it as a udata */ +static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name, + Dwarf_Word *result) +{ + Dwarf_Attribute attr; + + if (dwarf_attr(tp_die, attr_name, &attr) == NULL || + dwarf_formudata(&attr, result) != 0) + return -ENOENT; + + return 0; +} + +/** + * die_is_signed_type - Check whether a type DIE is signed or not + * @tp_die: a DIE of a type + * + * Get the encoding of @tp_die and return true if the encoding + * is signed. + */ +bool die_is_signed_type(Dwarf_Die *tp_die) +{ + Dwarf_Word ret; + + if (die_get_attr_udata(tp_die, DW_AT_encoding, &ret)) + return false; + + return (ret == DW_ATE_signed_char || ret == DW_ATE_signed || + ret == DW_ATE_signed_fixed); +} + +/** + * die_get_data_member_location - Get the data-member offset + * @mb_die: a DIE of a member of a data structure + * @offs: The offset of the member in the data structure + * + * Get the offset of @mb_die in the data structure including @mb_die, and + * stores result offset to @offs. If any error occurs this returns errno. + */ +int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs) +{ + Dwarf_Attribute attr; + Dwarf_Op *expr; + size_t nexpr; + int ret; + + if (dwarf_attr(mb_die, DW_AT_data_member_location, &attr) == NULL) + return -ENOENT; + + if (dwarf_formudata(&attr, offs) != 0) { + /* DW_AT_data_member_location should be DW_OP_plus_uconst */ + ret = dwarf_getlocation(&attr, &expr, &nexpr); + if (ret < 0 || nexpr == 0) + return -ENOENT; + + if (expr[0].atom != DW_OP_plus_uconst || nexpr != 1) { + pr_debug("Unable to get offset:Unexpected OP %x (%zd)\n", + expr[0].atom, nexpr); + return -ENOTSUP; + } + *offs = (Dwarf_Word)expr[0].number; + } + return 0; +} + +/** + * die_find_child - Generic DIE search function in DIE tree + * @rt_die: a root DIE + * @callback: a callback function + * @data: a user data passed to the callback function + * @die_mem: a buffer for result DIE + * + * Trace DIE tree from @rt_die and call @callback for each child DIE. + * If @callback returns DIE_FIND_CB_END, this stores the DIE into + * @die_mem and returns it. If @callback returns DIE_FIND_CB_CONTINUE, + * this continues to trace the tree. Optionally, @callback can return + * DIE_FIND_CB_CHILD and DIE_FIND_CB_SIBLING, those means trace only + * the children and trace only the siblings respectively. + * Returns NULL if @callback can't find any appropriate DIE. + */ +Dwarf_Die *die_find_child(Dwarf_Die *rt_die, + int (*callback)(Dwarf_Die *, void *), + void *data, Dwarf_Die *die_mem) +{ + Dwarf_Die child_die; + int ret; + + ret = dwarf_child(rt_die, die_mem); + if (ret != 0) + return NULL; + + do { + ret = callback(die_mem, data); + if (ret == DIE_FIND_CB_END) + return die_mem; + + if ((ret & DIE_FIND_CB_CHILD) && + die_find_child(die_mem, callback, data, &child_die)) { + memcpy(die_mem, &child_die, sizeof(Dwarf_Die)); + return die_mem; + } + } while ((ret & DIE_FIND_CB_SIBLING) && + dwarf_siblingof(die_mem, die_mem) == 0); + + return NULL; +} + +struct __addr_die_search_param { + Dwarf_Addr addr; + Dwarf_Die *die_mem; +}; + +/* die_find callback for non-inlined function search */ +static int __die_search_func_cb(Dwarf_Die *fn_die, void *data) +{ + struct __addr_die_search_param *ad = data; + + if (dwarf_tag(fn_die) == DW_TAG_subprogram && + dwarf_haspc(fn_die, ad->addr)) { + memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die)); + return DWARF_CB_ABORT; + } + return DWARF_CB_OK; +} + +/** + * die_find_realfunc - Search a non-inlined function at given address + * @cu_die: a CU DIE which including @addr + * @addr: target address + * @die_mem: a buffer for result DIE + * + * Search a non-inlined function DIE which includes @addr. Stores the + * DIE to @die_mem and returns it if found. Returns NULl if failed. + */ +Dwarf_Die *die_find_realfunc(Dwarf_Die *cu_die, Dwarf_Addr addr, + Dwarf_Die *die_mem) +{ + struct __addr_die_search_param ad; + ad.addr = addr; + ad.die_mem = die_mem; + /* dwarf_getscopes can't find subprogram. */ + if (!dwarf_getfuncs(cu_die, __die_search_func_cb, &ad, 0)) + return NULL; + else + return die_mem; +} + +/* die_find callback for inline function search */ +static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data) +{ + Dwarf_Addr *addr = data; + + if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine && + dwarf_haspc(die_mem, *addr)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_CONTINUE; +} + +/** + * die_find_inlinefunc - Search an inlined function at given address + * @cu_die: a CU DIE which including @addr + * @addr: target address + * @die_mem: a buffer for result DIE + * + * Search an inlined function DIE which includes @addr. Stores the + * DIE to @die_mem and returns it if found. Returns NULl if failed. + * If several inlined functions are expanded recursively, this trace + * it and returns deepest one. + */ +Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, + Dwarf_Die *die_mem) +{ + Dwarf_Die tmp_die; + + sp_die = die_find_child(sp_die, __die_find_inline_cb, &addr, &tmp_die); + if (!sp_die) + return NULL; + + /* Inlined function could be recursive. Trace it until fail */ + while (sp_die) { + memcpy(die_mem, sp_die, sizeof(Dwarf_Die)); + sp_die = die_find_child(sp_die, __die_find_inline_cb, &addr, + &tmp_die); + } + + return die_mem; +} + +/* Line walker internal parameters */ +struct __line_walk_param { + const char *fname; + line_walk_callback_t callback; + void *data; + int retval; +}; + +static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data) +{ + struct __line_walk_param *lw = data; + Dwarf_Addr addr; + int lineno; + + if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) { + lineno = die_get_call_lineno(in_die); + if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) { + lw->retval = lw->callback(lw->fname, lineno, addr, + lw->data); + if (lw->retval != 0) + return DIE_FIND_CB_END; + } + } + return DIE_FIND_CB_SIBLING; +} + +/* Walk on lines of blocks included in given DIE */ +static int __die_walk_funclines(Dwarf_Die *sp_die, + line_walk_callback_t callback, void *data) +{ + struct __line_walk_param lw = { + .callback = callback, + .data = data, + .retval = 0, + }; + Dwarf_Die die_mem; + Dwarf_Addr addr; + int lineno; + + /* Handle function declaration line */ + lw.fname = dwarf_decl_file(sp_die); + if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 && + dwarf_entrypc(sp_die, &addr) == 0) { + lw.retval = callback(lw.fname, lineno, addr, data); + if (lw.retval != 0) + goto done; + } + die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem); +done: + return lw.retval; +} + +static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data) +{ + struct __line_walk_param *lw = data; + + lw->retval = __die_walk_funclines(sp_die, lw->callback, lw->data); + if (lw->retval != 0) + return DWARF_CB_ABORT; + + return DWARF_CB_OK; +} + +/** + * die_walk_lines - Walk on lines inside given DIE + * @rt_die: a root DIE (CU or subprogram) + * @callback: callback routine + * @data: user data + * + * Walk on all lines inside given @rt_die and call @callback on each line. + * If the @rt_die is a function, walk only on the lines inside the function, + * otherwise @rt_die must be a CU DIE. + * Note that this walks not only dwarf line list, but also function entries + * and inline call-site. + */ +int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data) +{ + Dwarf_Lines *lines; + Dwarf_Line *line; + Dwarf_Addr addr; + const char *fname; + int lineno, ret = 0; + Dwarf_Die die_mem, *cu_die; + size_t nlines, i; + + /* Get the CU die */ + if (dwarf_tag(rt_die) == DW_TAG_subprogram) + cu_die = dwarf_diecu(rt_die, &die_mem, NULL, NULL); + else + cu_die = rt_die; + if (!cu_die) { + pr_debug2("Failed to get CU from subprogram\n"); + return -EINVAL; + } + + /* Get lines list in the CU */ + if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) { + pr_debug2("Failed to get source lines on this CU.\n"); + return -ENOENT; + } + pr_debug2("Get %zd lines from this CU\n", nlines); + + /* Walk on the lines on lines list */ + for (i = 0; i < nlines; i++) { + line = dwarf_onesrcline(lines, i); + if (line == NULL || + dwarf_lineno(line, &lineno) != 0 || + dwarf_lineaddr(line, &addr) != 0) { + pr_debug2("Failed to get line info. " + "Possible error in debuginfo.\n"); + continue; + } + /* Filter lines based on address */ + if (rt_die != cu_die) + /* + * Address filtering + * The line is included in given function, and + * no inline block includes it. + */ + if (!dwarf_haspc(rt_die, addr) || + die_find_inlinefunc(rt_die, addr, &die_mem)) + continue; + /* Get source line */ + fname = dwarf_linesrc(line, NULL, NULL); + + ret = callback(fname, lineno, addr, data); + if (ret != 0) + return ret; + } + + /* + * Dwarf lines doesn't include function declarations and inlined + * subroutines. We have to check functions list or given function. + */ + if (rt_die != cu_die) + ret = __die_walk_funclines(rt_die, callback, data); + else { + struct __line_walk_param param = { + .callback = callback, + .data = data, + .retval = 0, + }; + dwarf_getfuncs(cu_die, __die_walk_culines_cb, ¶m, 0); + ret = param.retval; + } + + return ret; +} + +struct __find_variable_param { + const char *name; + Dwarf_Addr addr; +}; + +static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data) +{ + struct __find_variable_param *fvp = data; + int tag; + + tag = dwarf_tag(die_mem); + if ((tag == DW_TAG_formal_parameter || + tag == DW_TAG_variable) && + die_compare_name(die_mem, fvp->name)) + return DIE_FIND_CB_END; + + if (dwarf_haspc(die_mem, fvp->addr)) + return DIE_FIND_CB_CONTINUE; + else + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_variable_at - Find a given name variable at given address + * @sp_die: a function DIE + * @name: variable name + * @addr: address + * @die_mem: a buffer for result DIE + * + * Find a variable DIE called @name at @addr in @sp_die. + */ +Dwarf_Die *die_find_variable_at(Dwarf_Die *sp_die, const char *name, + Dwarf_Addr addr, Dwarf_Die *die_mem) +{ + struct __find_variable_param fvp = { .name = name, .addr = addr}; + + return die_find_child(sp_die, __die_find_variable_cb, (void *)&fvp, + die_mem); +} + +static int __die_find_member_cb(Dwarf_Die *die_mem, void *data) +{ + const char *name = data; + + if ((dwarf_tag(die_mem) == DW_TAG_member) && + die_compare_name(die_mem, name)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_member - Find a given name member in a data structure + * @st_die: a data structure type DIE + * @name: member name + * @die_mem: a buffer for result DIE + * + * Find a member DIE called @name in @st_die. + */ +Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name, + Dwarf_Die *die_mem) +{ + return die_find_child(st_die, __die_find_member_cb, (void *)name, + die_mem); +} + +/** + * die_get_typename - Get the name of given variable DIE + * @vr_die: a variable DIE + * @buf: a buffer for result type name + * @len: a max-length of @buf + * + * Get the name of @vr_die and stores it to @buf. Return the actual length + * of type name if succeeded. Return -E2BIG if @len is not enough long, and + * Return -ENOENT if failed to find type name. + * Note that the result will stores typedef name if possible, and stores + * "*(function_type)" if the type is a function pointer. + */ +int die_get_typename(Dwarf_Die *vr_die, char *buf, int len) +{ + Dwarf_Die type; + int tag, ret, ret2; + const char *tmp = ""; + + if (__die_get_real_type(vr_die, &type) == NULL) + return -ENOENT; + + tag = dwarf_tag(&type); + if (tag == DW_TAG_array_type || tag == DW_TAG_pointer_type) + tmp = "*"; + else if (tag == DW_TAG_subroutine_type) { + /* Function pointer */ + ret = snprintf(buf, len, "(function_type)"); + return (ret >= len) ? -E2BIG : ret; + } else { + if (!dwarf_diename(&type)) + return -ENOENT; + if (tag == DW_TAG_union_type) + tmp = "union "; + else if (tag == DW_TAG_structure_type) + tmp = "struct "; + /* Write a base name */ + ret = snprintf(buf, len, "%s%s", tmp, dwarf_diename(&type)); + return (ret >= len) ? -E2BIG : ret; + } + ret = die_get_typename(&type, buf, len); + if (ret > 0) { + ret2 = snprintf(buf + ret, len - ret, "%s", tmp); + ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret; + } + return ret; +} + +/** + * die_get_varname - Get the name and type of given variable DIE + * @vr_die: a variable DIE + * @buf: a buffer for type and variable name + * @len: the max-length of @buf + * + * Get the name and type of @vr_die and stores it in @buf as "type\tname". + */ +int die_get_varname(Dwarf_Die *vr_die, char *buf, int len) +{ + int ret, ret2; + + ret = die_get_typename(vr_die, buf, len); + if (ret < 0) { + pr_debug("Failed to get type, make it unknown.\n"); + ret = snprintf(buf, len, "(unknown_type)"); + } + if (ret > 0) { + ret2 = snprintf(buf + ret, len - ret, "\t%s", + dwarf_diename(vr_die)); + ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret; + } + return ret; +} + diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h new file mode 100644 index 0000000..bc3b211 --- /dev/null +++ b/tools/perf/util/dwarf-aux.h @@ -0,0 +1,100 @@ +#ifndef _DWARF_AUX_H +#define _DWARF_AUX_H +/* + * dwarf-aux.h : libdw auxiliary interfaces + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +#include +#include +#include +#include + +/* Find the realpath of the target file */ +extern const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname); + +/* Get DW_AT_comp_dir (should be NULL with older gcc) */ +extern const char *cu_get_comp_dir(Dwarf_Die *cu_die); + +/* Get a line number and file name for given address */ +extern int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr, + const char **fname, int *lineno); + +/* Compare diename and tname */ +extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname); + +/* Get callsite line number of inline-function instance */ +extern int die_get_call_lineno(Dwarf_Die *in_die); + +/* Get type die */ +extern Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem); + +/* Get a type die, but skip qualifiers and typedef */ +extern Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem); + +/* Check whether the DIE is signed or not */ +extern bool die_is_signed_type(Dwarf_Die *tp_die); + +/* Get data_member_location offset */ +extern int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs); + +/* Return values for die_find_child() callbacks */ +enum { + DIE_FIND_CB_END = 0, /* End of Search */ + DIE_FIND_CB_CHILD = 1, /* Search only children */ + DIE_FIND_CB_SIBLING = 2, /* Search only siblings */ + DIE_FIND_CB_CONTINUE = 3, /* Search children and siblings */ +}; + +/* Search child DIEs */ +extern Dwarf_Die *die_find_child(Dwarf_Die *rt_die, + int (*callback)(Dwarf_Die *, void *), + void *data, Dwarf_Die *die_mem); + +/* Search a non-inlined function including given address */ +extern Dwarf_Die *die_find_realfunc(Dwarf_Die *cu_die, Dwarf_Addr addr, + Dwarf_Die *die_mem); + +/* Search an inlined function including given address */ +extern Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, + Dwarf_Die *die_mem); + +/* Walker on lines (Note: line number will not be sorted) */ +typedef int (* line_walk_callback_t) (const char *fname, int lineno, + Dwarf_Addr addr, void *data); + +/* + * Walk on lines inside given DIE. If the DIE is a subprogram, walk only on + * the lines inside the subprogram, otherwise the DIE must be a CU DIE. + */ +extern int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, + void *data); + +/* Find a variable called 'name' at given address */ +extern Dwarf_Die *die_find_variable_at(Dwarf_Die *sp_die, const char *name, + Dwarf_Addr addr, Dwarf_Die *die_mem); + +/* Find a member called 'name' */ +extern Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name, + Dwarf_Die *die_mem); + +/* Get the name of given variable DIE */ +extern int die_get_typename(Dwarf_Die *vr_die, char *buf, int len); + +/* Get the name and type of given variable DIE, stored as "type\tname" */ +extern int die_get_varname(Dwarf_Die *vr_die, char *buf, int len); +#endif diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index d443b64..53d219b 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -222,521 +222,6 @@ static Dwarf *dwfl_init_live_kernel_dwarf(Dwarf_Addr addr __used, Dwfl **dwflp, } #endif -/* Dwarf wrappers */ - -/* Find the realpath of the target file. */ -static const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname) -{ - Dwarf_Files *files; - size_t nfiles, i; - const char *src = NULL; - int ret; - - if (!fname) - return NULL; - - ret = dwarf_getsrcfiles(cu_die, &files, &nfiles); - if (ret != 0) - return NULL; - - for (i = 0; i < nfiles; i++) { - src = dwarf_filesrc(files, i, NULL, NULL); - if (strtailcmp(src, fname) == 0) - break; - } - if (i == nfiles) - return NULL; - return src; -} - -/* Get DW_AT_comp_dir (should be NULL with older gcc) */ -static const char *cu_get_comp_dir(Dwarf_Die *cu_die) -{ - Dwarf_Attribute attr; - if (dwarf_attr(cu_die, DW_AT_comp_dir, &attr) == NULL) - return NULL; - return dwarf_formstring(&attr); -} - -/* Get a line number and file name for given address */ -static int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr, - const char **fname, int *lineno) -{ - Dwarf_Line *line; - Dwarf_Addr laddr; - - line = dwarf_getsrc_die(cudie, (Dwarf_Addr)addr); - if (line && dwarf_lineaddr(line, &laddr) == 0 && - addr == (unsigned long)laddr && dwarf_lineno(line, lineno) == 0) { - *fname = dwarf_linesrc(line, NULL, NULL); - if (!*fname) - /* line number is useless without filename */ - *lineno = 0; - } - - return *lineno ?: -ENOENT; -} - -/* Compare diename and tname */ -static bool die_compare_name(Dwarf_Die *dw_die, const char *tname) -{ - const char *name; - name = dwarf_diename(dw_die); - return name ? (strcmp(tname, name) == 0) : false; -} - -/* Get callsite line number of inline-function instance */ -static int die_get_call_lineno(Dwarf_Die *in_die) -{ - Dwarf_Attribute attr; - Dwarf_Word ret; - - if (!dwarf_attr(in_die, DW_AT_call_line, &attr)) - return -ENOENT; - - dwarf_formudata(&attr, &ret); - return (int)ret; -} - -/* Get type die */ -static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) -{ - Dwarf_Attribute attr; - - if (dwarf_attr_integrate(vr_die, DW_AT_type, &attr) && - dwarf_formref_die(&attr, die_mem)) - return die_mem; - else - return NULL; -} - -/* Get a type die, but skip qualifiers */ -static Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) -{ - int tag; - - do { - vr_die = die_get_type(vr_die, die_mem); - if (!vr_die) - break; - tag = dwarf_tag(vr_die); - } while (tag == DW_TAG_const_type || - tag == DW_TAG_restrict_type || - tag == DW_TAG_volatile_type || - tag == DW_TAG_shared_type); - - return vr_die; -} - -/* Get a type die, but skip qualifiers and typedef */ -static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) -{ - do { - vr_die = __die_get_real_type(vr_die, die_mem); - } while (vr_die && dwarf_tag(vr_die) == DW_TAG_typedef); - - return vr_die; -} - -static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name, - Dwarf_Word *result) -{ - Dwarf_Attribute attr; - - if (dwarf_attr(tp_die, attr_name, &attr) == NULL || - dwarf_formudata(&attr, result) != 0) - return -ENOENT; - - return 0; -} - -static bool die_is_signed_type(Dwarf_Die *tp_die) -{ - Dwarf_Word ret; - - if (die_get_attr_udata(tp_die, DW_AT_encoding, &ret)) - return false; - - return (ret == DW_ATE_signed_char || ret == DW_ATE_signed || - ret == DW_ATE_signed_fixed); -} - -/* Get data_member_location offset */ -static int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs) -{ - Dwarf_Attribute attr; - Dwarf_Op *expr; - size_t nexpr; - int ret; - - if (dwarf_attr(mb_die, DW_AT_data_member_location, &attr) == NULL) - return -ENOENT; - - if (dwarf_formudata(&attr, offs) != 0) { - /* DW_AT_data_member_location should be DW_OP_plus_uconst */ - ret = dwarf_getlocation(&attr, &expr, &nexpr); - if (ret < 0 || nexpr == 0) - return -ENOENT; - - if (expr[0].atom != DW_OP_plus_uconst || nexpr != 1) { - pr_debug("Unable to get offset:Unexpected OP %x (%zd)\n", - expr[0].atom, nexpr); - return -ENOTSUP; - } - *offs = (Dwarf_Word)expr[0].number; - } - return 0; -} - -/* Return values for die_find callbacks */ -enum { - DIE_FIND_CB_END = 0, /* End of Search */ - DIE_FIND_CB_CHILD = 1, /* Search only children */ - DIE_FIND_CB_SIBLING = 2, /* Search only siblings */ - DIE_FIND_CB_CONTINUE = 3, /* Search children and siblings */ -}; - -/* Search a child die */ -static Dwarf_Die *die_find_child(Dwarf_Die *rt_die, - int (*callback)(Dwarf_Die *, void *), - void *data, Dwarf_Die *die_mem) -{ - Dwarf_Die child_die; - int ret; - - ret = dwarf_child(rt_die, die_mem); - if (ret != 0) - return NULL; - - do { - ret = callback(die_mem, data); - if (ret == DIE_FIND_CB_END) - return die_mem; - - if ((ret & DIE_FIND_CB_CHILD) && - die_find_child(die_mem, callback, data, &child_die)) { - memcpy(die_mem, &child_die, sizeof(Dwarf_Die)); - return die_mem; - } - } while ((ret & DIE_FIND_CB_SIBLING) && - dwarf_siblingof(die_mem, die_mem) == 0); - - return NULL; -} - -struct __addr_die_search_param { - Dwarf_Addr addr; - Dwarf_Die *die_mem; -}; - -static int __die_search_func_cb(Dwarf_Die *fn_die, void *data) -{ - struct __addr_die_search_param *ad = data; - - if (dwarf_tag(fn_die) == DW_TAG_subprogram && - dwarf_haspc(fn_die, ad->addr)) { - memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die)); - return DWARF_CB_ABORT; - } - return DWARF_CB_OK; -} - -/* Search a real subprogram including this line, */ -static Dwarf_Die *die_find_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr, - Dwarf_Die *die_mem) -{ - struct __addr_die_search_param ad; - ad.addr = addr; - ad.die_mem = die_mem; - /* dwarf_getscopes can't find subprogram. */ - if (!dwarf_getfuncs(cu_die, __die_search_func_cb, &ad, 0)) - return NULL; - else - return die_mem; -} - -/* die_find callback for inline function search */ -static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data) -{ - Dwarf_Addr *addr = data; - - if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine && - dwarf_haspc(die_mem, *addr)) - return DIE_FIND_CB_END; - - return DIE_FIND_CB_CONTINUE; -} - -/* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */ -static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, - Dwarf_Die *die_mem) -{ - Dwarf_Die tmp_die; - - sp_die = die_find_child(sp_die, __die_find_inline_cb, &addr, &tmp_die); - if (!sp_die) - return NULL; - - /* Inlined function could be recursive. Trace it until fail */ - while (sp_die) { - memcpy(die_mem, sp_die, sizeof(Dwarf_Die)); - sp_die = die_find_child(sp_die, __die_find_inline_cb, &addr, - &tmp_die); - } - - return die_mem; -} - -/* Walker on lines (Note: line number will not be sorted) */ -typedef int (* line_walk_handler_t) (const char *fname, int lineno, - Dwarf_Addr addr, void *data); - -struct __line_walk_param { - const char *fname; - line_walk_handler_t handler; - void *data; - int retval; -}; - -static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data) -{ - struct __line_walk_param *lw = data; - Dwarf_Addr addr; - int lineno; - - if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) { - lineno = die_get_call_lineno(in_die); - if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) { - lw->retval = lw->handler(lw->fname, lineno, addr, - lw->data); - if (lw->retval != 0) - return DIE_FIND_CB_END; - } - } - return DIE_FIND_CB_SIBLING; -} - -/* Walk on lines of blocks included in given DIE */ -static int __die_walk_funclines(Dwarf_Die *sp_die, - line_walk_handler_t handler, void *data) -{ - struct __line_walk_param lw = { - .handler = handler, - .data = data, - .retval = 0, - }; - Dwarf_Die die_mem; - Dwarf_Addr addr; - int lineno; - - /* Handle function declaration line */ - lw.fname = dwarf_decl_file(sp_die); - if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 && - dwarf_entrypc(sp_die, &addr) == 0) { - lw.retval = handler(lw.fname, lineno, addr, data); - if (lw.retval != 0) - goto done; - } - die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem); -done: - return lw.retval; -} - -static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data) -{ - struct __line_walk_param *lw = data; - - lw->retval = __die_walk_funclines(sp_die, lw->handler, lw->data); - if (lw->retval != 0) - return DWARF_CB_ABORT; - - return DWARF_CB_OK; -} - -/* - * Walk on lines inside given PDIE. If the PDIE is subprogram, walk only on - * the lines inside the subprogram, otherwise PDIE must be a CU DIE. - */ -static int die_walk_lines(Dwarf_Die *pdie, line_walk_handler_t handler, - void *data) -{ - Dwarf_Lines *lines; - Dwarf_Line *line; - Dwarf_Addr addr; - const char *fname; - int lineno, ret = 0; - Dwarf_Die die_mem, *cu_die; - size_t nlines, i; - - /* Get the CU die */ - if (dwarf_tag(pdie) == DW_TAG_subprogram) - cu_die = dwarf_diecu(pdie, &die_mem, NULL, NULL); - else - cu_die = pdie; - if (!cu_die) { - pr_debug2("Failed to get CU from subprogram\n"); - return -EINVAL; - } - - /* Get lines list in the CU */ - if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) { - pr_debug2("Failed to get source lines on this CU.\n"); - return -ENOENT; - } - pr_debug2("Get %zd lines from this CU\n", nlines); - - /* Walk on the lines on lines list */ - for (i = 0; i < nlines; i++) { - line = dwarf_onesrcline(lines, i); - if (line == NULL || - dwarf_lineno(line, &lineno) != 0 || - dwarf_lineaddr(line, &addr) != 0) { - pr_debug2("Failed to get line info. " - "Possible error in debuginfo.\n"); - continue; - } - /* Filter lines based on address */ - if (pdie != cu_die) - /* - * Address filtering - * The line is included in given function, and - * no inline block includes it. - */ - if (!dwarf_haspc(pdie, addr) || - die_find_inlinefunc(pdie, addr, &die_mem)) - continue; - /* Get source line */ - fname = dwarf_linesrc(line, NULL, NULL); - - ret = handler(fname, lineno, addr, data); - if (ret != 0) - return ret; - } - - /* - * Dwarf lines doesn't include function declarations and inlined - * subroutines. We have to check functions list or given function. - */ - if (pdie != cu_die) - ret = __die_walk_funclines(pdie, handler, data); - else { - struct __line_walk_param param = { - .handler = handler, - .data = data, - .retval = 0, - }; - dwarf_getfuncs(cu_die, __die_walk_culines_cb, ¶m, 0); - ret = param.retval; - } - - return ret; -} - -struct __find_variable_param { - const char *name; - Dwarf_Addr addr; -}; - -static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data) -{ - struct __find_variable_param *fvp = data; - int tag; - - tag = dwarf_tag(die_mem); - if ((tag == DW_TAG_formal_parameter || - tag == DW_TAG_variable) && - die_compare_name(die_mem, fvp->name)) - return DIE_FIND_CB_END; - - if (dwarf_haspc(die_mem, fvp->addr)) - return DIE_FIND_CB_CONTINUE; - else - return DIE_FIND_CB_SIBLING; -} - -/* Find a variable called 'name' at given address */ -static Dwarf_Die *die_find_variable_at(Dwarf_Die *sp_die, const char *name, - Dwarf_Addr addr, Dwarf_Die *die_mem) -{ - struct __find_variable_param fvp = { .name = name, .addr = addr}; - - return die_find_child(sp_die, __die_find_variable_cb, (void *)&fvp, - die_mem); -} - -static int __die_find_member_cb(Dwarf_Die *die_mem, void *data) -{ - const char *name = data; - - if ((dwarf_tag(die_mem) == DW_TAG_member) && - die_compare_name(die_mem, name)) - return DIE_FIND_CB_END; - - return DIE_FIND_CB_SIBLING; -} - -/* Find a member called 'name' */ -static Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name, - Dwarf_Die *die_mem) -{ - return die_find_child(st_die, __die_find_member_cb, (void *)name, - die_mem); -} - -/* Get the name of given variable DIE */ -static int die_get_typename(Dwarf_Die *vr_die, char *buf, int len) -{ - Dwarf_Die type; - int tag, ret, ret2; - const char *tmp = ""; - - if (__die_get_real_type(vr_die, &type) == NULL) - return -ENOENT; - - tag = dwarf_tag(&type); - if (tag == DW_TAG_array_type || tag == DW_TAG_pointer_type) - tmp = "*"; - else if (tag == DW_TAG_subroutine_type) { - /* Function pointer */ - ret = snprintf(buf, len, "(function_type)"); - return (ret >= len) ? -E2BIG : ret; - } else { - if (!dwarf_diename(&type)) - return -ENOENT; - if (tag == DW_TAG_union_type) - tmp = "union "; - else if (tag == DW_TAG_structure_type) - tmp = "struct "; - /* Write a base name */ - ret = snprintf(buf, len, "%s%s", tmp, dwarf_diename(&type)); - return (ret >= len) ? -E2BIG : ret; - } - ret = die_get_typename(&type, buf, len); - if (ret > 0) { - ret2 = snprintf(buf + ret, len - ret, "%s", tmp); - ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret; - } - return ret; -} - -/* Get the name and type of given variable DIE, stored as "type\tname" */ -static int die_get_varname(Dwarf_Die *vr_die, char *buf, int len) -{ - int ret, ret2; - - ret = die_get_typename(vr_die, buf, len); - if (ret < 0) { - pr_debug("Failed to get type, make it unknown.\n"); - ret = snprintf(buf, len, "(unknown_type)"); - } - if (ret > 0) { - ret2 = snprintf(buf + ret, len - ret, "\t%s", - dwarf_diename(vr_die)); - ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret; - } - return ret; -} - /* * Probe finder related functions */ @@ -1206,8 +691,7 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf) /* If no real subprogram, find a real one */ if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) { - sp_die = die_find_real_subprogram(&pf->cu_die, - pf->addr, &die_mem); + sp_die = die_find_realfunc(&pf->cu_die, pf->addr, &die_mem); if (!sp_die) { pr_warning("Failed to find probe point in any " "functions.\n"); @@ -1768,7 +1252,7 @@ int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt) /* Don't care whether it failed or not */ /* Find a corresponding function (name, baseline and baseaddr) */ - if (die_find_real_subprogram(&cudie, (Dwarf_Addr)addr, &spdie)) { + if (die_find_realfunc(&cudie, (Dwarf_Addr)addr, &spdie)) { /* Get function entry information */ tmp = dwarf_diename(&spdie); if (!tmp || diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index 605730a..0f1ed3d 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -32,11 +32,7 @@ extern int find_line_range(int fd, struct line_range *lr); extern int find_available_vars_at(int fd, struct perf_probe_event *pev, struct variable_list **vls, int max_points, bool externs); - -#include -#include -#include -#include +#include "dwarf-aux.h" struct probe_finder { struct perf_probe_event *pev; /* Target probe event */ -- cgit v0.10.2 From ff741783506c340035659a71be68ddb4068760d1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:39 +0900 Subject: perf probe: Introduce debuginfo to encapsulate dwarf information Introduce debuginfo to encapsulate dwarf information. This new object allows us to reuse and expand debuginfo easily. Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072739.6528.12438.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index f022316..920c195 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -170,16 +170,17 @@ const char *kernel_get_module_path(const char *module) } #ifdef DWARF_SUPPORT -static int open_vmlinux(const char *module) +/* Open new debuginfo of given module */ +static struct debuginfo *open_debuginfo(const char *module) { const char *path = kernel_get_module_path(module); + if (!path) { pr_err("Failed to find path of %s module.\n", module ?: "kernel"); - return -ENOENT; + return NULL; } - pr_debug("Try to open %s\n", path); - return open(path, O_RDONLY); + return debuginfo__new(path); } /* @@ -193,13 +194,24 @@ static int kprobe_convert_to_perf_probe(struct probe_trace_point *tp, struct map *map; u64 addr; int ret = -ENOENT; + struct debuginfo *dinfo; sym = __find_kernel_function_by_name(tp->symbol, &map); if (sym) { addr = map->unmap_ip(map, sym->start + tp->offset); pr_debug("try to find %s+%ld@%" PRIx64 "\n", tp->symbol, tp->offset, addr); - ret = find_perf_probe_point((unsigned long)addr, pp); + + dinfo = debuginfo__new_online_kernel(addr); + if (dinfo) { + ret = debuginfo__find_probe_point(dinfo, + (unsigned long)addr, pp); + debuginfo__delete(dinfo); + } else { + pr_debug("Failed to open debuginfo at 0x%" PRIx64 "\n", + addr); + ret = -ENOENT; + } } if (ret <= 0) { pr_debug("Failed to find corresponding probes from " @@ -220,20 +232,22 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, int max_tevs, const char *module) { bool need_dwarf = perf_probe_event_need_dwarf(pev); - int fd, ntevs; + struct debuginfo *dinfo = open_debuginfo(module); + int ntevs; - fd = open_vmlinux(module); - if (fd < 0) { + if (!dinfo) { if (need_dwarf) { pr_warning("Failed to open debuginfo file.\n"); - return fd; + return -ENOENT; } - pr_debug("Could not open vmlinux. Try to use symbols.\n"); + pr_debug("Could not open debuginfo. Try to use symbols.\n"); return 0; } - /* Searching trace events corresponding to probe event */ - ntevs = find_probe_trace_events(fd, pev, tevs, max_tevs); + /* Searching trace events corresponding to a probe event */ + ntevs = debuginfo__find_trace_events(dinfo, pev, tevs, max_tevs); + + debuginfo__delete(dinfo); if (ntevs > 0) { /* Succeeded to find trace events */ pr_debug("find %d probe_trace_events.\n", ntevs); @@ -371,8 +385,9 @@ int show_line_range(struct line_range *lr, const char *module) { int l = 1; struct line_node *ln; + struct debuginfo *dinfo; FILE *fp; - int fd, ret; + int ret; char *tmp; /* Search a line range */ @@ -380,13 +395,14 @@ int show_line_range(struct line_range *lr, const char *module) if (ret < 0) return ret; - fd = open_vmlinux(module); - if (fd < 0) { + dinfo = open_debuginfo(module); + if (!dinfo) { pr_warning("Failed to open debuginfo file.\n"); - return fd; + return -ENOENT; } - ret = find_line_range(fd, lr); + ret = debuginfo__find_line_range(dinfo, lr); + debuginfo__delete(dinfo); if (ret == 0) { pr_warning("Specified source line is not found.\n"); return -ENOENT; @@ -448,7 +464,8 @@ end: return ret; } -static int show_available_vars_at(int fd, struct perf_probe_event *pev, +static int show_available_vars_at(struct debuginfo *dinfo, + struct perf_probe_event *pev, int max_vls, struct strfilter *_filter, bool externs) { @@ -463,7 +480,8 @@ static int show_available_vars_at(int fd, struct perf_probe_event *pev, return -EINVAL; pr_debug("Searching variables at %s\n", buf); - ret = find_available_vars_at(fd, pev, &vls, max_vls, externs); + ret = debuginfo__find_available_vars_at(dinfo, pev, &vls, + max_vls, externs); if (ret <= 0) { pr_err("Failed to find variables at %s (%d)\n", buf, ret); goto end; @@ -504,24 +522,26 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs, int max_vls, const char *module, struct strfilter *_filter, bool externs) { - int i, fd, ret = 0; + int i, ret = 0; + struct debuginfo *dinfo; ret = init_vmlinux(); if (ret < 0) return ret; + dinfo = open_debuginfo(module); + if (!dinfo) { + pr_warning("Failed to open debuginfo file.\n"); + return -ENOENT; + } + setup_pager(); - for (i = 0; i < npevs && ret >= 0; i++) { - fd = open_vmlinux(module); - if (fd < 0) { - pr_warning("Failed to open debug information file.\n"); - ret = fd; - break; - } - ret = show_available_vars_at(fd, &pevs[i], max_vls, _filter, + for (i = 0; i < npevs && ret >= 0; i++) + ret = show_available_vars_at(dinfo, &pevs[i], max_vls, _filter, externs); - } + + debuginfo__delete(dinfo); return ret; } diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 53d219b..3e44a3e 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -116,29 +116,37 @@ static const Dwfl_Callbacks offline_callbacks = { }; /* Get a Dwarf from offline image */ -static Dwarf *dwfl_init_offline_dwarf(int fd, Dwfl **dwflp, Dwarf_Addr *bias) +static int debuginfo__init_offline_dwarf(struct debuginfo *self, + const char *path) { Dwfl_Module *mod; - Dwarf *dbg = NULL; + int fd; - if (!dwflp) - return NULL; + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; - *dwflp = dwfl_begin(&offline_callbacks); - if (!*dwflp) - return NULL; + self->dwfl = dwfl_begin(&offline_callbacks); + if (!self->dwfl) + goto error; - mod = dwfl_report_offline(*dwflp, "", "", fd); + mod = dwfl_report_offline(self->dwfl, "", "", fd); if (!mod) goto error; - dbg = dwfl_module_getdwarf(mod, bias); - if (!dbg) { + self->dbg = dwfl_module_getdwarf(mod, &self->bias); + if (!self->dbg) + goto error; + + return 0; error: - dwfl_end(*dwflp); - *dwflp = NULL; - } - return dbg; + if (self->dwfl) + dwfl_end(self->dwfl); + else + close(fd); + memset(self, 0, sizeof(*self)); + + return -ENOENT; } #if _ELFUTILS_PREREQ(0, 148) @@ -174,53 +182,82 @@ static const Dwfl_Callbacks kernel_callbacks = { }; /* Get a Dwarf from live kernel image */ -static Dwarf *dwfl_init_live_kernel_dwarf(Dwarf_Addr addr, Dwfl **dwflp, - Dwarf_Addr *bias) +static int debuginfo__init_online_kernel_dwarf(struct debuginfo *self, + Dwarf_Addr addr) { - Dwarf *dbg; - - if (!dwflp) - return NULL; - - *dwflp = dwfl_begin(&kernel_callbacks); - if (!*dwflp) - return NULL; + self->dwfl = dwfl_begin(&kernel_callbacks); + if (!self->dwfl) + return -EINVAL; /* Load the kernel dwarves: Don't care the result here */ - dwfl_linux_kernel_report_kernel(*dwflp); - dwfl_linux_kernel_report_modules(*dwflp); + dwfl_linux_kernel_report_kernel(self->dwfl); + dwfl_linux_kernel_report_modules(self->dwfl); - dbg = dwfl_addrdwarf(*dwflp, addr, bias); + self->dbg = dwfl_addrdwarf(self->dwfl, addr, &self->bias); /* Here, check whether we could get a real dwarf */ - if (!dbg) { + if (!self->dbg) { pr_debug("Failed to find kernel dwarf at %lx\n", (unsigned long)addr); - dwfl_end(*dwflp); - *dwflp = NULL; + dwfl_end(self->dwfl); + memset(self, 0, sizeof(*self)); + return -ENOENT; } - return dbg; + + return 0; } #else /* With older elfutils, this just support kernel module... */ -static Dwarf *dwfl_init_live_kernel_dwarf(Dwarf_Addr addr __used, Dwfl **dwflp, - Dwarf_Addr *bias) +static int debuginfo__init_online_kernel_dwarf(struct debuginfo *self, + Dwarf_Addr addr __used) { - int fd; const char *path = kernel_get_module_path("kernel"); if (!path) { pr_err("Failed to find vmlinux path\n"); - return NULL; + return -ENOENT; } pr_debug2("Use file %s for debuginfo\n", path); - fd = open(path, O_RDONLY); - if (fd < 0) + return debuginfo__init_offline_dwarf(self, path); +} +#endif + +struct debuginfo *debuginfo__new(const char *path) +{ + struct debuginfo *self = zalloc(sizeof(struct debuginfo)); + if (!self) return NULL; - return dwfl_init_offline_dwarf(fd, dwflp, bias); + if (debuginfo__init_offline_dwarf(self, path) < 0) { + free(self); + self = NULL; + } + + return self; +} + +struct debuginfo *debuginfo__new_online_kernel(unsigned long addr) +{ + struct debuginfo *self = zalloc(sizeof(struct debuginfo)); + if (!self) + return NULL; + + if (debuginfo__init_online_kernel_dwarf(self, (Dwarf_Addr)addr) < 0) { + free(self); + self = NULL; + } + + return self; +} + +void debuginfo__delete(struct debuginfo *self) +{ + if (self) { + if (self->dwfl) + dwfl_end(self->dwfl); + free(self); + } } -#endif /* * Probe finder related functions @@ -949,28 +986,18 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data) } /* Find probe points from debuginfo */ -static int find_probes(int fd, struct probe_finder *pf) +static int debuginfo__find_probes(struct debuginfo *self, + struct probe_finder *pf) { struct perf_probe_point *pp = &pf->pev->point; Dwarf_Off off, noff; size_t cuhl; Dwarf_Die *diep; - Dwarf *dbg = NULL; - Dwfl *dwfl; - Dwarf_Addr bias; /* Currently ignored */ int ret = 0; - dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias); - if (!dbg) { - pr_warning("No debug information found in the vmlinux - " - "please rebuild with CONFIG_DEBUG_INFO=y.\n"); - close(fd); /* Without dwfl_end(), fd isn't closed. */ - return -EBADF; - } - #if _ELFUTILS_PREREQ(0, 142) /* Get the call frame information from this dwarf */ - pf->cfi = dwarf_getcfi(dbg); + pf->cfi = dwarf_getcfi(self->dbg); #endif off = 0; @@ -989,7 +1016,8 @@ static int find_probes(int fd, struct probe_finder *pf) .data = pf, }; - dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + dwarf_getpubnames(self->dbg, pubname_search_cb, + &pubname_param, 0); if (pubname_param.found) { ret = probe_point_search_cb(&pf->sp_die, &probe_param); if (ret) @@ -998,9 +1026,9 @@ static int find_probes(int fd, struct probe_finder *pf) } /* Loop on CUs (Compilation Unit) */ - while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { + while (!dwarf_nextcu(self->dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { /* Get the DIE(Debugging Information Entry) of this CU */ - diep = dwarf_offdie(dbg, off + cuhl, &pf->cu_die); + diep = dwarf_offdie(self->dbg, off + cuhl, &pf->cu_die); if (!diep) continue; @@ -1027,8 +1055,6 @@ static int find_probes(int fd, struct probe_finder *pf) found: line_list__free(&pf->lcache); - if (dwfl) - dwfl_end(dwfl); return ret; } @@ -1074,8 +1100,9 @@ static int add_probe_trace_event(Dwarf_Die *sp_die, struct probe_finder *pf) } /* Find probe_trace_events specified by perf_probe_event from debuginfo */ -int find_probe_trace_events(int fd, struct perf_probe_event *pev, - struct probe_trace_event **tevs, int max_tevs) +int debuginfo__find_trace_events(struct debuginfo *self, + struct perf_probe_event *pev, + struct probe_trace_event **tevs, int max_tevs) { struct trace_event_finder tf = { .pf = {.pev = pev, .callback = add_probe_trace_event}, @@ -1090,7 +1117,7 @@ int find_probe_trace_events(int fd, struct perf_probe_event *pev, tf.tevs = *tevs; tf.ntevs = 0; - ret = find_probes(fd, &tf.pf); + ret = debuginfo__find_probes(self, &tf.pf); if (ret < 0) { free(*tevs); *tevs = NULL; @@ -1184,9 +1211,10 @@ out: } /* Find available variables at given probe point */ -int find_available_vars_at(int fd, struct perf_probe_event *pev, - struct variable_list **vls, int max_vls, - bool externs) +int debuginfo__find_available_vars_at(struct debuginfo *self, + struct perf_probe_event *pev, + struct variable_list **vls, + int max_vls, bool externs) { struct available_var_finder af = { .pf = {.pev = pev, .callback = add_available_vars}, @@ -1201,7 +1229,7 @@ int find_available_vars_at(int fd, struct perf_probe_event *pev, af.vls = *vls; af.nvls = 0; - ret = find_probes(fd, &af.pf); + ret = debuginfo__find_probes(self, &af.pf); if (ret < 0) { /* Free vlist for error */ while (af.nvls--) { @@ -1219,28 +1247,19 @@ int find_available_vars_at(int fd, struct perf_probe_event *pev, } /* Reverse search */ -int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt) +int debuginfo__find_probe_point(struct debuginfo *self, unsigned long addr, + struct perf_probe_point *ppt) { Dwarf_Die cudie, spdie, indie; - Dwarf *dbg = NULL; - Dwfl *dwfl = NULL; - Dwarf_Addr _addr, baseaddr, bias = 0; + Dwarf_Addr _addr, baseaddr; const char *fname = NULL, *func = NULL, *tmp; int baseline = 0, lineno = 0, ret = 0; - /* Open the live linux kernel */ - dbg = dwfl_init_live_kernel_dwarf(addr, &dwfl, &bias); - if (!dbg) { - pr_warning("No debug information found in the vmlinux - " - "please rebuild with CONFIG_DEBUG_INFO=y.\n"); - ret = -EINVAL; - goto end; - } - /* Adjust address with bias */ - addr += bias; + addr += self->bias; + /* Find cu die */ - if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr - bias, &cudie)) { + if (!dwarf_addrdie(self->dbg, (Dwarf_Addr)addr - self->bias, &cudie)) { pr_warning("Failed to find debug information for address %lx\n", addr); ret = -EINVAL; @@ -1316,8 +1335,6 @@ post: } } end: - if (dwfl) - dwfl_end(dwfl); if (ret == 0 && (fname || func)) ret = 1; /* Found a point */ return ret; @@ -1427,26 +1444,15 @@ static int find_line_range_by_func(struct line_finder *lf) return param.retval; } -int find_line_range(int fd, struct line_range *lr) +int debuginfo__find_line_range(struct debuginfo *self, struct line_range *lr) { struct line_finder lf = {.lr = lr, .found = 0}; int ret = 0; Dwarf_Off off = 0, noff; size_t cuhl; Dwarf_Die *diep; - Dwarf *dbg = NULL; - Dwfl *dwfl; - Dwarf_Addr bias; /* Currently ignored */ const char *comp_dir; - dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias); - if (!dbg) { - pr_warning("No debug information found in the vmlinux - " - "please rebuild with CONFIG_DEBUG_INFO=y.\n"); - close(fd); /* Without dwfl_end(), fd isn't closed. */ - return -EBADF; - } - /* Fastpath: lookup by function name from .debug_pubnames section */ if (lr->function) { struct pubname_callback_param pubname_param = { @@ -1455,7 +1461,8 @@ int find_line_range(int fd, struct line_range *lr) struct dwarf_callback_param line_range_param = { .data = (void *)&lf, .retval = 0}; - dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + dwarf_getpubnames(self->dbg, pubname_search_cb, + &pubname_param, 0); if (pubname_param.found) { line_range_search_cb(&lf.sp_die, &line_range_param); if (lf.found) @@ -1465,11 +1472,12 @@ int find_line_range(int fd, struct line_range *lr) /* Loop on CUs (Compilation Unit) */ while (!lf.found && ret >= 0) { - if (dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) != 0) + if (dwarf_nextcu(self->dbg, off, &noff, &cuhl, + NULL, NULL, NULL) != 0) break; /* Get the DIE(Debugging Information Entry) of this CU */ - diep = dwarf_offdie(dbg, off + cuhl, &lf.cu_die); + diep = dwarf_offdie(self->dbg, off + cuhl, &lf.cu_die); if (!diep) continue; @@ -1503,7 +1511,6 @@ found: } pr_debug("path: %s\n", lr->path); - dwfl_end(dwfl); return (ret < 0) ? ret : lf.found; } diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index 0f1ed3d..c478b42 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -16,23 +16,42 @@ static inline int is_c_varname(const char *name) } #ifdef DWARF_SUPPORT + +#include "dwarf-aux.h" + +/* TODO: export debuginfo data structure even if no dwarf support */ + +/* debug information structure */ +struct debuginfo { + Dwarf *dbg; + Dwfl *dwfl; + Dwarf_Addr bias; +}; + +extern struct debuginfo *debuginfo__new(const char *path); +extern struct debuginfo *debuginfo__new_online_kernel(unsigned long addr); +extern void debuginfo__delete(struct debuginfo *self); + /* Find probe_trace_events specified by perf_probe_event from debuginfo */ -extern int find_probe_trace_events(int fd, struct perf_probe_event *pev, - struct probe_trace_event **tevs, - int max_tevs); +extern int debuginfo__find_trace_events(struct debuginfo *self, + struct perf_probe_event *pev, + struct probe_trace_event **tevs, + int max_tevs); /* Find a perf_probe_point from debuginfo */ -extern int find_perf_probe_point(unsigned long addr, - struct perf_probe_point *ppt); +extern int debuginfo__find_probe_point(struct debuginfo *self, + unsigned long addr, + struct perf_probe_point *ppt); /* Find a line range */ -extern int find_line_range(int fd, struct line_range *lr); +extern int debuginfo__find_line_range(struct debuginfo *self, + struct line_range *lr); /* Find available variables */ -extern int find_available_vars_at(int fd, struct perf_probe_event *pev, - struct variable_list **vls, int max_points, - bool externs); -#include "dwarf-aux.h" +extern int debuginfo__find_available_vars_at(struct debuginfo *self, + struct perf_probe_event *pev, + struct variable_list **vls, + int max_points, bool externs); struct probe_finder { struct perf_probe_event *pev; /* Target probe event */ -- cgit v0.10.2 From 190b57fcb9c5fed5414935a174094f534fc510bc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:45 +0900 Subject: perf probe: Add probed module in front of function Add probed module name and ":" in front of function name if -m module option is given. In the result, the symbol name passed to kprobe-tracer becomes MODULE:FUNCTION, so that kallsyms can solve it as a symbol in the module correctly. Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072745.6528.26416.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 920c195..ee3f41e 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -226,14 +226,26 @@ static int kprobe_convert_to_perf_probe(struct probe_trace_point *tp, return 0; } +static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, + int ntevs, const char *module) +{ + int i; + for (i = 0; i < ntevs; i++) { + tevs[i].point.module = strdup(module); + if (!tevs[i].point.module) + return -ENOMEM; + } + return 0; +} + /* Try to find perf_probe_event with debuginfo */ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, - struct probe_trace_event **tevs, - int max_tevs, const char *module) + struct probe_trace_event **tevs, + int max_tevs, const char *module) { bool need_dwarf = perf_probe_event_need_dwarf(pev); struct debuginfo *dinfo = open_debuginfo(module); - int ntevs; + int ntevs, ret = 0; if (!dinfo) { if (need_dwarf) { @@ -251,7 +263,10 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, if (ntevs > 0) { /* Succeeded to find trace events */ pr_debug("find %d probe_trace_events.\n", ntevs); - return ntevs; + if (module) + ret = add_module_to_probe_trace_events(*tevs, ntevs, + module); + return ret < 0 ? ret : ntevs; } if (ntevs == 0) { /* No error but failed to find probe point. */ @@ -1010,7 +1025,7 @@ bool perf_probe_event_need_dwarf(struct perf_probe_event *pev) /* Parse probe_events event into struct probe_point */ static int parse_probe_trace_command(const char *cmd, - struct probe_trace_event *tev) + struct probe_trace_event *tev) { struct probe_trace_point *tp = &tev->point; char pr; @@ -1043,8 +1058,14 @@ static int parse_probe_trace_command(const char *cmd, tp->retprobe = (pr == 'r'); - /* Scan function name and offset */ - ret = sscanf(argv[1], "%a[^+]+%lu", (float *)(void *)&tp->symbol, + /* Scan module name(if there), function name and offset */ + p = strchr(argv[1], ':'); + if (p) { + tp->module = strndup(argv[1], p - argv[1]); + p++; + } else + p = argv[1]; + ret = sscanf(p, "%a[^+]+%lu", (float *)(void *)&tp->symbol, &tp->offset); if (ret == 1) tp->offset = 0; @@ -1289,9 +1310,10 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev) if (buf == NULL) return NULL; - len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s+%lu", + len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s%s%s+%lu", tp->retprobe ? 'r' : 'p', tev->group, tev->event, + tp->module ?: "", tp->module ? ":" : "", tp->symbol, tp->offset); if (len <= 0) goto error; @@ -1398,6 +1420,8 @@ static void clear_probe_trace_event(struct probe_trace_event *tev) free(tev->group); if (tev->point.symbol) free(tev->point.symbol); + if (tev->point.module) + free(tev->point.module); for (i = 0; i < tev->nargs; i++) { if (tev->args[i].name) free(tev->args[i].name); @@ -1749,7 +1773,7 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, /* Convert perf_probe_event with debuginfo */ ret = try_to_find_probe_trace_events(pev, tevs, max_tevs, module); if (ret != 0) - return ret; + return ret; /* Found in debuginfo or got an error */ /* Allocate trace event buffer */ tev = *tevs = zalloc(sizeof(struct probe_trace_event)); @@ -1762,6 +1786,11 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, ret = -ENOMEM; goto error; } + tev->point.module = strdup(module); + if (tev->point.module == NULL) { + ret = -ENOMEM; + goto error; + } tev->point.offset = pev->point.offset; tev->point.retprobe = pev->point.retprobe; tev->nargs = pev->nargs; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 3434fc9..a7dee83 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -10,6 +10,7 @@ extern bool probe_event_dry_run; /* kprobe-tracer tracing point */ struct probe_trace_point { char *symbol; /* Base symbol */ + char *module; /* Module name */ unsigned long offset; /* Offset from symbol */ bool retprobe; /* Return probe flag */ }; -- cgit v0.10.2 From 14a8fd7ceea6915c613746203d6e9a2bf273f16c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:51 +0900 Subject: perf probe: Support adding probes on offline kernel modules Support adding probes on offline kernel modules. This enables perf-probe to trace kernel-module init functions via perf-probe. If user gives the path of module with -m option, perf-probe expects the module is offline. This feature works with --add, --funcs, and --vars. E.g) # perf probe -m /lib/modules/`uname -r`/kernel/fs/btrfs/btrfs.ko \ -a "extent_io_init:5 extent_state_cache" Add new events: probe:extent_io_init (on extent_io_init:5 with extent_state_cache) probe:extent_io_init_1 (on extent_io_init:5 with extent_state_cache) You can now use it on all perf tools, such as: perf record -e probe:extent_io_init_1 -aR sleep 1 Signed-off-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072751.6528.10230.stgit@fedora15 Signed-off-by: Steven Rostedt diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 02bafce..2780d9c 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -34,9 +34,11 @@ OPTIONS Specify vmlinux path which has debuginfo (Dwarf binary). -m:: ---module=MODNAME:: +--module=MODNAME|PATH:: Specify module name in which perf-probe searches probe points - or lines. + or lines. If a path of module file is passed, perf-probe + treat it as an offline module (this means you can add a probe on + a module which has not been loaded yet). -s:: --source=PATH:: diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 2c0e64d..5f2a5c7 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -242,7 +242,8 @@ static const struct option options[] = { OPT_STRING('s', "source", &symbol_conf.source_prefix, "directory", "path to kernel source"), OPT_STRING('m', "module", ¶ms.target_module, - "modname", "target module name"), + "modname|path", + "target module name (for online) or path (for offline)"), #endif OPT__DRY_RUN(&probe_event_dry_run), OPT_INTEGER('\0', "max-probes", ¶ms.max_probe_points, diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index ee3f41e..b82d54f 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -117,6 +117,10 @@ static struct map *kernel_get_module_map(const char *module) struct rb_node *nd; struct map_groups *grp = &machine.kmaps; + /* A file path -- this is an offline module */ + if (module && strchr(module, '/')) + return machine__new_module(&machine, 0, module); + if (!module) module = "kernel"; @@ -173,12 +177,19 @@ const char *kernel_get_module_path(const char *module) /* Open new debuginfo of given module */ static struct debuginfo *open_debuginfo(const char *module) { - const char *path = kernel_get_module_path(module); + const char *path; - if (!path) { - pr_err("Failed to find path of %s module.\n", - module ?: "kernel"); - return NULL; + /* A file path -- this is an offline module */ + if (module && strchr(module, '/')) + path = module; + else { + path = kernel_get_module_path(module); + + if (!path) { + pr_err("Failed to find path of %s module.\n", + module ?: "kernel"); + return NULL; + } } return debuginfo__new(path); } @@ -229,13 +240,36 @@ static int kprobe_convert_to_perf_probe(struct probe_trace_point *tp, static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, int ntevs, const char *module) { - int i; + int i, ret = 0; + char *tmp; + + if (!module) + return 0; + + tmp = strrchr(module, '/'); + if (tmp) { + /* This is a module path -- get the module name */ + module = strdup(tmp + 1); + if (!module) + return -ENOMEM; + tmp = strchr(module, '.'); + if (tmp) + *tmp = '\0'; + tmp = (char *)module; /* For free() */ + } + for (i = 0; i < ntevs; i++) { tevs[i].point.module = strdup(module); - if (!tevs[i].point.module) - return -ENOMEM; + if (!tevs[i].point.module) { + ret = -ENOMEM; + break; + } } - return 0; + + if (tmp) + free(tmp); + + return ret; } /* Try to find perf_probe_event with debuginfo */ -- cgit v0.10.2 From 0111919da268e1ced315e009ad0d0435a2fb32ac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 13 Jul 2011 22:58:18 +0200 Subject: perf tools: Add missing 'node' alias to the hw_cache[] array Add "node" as a simple alias for NODE cache events. The addition of NODE cache events broke the parse_alias function, so any mismatched event caused the segfault, like: # ./perf stat -e krava ls The hw_cache/hw_cache_op/hw_cache_result arrays needs to follow PERF_COUNT_HW_CACHE_*MAX enums. Adding those MAXs to be size of those arrays, so possible ommision in future wil not lead to segfault. Adding read/write/prefetch as allowed operations for node cache event. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: acme@redhat.com Link: http://lkml.kernel.org/r/20110713205818.GB7827@jolsa.brq.redhat.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 41982c3..c0e21ae 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -86,22 +86,24 @@ static const char *sw_event_names[PERF_COUNT_SW_MAX] = { #define MAX_ALIASES 8 -static const char *hw_cache[][MAX_ALIASES] = { +static const char *hw_cache[PERF_COUNT_HW_CACHE_MAX][MAX_ALIASES] = { { "L1-dcache", "l1-d", "l1d", "L1-data", }, { "L1-icache", "l1-i", "l1i", "L1-instruction", }, - { "LLC", "L2" }, + { "LLC", "L2", }, { "dTLB", "d-tlb", "Data-TLB", }, { "iTLB", "i-tlb", "Instruction-TLB", }, { "branch", "branches", "bpu", "btb", "bpc", }, + { "node", }, }; -static const char *hw_cache_op[][MAX_ALIASES] = { +static const char *hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][MAX_ALIASES] = { { "load", "loads", "read", }, { "store", "stores", "write", }, { "prefetch", "prefetches", "speculative-read", "speculative-load", }, }; -static const char *hw_cache_result[][MAX_ALIASES] = { +static const char *hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX] + [MAX_ALIASES] = { { "refs", "Reference", "ops", "access", }, { "misses", "miss", }, }; @@ -124,6 +126,7 @@ static unsigned long hw_cache_stat[C(MAX)] = { [C(DTLB)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), [C(ITLB)] = (CACHE_READ), [C(BPU)] = (CACHE_READ), + [C(NODE)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH), }; #define for_each_subsystem(sys_dir, sys_dirent, sys_next) \ @@ -393,7 +396,7 @@ parse_generic_hw_event(const char **str, struct perf_event_attr *attr) PERF_COUNT_HW_CACHE_OP_MAX); if (cache_op >= 0) { if (!is_cache_op_valid(cache_type, cache_op)) - return 0; + return EVT_FAILED; continue; } } -- cgit v0.10.2 From eda3913bb70ecebac13adccffe1e7f96e93cee02 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 15 Jul 2011 12:34:09 -0600 Subject: perf tools: Fix endian conversion reading event attr from file header The perf_event_attr struct has two __u32's at the top and they need to be swapped individually. With this change I was able to analyze a perf.data collected in a 32-bit PPC VM on an x86 system. I tested both 32-bit and 64-bit binaries for the Intel analysis side; both read the PPC perf.data file correctly. -v2: - changed the existing perf_event__attr_swap() to swap only elements of perf_event_attr and exported it for use in swapping the attributes in the file header - updated swap_ops used for processing events Signed-off-by: David Ahern Acked-by: Frederic Weisbecker Cc: acme@ghostprotocols.net Cc: peterz@infradead.org Cc: paulus@samba.org Cc: Link: http://lkml.kernel.org/r/1310754849-12474-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index afb0849..cb2959a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -877,9 +877,12 @@ int perf_session__read_header(struct perf_session *session, int fd) struct perf_evsel *evsel; off_t tmp; - if (perf_header__getbuffer64(header, fd, &f_attr, sizeof(f_attr))) + if (readn(fd, &f_attr, sizeof(f_attr)) <= 0) goto out_errno; + if (header->needs_swap) + perf_event__attr_swap(&f_attr.attr); + tmp = lseek(fd, 0, SEEK_CUR); evsel = perf_evsel__new(&f_attr.attr, i); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 080e533..a3c13bc 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -413,20 +413,26 @@ static void perf_event__read_swap(union perf_event *event) event->read.id = bswap_64(event->read.id); } -static void perf_event__attr_swap(union perf_event *event) +/* exported for swapping attributes in file header */ +void perf_event__attr_swap(struct perf_event_attr *attr) +{ + attr->type = bswap_32(attr->type); + attr->size = bswap_32(attr->size); + attr->config = bswap_64(attr->config); + attr->sample_period = bswap_64(attr->sample_period); + attr->sample_type = bswap_64(attr->sample_type); + attr->read_format = bswap_64(attr->read_format); + attr->wakeup_events = bswap_32(attr->wakeup_events); + attr->bp_type = bswap_32(attr->bp_type); + attr->bp_addr = bswap_64(attr->bp_addr); + attr->bp_len = bswap_64(attr->bp_len); +} + +static void perf_event__hdr_attr_swap(union perf_event *event) { size_t size; - event->attr.attr.type = bswap_32(event->attr.attr.type); - event->attr.attr.size = bswap_32(event->attr.attr.size); - event->attr.attr.config = bswap_64(event->attr.attr.config); - event->attr.attr.sample_period = bswap_64(event->attr.attr.sample_period); - event->attr.attr.sample_type = bswap_64(event->attr.attr.sample_type); - event->attr.attr.read_format = bswap_64(event->attr.attr.read_format); - event->attr.attr.wakeup_events = bswap_32(event->attr.attr.wakeup_events); - event->attr.attr.bp_type = bswap_32(event->attr.attr.bp_type); - event->attr.attr.bp_addr = bswap_64(event->attr.attr.bp_addr); - event->attr.attr.bp_len = bswap_64(event->attr.attr.bp_len); + perf_event__attr_swap(&event->attr.attr); size = event->header.size; size -= (void *)&event->attr.id - (void *)event; @@ -454,7 +460,7 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_LOST] = perf_event__all64_swap, [PERF_RECORD_READ] = perf_event__read_swap, [PERF_RECORD_SAMPLE] = perf_event__all64_swap, - [PERF_RECORD_HEADER_ATTR] = perf_event__attr_swap, + [PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap, [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap, [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap, [PERF_RECORD_HEADER_BUILD_ID] = NULL, diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 5de754f..170601e 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -112,6 +112,7 @@ int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, u64 addr); void mem_bswap_64(void *src, int byte_size); +void perf_event__attr_swap(struct perf_event_attr *attr); int perf_session__create_kernel_maps(struct perf_session *self); -- cgit v0.10.2 From adc4bf9955856f8aa081ba613dbf56ffd664f0b7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 May 2011 09:16:27 -0600 Subject: perf script: Fix display of IP address for non-callchain path Non-callchain path is using al.addr which prints as: openssl 14564 17672.003587: 7862d _x86_64_AES_encrypt_compact This should be sample->ip to print as: openssl 14564 17672.003587: 3f7867862d _x86_64_AES_encrypt_compact Signed-off-by: David Ahern Acked-by: Frederic Weisbecker Cc: acme@ghostprotocols.net Cc: peterz@infradead.org Cc: paulus@samba.org Link: http://lkml.kernel.org/r/1306768587-15376-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a3c13bc..72458d9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1269,7 +1269,7 @@ void perf_session__print_ip(union perf_event *event, } } else { - printf("%16" PRIx64, al.addr); + printf("%16" PRIx64, sample->ip); if (print_sym) { if (al.sym && al.sym->name) symname = al.sym->name; -- cgit v0.10.2 From f120f9d51be3a7db8991e7b78dc08bab5f8ab8f3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jul 2011 11:25:32 +0200 Subject: perf tools: De-opt the parse_events function Moving out the option parameter from parse_events function, and adding new parse_events_option function instead. The option parameter is used only to carry "struct perf_evlist" pointer for chaining new events. Putting it away, enable us to call parse_events from other places without using the option parameter. Signed-off-by: Jiri Olsa Cc: acme@redhat.com Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Link: http://lkml.kernel.org/r/1310635534-4013-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 8e2c857..80dc5b7 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -740,7 +740,7 @@ static bool force, append_file; const struct option record_options[] = { OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", - parse_events), + parse_events_option), OPT_CALLBACK(0, "filter", &evsel_list, "filter", "event filter", parse_filter), OPT_INTEGER('p', "pid", &target_pid, diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1d08c80..1ad04ce 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1032,7 +1032,7 @@ static int stat__set_big_num(const struct option *opt __used, static const struct option options[] = { OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", - parse_events), + parse_events_option), OPT_CALLBACK(0, "filter", &evsel_list, "filter", "event filter", parse_filter), OPT_BOOLEAN('i', "no-inherit", &no_inherit, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f2f3f49..a43433f 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -990,7 +990,7 @@ static const char * const top_usage[] = { static const struct option options[] = { OPT_CALLBACK('e', "event", &top.evlist, "event", "event selector. use 'perf list' to list available events", - parse_events), + parse_events_option), OPT_INTEGER('c', "count", &default_interval, "event period to sample"), OPT_INTEGER('p', "pid", &top.target_pid, diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c0e21ae..4ea7e19 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -478,7 +478,7 @@ parse_single_tracepoint_event(char *sys_name, /* sys + ':' + event + ':' + flags*/ #define MAX_EVOPT_LEN (MAX_EVENT_LENGTH * 2 + 2 + 128) static enum event_result -parse_multiple_tracepoint_event(const struct option *opt, char *sys_name, +parse_multiple_tracepoint_event(struct perf_evlist *evlist, char *sys_name, const char *evt_exp, char *flags) { char evt_path[MAXPATHLEN]; @@ -512,7 +512,7 @@ parse_multiple_tracepoint_event(const struct option *opt, char *sys_name, if (len < 0) return EVT_FAILED; - if (parse_events(opt, event_opt, 0)) + if (parse_events(evlist, event_opt, 0)) return EVT_FAILED; } @@ -520,7 +520,7 @@ parse_multiple_tracepoint_event(const struct option *opt, char *sys_name, } static enum event_result -parse_tracepoint_event(const struct option *opt, const char **strp, +parse_tracepoint_event(struct perf_evlist *evlist, const char **strp, struct perf_event_attr *attr) { const char *evt_name; @@ -560,8 +560,8 @@ parse_tracepoint_event(const struct option *opt, const char **strp, return EVT_FAILED; if (strpbrk(evt_name, "*?")) { *strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */ - return parse_multiple_tracepoint_event(opt, sys_name, evt_name, - flags); + return parse_multiple_tracepoint_event(evlist, sys_name, + evt_name, flags); } else { return parse_single_tracepoint_event(sys_name, evt_name, evt_length, attr, strp); @@ -781,12 +781,12 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) * Symbolic names are (almost) exactly matched. */ static enum event_result -parse_event_symbols(const struct option *opt, const char **str, +parse_event_symbols(struct perf_evlist *evlist, const char **str, struct perf_event_attr *attr) { enum event_result ret; - ret = parse_tracepoint_event(opt, str, attr); + ret = parse_tracepoint_event(evlist, str, attr); if (ret != EVT_FAILED) goto modifier; @@ -825,9 +825,8 @@ modifier: return ret; } -int parse_events(const struct option *opt, const char *str, int unset __used) +int parse_events(struct perf_evlist *evlist , const char *str, int unset __used) { - struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; struct perf_event_attr attr; enum event_result ret; const char *ostr; @@ -835,7 +834,7 @@ int parse_events(const struct option *opt, const char *str, int unset __used) for (;;) { ostr = str; memset(&attr, 0, sizeof(attr)); - ret = parse_event_symbols(opt, &str, &attr); + ret = parse_event_symbols(evlist, &str, &attr); if (ret == EVT_FAILED) return -1; @@ -866,6 +865,13 @@ int parse_events(const struct option *opt, const char *str, int unset __used) return 0; } +int parse_events_option(const struct option *opt, const char *str, + int unset __used) +{ + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; + return parse_events(evlist, str, unset); +} + int parse_filter(const struct option *opt, const char *str, int unset __used) { diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 746d3fc..2f8e375 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -8,6 +8,7 @@ struct list_head; struct perf_evsel; +struct perf_evlist; struct option; @@ -24,7 +25,10 @@ const char *event_type(int type); const char *event_name(struct perf_evsel *event); extern const char *__event_name(int type, u64 config); -extern int parse_events(const struct option *opt, const char *str, int unset); +extern int parse_events_option(const struct option *opt, const char *str, + int unset); +extern int parse_events(struct perf_evlist *evlist, const char *str, + int unset); extern int parse_filter(const struct option *opt, const char *str, int unset); #define EVENTS_HELP_MAX (128*1024) -- cgit v0.10.2 From 13b62567e909125145f90e91625b1062196d1258 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jul 2011 11:25:33 +0200 Subject: perf tools: Add automated tests for events parsing Adding builtin test for parse_events function, which is responsible for parsing/processing "-e" option for stat/top/record commands. This new test will run within the builtin test command suite (perf test). One or several tests were added for each type of event. More tests could be added easily if needed. Signed-off-by: Jiri Olsa Cc: acme@redhat.com Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Link: http://lkml.kernel.org/r/1310635534-4013-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 2da9162..b5cd5f5 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -12,6 +12,7 @@ #include "util/parse-events.h" #include "util/symbol.h" #include "util/thread_map.h" +#include "../../include/linux/hw_breakpoint.h" static long page_size; @@ -600,6 +601,246 @@ out_free_threads: #undef nsyscalls } +#define TEST_ASSERT_VAL(text, cond) \ +do { \ + if (!cond) { \ + pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \ + return -1; \ + } \ +} while (0) + +static int test__checkevent_tracepoint(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong sample_type", + (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | PERF_SAMPLE_CPU) == + evsel->attr.sample_type); + TEST_ASSERT_VAL("wrong sample_period", 1 == evsel->attr.sample_period); + return 0; +} + +static int test__checkevent_tracepoint_multi(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1); + + list_for_each_entry(evsel, &evlist->entries, node) { + TEST_ASSERT_VAL("wrong type", + PERF_TYPE_TRACEPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong sample_type", + (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | PERF_SAMPLE_CPU) + == evsel->attr.sample_type); + TEST_ASSERT_VAL("wrong sample_period", + 1 == evsel->attr.sample_period); + } + return 0; +} + +static int test__checkevent_raw(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config); + return 0; +} + +static int test__checkevent_numeric(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", 1 == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config); + return 0; +} + +static int test__checkevent_symbolic_name(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", + PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config); + return 0; +} + +static int test__checkevent_symbolic_alias(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", + PERF_COUNT_SW_PAGE_FAULTS == evsel->attr.config); + return 0; +} + +static int test__checkevent_genhw(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", (1 << 16) == evsel->attr.config); + return 0; +} + +static int test__checkevent_breakpoint(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 0 == evsel->attr.config); + TEST_ASSERT_VAL("wrong bp_type", (HW_BREAKPOINT_R | HW_BREAKPOINT_W) == + evsel->attr.bp_type); + TEST_ASSERT_VAL("wrong bp_len", HW_BREAKPOINT_LEN_4 == + evsel->attr.bp_len); + return 0; +} + +static int test__checkevent_breakpoint_x(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 0 == evsel->attr.config); + TEST_ASSERT_VAL("wrong bp_type", + HW_BREAKPOINT_X == evsel->attr.bp_type); + TEST_ASSERT_VAL("wrong bp_len", sizeof(long) == evsel->attr.bp_len); + return 0; +} + +static int test__checkevent_breakpoint_r(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", + PERF_TYPE_BREAKPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 0 == evsel->attr.config); + TEST_ASSERT_VAL("wrong bp_type", + HW_BREAKPOINT_R == evsel->attr.bp_type); + TEST_ASSERT_VAL("wrong bp_len", + HW_BREAKPOINT_LEN_4 == evsel->attr.bp_len); + return 0; +} + +static int test__checkevent_breakpoint_w(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", + PERF_TYPE_BREAKPOINT == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 0 == evsel->attr.config); + TEST_ASSERT_VAL("wrong bp_type", + HW_BREAKPOINT_W == evsel->attr.bp_type); + TEST_ASSERT_VAL("wrong bp_len", + HW_BREAKPOINT_LEN_4 == evsel->attr.bp_len); + return 0; +} + +static struct test__event_st { + const char *name; + __u32 type; + int (*check)(struct perf_evlist *evlist); +} test__events[] = { + { + .name = "syscalls:sys_enter_open", + .check = test__checkevent_tracepoint, + }, + { + .name = "syscalls:*", + .check = test__checkevent_tracepoint_multi, + }, + { + .name = "r1", + .check = test__checkevent_raw, + }, + { + .name = "1:1", + .check = test__checkevent_numeric, + }, + { + .name = "instructions", + .check = test__checkevent_symbolic_name, + }, + { + .name = "faults", + .check = test__checkevent_symbolic_alias, + }, + { + .name = "L1-dcache-load-miss", + .check = test__checkevent_genhw, + }, + { + .name = "mem:0", + .check = test__checkevent_breakpoint, + }, + { + .name = "mem:0:x", + .check = test__checkevent_breakpoint_x, + }, + { + .name = "mem:0:r", + .check = test__checkevent_breakpoint_r, + }, + { + .name = "mem:0:w", + .check = test__checkevent_breakpoint_w, + }, +}; + +#define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) + +static int test__parse_events(void) +{ + struct perf_evlist *evlist; + u_int i; + int ret = 0; + + for (i = 0; i < TEST__EVENTS_CNT; i++) { + struct test__event_st *e = &test__events[i]; + + evlist = perf_evlist__new(NULL, NULL); + if (evlist == NULL) + break; + + ret = parse_events(evlist, e->name, 0); + if (ret) { + pr_debug("failed to parse event '%s', err %d\n", + e->name, ret); + break; + } + + ret = e->check(evlist); + if (ret) + break; + + perf_evlist__delete(evlist); + } + + return ret; +} static struct test { const char *desc; int (*func)(void); @@ -621,6 +862,10 @@ static struct test { .func = test__basic_mmap, }, { + .desc = "parse events tests", + .func = test__parse_events, + }, + { .func = NULL, }, }; -- cgit v0.10.2 From baf040a0d1ac6319725c0fe400503683ac016580 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jul 2011 11:25:34 +0200 Subject: perf tools: Make test use the preset debugfs path Use preset debugfs path instead of hardcoded one. Signed-off-by: Jiri Olsa Cc: acme@redhat.com Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Link: http://lkml.kernel.org/r/1310635534-4013-4-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index b5cd5f5..55f4c76 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -246,8 +246,8 @@ static int trace_event__id(const char *evname) int err = -1, fd; if (asprintf(&filename, - "/sys/kernel/debug/tracing/events/syscalls/%s/id", - evname) < 0) + "%s/syscalls/%s/id", + debugfs_path, evname) < 0) return -1; fd = open(filename, O_RDONLY); -- cgit v0.10.2 From f53173e47dee5f7514d264796bec58d43ed0f67f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Jul 2011 20:06:25 +0400 Subject: x86, perf: P4 PMU - Fix typos in comments and style cleanup This patch: - fixes typos in comments and clarifies the text - renames obscure p4_event_alias::original and ::alter members to ::original and ::alternative as appropriate - drops parenthesis from the return of p4_get_alias_event() No functional changes. Reported-by: Ingo Molnar Signed-off-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20110721160625.GX7492@sun Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 4d86c86..4f7e67e 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -133,7 +133,7 @@ /* * In case of event aliasing we need to preserve some - * caller bits otherwise the mapping won't be complete. + * caller bits, otherwise the mapping won't be complete. */ #define P4_CONFIG_EVENT_ALIAS_MASK \ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 8b7a0c3..7809d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -571,35 +571,34 @@ static __initconst const u64 p4_hw_cache_event_ids }; /* - * Because of Netburst being quite restricted in now - * many same events can run simultaneously, we use - * event aliases, ie different events which have the - * same functionallity but use non-intersected resources - * (ESCR/CCCR/couter registers). This allow us to run - * two or more semi-same events together. It is done - * transparently to a user space. + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). * - * Never set any cusom internal bits such as P4_CONFIG_HT, - * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are - * either up-to-dated automatically either not appliable - * at all. + * This allow us to relax restrictions a bit and run two or more + * identical events together. * - * And be really carefull choosing aliases! + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. */ struct p4_event_alias { - u64 orig; - u64 alter; + u64 original; + u64 alternative; } p4_event_aliases[] = { { /* - * Non-halted cycles can be substituted with - * non-sleeping cycles (see Intel SDM Vol3b for - * details). + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. */ - .orig = + .original = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), - .alter = + .alternative = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| @@ -620,25 +619,21 @@ static u64 p4_get_alias_event(u64 config) int i; /* - * Probably we're lucky and don't have to do - * matching over all config bits. + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. */ if (!(config & P4_CONFIG_ALIASABLE)) return 0; config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; - /* - * If an event was previously swapped to the alter config - * we should swap it back otherwise contnention on registers - * will return back. - */ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { - if (config_match == p4_event_aliases[i].orig) { - config_match = p4_event_aliases[i].alter; + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; break; - } else if (config_match == p4_event_aliases[i].alter) { - config_match = p4_event_aliases[i].orig; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; break; } } @@ -646,8 +641,7 @@ static u64 p4_get_alias_event(u64 config) if (i >= ARRAY_SIZE(p4_event_aliases)) return 0; - return config_match | - (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); } static u64 p4_general_events[PERF_COUNT_HW_MAX] = { @@ -1229,9 +1223,9 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign again: /* - * Aliases are swappable so we may hit circular - * lock if both original config and alias need - * resources (MSR registers) which already busy. + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. */ if (pass > 2) goto done; @@ -1251,7 +1245,7 @@ again: cntr_idx = p4_next_cntr(thread, used_mask, bind); if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { /* - * Probably an event alias is still available. + * Check whether an event alias is still available. */ config_alias = p4_get_alias_event(hwc->config); if (!config_alias) -- cgit v0.10.2 From 9985c20f9e4aee6857c08246b273a3695a52b929 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 30 Jun 2011 08:09:55 +0000 Subject: perf: Remove perf_event_attr::type check PMU type id can be allocated dynamically, so perf_event_attr::type check when copying attribute from userspace to kernel is not valid. Signed-off-by: Lin Ming Cc: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309421396-17438-4-git-send-email-ming.m.lin@intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 0567e32..b8785e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5926,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; -- cgit v0.10.2 From 1ac2e6ca44e13a087eb7438d284f887097ee7e84 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jun 2011 11:49:55 +0200 Subject: x86, perf: Make copy_from_user_nmi() a library function copy_from_user_nmi() is used in oprofile and perf. Moving it to other library functions like copy_from_user(). As this is x86 code for 32 and 64 bits, create a new file usercopy.c for unified code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110607172413.GJ20052@erda.amd.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99ddd14..36361bf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; }; #endif /* CONFIG_X86_WP_WORKS_OK */ +extern unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b7a010f..4ee3abf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -67,40 +66,6 @@ enum extra_reg_type { EXTRA_REG_MAX /* number of entries needed */ }; -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2479f1..6ba4773 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o -lib-y += usercopy_$(BITS).o getuser.o putuser.o +lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c new file mode 100644 index 0000000..97be9cb --- /dev/null +++ b/arch/x86/lib/usercopy.c @@ -0,0 +1,43 @@ +/* + * User address space access functions. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include + +/* + * best effort, GUP based copy_from_user() that is NMI-safe + */ +unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} +EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 32f78eb..bff89df 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -12,10 +12,9 @@ #include #include #include -#include +#include #include -#include #include static int backtrace_stack(void *data, char *name) @@ -38,42 +37,6 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; -/* from arch/x86/kernel/cpu/perf_event.c: */ - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) -- cgit v0.10.2 From 7fcfd1abd6480d3b9ef17f5759c175e036e835cf Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Jul 2011 18:34:42 +0100 Subject: perf: Remove the nmi parameter from the oprofile_perf backend In commit a8b0ca17b80e ("perf: Remove the nmi parameter from the swevent and overflow interface") one site was overlooked. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110708173442.GB31972@e102144-lin.cambridge.arm.com Signed-off-by: Ingo Molnar diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c index 59acf9e..94796f3 100644 --- a/drivers/oprofile/oprofile_perf.c +++ b/drivers/oprofile/oprofile_perf.c @@ -31,7 +31,7 @@ static int num_counters; /* * Overflow callback for oprofile. */ -static void op_overflow_handler(struct perf_event *event, int unused, +static void op_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { int id; -- cgit v0.10.2