From 11ba2b85f506306c8dfc9fe144aa4ddc43242855 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 Apr 2011 15:05:10 +0200 Subject: perf stat: Print stalled cycles percentage Print: 611,527 cycles 400,553 instructions # ( 0.71 instructions per cycle ) 77,809 stalled-cycles # ( 12.71% of all cycles ) 0.000610987 seconds time elapsed Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/n/tip-fd6x8r1cpyb6zhlrc4ix8m45@git.kernel.org --- tools/perf/builtin-stat.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03f0e45f147..3a29041f857 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -442,7 +442,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %10.3f IPC ", ratio); + fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -450,7 +450,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %% ", ratio); + fprintf(stderr, " # %10.3f %%", ratio); } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -459,6 +459,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %10.3f M/sec", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); } } -- cgit v1.2.3 From d58f4c82fed69fdd4a79fa54fe17fd14d98c27aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 03:42:18 +0200 Subject: perf stat: Print cache misses as percentage Before: 113,393,041 cache-references # 83.636 M/sec 7,052,454 cache-misses # 5.202 M/sec After: 112,589,441 cache-references # 87.925 M/sec 6,556,354 cache-misses # 5.823 % misses/hits percentages are more expressive than absolute numbers or rates. (Also prettify the CPUs printout line to not have a trailing whitespace.) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-axm28f43x439bl41zkvfzd63@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3a29041f857..0de3a2002f4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -157,6 +157,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; +struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -219,10 +220,12 @@ static int read_counter_aggr(struct perf_evsel *counter) */ if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) update_stats(&runtime_nsecs_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); return 0; } @@ -404,7 +407,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs ", + fprintf(stderr, " # %10.3f CPUs", avg / avg_stats(&walltime_nsecs_stats)); } @@ -452,6 +455,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %10.3f %%", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && + runtime_cacherefs_stats[cpu].n != 0) { + total = avg_stats(&runtime_cacherefs_stats[cpu]); + + if (total) + ratio = avg * 100 / total; + + fprintf(stderr, " # %10.3f %%", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); -- cgit v1.2.3 From dcd9936a5a6d89512b5323c1145647f2dbe0236f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:36:37 +0200 Subject: perf stat: Factor our shadow stats Create update_shadow_stats() which is then used in both read_counter_aggr() and read_counter(). This not only simplifies the code but also fixes a bug: HW_CACHE_REFERENCES was not updated in read_counter(). Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-9uc55z3g88r47exde7zxjm6p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0de3a2002f4..e5e82f62c78 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -193,6 +193,23 @@ static inline int nsec_counter(struct perf_evsel *evsel) return 0; } +/* + * Update various tracking values we maintain to print + * more semantic information such as miss/hit ratios, + * instruction rates, etc: + */ +static void update_shadow_stats(struct perf_evsel *counter, u64 *count) +{ + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); +} + /* * Read out the results of a single counter: * aggregate counts across CPUs in system-wide mode @@ -218,14 +235,7 @@ static int read_counter_aggr(struct perf_evsel *counter) /* * Save the full runtime - to allow normalization during printout: */ - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_stats(&runtime_cacherefs_stats[0], count[0]); + update_shadow_stats(counter, count); return 0; } @@ -245,12 +255,7 @@ static int read_counter(struct perf_evsel *counter) count = counter->counts->cpu[cpu].values; - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[cpu], count[0]); + update_shadow_stats(counter, count); } return 0; -- cgit v1.2.3 From 481f988a016f7a0327a5537bde4794349fc4625c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:34:16 +0200 Subject: perf stat: Add stalled cycles accounting, prettify the resulting output Add stalled cycles accounting and use it to print the "cycles stalled per instruction" value. Also change the unit of the cycles output from M/sec to GHz - this is more intuitive. Prettify the output to: Performance counter stats for './loop_1b_instructions': 239.775036 task-clock # 0.997 CPUs utilized 761,903,912 cycles # 3.178 GHz 356,620,620 stalled-cycles # 46.81% of all cycles are idle 1,001,578,351 instructions # 1.31 insns per cycle # 0.36 stalled cycles per insn 14,782 cache-references # 0.062 M/sec 5,694 cache-misses # 38.520 % of all cache refs 0.240493656 seconds time elapsed Also adjust the --repeat output to make the percentages align vertically: Performance counter stats for './loop_1b_instructions' (10 runs): 236.096793 task-clock # 0.997 CPUs utilized ( +- 0.011% ) 756,553,086 cycles # 3.204 GHz ( +- 0.002% ) 354,942,692 stalled-cycles # 46.92% of all cycles are idle ( +- 0.008% ) 1,001,389,700 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn ( +- 0.000% ) 10,166 cache-references # 0.043 M/sec ( +- 0.742% ) 468 cache-misses # 4.608 % of all cache refs ( +- 13.385% ) 0.236874136 seconds time elapsed ( +- 0.01% ) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-uapziqny39601apdmmhoz7hk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e5e82f62c78..e881c206138 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -156,6 +156,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; @@ -204,6 +205,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -412,8 +415,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs", - avg / avg_stats(&walltime_nsecs_stats)); + fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -450,7 +452,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); + fprintf(stderr, " # %4.2f insns per cycle", ratio); + + total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + + if (total && avg) { + ratio = total / avg; + fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + } + } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -458,7 +468,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all branches", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -467,22 +477,29 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (runtime_nsecs_stats[cpu].n != 0) { + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) - ratio = 1000.0 * avg / total; + ratio = 1.0 * avg / total; - fprintf(stderr, " # %10.3f M/sec", ratio); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { - total = avg_stats(&runtime_cycles_stats[cpu]); + fprintf(stderr, " # %8.3f GHz ", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) - ratio = avg / total * 100.0; + ratio = 1000.0 * avg / total; - fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); + fprintf(stderr, " # %8.3f M/sec ", ratio); } } @@ -619,7 +636,7 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +- %7.3f%% )", + fprintf(stderr, " ( +-%5.2f%% )", 100*stddev_stats(&walltime_nsecs_stats) / avg_stats(&walltime_nsecs_stats)); } -- cgit v1.2.3 From 1fc570ad89e55dc32dfa4dda1311948b38f26524 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:20:22 +0200 Subject: perf stat: Add stalled cycles to the default output The new default output looks like this: Performance counter stats for './loop_1b_instructions': 236.010686 task-clock # 0.996 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 99 page-faults # 0.000 M/sec 756,487,646 cycles # 3.205 GHz 354,938,996 stalled-cycles # 46.92% of all cycles are idle 1,001,403,797 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn 100,279,773 branches # 424.895 M/sec 12,646 branch-misses # 0.013 % of all branches 0.236902540 seconds time elapsed We dropped cache-refs and cache-misses and added stalled-cycles - this is a more generic "how well utilized is the CPU" metric. If the stalled-cycles ratio is too high then more specific measurements can be taken to figure out the source of the inefficiency. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pbpl2l4mn797s69bclfpwkwn@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e881c206138..924d18c407b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,11 +65,10 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, }; @@ -468,7 +467,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %8.3f %% of all branches", ratio); + fprintf(stderr, " # %5.2f %% of all branches ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { -- cgit v1.2.3 From f99844cb76b7d347711c22cdcb94266b7214141f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:35:39 +0200 Subject: perf stat: Fix -nan% output in perf stat noise printouts Before: 0 CPU-migrations # 0.000 M/sec ( +- -nan% ) After: 0 CPU-migrations # 0.000 M/sec ( +- 0.00% ) Also factor out the noise printing function. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-z89h2v1bk1mikcbsf7e6v34q@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 924d18c407b..845ded8f15a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -382,6 +382,16 @@ static int run_perf_stat(int argc __used, const char **argv) return WEXITSTATUS(status); } +static void print_noise_pct(double total, double avg) +{ + double pct = 0.0; + + if (avg) + pct = 100.0*total/avg; + + fprintf(stderr, " ( +-%6.2f%% )", pct); +} + static void print_noise(struct perf_evsel *evsel, double avg) { struct perf_stat *ps; @@ -390,8 +400,7 @@ static void print_noise(struct perf_evsel *evsel, double avg) return; ps = evsel->priv; - fprintf(stderr, " ( +- %7.3f%% )", - 100 * stddev_stats(&ps->res_stats[0]) / avg); + print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); } static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -635,9 +644,8 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +-%5.2f%% )", - 100*stddev_stats(&walltime_nsecs_stats) / - avg_stats(&walltime_nsecs_stats)); + print_noise_pct(stddev_stats(&walltime_nsecs_stats), + avg_stats(&walltime_nsecs_stats)); } fprintf(stderr, "\n\n"); } -- cgit v1.2.3 From a5d243d04a150acbfa79d641154f49e5d920f64f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:39:24 +0200 Subject: perf stat: Print stalled cycles warning colors Print the stalled-cycles percentage with different warning level ASCII colors, as the percentage passes the 25%/50%/75% thresholds. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-e25zz44rcms7mu9az4fu5zp0@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 845ded8f15a..e7d91f9cf78 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -46,6 +46,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" +#include "util/color.h" #include "util/header.h" #include "util/cpumap.h" #include "util/thread.h" @@ -426,6 +427,29 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } +static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 25.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all cycles are idle "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -488,12 +512,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { - total = avg_stats(&runtime_cycles_stats[cpu]); - - if (total) - ratio = avg / total * 100.0; - - fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -508,6 +527,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %8.3f M/sec ", ratio); + } else { + fprintf(stderr, " "); } } -- cgit v1.2.3 From c78df6c1d49b5d798f1579141e3a12be7c325d1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 12:16:10 +0200 Subject: perf stat: Print branch misses warning colors Print the missed-branches percentage with different warning level ASCII colors, as the percentage passes the 5%/10%/20% thresholds. These thresholds are set to relatively low levels, because on most CPUs even a moderate percentage of branch-misses already shows up as a slowdown. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-ybqukg7p86leiup7gl03ecgk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e7d91f9cf78..5d4e1b9b2d8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -450,6 +450,29 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " of all cycles are idle "); } +static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_branches_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all branches "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -495,13 +518,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { - total = avg_stats(&runtime_branches_stats[cpu]); - - if (total) - ratio = avg * 100 / total; - - fprintf(stderr, " # %5.2f %% of all branches ", ratio); - + print_branch_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); -- cgit v1.2.3 From 8bb6c79f24e66538f606076915e918242c02ec7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:25:24 +0200 Subject: perf stat: Print out miss/hit ratio for L1 data-cache events Print out this kind of l1-dcache-misses percentage: Performance counter stats for './bw_tcp localhost': 29,956,262,201 cycles # 3.002 GHz (scaled from 85.14%) 8,255,209,558 stalled-cycles # 27.56% of all cycles are idle (scaled from 86.56%) 1,206,130,308 l1-dcache-misses # 40.49% of all L1-dcache hits (scaled from 86.30%) 2,978,756,779 l1-dcache-refs # 298.512 M/sec (scaled from 70.02%) 8,861,956,159 instructions # 0.30 insns per cycle # 0.93 stalled cycles per insn (scaled from 84.27%) 1,644,306,068 branches # 164.782 M/sec (scaled from 86.43%) 74,778,443 branch-misses # 4.55% of all branches (scaled from 70.69%) 9978.695711 task-clock # 0.693 CPUs utilized 14.404347983 seconds time elapsed And color the result depending on the severity of cache-trashing. Acked-by: Peter Zijlstra Acked-by Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-54gmz0zymaid84zcs7joq02p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5d4e1b9b2d8..03bac6aa014 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -159,6 +159,7 @@ struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; +struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -211,6 +212,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) update_stats(&runtime_cacherefs_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) + update_stats(&runtime_l1_dcache_stats[0], count[0]); } /* @@ -473,6 +476,29 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double fprintf(stderr, " of all branches "); } +static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_dcache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all L1-dcache hits "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -519,6 +545,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { print_branch_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_branches_stats[cpu].n != 0) { + print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); -- cgit v1.2.3 From c6264deff7ea6125492b442edad885e5429679af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:50:47 +0200 Subject: perf stat: Add -d/--detailed flag to run with a lot of events Add the new -d/--detailed flag, which generates a pretty detailed event list: Performance counter stats for './hackbench 10' (10 runs): 1514.287888 task-clock # 10.897 CPUs utilized ( +- 3.05% ) 39,698 context-switches # 0.026 M/sec ( +- 12.19% ) 8,147 CPU-migrations # 0.005 M/sec ( +- 16.55% ) 17,918 page-faults # 0.012 M/sec ( +- 0.37% ) 2,944,504,050 cycles # 1.944 GHz ( +- 3.89% ) (32.60%) 1,043,971,283 stalled-cycles # 35.45% of all cycles are idle ( +- 5.22% ) (44.48%) 1,655,906,768 instructions # 0.56 insns per cycle # 0.63 stalled cycles per insn ( +- 1.95% ) (55.09%) 338,832,373 branches # 223.757 M/sec ( +- 1.96% ) (64.47%) 3,892,416 branch-misses # 1.15% of all branches ( +- 5.49% ) (73.12%) 606,410,482 L1-dcache-loads # 400.459 M/sec ( +- 1.29% ) (71.21%) 31,204,395 L1-dcache-load-misses # 5.15% of all L1-dcache hits ( +- 3.04% ) (60.43%) 3,922,751 LLC-loads # 2.590 M/sec ( +- 6.80% ) (46.87%) 5,037,288 LLC-load-misses # 3.327 M/sec ( +- 3.56% ) (13.00%) 0.138966828 seconds time elapsed ( +- 4.11% ) This can be used "at a glance" for narrower analysis. -d can also be used in addition to other -e events, to further expand an event list. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-cxs98quixs3qyvdqx3goojc4@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 68 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03bac6aa014..6959fdecb20 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -73,6 +73,47 @@ static struct perf_event_attr default_attrs[] = { }; +/* + * Detailed stats: + */ +static struct perf_event_attr detailed_attrs[] = { + + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, + + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + struct perf_evlist *evsel_list; static bool system_wide = false; @@ -86,6 +127,7 @@ static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; +static bool detailed_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -550,7 +592,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && - runtime_branches_stats[cpu].n != 0) { + runtime_l1_dcache_stats[cpu].n != 0) { print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -625,8 +667,7 @@ static void print_counter_aggr(struct perf_evsel *counter) avg_enabled = avg_stats(&ps->res_stats[1]); avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (scaled from %.2f%%)", - 100 * avg_running / avg_enabled); + fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); } fprintf(stderr, "\n"); } @@ -668,10 +709,8 @@ static void print_counter(struct perf_evsel *counter) if (!csv_output) { print_noise(counter, 1.0); - if (run != ena) { - fprintf(stderr, " (scaled from %.2f%%)", - 100.0 * run / ena); - } + if (run != ena) + fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); } fputc('\n', stderr); } @@ -778,6 +817,8 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), + OPT_BOOLEAN('d', "detailed", &detailed_run, + "detailed run - start a lot of events"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -839,7 +880,18 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) } /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!null_run && !evsel_list->nr_entries) { + if (detailed_run) { + size_t c; + + for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { + pos = perf_evsel__new(&detailed_attrs[c], c); + if (pos == NULL) + goto out; + perf_evlist__add(evsel_list, pos); + } + } + /* Set attrs and nr_counters if no event is selected and !null_run */ + if (!detailed_run && !null_run && !evsel_list->nr_entries) { size_t c; for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { -- cgit v1.2.3 From 9ceb1c3d1fe15c2f9b55eaa8978019ef0e0a06ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 02:57:53 +0200 Subject: perf stat: Fix printout vertical alignment Before: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | After: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | The last column (stddev noise) is properly aligned, vertically. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn0dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6959fdecb20..003caa857a4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -575,7 +575,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle", ratio); + fprintf(stderr, " # %4.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_stats[cpu]); -- cgit v1.2.3 From f9cef0a90c4e7637f1ec98474a1a099aec45eb65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 18:17:11 +0200 Subject: perf stat: Add --sync/-S option --sync will tell perf stat to run sync() before starting a command. This allows IO-heavy tests to be used with --repeat, without one iteration impacting the other. Elapsed time will stabilize for example: before: 3.971525714 seconds time elapsed ( +- 8.56% ) after: 3.211098537 seconds time elapsed ( +- 1.52% ) So measurements will be more accurate. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 003caa857a4..5658a770dbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -128,6 +128,7 @@ static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; static bool detailed_run = false; +static bool sync_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -819,6 +820,8 @@ static const struct option options[] = { "null run - dont start any counters"), OPT_BOOLEAN('d', "detailed", &detailed_run, "detailed run - start a lot of events"), + OPT_BOOLEAN('S', "sync", &sync_run, + "call sync() before starting a run"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -944,6 +947,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) for (run_idx = 0; run_idx < run_count; run_idx++) { if (run_count != 1 && verbose) fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); + + if (sync_run) + sync(); + status = run_perf_stat(argc, argv); } -- cgit v1.2.3 From ede70290046043b2638204cab55e26ea1d0c6cd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 08:48:42 +0200 Subject: perf stat: Fix compatibility behavior Instead of failing on an unknown event, when new perf stat is run on older kernels: $ ./perf stat true Error: open_counter returned with 22 (Invalid argument). /bin/dmesg may provide additional information. Fatal: Not all events could be opened. Just ignore EINVAL and ENOSYS, we'll print the results as not counted: Performance counter stats for 'true': 0.239483 task-clock # 0.493 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 86 page-faults # 0.359 M/sec 704,766 cycles # 2.943 GHz stalled-cycles 381,961 instructions # 0.54 insns per cycle 69,626 branches # 290.735 M/sec 4,594 branch-misses # 6.60% of all branches 0.000485883 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio5hjpn3dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5658a770dbd..da77077450c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -372,7 +372,10 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == -EPERM || errno == -EACCES) { + if (errno == EINVAL || errno == ENOSYS) + continue; + + if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n" "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", -- cgit v1.2.3 From 129c04cb8ce2e4bf3f17223f58ef16aa8a2cb3b8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:41:28 +0200 Subject: perf tools: Add front-end and back-end stalled cycles support Update perf tooling to deal with front-end and back-end stalled cycles events. Add both the default 'perf stat' output. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n002io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index da77077450c..6a4a8a399d9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -66,7 +66,8 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -84,7 +85,8 @@ static struct perf_event_attr detailed_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -249,7 +251,7 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); @@ -607,7 +609,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); -- cgit v1.2.3 From d3d1e86da07b4565815e3dbcd082f53017d215f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 13:49:08 +0200 Subject: perf stat: Analyze front-end and back-end stall counts Sample output: Performance counter stats for './loop_1b': 873.691065 task-clock # 1.000 CPUs utilized 1 context-switches # 0.000 M/sec 1 CPU-migrations # 0.000 M/sec 96 page-faults # 0.000 M/sec 2,012,637,222 cycles # 2.304 GHz (66.58%) 1,001,397,911 stalled-cycles-frontend # 49.76% frontend cycles idle (66.58%) 7,523,398 stalled-cycles-backend # 0.37% backend cycles idle (66.76%) 2,004,551,046 instructions # 1.00 insns per cycle # 0.50 stalled cycles per insn (66.80%) 1,001,304,992 branches # 1146.063 M/sec (66.76%) 39,453 branch-misses # 0.00% of all branches (66.64%) 0.874046121 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n003io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6a4a8a399d9..e45449938b8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -201,7 +201,8 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; -struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; @@ -251,8 +252,10 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) + update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) - update_stats(&runtime_stalled_cycles_stats[0], count[0]); + update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -478,7 +481,30 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } -static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 20.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " frontend cycles idle "); +} + +static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; const char *color; @@ -498,7 +524,7 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " # "); color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " of all cycles are idle "); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -583,7 +609,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %4.2f insns per cycle ", ratio); - total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); + total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; @@ -609,8 +636,10 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { + print_stalled_cycles_frontend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles(cpu, evsel, avg); + print_stalled_cycles_backend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); -- cgit v1.2.3 From 2b427e14b77dbf3e05f1bd0785f1d07ea5fe924e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:16:18 +0200 Subject: perf stat: Adjust stall cycles warning percentages Adjust to color thresholds to better match the percentages seen in real workloads. Both are now a bit more sensitive. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n004io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e45449938b8..2492a0efa4d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -492,11 +492,11 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us ratio = avg / total * 100.0; color = PERF_COLOR_NORMAL; - if (ratio > 75.0) + if (ratio > 50.0) color = PERF_COLOR_RED; - else if (ratio > 50.0) + else if (ratio > 30.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 20.0) + else if (ratio > 10.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); @@ -519,7 +519,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use color = PERF_COLOR_RED; else if (ratio > 50.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 25.0) + else if (ratio > 20.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); -- cgit v1.2.3 From fce3c786d3a49eff397583b4b62fa38df90db937 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Apr 2011 09:03:15 +0200 Subject: perf stat: Leave more room for percentages Triple digit percentages do not fit otherwise. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n005io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2492a0efa4d..9e596ab98d0 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -499,8 +499,8 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us else if (ratio > 10.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " frontend cycles idle "); } @@ -522,9 +522,9 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use else if (ratio > 20.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " backend cycles idle "); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -545,8 +545,8 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all branches "); } @@ -568,8 +568,8 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all L1-dcache hits "); } @@ -607,14 +607,14 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle ", ratio); + fprintf(stderr, " # %5.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; - fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && -- cgit v1.2.3 From 370faf1dd0461ad811852c8abbbcd3d73b1e4fc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 16:11:03 +0200 Subject: perf stat: Fail softly on unsupported events David Ahern reported this perf stat failure: > # /tmp/build-perf/perf stat -- sleep 1 > Error: stalled-cycles-frontend event is not supported. > Fatal: Not all events could be opened. > > This is a Dell R410 with an E5620 processor. Fail in a softer fashion on unknown/unsupported events. Reported-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n006io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 9e596ab98d0..c8b535bc27b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,7 +377,7 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) continue; if (errno == EPERM || errno == EACCES) { @@ -385,8 +385,6 @@ static int run_perf_stat(int argc __used, const char **argv) "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", system_wide ? "system-wide " : ""); - } else if (errno == ENOENT) { - error("%s event is not supported. ", event_name(counter)); } else { error("open_counter returned with %d (%s). " "/bin/dmesg may provide additional information.\n", -- cgit v1.2.3 From c63ca0c01d73563d4e2ab174bb3dd1e5efb907e6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 Apr 2011 16:04:15 -0600 Subject: perf stat: Tell user about unsupported events in the list Similar to perf-record, tell user about unsupported events that will not be counted if invoked in verbose mode. e.g., $ perf stat -e dTLB-prefetch-misses -v -- sleep 1 dTLB-prefetch-misses event is not supported by the kernel. dTLB-prefetch-misses: 0 0 0 Performance counter stats for 'sleep 1': dTLB-prefetch-misses 1.001884783 seconds time elapsed Signed-off-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1304114655-10600-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c8b535bc27b..602c3c96fa1 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,8 +377,12 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { + if (verbose) + ui__warning("%s event is not supported by the kernel.\n", + event_name(counter)); continue; + } if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n" -- cgit v1.2.3 From 2cba3ffb9a9db3874304a1739002d053d53c738b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 May 2011 13:30:56 +0200 Subject: perf stat: Add -d -d and -d -d -d options to show more CPU events Print even more detailed statistics if requested via perf stat -d: -d: detailed events, L1 and LLC data cache -d -d: more detailed events, dTLB and iTLB events -d -d -d: very detailed events, adding prefetch events Full output looks like this now: Performance counter stats for '/home/mingo/hackbench 10' (5 runs): 1703.674707 task-clock # 8.709 CPUs utilized ( +- 4.19% ) 49,068 context-switches # 0.029 M/sec ( +- 16.66% ) 8,303 CPU-migrations # 0.005 M/sec ( +- 24.90% ) 17,397 page-faults # 0.010 M/sec ( +- 0.46% ) 2,345,389,239 cycles # 1.377 GHz ( +- 4.61% ) [55.90%] 1,884,503,527 stalled-cycles-frontend # 80.35% frontend cycles idle ( +- 5.67% ) [50.39%] 743,919,737 stalled-cycles-backend # 31.72% backend cycles idle ( +- 8.75% ) [49.91%] 1,314,416,379 instructions # 0.56 insns per cycle # 1.43 stalled cycles per insn ( +- 2.53% ) [60.87%] 272,592,567 branches # 160.003 M/sec ( +- 1.74% ) [56.56%] 3,794,846 branch-misses # 1.39% of all branches ( +- 6.59% ) [58.50%] 449,982,778 L1-dcache-loads # 264.125 M/sec ( +- 2.47% ) [49.88%] 22,404,961 L1-dcache-load-misses # 4.98% of all L1-dcache hits ( +- 6.08% ) [55.05%] 6,204,750 LLC-loads # 3.642 M/sec ( +- 8.91% ) [43.75%] 1,837,411 LLC-load-misses # 1.078 M/sec ( +- 7.27% ) [12.07%] 411,440,421 L1-icache-loads # 241.502 M/sec ( +- 5.60% ) [36.52%] 27,556,832 L1-icache-load-misses # 16.175 M/sec ( +- 7.46% ) [46.72%] 464,067,627 dTLB-loads # 272.392 M/sec ( +- 4.46% ) [54.17%] 10,765,648 dTLB-load-misses # 6.319 M/sec ( +- 3.18% ) [48.68%] 1,273,080,386 iTLB-loads # 747.256 M/sec ( +- 3.38% ) [47.53%] 117,481 iTLB-load-misses # 0.069 M/sec ( +- 14.99% ) [47.01%] 4,590,653 L1-dcache-prefetches # 2.695 M/sec ( +- 4.49% ) [46.19%] 1,712,660 L1-dcache-prefetch-misses # 1.005 M/sec ( +- 3.75% ) [44.82%] 0.195622057 seconds time elapsed ( +- 6.84% ) Also clean up the attribute construction code to be appending, and factor it out into add_default_attributes(). Tweak the coverage percentage printout a bit, so that it's easier to view it alongside the +- sttddev colum. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-to3kgu04449s64062val8b62@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 209 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 154 insertions(+), 55 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 602c3c96fa1..a89fc083536 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -6,24 +6,28 @@ * * Sample output: - $ perf stat ~/hackbench 10 - Time: 0.104 + $ perf stat ./hackbench 10 - Performance counter stats for '/home/mingo/hackbench': + Time: 0.118 - 1255.538611 task clock ticks # 10.143 CPU utilization factor - 54011 context switches # 0.043 M/sec - 385 CPU migrations # 0.000 M/sec - 17755 pagefaults # 0.014 M/sec - 3808323185 CPU cycles # 3033.219 M/sec - 1575111190 instructions # 1254.530 M/sec - 17367895 cache references # 13.833 M/sec - 7674421 cache misses # 6.112 M/sec + Performance counter stats for './hackbench 10': - Wall-clock time elapsed: 123.786620 msecs + 1708.761321 task-clock # 11.037 CPUs utilized + 41,190 context-switches # 0.024 M/sec + 6,735 CPU-migrations # 0.004 M/sec + 17,318 page-faults # 0.010 M/sec + 5,205,202,243 cycles # 3.046 GHz + 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle + 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle + 2,603,501,247 instructions # 0.50 insns per cycle + # 1.48 stalled cycles per insn + 484,357,498 branches # 283.455 M/sec + 6,388,934 branch-misses # 1.32% of all branches + + 0.154822978 seconds time elapsed * - * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar * * Improvements and fixes by: * @@ -75,22 +79,10 @@ static struct perf_event_attr default_attrs[] = { }; /* - * Detailed stats: + * Detailed stats (-d), covering the L1 and last level data caches: */ static struct perf_event_attr detailed_attrs[] = { - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, - - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HW_CACHE, .config = PERF_COUNT_HW_CACHE_L1D << 0 | @@ -116,6 +108,69 @@ static struct perf_event_attr detailed_attrs[] = { (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, }; +/* + * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: + */ +static struct perf_event_attr very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + +}; + +/* + * Very, very detailed stats (-d -d -d), adding prefetch events: + */ +static struct perf_event_attr very_very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + + + struct perf_evlist *evsel_list; static bool system_wide = false; @@ -129,7 +184,7 @@ static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; -static bool detailed_run = false; +static int detailed_run = 0; static bool sync_run = false; static bool big_num = true; static int big_num_opt = -1; @@ -464,7 +519,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) { double msecs = avg / 1e6; char cpustr[16] = { '\0', }; - const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s"; + const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; if (no_aggr) sprintf(cpustr, "CPU%*d%s", @@ -584,9 +639,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (csv_output) fmt = "%s%.0f%s%s"; else if (big_num) - fmt = "%s%'18.0f%s%-24s"; + fmt = "%s%'18.0f%s%-25s"; else - fmt = "%s%18.0f%s%-24s"; + fmt = "%s%18.0f%s%-25s"; if (no_aggr) sprintf(cpustr, "CPU%*d%s", @@ -616,7 +671,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total && avg) { ratio = total / avg; - fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && @@ -704,7 +759,7 @@ static void print_counter_aggr(struct perf_evsel *counter) avg_enabled = avg_stats(&ps->res_stats[1]); avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); + fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); } fprintf(stderr, "\n"); } @@ -854,7 +909,7 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), - OPT_BOOLEAN('d', "detailed", &detailed_run, + OPT_INCR('d', "detailed", &detailed_run, "detailed run - start a lot of events"), OPT_BOOLEAN('S', "sync", &sync_run, "call sync() before starting a run"), @@ -873,6 +928,70 @@ static const struct option options[] = { OPT_END() }; +/* + * Add default attributes, if there were no attributes specified or + * if -d/--detailed, -d -d or -d -d -d is used: + */ +static int add_default_attributes(void) +{ + struct perf_evsel *pos; + size_t attr_nr = 0; + size_t c; + + /* Set attrs if no event is selected and !null_run: */ + if (null_run) + return 0; + + if (!evsel_list->nr_entries) { + for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { + pos = perf_evsel__new(default_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + } + + /* Detailed events get appended to the event list: */ + + if (detailed_run < 1) + return 0; + + /* Append detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { + pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + + if (detailed_run < 2) + return 0; + + /* Append very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { + pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + if (detailed_run < 3) + return 0; + + /* Append very, very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { + pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + + return 0; +} + int cmd_stat(int argc, const char **argv, const char *prefix __used) { struct perf_evsel *pos; @@ -918,28 +1037,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) usage_with_options(stat_usage, options); } - /* Set attrs and nr_counters if no event is selected and !null_run */ - if (detailed_run) { - size_t c; - - for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { - pos = perf_evsel__new(&detailed_attrs[c], c); - if (pos == NULL) - goto out; - perf_evlist__add(evsel_list, pos); - } - } - /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!detailed_run && !null_run && !evsel_list->nr_entries) { - size_t c; - - for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { - pos = perf_evsel__new(&default_attrs[c], c); - if (pos == NULL) - goto out; - perf_evlist__add(evsel_list, pos); - } - } + if (add_default_attributes()) + goto out; if (target_pid != -1) target_tid = target_pid; -- cgit v1.2.3 From c3305257cd4df63e03e21e331a0140ae9c0faccc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 May 2011 14:01:42 +0200 Subject: perf stat: Add more cache-miss percentage printouts Print out the cache-miss percentage as well if the cache refs were collected, for all the generic cache event types. Before: 11,103,723,230 dTLB-loads # 622.471 M/sec ( +- 0.30% ) 87,065,337 dTLB-load-misses # 4.881 M/sec ( +- 0.90% ) After: 11,353,713,242 dTLB-loads # 626.020 M/sec ( +- 0.35% ) 113,393,472 dTLB-load-misses # 1.00% of all dTLB cache hits ( +- 0.49% ) Also ASCII color highlight too high percentages, them when it's executed on the console. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-lkhwxsevdbd9a8nymx0vxc3y@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 138 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a89fc083536..a9f06715e44 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -261,6 +261,10 @@ struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; +struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; +struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; +struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; +struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -317,6 +321,14 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_cacherefs_stats[0], count[0]); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) update_stats(&runtime_l1_dcache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) + update_stats(&runtime_l1_icache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) + update_stats(&runtime_ll_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) + update_stats(&runtime_dtlb_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) + update_stats(&runtime_itlb_cache_stats[0], count[0]); } /* @@ -630,6 +642,98 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou fprintf(stderr, " of all L1-dcache hits "); } +static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_icache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all L1-icache hits "); +} + +static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_dtlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all dTLB cache hits "); +} + +static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_itlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all iTLB cache hits "); +} + +static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_ll_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all LL-cache hits "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -684,6 +788,34 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && runtime_l1_dcache_stats[cpu].n != 0) { print_l1_dcache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_l1_icache_stats[cpu].n != 0) { + print_l1_icache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_dtlb_cache_stats[cpu].n != 0) { + print_dtlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_itlb_cache_stats[cpu].n != 0) { + print_itlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_ll_cache_stats[cpu].n != 0) { + print_ll_cache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); @@ -842,10 +974,12 @@ static void print_stat(int argc, const char **argv) } if (!csv_output) { - fprintf(stderr, "\n"); - fprintf(stderr, " %18.9f seconds time elapsed", + if (!null_run) + fprintf(stderr, "\n"); + fprintf(stderr, " %17.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { + fprintf(stderr, " "); print_noise_pct(stddev_stats(&walltime_nsecs_stats), avg_stats(&walltime_nsecs_stats)); } -- cgit v1.2.3