diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 04e150d83e7da6fefaa0af14151b26458f6a01a9..47469abdcc1c11e2256effc86845232f78785075 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -144,6 +144,10 @@ is a useful mode to detect imbalance between physical cores. To enable this mod use --per-core in addition to -a. (system-wide). The output includes the core number and the number of online logical processors on that physical processor. +--per-thread:: +Aggregate counts per monitored threads, when monitoring threads (-t option) +or processes (-p option). + -D msecs:: --delay msecs:: After starting the program, wait msecs before measuring. This is useful to diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 56dc8881cb054615cc0db484f970ae77845a3bba..37e301a32f437eb6004ba4f4ccc6b90c73b9aa5c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -231,6 +231,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, count = &zero; switch (aggr_mode) { + case AGGR_THREAD: case AGGR_CORE: case AGGR_SOCKET: case AGGR_NONE: @@ -602,6 +603,14 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) csv_output ? 0 : -4, perf_evsel__cpus(evsel)->map[id], csv_sep); break; + case AGGR_THREAD: + fprintf(output, "%*s-%*d%s", + csv_output ? 0 : 16, + thread_map__comm(evsel->threads, id), + csv_output ? 0 : -8, + thread_map__pid(evsel->threads, id), + csv_sep); + break; case AGGR_GLOBAL: default: break; @@ -750,6 +759,40 @@ static void print_aggr(char *prefix) } } +static void print_aggr_thread(struct perf_evsel *counter, char *prefix) +{ + int nthreads = thread_map__nr(counter->threads); + int ncpus = cpu_map__nr(counter->cpus); + int cpu, thread; + double uval; + + for (thread = 0; thread < nthreads; thread++) { + u64 ena = 0, run = 0, val = 0; + + for (cpu = 0; cpu < ncpus; cpu++) { + val += perf_counts(counter->counts, cpu, thread)->val; + ena += perf_counts(counter->counts, cpu, thread)->ena; + run += perf_counts(counter->counts, cpu, thread)->run; + } + + if (prefix) + fprintf(output, "%s", prefix); + + uval = val * counter->scale; + + if (nsec_counter(counter)) + nsec_printout(thread, 0, counter, uval); + else + abs_printout(thread, 0, counter, uval); + + if (!csv_output) + print_noise(counter, 1.0); + + print_running(run, ena); + fputc('\n', output); + } +} + /* * Print out the results of a single counter: * aggregated counts in system-wide mode @@ -876,6 +919,9 @@ static void print_interval(char *prefix, struct timespec *ts) case AGGR_NONE: fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); break; + case AGGR_THREAD: + fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit"); + break; case AGGR_GLOBAL: default: fprintf(output, "# time counts %*s events\n", unit_width, "unit"); @@ -944,6 +990,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) case AGGR_SOCKET: print_aggr(prefix); break; + case AGGR_THREAD: + evlist__for_each(evsel_list, counter) + print_aggr_thread(counter, prefix); + break; case AGGR_GLOBAL: evlist__for_each(evsel_list, counter) print_counter_aggr(counter, prefix); @@ -1031,6 +1081,7 @@ static int perf_stat_init_aggr_mode(void) break; case AGGR_NONE: case AGGR_GLOBAL: + case AGGR_THREAD: default: break; } @@ -1255,6 +1306,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-core", &aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), + OPT_SET_UINT(0, "per-thread", &aggr_mode, + "aggregate counts per thread", AGGR_THREAD), OPT_UINTEGER('D', "delay", &initial_delay, "ms to wait before starting measurement after program start"), OPT_END() @@ -1346,8 +1399,19 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) run_count = 1; } - /* no_aggr, cgroup are for system-wide only */ - if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && + if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) { + fprintf(stderr, "The --per-thread option is only available " + "when monitoring via -p -t options.\n"); + parse_options_usage(NULL, options, "p", 1); + parse_options_usage(NULL, options, "t", 1); + goto out; + } + + /* + * no_aggr, cgroup are for system-wide only + * --per-thread is aggregated per thread, we dont mix it with cpu mode + */ + if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); @@ -1375,6 +1439,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) } goto out; } + + /* + * Initialize thread_map with comm names, + * so we could print it out on output. + */ + if (aggr_mode == AGGR_THREAD) + thread_map__read_comms(evsel_list->threads); + if (interval && interval < 100) { pr_err("print interval must be >= 100ms\n"); parse_options_usage(stat_usage, options, "I", 1); diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 9f05c571befea4202aa70396c274acda7a1a52a6..1cfbe0a980ac77a1a1af31db8a01141a449a6a4f 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -30,6 +30,7 @@ enum aggr_mode { AGGR_GLOBAL, AGGR_SOCKET, AGGR_CORE, + AGGR_THREAD, }; struct perf_counts_values {