提交 91e95617 编写于 作者: W Waiman Long 提交者: Arnaldo Carvalho de Melo

perf report: Add --max-stack option to limit callchain stack scan

When callgraph data was included in the perf data file, it may take a
long time to scan all those data and merge them together especially if
the stored callchains are long and the perf data file itself is large,
like a Gbyte or so.

The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
This is a large value. Usually the callgraph data that developers are
most interested in are the first few levels, the rests are usually not
looked at.

This patch adds a new --max-stack option to perf-report to limit the
depth of callchain stack data to look at to reduce the time it takes for
perf-report to finish its processing. It trades the presence of trailing
stack information with faster speed.

The following table shows the elapsed time of doing perf-report on a
perf.data file of size 985,531,828 bytes.

  --max_stack   Elapsed Time    Output data size
  -----------   ------------    ----------------
  not set        88.0s          124,422,651
  64             87.5s          116,303,213
  32             87.2s          112,023,804
  16             86.6s           94,326,380
  8              59.9s           33,697,248
  4              40.7s           10,116,637
  -g none        27.1s            2,555,810
Signed-off-by: NWaiman Long <Waiman.Long@hp.com>
Acked-by: NDavid Ahern <dsahern@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1382107129-2010-4-git-send-email-Waiman.Long@hp.comSigned-off-by: NArnaldo Carvalho de Melo <acme@redhat.com>
上级 cc9784bd
...@@ -141,6 +141,14 @@ OPTIONS ...@@ -141,6 +141,14 @@ OPTIONS
Default: fractal,0.5,callee,function. Default: fractal,0.5,callee,function.
--max-stack::
Set the stack depth limit when parsing the callchain, anything
beyond the specified depth will be ignored. This is a trade-off
between information loss and faster processing especially for
workloads that can have a very long callchain stack.
Default: 127
-G:: -G::
--inverted:: --inverted::
alias for inverted caller based call graph. alias for inverted caller based call graph.
......
...@@ -49,6 +49,7 @@ struct perf_report { ...@@ -49,6 +49,7 @@ struct perf_report {
bool show_threads; bool show_threads;
bool inverted_callchain; bool inverted_callchain;
bool mem_mode; bool mem_mode;
int max_stack;
struct perf_read_values show_threads_values; struct perf_read_values show_threads_values;
const char *pretty_printing_style; const char *pretty_printing_style;
const char *cpu_list; const char *cpu_list;
...@@ -90,7 +91,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool, ...@@ -90,7 +91,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain) && if ((sort__has_parent || symbol_conf.use_callchain) &&
sample->callchain) { sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread, err = machine__resolve_callchain(machine, evsel, al->thread,
sample, &parent, al); sample, &parent, al,
rep->max_stack);
if (err) if (err)
return err; return err;
} }
...@@ -181,7 +183,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool, ...@@ -181,7 +183,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
if ((sort__has_parent || symbol_conf.use_callchain) if ((sort__has_parent || symbol_conf.use_callchain)
&& sample->callchain) { && sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread, err = machine__resolve_callchain(machine, evsel, al->thread,
sample, &parent, al); sample, &parent, al,
rep->max_stack);
if (err) if (err)
return err; return err;
} }
...@@ -244,18 +247,21 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool, ...@@ -244,18 +247,21 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
return err; return err;
} }
static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, static int perf_evsel__add_hist_entry(struct perf_tool *tool,
struct perf_evsel *evsel,
struct addr_location *al, struct addr_location *al,
struct perf_sample *sample, struct perf_sample *sample,
struct machine *machine) struct machine *machine)
{ {
struct perf_report *rep = container_of(tool, struct perf_report, tool);
struct symbol *parent = NULL; struct symbol *parent = NULL;
int err = 0; int err = 0;
struct hist_entry *he; struct hist_entry *he;
if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
err = machine__resolve_callchain(machine, evsel, al->thread, err = machine__resolve_callchain(machine, evsel, al->thread,
sample, &parent, al); sample, &parent, al,
rep->max_stack);
if (err) if (err)
return err; return err;
} }
...@@ -332,7 +338,8 @@ static int process_sample_event(struct perf_tool *tool, ...@@ -332,7 +338,8 @@ static int process_sample_event(struct perf_tool *tool,
if (al.map != NULL) if (al.map != NULL)
al.map->dso->hit = 1; al.map->dso->hit = 1;
ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine); ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
machine);
if (ret < 0) if (ret < 0)
pr_debug("problem incrementing symbol period, skipping event\n"); pr_debug("problem incrementing symbol period, skipping event\n");
} }
...@@ -772,6 +779,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -772,6 +779,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.ordered_samples = true, .ordered_samples = true,
.ordering_requires_timestamps = true, .ordering_requires_timestamps = true,
}, },
.max_stack = PERF_MAX_STACK_DEPTH,
.pretty_printing_style = "normal", .pretty_printing_style = "normal",
}; };
const struct option options[] = { const struct option options[] = {
...@@ -812,6 +820,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -812,6 +820,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order", OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). " "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
"Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt), "Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
OPT_INTEGER(0, "max-stack", &report.max_stack,
"Set the maximum stack depth when parsing the callchain, "
"anything beyond the specified depth will be ignored. "
"Default: " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_BOOLEAN('G', "inverted", &report.inverted_callchain, OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
"alias for inverted call graph"), "alias for inverted call graph"),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex", OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
......
...@@ -770,7 +770,8 @@ static void perf_event__process_sample(struct perf_tool *tool, ...@@ -770,7 +770,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
sample->callchain) { sample->callchain) {
err = machine__resolve_callchain(machine, evsel, err = machine__resolve_callchain(machine, evsel,
al.thread, sample, al.thread, sample,
&parent, &al); &parent, &al,
PERF_MAX_STACK_DEPTH);
if (err) if (err)
return; return;
} }
......
...@@ -1253,10 +1253,12 @@ static int machine__resolve_callchain_sample(struct machine *machine, ...@@ -1253,10 +1253,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
struct thread *thread, struct thread *thread,
struct ip_callchain *chain, struct ip_callchain *chain,
struct symbol **parent, struct symbol **parent,
struct addr_location *root_al) struct addr_location *root_al,
int max_stack)
{ {
u8 cpumode = PERF_RECORD_MISC_USER; u8 cpumode = PERF_RECORD_MISC_USER;
unsigned int i; int chain_nr = min(max_stack, (int)chain->nr);
int i;
int err; int err;
callchain_cursor_reset(&callchain_cursor); callchain_cursor_reset(&callchain_cursor);
...@@ -1266,7 +1268,7 @@ static int machine__resolve_callchain_sample(struct machine *machine, ...@@ -1266,7 +1268,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
return 0; return 0;
} }
for (i = 0; i < chain->nr; i++) { for (i = 0; i < chain_nr; i++) {
u64 ip; u64 ip;
struct addr_location al; struct addr_location al;
...@@ -1338,12 +1340,14 @@ int machine__resolve_callchain(struct machine *machine, ...@@ -1338,12 +1340,14 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread, struct thread *thread,
struct perf_sample *sample, struct perf_sample *sample,
struct symbol **parent, struct symbol **parent,
struct addr_location *root_al) struct addr_location *root_al,
int max_stack)
{ {
int ret; int ret;
ret = machine__resolve_callchain_sample(machine, thread, ret = machine__resolve_callchain_sample(machine, thread,
sample->callchain, parent, root_al); sample->callchain, parent,
root_al, max_stack);
if (ret) if (ret)
return ret; return ret;
......
...@@ -92,7 +92,8 @@ int machine__resolve_callchain(struct machine *machine, ...@@ -92,7 +92,8 @@ int machine__resolve_callchain(struct machine *machine,
struct thread *thread, struct thread *thread,
struct perf_sample *sample, struct perf_sample *sample,
struct symbol **parent, struct symbol **parent,
struct addr_location *root_al); struct addr_location *root_al,
int max_stack);
/* /*
* Default guest kernel is defined by parameter --guestkallsyms * Default guest kernel is defined by parameter --guestkallsyms
......
...@@ -1512,7 +1512,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event, ...@@ -1512,7 +1512,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
if (symbol_conf.use_callchain && sample->callchain) { if (symbol_conf.use_callchain && sample->callchain) {
if (machine__resolve_callchain(machine, evsel, al.thread, if (machine__resolve_callchain(machine, evsel, al.thread,
sample, NULL, NULL) != 0) { sample, NULL, NULL,
PERF_MAX_STACK_DEPTH) != 0) {
if (verbose) if (verbose)
error("Failed to resolve callchain. Skipping\n"); error("Failed to resolve callchain. Skipping\n");
return; return;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册