builtin-stat.c 28.0 KB
Newer Older
1
/*
2 3 4 5 6 7
 * builtin-stat.c
 *
 * Builtin stat command: Give a precise performance counters summary
 * overview about any workload, CPU or specific PID.
 *
 * Sample output:
8

9
   $ perf stat ./hackbench 10
10

11
  Time: 0.118
12

13
  Performance counter stats for './hackbench 10':
14

15 16 17 18 19 20 21 22 23 24 25 26 27
       1708.761321 task-clock                #   11.037 CPUs utilized
            41,190 context-switches          #    0.024 M/sec
             6,735 CPU-migrations            #    0.004 M/sec
            17,318 page-faults               #    0.010 M/sec
     5,205,202,243 cycles                    #    3.046 GHz
     3,856,436,920 stalled-cycles-frontend   #   74.09% frontend cycles idle
     1,600,790,871 stalled-cycles-backend    #   30.75% backend  cycles idle
     2,603,501,247 instructions              #    0.50  insns per cycle
                                             #    1.48  stalled cycles per insn
       484,357,498 branches                  #  283.455 M/sec
         6,388,934 branch-misses             #    1.32% of all branches

        0.154822978  seconds time elapsed
28

29
 *
30
 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
31 32 33 34 35 36 37 38
 *
 * Improvements and fixes by:
 *
 *   Arjan van de Ven <arjan@linux.intel.com>
 *   Yanmin Zhang <yanmin.zhang@intel.com>
 *   Wu Fengguang <fengguang.wu@intel.com>
 *   Mike Galbraith <efault@gmx.de>
 *   Paul Mackerras <paulus@samba.org>
39
 *   Jaswinder Singh Rajput <jaswinder@kernel.org>
40 41
 *
 * Released under the GPL v2. (and only v2, not any later version)
42 43
 */

44
#include "perf.h"
45
#include "builtin.h"
46
#include "util/util.h"
47 48
#include "util/parse-options.h"
#include "util/parse-events.h"
49
#include "util/event.h"
50
#include "util/evlist.h"
51
#include "util/evsel.h"
52
#include "util/debug.h"
53
#include "util/color.h"
54
#include "util/header.h"
55
#include "util/cpumap.h"
56
#include "util/thread.h"
57
#include "util/thread_map.h"
58 59

#include <sys/prctl.h>
60
#include <math.h>
61
#include <locale.h>
62

S
Stephane Eranian 已提交
63 64
#define DEFAULT_SEPARATOR	" "

65
static struct perf_event_attr default_attrs[] = {
66

67 68 69 70
  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES	},
  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS		},
  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS		},
71

72
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES		},
73 74
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	},
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND	},
75 76 77
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS		},
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS	},
  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES		},
78

79
};
80

81
/*
82
 * Detailed stats (-d), covering the L1 and last level data caches:
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
 */
static struct perf_event_attr detailed_attrs[] = {

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_LL			<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_LL			<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
};

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
/*
 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
 */
static struct perf_event_attr very_detailed_attrs[] = {

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1I		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1I		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_DTLB		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_DTLB		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_ITLB		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_ITLB		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},

};

/*
 * Very, very detailed stats (-d -d -d), adding prefetch events:
 */
static struct perf_event_attr very_very_detailed_attrs[] = {

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_PREFETCH	<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},

  { .type = PERF_TYPE_HW_CACHE,
    .config =
	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
	(PERF_COUNT_HW_CACHE_OP_PREFETCH	<<  8) |
	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
};



174 175
struct perf_evlist		*evsel_list;

176
static bool			system_wide			=  false;
177
static int			run_idx				=  0;
178

179
static int			run_count			=  1;
180
static bool			no_inherit			= false;
181
static bool			scale				=  true;
182
static bool			no_aggr				= false;
183
static pid_t			target_pid			= -1;
184
static pid_t			target_tid			= -1;
185
static pid_t			child_pid			= -1;
186
static bool			null_run			=  false;
187
static int			detailed_run			=  0;
I
Ingo Molnar 已提交
188
static bool			sync_run			=  false;
189
static bool			big_num				=  true;
S
Stephane Eranian 已提交
190
static int			big_num_opt			=  -1;
191
static const char		*cpu_list;
S
Stephane Eranian 已提交
192 193
static const char		*csv_sep			= NULL;
static bool			csv_output			= false;
194

195 196
static volatile int done = 0;

197 198
struct stats
{
199
	double n, mean, M2;
200
};
201

202 203 204 205
struct perf_stat {
	struct stats	  res_stats[3];
};

206
static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
207
{
208
	evsel->priv = zalloc(sizeof(struct perf_stat));
209 210 211 212 213 214 215 216 217
	return evsel->priv == NULL ? -ENOMEM : 0;
}

static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
{
	free(evsel->priv);
	evsel->priv = NULL;
}

218 219
static void update_stats(struct stats *stats, u64 val)
{
220
	double delta;
221

222 223 224 225
	stats->n++;
	delta = val - stats->mean;
	stats->mean += delta / stats->n;
	stats->M2 += delta*(val - stats->mean);
226 227
}

228 229
static double avg_stats(struct stats *stats)
{
230
	return stats->mean;
231
}
232

233
/*
234 235
 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 *
236 237 238
 *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
 * s^2 = -------------------------------
 *                  n - 1
239 240 241 242 243 244 245 246 247
 *
 * http://en.wikipedia.org/wiki/Stddev
 *
 * The std dev of the mean is related to the std dev by:
 *
 *             s
 * s_mean = -------
 *          sqrt(n)
 *
248 249 250
 */
static double stddev_stats(struct stats *stats)
{
251 252
	double variance = stats->M2 / (stats->n - 1);
	double variance_mean = variance / stats->n;
253

254
	return sqrt(variance_mean);
255
}
256

257 258
struct stats			runtime_nsecs_stats[MAX_NR_CPUS];
struct stats			runtime_cycles_stats[MAX_NR_CPUS];
259 260
struct stats			runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
struct stats			runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
261
struct stats			runtime_branches_stats[MAX_NR_CPUS];
262
struct stats			runtime_cacherefs_stats[MAX_NR_CPUS];
263
struct stats			runtime_l1_dcache_stats[MAX_NR_CPUS];
264
struct stats			walltime_nsecs_stats;
265

266
static int create_perf_stat_counter(struct perf_evsel *evsel)
267
{
268
	struct perf_event_attr *attr = &evsel->attr;
269

270
	if (scale)
271 272
		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
				    PERF_FORMAT_TOTAL_TIME_RUNNING;
273

274 275
	attr->inherit = !no_inherit;

276
	if (system_wide)
277
		return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false);
278 279 280 281

	if (target_pid == -1 && target_tid == -1) {
		attr->disabled = 1;
		attr->enable_on_exec = 1;
282
	}
283

284
	return perf_evsel__open_per_thread(evsel, evsel_list->threads, false);
285 286
}

287 288 289
/*
 * Does the counter have nsecs as a unit?
 */
290
static inline int nsec_counter(struct perf_evsel *evsel)
291
{
292 293
	if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
	    perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
294 295 296 297 298
		return 1;

	return 0;
}

I
Ingo Molnar 已提交
299 300 301 302 303 304 305 306 307 308 309
/*
 * Update various tracking values we maintain to print
 * more semantic information such as miss/hit ratios,
 * instruction rates, etc:
 */
static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
{
	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
		update_stats(&runtime_nsecs_stats[0], count[0]);
	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
		update_stats(&runtime_cycles_stats[0], count[0]);
310 311
	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
		update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
312
	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
313
		update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
I
Ingo Molnar 已提交
314 315 316 317
	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
		update_stats(&runtime_branches_stats[0], count[0]);
	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
		update_stats(&runtime_cacherefs_stats[0], count[0]);
318 319
	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
		update_stats(&runtime_l1_dcache_stats[0], count[0]);
I
Ingo Molnar 已提交
320 321
}

322
/*
323
 * Read out the results of a single counter:
324
 * aggregate counts across CPUs in system-wide mode
325
 */
326
static int read_counter_aggr(struct perf_evsel *counter)
327
{
328
	struct perf_stat *ps = counter->priv;
329 330
	u64 *count = counter->counts->aggr.values;
	int i;
331

332 333
	if (__perf_evsel__read(counter, evsel_list->cpus->nr,
			       evsel_list->threads->nr, scale) < 0)
334
		return -1;
335 336

	for (i = 0; i < 3; i++)
337
		update_stats(&ps->res_stats[i], count[i]);
338 339

	if (verbose) {
340 341
		fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
			event_name(counter), count[0], count[1], count[2]);
342 343
	}

344 345 346
	/*
	 * Save the full runtime - to allow normalization during printout:
	 */
I
Ingo Molnar 已提交
347
	update_shadow_stats(counter, count);
348 349

	return 0;
350 351 352 353 354 355
}

/*
 * Read out the results of a single counter:
 * do not aggregate counts across CPUs in system-wide mode
 */
356
static int read_counter(struct perf_evsel *counter)
357
{
358
	u64 *count;
359 360
	int cpu;

361
	for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
362 363
		if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
			return -1;
364

365
		count = counter->counts->cpu[cpu].values;
366

I
Ingo Molnar 已提交
367
		update_shadow_stats(counter, count);
368
	}
369 370

	return 0;
371 372
}

373
static int run_perf_stat(int argc __used, const char **argv)
374 375
{
	unsigned long long t0, t1;
376
	struct perf_evsel *counter;
377
	int status = 0;
378
	int child_ready_pipe[2], go_pipe[2];
379
	const bool forks = (argc > 0);
380
	char buf;
381

382
	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
383 384 385 386
		perror("failed to create pipes");
		exit(1);
	}

387
	if (forks) {
388
		if ((child_pid = fork()) < 0)
389 390
			perror("failed to fork");

391
		if (!child_pid) {
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
			close(child_ready_pipe[0]);
			close(go_pipe[1]);
			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);

			/*
			 * Do a dummy execvp to get the PLT entry resolved,
			 * so we avoid the resolver overhead on the real
			 * execvp call.
			 */
			execvp("", (char **)argv);

			/*
			 * Tell the parent we're ready to go
			 */
			close(child_ready_pipe[1]);

			/*
			 * Wait until the parent tells us to go.
			 */
			if (read(go_pipe[0], &buf, 1) == -1)
				perror("unable to read pipe");

			execvp(argv[0], (char **)argv);

			perror(argv[0]);
			exit(-1);
		}
419

420
		if (target_tid == -1 && target_pid == -1 && !system_wide)
421
			evsel_list->threads->map[0] = child_pid;
422

423
		/*
424
		 * Wait for the child to be ready to exec.
425 426
		 */
		close(child_ready_pipe[1]);
427 428
		close(go_pipe[0]);
		if (read(child_ready_pipe[0], &buf, 1) == -1)
429
			perror("unable to read pipe");
430
		close(child_ready_pipe[0]);
431 432
	}

433
	list_for_each_entry(counter, &evsel_list->entries, node) {
434
		if (create_perf_stat_counter(counter) < 0) {
435 436 437 438
			if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) {
				if (verbose)
					ui__warning("%s event is not supported by the kernel.\n",
						    event_name(counter));
439
				continue;
440
			}
441 442

			if (errno == EPERM || errno == EACCES) {
443 444 445 446 447 448 449 450 451 452 453 454 455 456
				error("You may not have permission to collect %sstats.\n"
				      "\t Consider tweaking"
				      " /proc/sys/kernel/perf_event_paranoid or running as root.",
				      system_wide ? "system-wide " : "");
			} else {
				error("open_counter returned with %d (%s). "
				      "/bin/dmesg may provide additional information.\n",
				       errno, strerror(errno));
			}
			if (child_pid != -1)
				kill(child_pid, SIGTERM);
			die("Not all events could be opened.\n");
			return -1;
		}
457
	}
458

459 460 461 462 463 464
	if (perf_evlist__set_filters(evsel_list)) {
		error("failed to set filter with %d (%s)\n", errno,
			strerror(errno));
		return -1;
	}

465 466 467 468 469
	/*
	 * Enable counters and exec the command:
	 */
	t0 = rdclock();

470 471 472 473
	if (forks) {
		close(go_pipe[1]);
		wait(&status);
	} else {
474
		while(!done) sleep(1);
475
	}
476 477 478

	t1 = rdclock();

479
	update_stats(&walltime_nsecs_stats, t1 - t0);
480

481
	if (no_aggr) {
482
		list_for_each_entry(counter, &evsel_list->entries, node) {
483
			read_counter(counter);
484
			perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
485
		}
486
	} else {
487
		list_for_each_entry(counter, &evsel_list->entries, node) {
488
			read_counter_aggr(counter);
489 490
			perf_evsel__close_fd(counter, evsel_list->cpus->nr,
					     evsel_list->threads->nr);
491
		}
492
	}
493

494 495 496
	return WEXITSTATUS(status);
}

497 498 499 500 501 502 503 504 505 506
static void print_noise_pct(double total, double avg)
{
	double pct = 0.0;

	if (avg)
		pct = 100.0*total/avg;

	fprintf(stderr, "  ( +-%6.2f%% )", pct);
}

507
static void print_noise(struct perf_evsel *evsel, double avg)
508
{
509 510
	struct perf_stat *ps;

511 512 513
	if (run_count == 1)
		return;

514
	ps = evsel->priv;
515
	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
516 517
}

518
static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
I
Ingo Molnar 已提交
519
{
520
	double msecs = avg / 1e6;
S
Stephane Eranian 已提交
521
	char cpustr[16] = { '\0', };
522
	const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
I
Ingo Molnar 已提交
523

524
	if (no_aggr)
S
Stephane Eranian 已提交
525 526
		sprintf(cpustr, "CPU%*d%s",
			csv_output ? 0 : -4,
527
			evsel_list->cpus->map[cpu], csv_sep);
S
Stephane Eranian 已提交
528

529
	fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
S
Stephane Eranian 已提交
530

S
Stephane Eranian 已提交
531 532 533
	if (evsel->cgrp)
		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);

S
Stephane Eranian 已提交
534 535
	if (csv_output)
		return;
I
Ingo Molnar 已提交
536

537
	if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
538
		fprintf(stderr, " # %8.3f CPUs utilized          ", avg / avg_stats(&walltime_nsecs_stats));
I
Ingo Molnar 已提交
539 540
}

541 542 543 544 545 546 547 548 549 550 551
static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
{
	double total, ratio = 0.0;
	const char *color;

	total = avg_stats(&runtime_cycles_stats[cpu]);

	if (total)
		ratio = avg / total * 100.0;

	color = PERF_COLOR_NORMAL;
552
	if (ratio > 50.0)
553
		color = PERF_COLOR_RED;
554
	else if (ratio > 30.0)
555
		color = PERF_COLOR_MAGENTA;
556
	else if (ratio > 10.0)
557 558
		color = PERF_COLOR_YELLOW;

559 560
	fprintf(stderr, " #  ");
	color_fprintf(stderr, color, "%6.2f%%", ratio);
561 562 563 564
	fprintf(stderr, " frontend cycles idle   ");
}

static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
565 566 567 568 569 570 571 572 573 574 575 576 577 578
{
	double total, ratio = 0.0;
	const char *color;

	total = avg_stats(&runtime_cycles_stats[cpu]);

	if (total)
		ratio = avg / total * 100.0;

	color = PERF_COLOR_NORMAL;
	if (ratio > 75.0)
		color = PERF_COLOR_RED;
	else if (ratio > 50.0)
		color = PERF_COLOR_MAGENTA;
579
	else if (ratio > 20.0)
580 581
		color = PERF_COLOR_YELLOW;

582 583 584
	fprintf(stderr, " #  ");
	color_fprintf(stderr, color, "%6.2f%%", ratio);
	fprintf(stderr, " backend  cycles idle   ");
585 586
}

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
{
	double total, ratio = 0.0;
	const char *color;

	total = avg_stats(&runtime_branches_stats[cpu]);

	if (total)
		ratio = avg / total * 100.0;

	color = PERF_COLOR_NORMAL;
	if (ratio > 20.0)
		color = PERF_COLOR_RED;
	else if (ratio > 10.0)
		color = PERF_COLOR_MAGENTA;
	else if (ratio > 5.0)
		color = PERF_COLOR_YELLOW;

605 606
	fprintf(stderr, " #  ");
	color_fprintf(stderr, color, "%6.2f%%", ratio);
607 608 609
	fprintf(stderr, " of all branches        ");
}

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
{
	double total, ratio = 0.0;
	const char *color;

	total = avg_stats(&runtime_l1_dcache_stats[cpu]);

	if (total)
		ratio = avg / total * 100.0;

	color = PERF_COLOR_NORMAL;
	if (ratio > 20.0)
		color = PERF_COLOR_RED;
	else if (ratio > 10.0)
		color = PERF_COLOR_MAGENTA;
	else if (ratio > 5.0)
		color = PERF_COLOR_YELLOW;

628 629
	fprintf(stderr, " #  ");
	color_fprintf(stderr, color, "%6.2f%%", ratio);
630 631 632
	fprintf(stderr, " of all L1-dcache hits  ");
}

633
static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
I
Ingo Molnar 已提交
634
{
635
	double total, ratio = 0.0;
636
	char cpustr[16] = { '\0', };
S
Stephane Eranian 已提交
637 638 639 640 641
	const char *fmt;

	if (csv_output)
		fmt = "%s%.0f%s%s";
	else if (big_num)
642
		fmt = "%s%'18.0f%s%-25s";
S
Stephane Eranian 已提交
643
	else
644
		fmt = "%s%18.0f%s%-25s";
645 646

	if (no_aggr)
S
Stephane Eranian 已提交
647 648
		sprintf(cpustr, "CPU%*d%s",
			csv_output ? 0 : -4,
649
			evsel_list->cpus->map[cpu], csv_sep);
650 651
	else
		cpu = 0;
652

653
	fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
S
Stephane Eranian 已提交
654

S
Stephane Eranian 已提交
655 656 657
	if (evsel->cgrp)
		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);

S
Stephane Eranian 已提交
658 659
	if (csv_output)
		return;
I
Ingo Molnar 已提交
660

661
	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
662
		total = avg_stats(&runtime_cycles_stats[cpu]);
663 664 665 666

		if (total)
			ratio = avg / total;

667
		fprintf(stderr, " #   %5.2f  insns per cycle        ", ratio);
668

669 670
		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
671 672 673

		if (total && avg) {
			ratio = total / avg;
674
			fprintf(stderr, "\n                                             #   %5.2f  stalled cycles per insn", ratio);
675 676
		}

677
	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
678
			runtime_branches_stats[cpu].n != 0) {
679
		print_branch_misses(cpu, evsel, avg);
680 681 682 683 684
	} else if (
		evsel->attr.type == PERF_TYPE_HW_CACHE &&
		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
685
			runtime_l1_dcache_stats[cpu].n != 0) {
686
		print_l1_dcache_misses(cpu, evsel, avg);
687 688 689 690 691 692 693
	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
			runtime_cacherefs_stats[cpu].n != 0) {
		total = avg_stats(&runtime_cacherefs_stats[cpu]);

		if (total)
			ratio = avg * 100 / total;

694
		fprintf(stderr, " # %8.3f %% of all cache refs    ", ratio);
695

696 697
	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
		print_stalled_cycles_frontend(cpu, evsel, avg);
698
	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
699
		print_stalled_cycles_backend(cpu, evsel, avg);
700
	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
701
		total = avg_stats(&runtime_nsecs_stats[cpu]);
702 703

		if (total)
704
			ratio = 1.0 * avg / total;
705

706 707 708
		fprintf(stderr, " # %8.3f GHz                    ", ratio);
	} else if (runtime_nsecs_stats[cpu].n != 0) {
		total = avg_stats(&runtime_nsecs_stats[cpu]);
709 710

		if (total)
711
			ratio = 1000.0 * avg / total;
712

713
		fprintf(stderr, " # %8.3f M/sec                  ", ratio);
714 715
	} else {
		fprintf(stderr, "                                   ");
I
Ingo Molnar 已提交
716 717 718
	}
}

719 720
/*
 * Print out the results of a single counter:
721
 * aggregated counts in system-wide mode
722
 */
723
static void print_counter_aggr(struct perf_evsel *counter)
724
{
725 726
	struct perf_stat *ps = counter->priv;
	double avg = avg_stats(&ps->res_stats[0]);
727
	int scaled = counter->counts->scaled;
728 729

	if (scaled == -1) {
S
Stephane Eranian 已提交
730
		fprintf(stderr, "%*s%s%*s",
S
Stephane Eranian 已提交
731
			csv_output ? 0 : 18,
S
Stephane Eranian 已提交
732 733 734 735 736 737 738 739 740
			"<not counted>",
			csv_sep,
			csv_output ? 0 : -24,
			event_name(counter));

		if (counter->cgrp)
			fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);

		fputc('\n', stderr);
741 742
		return;
	}
743

I
Ingo Molnar 已提交
744
	if (nsec_counter(counter))
745
		nsec_printout(-1, counter, avg);
I
Ingo Molnar 已提交
746
	else
747
		abs_printout(-1, counter, avg);
748

S
Stephane Eranian 已提交
749 750 751 752 753
	if (csv_output) {
		fputc('\n', stderr);
		return;
	}

754
	print_noise(counter, avg);
755 756 757 758

	if (scaled) {
		double avg_enabled, avg_running;

759 760
		avg_enabled = avg_stats(&ps->res_stats[1]);
		avg_running = avg_stats(&ps->res_stats[2]);
761

762
		fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled);
763
	}
764 765 766
	fprintf(stderr, "\n");
}

767 768 769 770
/*
 * Print out the results of a single counter:
 * does not use aggregated count in system-wide
 */
771
static void print_counter(struct perf_evsel *counter)
772 773 774 775
{
	u64 ena, run, val;
	int cpu;

776
	for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
777 778 779
		val = counter->counts->cpu[cpu].val;
		ena = counter->counts->cpu[cpu].ena;
		run = counter->counts->cpu[cpu].run;
780
		if (run == 0 || ena == 0) {
S
Stephane Eranian 已提交
781
			fprintf(stderr, "CPU%*d%s%*s%s%*s",
S
Stephane Eranian 已提交
782
				csv_output ? 0 : -4,
783
				evsel_list->cpus->map[cpu], csv_sep,
S
Stephane Eranian 已提交
784 785
				csv_output ? 0 : 18,
				"<not counted>", csv_sep,
S
Stephane Eranian 已提交
786
				csv_output ? 0 : -24,
S
Stephane Eranian 已提交
787
				event_name(counter));
788

S
Stephane Eranian 已提交
789 790 791 792
			if (counter->cgrp)
				fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);

			fputc('\n', stderr);
793 794 795 796 797 798 799 800
			continue;
		}

		if (nsec_counter(counter))
			nsec_printout(cpu, counter, val);
		else
			abs_printout(cpu, counter, val);

S
Stephane Eranian 已提交
801 802
		if (!csv_output) {
			print_noise(counter, 1.0);
803

804 805
			if (run != ena)
				fprintf(stderr, "  (%.2f%%)", 100.0 * run / ena);
806
		}
S
Stephane Eranian 已提交
807
		fputc('\n', stderr);
808 809 810
	}
}

811 812
static void print_stat(int argc, const char **argv)
{
813 814
	struct perf_evsel *counter;
	int i;
815

816 817
	fflush(stdout);

S
Stephane Eranian 已提交
818 819 820 821 822 823 824 825 826 827 828
	if (!csv_output) {
		fprintf(stderr, "\n");
		fprintf(stderr, " Performance counter stats for ");
		if(target_pid == -1 && target_tid == -1) {
			fprintf(stderr, "\'%s", argv[0]);
			for (i = 1; i < argc; i++)
				fprintf(stderr, " %s", argv[i]);
		} else if (target_pid != -1)
			fprintf(stderr, "process id \'%d", target_pid);
		else
			fprintf(stderr, "thread id \'%d", target_tid);
I
Ingo Molnar 已提交
829

S
Stephane Eranian 已提交
830 831 832 833 834
		fprintf(stderr, "\'");
		if (run_count > 1)
			fprintf(stderr, " (%d runs)", run_count);
		fprintf(stderr, ":\n\n");
	}
835

836
	if (no_aggr) {
837
		list_for_each_entry(counter, &evsel_list->entries, node)
838 839
			print_counter(counter);
	} else {
840
		list_for_each_entry(counter, &evsel_list->entries, node)
841 842
			print_counter_aggr(counter);
	}
843

S
Stephane Eranian 已提交
844 845 846 847 848
	if (!csv_output) {
		fprintf(stderr, "\n");
		fprintf(stderr, " %18.9f  seconds time elapsed",
				avg_stats(&walltime_nsecs_stats)/1e9);
		if (run_count > 1) {
849 850
			print_noise_pct(stddev_stats(&walltime_nsecs_stats),
					avg_stats(&walltime_nsecs_stats));
S
Stephane Eranian 已提交
851 852
		}
		fprintf(stderr, "\n\n");
I
Ingo Molnar 已提交
853
	}
854 855
}

856 857
static volatile int signr = -1;

858
static void skip_signal(int signo)
859
{
860
	if(child_pid == -1)
861 862
		done = 1;

863 864 865 866 867
	signr = signo;
}

static void sig_atexit(void)
{
868 869 870
	if (child_pid != -1)
		kill(child_pid, SIGTERM);

871 872 873 874 875
	if (signr == -1)
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
876 877 878
}

static const char * const stat_usage[] = {
879
	"perf stat [<options>] [<command>]",
880 881 882
	NULL
};

S
Stephane Eranian 已提交
883 884 885 886 887 888 889
static int stat__set_big_num(const struct option *opt __used,
			     const char *s __used, int unset)
{
	big_num_opt = unset ? 0 : 1;
	return 0;
}

890
static const struct option options[] = {
891
	OPT_CALLBACK('e', "event", &evsel_list, "event",
892 893
		     "event selector. use 'perf list' to list available events",
		     parse_events),
894 895
	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
		     "event filter", parse_filter),
896 897
	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
		    "child tasks do not inherit counters"),
898
	OPT_INTEGER('p', "pid", &target_pid,
899 900 901
		    "stat events on existing process id"),
	OPT_INTEGER('t', "tid", &target_tid,
		    "stat events on existing thread id"),
902
	OPT_BOOLEAN('a', "all-cpus", &system_wide,
903
		    "system-wide collection from all CPUs"),
904
	OPT_BOOLEAN('c', "scale", &scale,
905
		    "scale/normalize counters"),
906
	OPT_INCR('v', "verbose", &verbose,
907
		    "be more verbose (show counter open errors, etc)"),
908 909
	OPT_INTEGER('r', "repeat", &run_count,
		    "repeat command and print average + stddev (max: 100)"),
910 911
	OPT_BOOLEAN('n', "null", &null_run,
		    "null run - dont start any counters"),
912
	OPT_INCR('d', "detailed", &detailed_run,
913
		    "detailed run - start a lot of events"),
I
Ingo Molnar 已提交
914 915
	OPT_BOOLEAN('S', "sync", &sync_run,
		    "call sync() before starting a run"),
S
Stephane Eranian 已提交
916 917 918
	OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 
			   "print large numbers with thousands\' separators",
			   stat__set_big_num),
919 920
	OPT_STRING('C', "cpu", &cpu_list, "cpu",
		    "list of cpus to monitor in system-wide"),
921 922
	OPT_BOOLEAN('A', "no-aggr", &no_aggr,
		    "disable CPU count aggregation"),
S
Stephane Eranian 已提交
923 924
	OPT_STRING('x', "field-separator", &csv_sep, "separator",
		   "print counts with custom separator"),
S
Stephane Eranian 已提交
925 926 927
	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
		     "monitor event in cgroup name only",
		     parse_cgroups),
928 929 930
	OPT_END()
};

931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
/*
 * Add default attributes, if there were no attributes specified or
 * if -d/--detailed, -d -d or -d -d -d is used:
 */
static int add_default_attributes(void)
{
	struct perf_evsel *pos;
	size_t attr_nr = 0;
	size_t c;

	/* Set attrs if no event is selected and !null_run: */
	if (null_run)
		return 0;

	if (!evsel_list->nr_entries) {
		for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
			pos = perf_evsel__new(default_attrs + c, c + attr_nr);
			if (pos == NULL)
				return -1;
			perf_evlist__add(evsel_list, pos);
		}
		attr_nr += c;
	}

	/* Detailed events get appended to the event list: */

	if (detailed_run <  1)
		return 0;

	/* Append detailed run extra attributes: */
	for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
		pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
		if (pos == NULL)
			return -1;
		perf_evlist__add(evsel_list, pos);
	}
	attr_nr += c;

	if (detailed_run < 2)
		return 0;

	/* Append very detailed run extra attributes: */
	for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
		pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
		if (pos == NULL)
			return -1;
		perf_evlist__add(evsel_list, pos);
	}

	if (detailed_run < 3)
		return 0;

	/* Append very, very detailed run extra attributes: */
	for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
		pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
		if (pos == NULL)
			return -1;
		perf_evlist__add(evsel_list, pos);
	}


	return 0;
}

995
int cmd_stat(int argc, const char **argv, const char *prefix __used)
996
{
997 998
	struct perf_evsel *pos;
	int status = -ENOMEM;
999

1000 1001
	setlocale(LC_ALL, "");

1002
	evsel_list = perf_evlist__new(NULL, NULL);
1003 1004 1005
	if (evsel_list == NULL)
		return -ENOMEM;

1006 1007
	argc = parse_options(argc, argv, options, stat_usage,
		PARSE_OPT_STOP_AT_NON_OPTION);
S
Stephane Eranian 已提交
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026

	if (csv_sep)
		csv_output = true;
	else
		csv_sep = DEFAULT_SEPARATOR;

	/*
	 * let the spreadsheet do the pretty-printing
	 */
	if (csv_output) {
		/* User explicitely passed -B? */
		if (big_num_opt == 1) {
			fprintf(stderr, "-B option not supported with -x\n");
			usage_with_options(stat_usage, options);
		} else /* Nope, so disable big number formatting */
			big_num = false;
	} else if (big_num_opt == 0) /* User passed --no-big-num */
		big_num = false;

1027
	if (!argc && target_pid == -1 && target_tid == -1)
1028
		usage_with_options(stat_usage, options);
1029
	if (run_count <= 0)
1030
		usage_with_options(stat_usage, options);
1031

S
Stephane Eranian 已提交
1032 1033 1034 1035 1036
	/* no_aggr, cgroup are for system-wide only */
	if ((no_aggr || nr_cgroups) && !system_wide) {
		fprintf(stderr, "both cgroup and no-aggregation "
			"modes only available in system-wide mode\n");

1037
		usage_with_options(stat_usage, options);
S
Stephane Eranian 已提交
1038
	}
1039

1040 1041
	if (add_default_attributes())
		goto out;
1042

1043 1044 1045
	if (target_pid != -1)
		target_tid = target_pid;

1046 1047
	evsel_list->threads = thread_map__new(target_pid, target_tid);
	if (evsel_list->threads == NULL) {
1048 1049 1050 1051
		pr_err("Problems finding threads of monitor\n");
		usage_with_options(stat_usage, options);
	}

1052
	if (system_wide)
1053
		evsel_list->cpus = cpu_map__new(cpu_list);
1054
	else
1055
		evsel_list->cpus = cpu_map__dummy_new();
1056

1057
	if (evsel_list->cpus == NULL) {
1058
		perror("failed to parse CPUs map");
1059
		usage_with_options(stat_usage, options);
1060 1061
		return -1;
	}
1062

1063
	list_for_each_entry(pos, &evsel_list->entries, node) {
1064
		if (perf_evsel__alloc_stat_priv(pos) < 0 ||
1065 1066
		    perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
		    perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
1067
			goto out_free_fd;
1068 1069
	}

I
Ingo Molnar 已提交
1070 1071 1072 1073 1074 1075
	/*
	 * We dont want to block the signals - that would cause
	 * child tasks to inherit that and Ctrl-C would not work.
	 * What we want is for Ctrl-C to work in the exec()-ed
	 * task, but being ignored by perf stat itself:
	 */
1076
	atexit(sig_atexit);
I
Ingo Molnar 已提交
1077 1078 1079 1080
	signal(SIGINT,  skip_signal);
	signal(SIGALRM, skip_signal);
	signal(SIGABRT, skip_signal);

1081 1082 1083
	status = 0;
	for (run_idx = 0; run_idx < run_count; run_idx++) {
		if (run_count != 1 && verbose)
1084
			fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
I
Ingo Molnar 已提交
1085 1086 1087 1088

		if (sync_run)
			sync();

1089 1090 1091
		status = run_perf_stat(argc, argv);
	}

1092 1093
	if (status != -1)
		print_stat(argc, argv);
1094
out_free_fd:
1095
	list_for_each_entry(pos, &evsel_list->entries, node)
1096
		perf_evsel__free_stat_priv(pos);
1097
	perf_evlist__delete_maps(evsel_list);
1098 1099
out:
	perf_evlist__delete(evsel_list);
1100
	return status;
1101
}