builtin-top.c 26.5 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
 * builtin-top.c
 *
 * Builtin top command: Display a continuously updated profile of
 * any workload, CPU or specific PID.
 *
 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
 *
 * Improvements and fixes by:
 *
 *   Arjan van de Ven <arjan@linux.intel.com>
 *   Yanmin Zhang <yanmin.zhang@intel.com>
 *   Wu Fengguang <fengguang.wu@intel.com>
 *   Mike Galbraith <efault@gmx.de>
 *   Paul Mackerras <paulus@samba.org>
 *
 * Released under the GPL v2. (and only v2, not any later version)
18
 */
19
#include "builtin.h"
20

21
#include "perf.h"
22

23
#include "util/annotate.h"
24
#include "util/cache.h"
25
#include "util/color.h"
26
#include "util/evlist.h"
27
#include "util/evsel.h"
28 29
#include "util/session.h"
#include "util/symbol.h"
30
#include "util/thread.h"
31
#include "util/thread_map.h"
32
#include "util/top.h"
33
#include "util/util.h"
34
#include <linux/rbtree.h>
35 36
#include "util/parse-options.h"
#include "util/parse-events.h"
37
#include "util/cpumap.h"
38
#include "util/xyarray.h"
39

40 41
#include "util/debug.h"

42 43
#include <assert.h>
#include <fcntl.h>
44

45
#include <stdio.h>
46 47
#include <termios.h>
#include <unistd.h>
48
#include <inttypes.h>
49

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
#include <errno.h>
#include <time.h>
#include <sched.h>

#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sys/uio.h>
#include <sys/mman.h>

#include <linux/unistd.h>
#include <linux/types.h>

65
#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
66

67 68 69 70 71 72 73 74 75 76
static struct perf_top top = {
	.count_filter		= 5,
	.delay_secs		= 2,
	.display_weighted	= -1,
	.target_pid		= -1,
	.target_tid		= -1,
	.active_symbols		= LIST_HEAD_INIT(top.active_symbols),
	.active_symbols_lock	= PTHREAD_MUTEX_INITIALIZER,
	.freq			= 1000, /* 1 KHz */
};
77

78
static bool			system_wide			=  false;
79

80 81
static bool			use_tui, use_stdio;

82
static int			default_interval		=      0;
83

84
static bool			inherit				=  false;
85
static int			realtime_prio			=      0;
86
static bool			group				=  false;
87
static unsigned int		page_size;
88
static unsigned int		mmap_pages			=    128;
89

90
static bool			dump_symtab                     =  false;
91

92
static struct winsize		winsize;
93

94
static const char		*sym_filter			=   NULL;
95
struct sym_entry		*sym_filter_entry		=   NULL;
96
struct sym_entry		*sym_filter_entry_sched		=   NULL;
97
static int			sym_pcnt_filter			=      5;
98

99 100 101 102
/*
 * Source functions
 */

103
void get_term_dimensions(struct winsize *ws)
104
{
105 106 107 108 109 110 111 112 113 114
	char *s = getenv("LINES");

	if (s != NULL) {
		ws->ws_row = atoi(s);
		s = getenv("COLUMNS");
		if (s != NULL) {
			ws->ws_col = atoi(s);
			if (ws->ws_row && ws->ws_col)
				return;
		}
115
	}
116 117 118 119
#ifdef TIOCGWINSZ
	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
	    ws->ws_row && ws->ws_col)
		return;
120
#endif
121 122
	ws->ws_row = 25;
	ws->ws_col = 80;
123 124
}

125
static void update_print_entries(struct winsize *ws)
126
{
127
	top.print_entries = ws->ws_row;
128

129 130
	if (top.print_entries > 9)
		top.print_entries -= 9;
131 132 133 134
}

static void sig_winch_handler(int sig __used)
{
135 136
	get_term_dimensions(&winsize);
	update_print_entries(&winsize);
137 138
}

139
static int parse_source(struct sym_entry *syme)
140 141
{
	struct symbol *sym;
142
	struct sym_entry_source *source;
143
	struct map *map;
144
	int err = -1;
145 146

	if (!syme)
147 148 149 150 151 152 153 154 155 156
		return -1;

	sym = sym_entry__symbol(syme);
	map = syme->map;

	/*
	 * We can't annotate with just /proc/kallsyms
	 */
	if (map->dso->origin == DSO__ORIG_KERNEL)
		return -1;
157

158
	if (syme->src == NULL) {
159
		syme->src = zalloc(sizeof(*source));
160
		if (syme->src == NULL)
161
			return -1;
162
		pthread_mutex_init(&syme->src->lock, NULL);
163
		INIT_LIST_HEAD(&syme->src->head);
164 165 166 167
	}

	source = syme->src;

168
	if (symbol__annotation(sym)->histograms != NULL) {
169
		pthread_mutex_lock(&source->lock);
170 171 172
		goto out_assign;
	}

173
	pthread_mutex_lock(&source->lock);
174

175 176 177 178
	if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
		pr_err("Not enough memory for annotating '%s' symbol!\n",
		       sym->name);
		goto out_unlock;
179
	}
180 181 182

	err = symbol__annotate(sym, syme->map, &source->head, 0);
	if (err == 0) {
183 184
out_assign:
	sym_filter_entry = syme;
185 186
	}
out_unlock:
187
	pthread_mutex_unlock(&source->lock);
188
	return err;
189 190 191 192
}

static void __zero_source_counters(struct sym_entry *syme)
{
193 194
	struct symbol *sym = sym_entry__symbol(syme);
	symbol__annotate_zero_histograms(sym);
195 196 197 198 199 200 201
}

static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
{
	if (syme != sym_filter_entry)
		return;

202
	if (pthread_mutex_trylock(&syme->src->lock))
203 204
		return;

205 206
	ip = syme->map->map_ip(syme->map, ip);
	symbol__inc_addr_samples(sym_entry__symbol(syme), syme->map, counter, ip);
207

208
	pthread_mutex_unlock(&syme->src->lock);
209 210 211 212 213
}

static void show_details(struct sym_entry *syme)
{
	struct symbol *symbol;
214
	int more;
215 216 217 218

	if (!syme)
		return;

219 220
	symbol = sym_entry__symbol(syme);
	if (!syme->src || symbol__annotation(symbol)->histograms == NULL)
221 222
		return;

223
	printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
224 225
	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);

226
	pthread_mutex_lock(&syme->src->lock);
227 228 229 230 231 232 233 234
	more = symbol__annotate_printf(symbol, syme->map, &syme->src->head,
				       top.sym_evsel->idx, 0, sym_pcnt_filter,
				       top.print_entries);
	if (top.zero)
		symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
	else
		symbol__annotate_decay_histogram(symbol, &syme->src->head,
						 top.sym_evsel->idx);
235
	pthread_mutex_unlock(&syme->src->lock);
236
	if (more != 0)
237 238
		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
}
239 240 241

static const char		CONSOLE_CLEAR[] = "";

242
static void __list_insert_active_sym(struct sym_entry *syme)
243
{
244
	list_add(&syme->node, &top.active_symbols);
245
}
246

247
static void print_sym_table(struct perf_session *session)
248
{
249 250
	char bf[160];
	int printed = 0;
251
	struct rb_node *nd;
252 253
	struct sym_entry *syme;
	struct rb_root tmp = RB_ROOT;
254
	const int win_width = winsize.ws_col - 1;
255 256
	int sym_width, dso_width, dso_short_width;
	float sum_ksamples = perf_top__decay_samples(&top, &tmp);
257

258
	puts(CONSOLE_CLEAR);
259

260 261
	perf_top__header_snprintf(&top, bf, sizeof(bf));
	printf("%s\n", bf);
262

263
	perf_top__reset_sample_counters(&top);
264

265
	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
266

267 268 269 270 271 272
	if (session->hists.stats.total_lost != 0) {
		color_fprintf(stdout, PERF_COLOR_RED, "WARNING:");
		printf(" LOST %" PRIu64 " events, Check IO/CPU overload\n",
		       session->hists.stats.total_lost);
	}

273 274 275 276 277
	if (sym_filter_entry) {
		show_details(sym_filter_entry);
		return;
	}

278 279
	perf_top__find_widths(&top, &tmp, &dso_width, &dso_short_width,
			      &sym_width);
280

281 282 283 284 285
	if (sym_width + dso_width > winsize.ws_col - 29) {
		dso_width = dso_short_width;
		if (sym_width + dso_width > winsize.ws_col - 29)
			sym_width = winsize.ws_col - dso_width - 29;
	}
286
	putchar('\n');
287
	if (top.evlist->nr_entries == 1)
288
		printf("             samples  pcnt");
289
	else
290
		printf("   weight    samples  pcnt");
291

292 293
	if (verbose)
		printf("         RIP       ");
294
	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
295
	printf("   %s    _______ _____",
296
	       top.evlist->nr_entries == 1 ? "      " : "______");
297
	if (verbose)
298
		printf(" ________________");
299
	printf(" %-*.*s", sym_width, sym_width, graph_line);
300
	printf(" %-*.*s", dso_width, dso_width, graph_line);
301
	puts("\n");
302

303
	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
304
		struct symbol *sym;
305
		double pcnt;
306

307
		syme = rb_entry(nd, struct sym_entry, rb_node);
308
		sym = sym_entry__symbol(syme);
309 310
		if (++printed > top.print_entries ||
		    (int)syme->snap_count < top.count_filter)
311
			continue;
312

313 314
		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
					 sum_ksamples));
315

316
		if (top.evlist->nr_entries == 1 || !top.display_weighted)
317
			printf("%20.2f ", syme->weight);
318
		else
319
			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
320

321
		percent_color_fprintf(stdout, "%4.1f%%", pcnt);
322
		if (verbose)
323
			printf(" %016" PRIx64, sym->start);
324
		printf(" %-*.*s", sym_width, sym_width, sym->name);
325 326 327 328
		printf(" %-*.*s\n", dso_width, dso_width,
		       dso_width >= syme->map->dso->long_name_len ?
					syme->map->dso->long_name :
					syme->map->dso->short_name);
329 330 331
	}
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
static void prompt_integer(int *target, const char *msg)
{
	char *buf = malloc(0), *p;
	size_t dummy = 0;
	int tmp;

	fprintf(stdout, "\n%s: ", msg);
	if (getline(&buf, &dummy, stdin) < 0)
		return;

	p = strchr(buf, '\n');
	if (p)
		*p = 0;

	p = buf;
	while(*p) {
		if (!isdigit(*p))
			goto out_free;
		p++;
	}
	tmp = strtoul(buf, NULL, 10);
	*target = tmp;
out_free:
	free(buf);
}

static void prompt_percent(int *target, const char *msg)
{
	int tmp = 0;

	prompt_integer(&tmp, msg);
	if (tmp >= 0 && tmp <= 100)
		*target = tmp;
}

static void prompt_symbol(struct sym_entry **target, const char *msg)
{
	char *buf = malloc(0), *p;
	struct sym_entry *syme = *target, *n, *found = NULL;
	size_t dummy = 0;

	/* zero counters of active symbol */
	if (syme) {
375
		pthread_mutex_lock(&syme->src->lock);
376 377
		__zero_source_counters(syme);
		*target = NULL;
378
		pthread_mutex_unlock(&syme->src->lock);
379 380 381 382 383 384 385 386 387 388
	}

	fprintf(stdout, "\n%s: ", msg);
	if (getline(&buf, &dummy, stdin) < 0)
		goto out_free;

	p = strchr(buf, '\n');
	if (p)
		*p = 0;

389 390 391
	pthread_mutex_lock(&top.active_symbols_lock);
	syme = list_entry(top.active_symbols.next, struct sym_entry, node);
	pthread_mutex_unlock(&top.active_symbols_lock);
392

393
	list_for_each_entry_safe_from(syme, n, &top.active_symbols, node) {
394
		struct symbol *sym = sym_entry__symbol(syme);
395 396 397 398 399 400 401 402

		if (!strcmp(buf, sym->name)) {
			found = syme;
			break;
		}
	}

	if (!found) {
403
		fprintf(stderr, "Sorry, %s is not active.\n", buf);
404 405 406 407 408 409 410 411 412
		sleep(1);
		return;
	} else
		parse_source(found);

out_free:
	free(buf);
}

413
static void print_mapped_keys(void)
414
{
415 416 417
	char *name = NULL;

	if (sym_filter_entry) {
418
		struct symbol *sym = sym_entry__symbol(sym_filter_entry);
419 420 421 422
		name = sym->name;
	}

	fprintf(stdout, "\nMapped keys:\n");
423 424
	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", top.delay_secs);
	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", top.print_entries);
425

426 427
	if (top.evlist->nr_entries > 1)
		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(top.sym_evsel));
428

429
	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", top.count_filter);
430

431 432 433
	fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
	fprintf(stdout, "\t[S]     stop annotation.\n");
434

435 436
	if (top.evlist->nr_entries > 1)
		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", top.display_weighted ? 1 : 0);
437

438
	fprintf(stdout,
439
		"\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
440
		top.hide_kernel_symbols ? "yes" : "no");
441 442
	fprintf(stdout,
		"\t[U]     hide user symbols.               \t(%s)\n",
443 444
		top.hide_user_symbols ? "yes" : "no");
	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", top.zero ? 1 : 0);
445 446 447 448 449 450 451 452 453 454 455 456
	fprintf(stdout, "\t[qQ]    quit.\n");
}

static int key_mapped(int c)
{
	switch (c) {
		case 'd':
		case 'e':
		case 'f':
		case 'z':
		case 'q':
		case 'Q':
457 458
		case 'K':
		case 'U':
459 460 461
		case 'F':
		case 's':
		case 'S':
462 463 464
			return 1;
		case 'E':
		case 'w':
465
			return top.evlist->nr_entries > 1 ? 1 : 0;
466 467
		default:
			break;
468 469 470
	}

	return 0;
471 472
}

473
static void handle_keypress(struct perf_session *session, int c)
474
{
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
	if (!key_mapped(c)) {
		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
		struct termios tc, save;

		print_mapped_keys();
		fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
		fflush(stdout);

		tcgetattr(0, &save);
		tc = save;
		tc.c_lflag &= ~(ICANON | ECHO);
		tc.c_cc[VMIN] = 0;
		tc.c_cc[VTIME] = 0;
		tcsetattr(0, TCSANOW, &tc);

		poll(&stdin_poll, 1, -1);
		c = getc(stdin);

		tcsetattr(0, TCSAFLUSH, &save);
		if (!key_mapped(c))
			return;
	}

498 499
	switch (c) {
		case 'd':
500 501 502
			prompt_integer(&top.delay_secs, "Enter display delay");
			if (top.delay_secs < 1)
				top.delay_secs = 1;
503 504
			break;
		case 'e':
505 506
			prompt_integer(&top.print_entries, "Enter display entries (lines)");
			if (top.print_entries == 0) {
507
				sig_winch_handler(SIGWINCH);
508 509 510
				signal(SIGWINCH, sig_winch_handler);
			} else
				signal(SIGWINCH, SIG_DFL);
511 512
			break;
		case 'E':
513
			if (top.evlist->nr_entries > 1) {
514
				fprintf(stderr, "\nAvailable events:");
515

516 517
				list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
					fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel));
518

519
				prompt_integer(&top.sym_counter, "Enter details event counter");
520

521 522 523 524
				if (top.sym_counter >= top.evlist->nr_entries) {
					top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
					top.sym_counter = 0;
					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel));
525
					sleep(1);
526
					break;
527
				}
528 529
				list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
					if (top.sym_evsel->idx == top.sym_counter)
530
						break;
531
			} else top.sym_counter = 0;
532 533
			break;
		case 'f':
534
			prompt_integer(&top.count_filter, "Enter display event count filter");
535 536 537 538
			break;
		case 'F':
			prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
			break;
539
		case 'K':
540
			top.hide_kernel_symbols = !top.hide_kernel_symbols;
541
			break;
542 543 544
		case 'q':
		case 'Q':
			printf("exiting.\n");
545
			if (dump_symtab)
546
				perf_session__fprintf_dsos(session, stderr);
547 548 549 550 551 552 553 554 555 556
			exit(0);
		case 's':
			prompt_symbol(&sym_filter_entry, "Enter details symbol");
			break;
		case 'S':
			if (!sym_filter_entry)
				break;
			else {
				struct sym_entry *syme = sym_filter_entry;

557
				pthread_mutex_lock(&syme->src->lock);
558 559
				sym_filter_entry = NULL;
				__zero_source_counters(syme);
560
				pthread_mutex_unlock(&syme->src->lock);
561 562
			}
			break;
563
		case 'U':
564
			top.hide_user_symbols = !top.hide_user_symbols;
565
			break;
566
		case 'w':
567
			top.display_weighted = ~top.display_weighted;
568
			break;
569
		case 'z':
570
			top.zero = !top.zero;
571
			break;
572 573
		default:
			break;
574 575 576
	}
}

577 578 579 580 581 582 583 584
static void *display_thread_tui(void *arg __used)
{
	perf_top__tui_browser(&top);
	exit_browser(0);
	exit(0);
	return NULL;
}

585
static void *display_thread(void *arg __used)
586
{
587
	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
588 589
	struct termios tc, save;
	int delay_msecs, c;
590
	struct perf_session *session = (struct perf_session *) arg;
591 592 593 594 595 596

	tcgetattr(0, &save);
	tc = save;
	tc.c_lflag &= ~(ICANON | ECHO);
	tc.c_cc[VMIN] = 0;
	tc.c_cc[VTIME] = 0;
597

598
repeat:
599
	delay_msecs = top.delay_secs * 1000;
600 601 602
	tcsetattr(0, TCSANOW, &tc);
	/* trash return*/
	getc(stdin);
603

604
	do {
605
		print_sym_table(session);
606 607
	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);

608 609 610
	c = getc(stdin);
	tcsetattr(0, TCSAFLUSH, &save);

611
	handle_keypress(session, c);
612
	goto repeat;
613 614 615 616

	return NULL;
}

617
/* Tag samples to be skipped. */
618
static const char *skip_symbols[] = {
619
	"default_idle",
620
	"native_safe_halt",
621 622 623 624
	"cpu_idle",
	"enter_idle",
	"exit_idle",
	"mwait_idle",
625
	"mwait_idle_with_hints",
626
	"poll_idle",
627 628
	"ppc64_runlatch_off",
	"pseries_dedicated_idle_sleep",
629 630 631
	NULL
};

632
static int symbol_filter(struct map *map, struct symbol *sym)
633
{
634 635
	struct sym_entry *syme;
	const char *name = sym->name;
636
	int i;
637

638 639 640 641 642 643 644
	/*
	 * ppc64 uses function descriptors and appends a '.' to the
	 * start of every instruction address. Remove it.
	 */
	if (name[0] == '.')
		name++;

645 646 647 648 649 650 651
	if (!strcmp(name, "_text") ||
	    !strcmp(name, "_etext") ||
	    !strcmp(name, "_sinittext") ||
	    !strncmp("init_module", name, 11) ||
	    !strncmp("cleanup_module", name, 14) ||
	    strstr(name, "_text_start") ||
	    strstr(name, "_text_end"))
652 653
		return 1;

654
	syme = symbol__priv(sym);
655
	syme->map = map;
656
	syme->src = NULL;
657 658 659 660 661 662

	if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
		/* schedule initial sym_filter_entry setup */
		sym_filter_entry_sched = syme;
		sym_filter = NULL;
	}
663

664 665 666 667 668 669
	for (i = 0; skip_symbols[i]; i++) {
		if (!strcmp(skip_symbols[i], name)) {
			syme->skip = 1;
			break;
		}
	}
670 671 672 673

	return 0;
}

674 675 676
static void perf_event__process_sample(const union perf_event *event,
				       struct perf_sample *sample,
				       struct perf_session *session)
677
{
678
	u64 ip = event->ip.ip;
679
	struct sym_entry *syme;
680
	struct addr_location al;
681
	struct machine *machine;
682
	u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
683

684
	++top.samples;
685

686
	switch (origin) {
687
	case PERF_RECORD_MISC_USER:
688 689
		++top.us_samples;
		if (top.hide_user_symbols)
690
			return;
691
		machine = perf_session__find_host_machine(session);
692
		break;
693
	case PERF_RECORD_MISC_KERNEL:
694 695
		++top.kernel_samples;
		if (top.hide_kernel_symbols)
696
			return;
697
		machine = perf_session__find_host_machine(session);
698 699
		break;
	case PERF_RECORD_MISC_GUEST_KERNEL:
700
		++top.guest_kernel_samples;
701
		machine = perf_session__find_machine(session, event->ip.pid);
702
		break;
703
	case PERF_RECORD_MISC_GUEST_USER:
704
		++top.guest_us_samples;
705 706 707 708 709
		/*
		 * TODO: we don't process guest user from host side
		 * except simple counting.
		 */
		return;
710 711 712 713
	default:
		return;
	}

714
	if (!machine && perf_guest) {
715
		pr_err("Can't find guest [%d]'s kernel information\n",
716
			event->ip.pid);
717 718 719
		return;
	}

720
	if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
721
		top.exact_samples++;
722

723 724
	if (perf_event__preprocess_sample(event, session, &al, sample,
					  symbol_filter) < 0 ||
725
	    al.filtered)
726
		return;
727

728 729 730 731 732 733 734 735 736 737 738 739
	if (al.sym == NULL) {
		/*
		 * As we do lazy loading of symtabs we only will know if the
		 * specified vmlinux file is invalid when we actually have a
		 * hit in kernel space and then try to load it. So if we get
		 * here and there are _no_ symbols in the DSO backing the
		 * kernel map, bail out.
		 *
		 * We may never get here, for instance, if we use -K/
		 * --hide-kernel-symbols, even if the user specifies an
		 * invalid --vmlinux ;-)
		 */
740
		if (al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
741 742 743 744 745 746 747 748 749
		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
			pr_err("The %s file can't be used\n",
			       symbol_conf.vmlinux_name);
			exit(1);
		}

		return;
	}

750 751 752 753
	/* let's see, whether we need to install initial sym_filter_entry */
	if (sym_filter_entry_sched) {
		sym_filter_entry = sym_filter_entry_sched;
		sym_filter_entry_sched = NULL;
754 755 756 757 758 759
		if (parse_source(sym_filter_entry) < 0) {
			struct symbol *sym = sym_entry__symbol(sym_filter_entry);

			pr_err("Can't annotate %s", sym->name);
			if (sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) {
				pr_err(": No vmlinux file was found in the path:\n");
760
				machine__fprintf_vmlinux_path(machine, stderr);
761 762 763 764
			} else
				pr_err(".\n");
			exit(1);
		}
765 766
	}

767
	syme = symbol__priv(al.sym);
768
	if (!syme->skip) {
769 770
		struct perf_evsel *evsel;

771
		syme->origin = origin;
772
		evsel = perf_evlist__id2evsel(top.evlist, sample->id);
773 774
		assert(evsel != NULL);
		syme->count[evsel->idx]++;
775
		record_precise_ip(syme, evsel->idx, ip);
776
		pthread_mutex_lock(&top.active_symbols_lock);
777 778
		if (list_empty(&syme->node) || !syme->node.next)
			__list_insert_active_sym(syme);
779
		pthread_mutex_unlock(&top.active_symbols_lock);
780
	}
781 782
}

783
static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
784
{
785
	struct perf_sample sample;
786
	union perf_event *event;
787

788
	while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) {
789
		perf_session__parse_sample(self, event, &sample);
790

791
		if (event->header.type == PERF_RECORD_SAMPLE)
792
			perf_event__process_sample(event, &sample, self);
793
		else
794
			perf_event__process(event, &sample, self);
795 796 797
	}
}

798
static void perf_session__mmap_read(struct perf_session *self)
799
{
800 801
	int i;

802
	for (i = 0; i < top.evlist->cpus->nr; i++)
803
		perf_session__mmap_read_cpu(self, i);
804 805
}

806 807 808
static void start_counters(struct perf_evlist *evlist)
{
	struct perf_evsel *counter;
809

810 811
	list_for_each_entry(counter, &evlist->entries, node) {
		struct perf_event_attr *attr = &counter->attr;
812

813 814
		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;

815
		if (top.freq) {
816 817
			attr->sample_type |= PERF_SAMPLE_PERIOD;
			attr->freq	  = 1;
818
			attr->sample_freq = top.freq;
819
		}
820

821 822 823 824 825
		if (evlist->nr_entries > 1) {
			attr->sample_type |= PERF_SAMPLE_ID;
			attr->read_format |= PERF_FORMAT_ID;
		}

826 827
		attr->mmap = 1;
try_again:
828 829
		if (perf_evsel__open(counter, top.evlist->cpus,
				     top.evlist->threads, group, inherit) < 0) {
830 831 832
			int err = errno;

			if (err == EPERM || err == EACCES)
833 834 835
				die("Permission error - are you root?\n"
					"\t Consider tweaking"
					" /proc/sys/kernel/perf_event_paranoid.\n");
836 837 838 839 840
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
			 * is always available even if no PMU support:
			 */
841 842
			if (attr->type == PERF_TYPE_HARDWARE &&
			    attr->config == PERF_COUNT_HW_CPU_CYCLES) {
843 844 845 846 847 848 849 850 851

				if (verbose)
					warning(" ... trying to fall back to cpu-clock-ticks\n");

				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
				goto try_again;
			}
			printf("\n");
852 853 854
			error("sys_perf_event_open() syscall returned with %d "
			      "(%s).  /bin/dmesg may provide additional information.\n",
			      err, strerror(err));
855 856 857
			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
			exit(-1);
		}
858
	}
859

860
	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
861
		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
862 863 864 865 866
}

static int __cmd_top(void)
{
	pthread_t thread;
867
	struct perf_evsel *first;
868
	int ret;
869
	/*
870 871
	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
872
	 */
873
	struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
874 875
	if (session == NULL)
		return -ENOMEM;
876

877 878
	if (top.target_tid != -1)
		perf_event__synthesize_thread(top.target_tid, perf_event__process,
879
					      session);
880
	else
881
		perf_event__synthesize_threads(perf_event__process, session);
882

883 884
	start_counters(top.evlist);
	first = list_entry(top.evlist->entries.next, struct perf_evsel, node);
885
	perf_session__set_sample_type(session, first->attr.sample_type);
886

887
	/* Wait for a minimal set of events before starting the snapshot */
888
	poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
889

890
	perf_session__mmap_read(session);
891

892 893
	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
							     display_thread), session)) {
894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
		printf("Could not create display thread.\n");
		exit(-1);
	}

	if (realtime_prio) {
		struct sched_param param;

		param.sched_priority = realtime_prio;
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
			printf("Could not set realtime priority.\n");
			exit(-1);
		}
	}

	while (1) {
909
		u64 hits = top.samples;
910

911
		perf_session__mmap_read(session);
912

913 914
		if (hits == top.samples)
			ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
915 916 917 918
	}

	return 0;
}
919 920 921 922 923 924 925

static const char * const top_usage[] = {
	"perf top [<options>]",
	NULL
};

static const struct option options[] = {
926
	OPT_CALLBACK('e', "event", &top.evlist, "event",
927 928
		     "event selector. use 'perf list' to list available events",
		     parse_events),
929 930
	OPT_INTEGER('c', "count", &default_interval,
		    "event period to sample"),
931
	OPT_INTEGER('p', "pid", &top.target_pid,
932
		    "profile events on existing process id"),
933
	OPT_INTEGER('t', "tid", &top.target_tid,
934
		    "profile events on existing thread id"),
935 936
	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			    "system-wide collection from all CPUs"),
937
	OPT_STRING('C', "cpu", &top.cpu_list, "cpu",
938
		    "list of cpus to monitor"),
939 940
	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
		   "file", "vmlinux pathname"),
941
	OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
942
		    "hide kernel symbols"),
943
	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
944 945
	OPT_INTEGER('r', "realtime", &realtime_prio,
		    "collect data with this RT SCHED_FIFO priority"),
946
	OPT_INTEGER('d', "delay", &top.delay_secs,
947 948 949
		    "number of seconds to delay between refreshes"),
	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
			    "dump the symbol table used for profiling"),
950
	OPT_INTEGER('f', "count-filter", &top.count_filter,
951 952 953
		    "only display functions with more events than this"),
	OPT_BOOLEAN('g', "group", &group,
			    "put the counters into a counter group"),
954 955
	OPT_BOOLEAN('i', "inherit", &inherit,
		    "child tasks inherit counters"),
956
	OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
957
		    "symbol to annotate"),
958
	OPT_BOOLEAN('z', "zero", &top.zero,
959
		    "zero history across updates"),
960
	OPT_INTEGER('F', "freq", &top.freq,
961
		    "profile at this frequency"),
962
	OPT_INTEGER('E', "entries", &top.print_entries,
963
		    "display this many functions"),
964
	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
965
		    "hide user symbols"),
966 967
	OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
	OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
968
	OPT_INCR('v', "verbose", &verbose,
969
		    "be more verbose (show counter open errors, etc)"),
970 971 972
	OPT_END()
};

973
int cmd_top(int argc, const char **argv, const char *prefix __used)
974
{
975 976
	struct perf_evsel *pos;
	int status = -ENOMEM;
977

978 979
	top.evlist = perf_evlist__new(NULL, NULL);
	if (top.evlist == NULL)
980 981
		return -ENOMEM;

982 983 984 985 986 987
	page_size = sysconf(_SC_PAGE_SIZE);

	argc = parse_options(argc, argv, options, top_usage, 0);
	if (argc)
		usage_with_options(top_usage, options);

988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
	/*
 	 * XXX For now start disabled, only using TUI if explicitely asked for.
 	 * Change that when handle_keys equivalent gets written, live annotation
 	 * done, etc.
 	 */
	use_browser = 0;

	if (use_stdio)
		use_browser = 0;
	else if (use_tui)
		use_browser = 1;

	setup_browser(false);

1002
	/* CPU and PID are mutually exclusive */
1003
	if (top.target_tid > 0 && top.cpu_list) {
1004 1005
		printf("WARNING: PID switch overriding CPU\n");
		sleep(1);
1006
		top.cpu_list = NULL;
1007 1008
	}

1009 1010
	if (top.target_pid != -1)
		top.target_tid = top.target_pid;
1011

1012 1013
	if (perf_evlist__create_maps(top.evlist, top.target_pid,
				     top.target_tid, top.cpu_list) < 0)
1014 1015
		usage_with_options(top_usage, options);

1016 1017
	if (!top.evlist->nr_entries &&
	    perf_evlist__add_default(top.evlist) < 0) {
1018 1019 1020
		pr_err("Not enough memory for event selector list\n");
		return -ENOMEM;
	}
1021

1022 1023
	if (top.delay_secs < 1)
		top.delay_secs = 1;
1024

1025 1026 1027 1028
	/*
	 * User specified count overrides default frequency.
	 */
	if (default_interval)
1029 1030 1031
		top.freq = 0;
	else if (top.freq) {
		default_interval = top.freq;
1032 1033 1034 1035 1036
	} else {
		fprintf(stderr, "frequency and count are zero, aborting\n");
		exit(EXIT_FAILURE);
	}

1037 1038 1039
	list_for_each_entry(pos, &top.evlist->entries, node) {
		if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr,
					 top.evlist->threads->nr) < 0)
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
			goto out_free_fd;
		/*
		 * Fill in the ones not specifically initialized via -c:
		 */
		if (pos->attr.sample_period)
			continue;

		pos->attr.sample_period = default_interval;
	}

1050 1051
	if (perf_evlist__alloc_pollfd(top.evlist) < 0 ||
	    perf_evlist__alloc_mmap(top.evlist) < 0)
1052 1053
		goto out_free_fd;

1054
	top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
1055

1056
	symbol_conf.priv_size = (sizeof(struct sym_entry) + sizeof(struct annotation) +
1057
				 (top.evlist->nr_entries + 1) * sizeof(unsigned long));
1058 1059 1060 1061 1062

	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
	if (symbol__init() < 0)
		return -1;

1063
	get_term_dimensions(&winsize);
1064
	if (top.print_entries == 0) {
1065
		update_print_entries(&winsize);
1066 1067 1068
		signal(SIGWINCH, sig_winch_handler);
	}

1069 1070
	status = __cmd_top();
out_free_fd:
1071
	perf_evlist__delete(top.evlist);
1072 1073

	return status;
1074
}