builtin-trace.c 31.1 KB
Newer Older
1
#include <traceevent/event-parse.h>
A
Arnaldo Carvalho de Melo 已提交
2
#include "builtin.h"
3
#include "util/color.h"
4
#include "util/debug.h"
A
Arnaldo Carvalho de Melo 已提交
5
#include "util/evlist.h"
6
#include "util/machine.h"
7
#include "util/session.h"
8
#include "util/thread.h"
A
Arnaldo Carvalho de Melo 已提交
9
#include "util/parse-options.h"
10
#include "util/strlist.h"
11
#include "util/intlist.h"
A
Arnaldo Carvalho de Melo 已提交
12 13 14 15
#include "util/thread_map.h"

#include <libaudit.h>
#include <stdlib.h>
16
#include <sys/mman.h>
17
#include <linux/futex.h>
A
Arnaldo Carvalho de Melo 已提交
18

19 20
static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
					 unsigned long arg, u8 *arg_mask __maybe_unused)
21 22 23 24
{
	return scnprintf(bf, size, "%#lx", arg);
}

25 26
#define SCA_HEX syscall_arg__scnprintf_hex

27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
static size_t syscall_arg__scnprintf_whence(char *bf, size_t size,
					    unsigned long arg, u8 *arg_mask __maybe_unused)
{
	int whence = arg;

	switch (whence) {
#define P_WHENCE(n) case SEEK_##n: return scnprintf(bf, size, #n)
	P_WHENCE(SET);
	P_WHENCE(CUR);
	P_WHENCE(END);
#ifdef SEEK_DATA
	P_WHENCE(DATA);
#endif
#ifdef SEEK_HOLE
	P_WHENCE(HOLE);
#endif
#undef P_WHENCE
	default: break;
	}

	return scnprintf(bf, size, "%#x", whence);
}

#define SCA_WHENCE syscall_arg__scnprintf_whence

52 53
static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
					       unsigned long arg, u8 *arg_mask __maybe_unused)
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
{
	int printed = 0, prot = arg;

	if (prot == PROT_NONE)
		return scnprintf(bf, size, "NONE");
#define	P_MMAP_PROT(n) \
	if (prot & PROT_##n) { \
		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
		prot &= ~PROT_##n; \
	}

	P_MMAP_PROT(EXEC);
	P_MMAP_PROT(READ);
	P_MMAP_PROT(WRITE);
#ifdef PROT_SEM
	P_MMAP_PROT(SEM);
#endif
	P_MMAP_PROT(GROWSDOWN);
	P_MMAP_PROT(GROWSUP);
#undef P_MMAP_PROT

	if (prot)
		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);

	return printed;
}

#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot

83 84
static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
						unsigned long arg, u8 *arg_mask __maybe_unused)
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
{
	int printed = 0, flags = arg;

#define	P_MMAP_FLAG(n) \
	if (flags & MAP_##n) { \
		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
		flags &= ~MAP_##n; \
	}

	P_MMAP_FLAG(SHARED);
	P_MMAP_FLAG(PRIVATE);
	P_MMAP_FLAG(32BIT);
	P_MMAP_FLAG(ANONYMOUS);
	P_MMAP_FLAG(DENYWRITE);
	P_MMAP_FLAG(EXECUTABLE);
	P_MMAP_FLAG(FILE);
	P_MMAP_FLAG(FIXED);
	P_MMAP_FLAG(GROWSDOWN);
103
#ifdef MAP_HUGETLB
104
	P_MMAP_FLAG(HUGETLB);
105
#endif
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
	P_MMAP_FLAG(LOCKED);
	P_MMAP_FLAG(NONBLOCK);
	P_MMAP_FLAG(NORESERVE);
	P_MMAP_FLAG(POPULATE);
	P_MMAP_FLAG(STACK);
#ifdef MAP_UNINITIALIZED
	P_MMAP_FLAG(UNINITIALIZED);
#endif
#undef P_MMAP_FLAG

	if (flags)
		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);

	return printed;
}

#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags

124 125
static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
						      unsigned long arg, u8 *arg_mask __maybe_unused)
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
{
	int behavior = arg;

	switch (behavior) {
#define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
	P_MADV_BHV(NORMAL);
	P_MADV_BHV(RANDOM);
	P_MADV_BHV(SEQUENTIAL);
	P_MADV_BHV(WILLNEED);
	P_MADV_BHV(DONTNEED);
	P_MADV_BHV(REMOVE);
	P_MADV_BHV(DONTFORK);
	P_MADV_BHV(DOFORK);
	P_MADV_BHV(HWPOISON);
#ifdef MADV_SOFT_OFFLINE
	P_MADV_BHV(SOFT_OFFLINE);
#endif
	P_MADV_BHV(MERGEABLE);
	P_MADV_BHV(UNMERGEABLE);
145
#ifdef MADV_HUGEPAGE
146
	P_MADV_BHV(HUGEPAGE);
147 148
#endif
#ifdef MADV_NOHUGEPAGE
149
	P_MADV_BHV(NOHUGEPAGE);
150
#endif
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
#ifdef MADV_DONTDUMP
	P_MADV_BHV(DONTDUMP);
#endif
#ifdef MADV_DODUMP
	P_MADV_BHV(DODUMP);
#endif
#undef P_MADV_PHV
	default: break;
	}

	return scnprintf(bf, size, "%#x", behavior);
}

#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior

166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, unsigned long arg, u8 *arg_mask)
{
	enum syscall_futex_args {
		SCF_UADDR   = (1 << 0),
		SCF_OP	    = (1 << 1),
		SCF_VAL	    = (1 << 2),
		SCF_TIMEOUT = (1 << 3),
		SCF_UADDR2  = (1 << 4),
		SCF_VAL3    = (1 << 5),
	};
	int op = arg;
	int cmd = op & FUTEX_CMD_MASK;
	size_t printed = 0;

	switch (cmd) {
#define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
	P_FUTEX_OP(WAIT);	    *arg_mask |= SCF_VAL3|SCF_UADDR2;		  break;
	P_FUTEX_OP(WAKE);	    *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
	P_FUTEX_OP(FD);		    *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
	P_FUTEX_OP(REQUEUE);	    *arg_mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
	P_FUTEX_OP(CMP_REQUEUE);    *arg_mask |= SCF_TIMEOUT;			  break;
	P_FUTEX_OP(CMP_REQUEUE_PI); *arg_mask |= SCF_TIMEOUT;			  break;
	P_FUTEX_OP(WAKE_OP);							  break;
	P_FUTEX_OP(LOCK_PI);	    *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
	P_FUTEX_OP(UNLOCK_PI);	    *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
	P_FUTEX_OP(TRYLOCK_PI);	    *arg_mask |= SCF_VAL3|SCF_UADDR2;		  break;
	P_FUTEX_OP(WAIT_BITSET);    *arg_mask |= SCF_UADDR2;			  break;
	P_FUTEX_OP(WAKE_BITSET);    *arg_mask |= SCF_UADDR2;			  break;
	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
	}

	if (op & FUTEX_PRIVATE_FLAG)
		printed += scnprintf(bf + printed, size - printed, "|PRIV");

	if (op & FUTEX_CLOCK_REALTIME)
		printed += scnprintf(bf + printed, size - printed, "|CLKRT");

	return printed;
}

#define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op

A
Arnaldo Carvalho de Melo 已提交
209 210
static struct syscall_fmt {
	const char *name;
211
	const char *alias;
212
	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, unsigned long arg, u8 *arg_mask);
A
Arnaldo Carvalho de Melo 已提交
213 214
	bool	   errmsg;
	bool	   timeout;
215
	bool	   hexret;
A
Arnaldo Carvalho de Melo 已提交
216
} syscall_fmts[] = {
217
	{ .name	    = "access",	    .errmsg = true, },
218
	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
219 220
	{ .name	    = "brk",	    .hexret = true,
	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
221
	{ .name	    = "mmap",	    .hexret = true, },
222
	{ .name	    = "connect",    .errmsg = true, },
223 224
	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
225 226
	{ .name	    = "futex",	    .errmsg = true,
	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
227 228
	{ .name	    = "ioctl",	    .errmsg = true,
	  .arg_scnprintf = { [2] = SCA_HEX, /* arg */ }, },
229 230
	{ .name	    = "lseek",	    .errmsg = true,
	  .arg_scnprintf = { [2] = SCA_WHENCE, /* whence */ }, },
231
	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
232 233 234
	{ .name     = "madvise",    .errmsg = true,
	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
			     [2] = SCA_MADV_BHV, /* behavior */ }, },
235
	{ .name	    = "mmap",	    .hexret = true,
236
	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
237 238
			     [2] = SCA_MMAP_PROT, /* prot */
			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
239
	{ .name	    = "mprotect",   .errmsg = true,
240 241 242 243 244
	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
			     [2] = SCA_MMAP_PROT, /* prot */ }, },
	{ .name	    = "mremap",	    .hexret = true,
	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
			     [4] = SCA_HEX, /* new_addr */ }, },
245 246
	{ .name	    = "munmap",	    .errmsg = true,
	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
247
	{ .name	    = "open",	    .errmsg = true, },
248 249
	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
250 251
	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
252 253 254
	{ .name	    = "read",	    .errmsg = true, },
	{ .name	    = "recvfrom",   .errmsg = true, },
	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
255
	{ .name	    = "socket",	    .errmsg = true, },
256
	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
257
	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
A
Arnaldo Carvalho de Melo 已提交
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
};

static int syscall_fmt__cmp(const void *name, const void *fmtp)
{
	const struct syscall_fmt *fmt = fmtp;
	return strcmp(name, fmt->name);
}

static struct syscall_fmt *syscall_fmt__find(const char *name)
{
	const int nmemb = ARRAY_SIZE(syscall_fmts);
	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
}

struct syscall {
	struct event_format *tp_format;
	const char	    *name;
275
	bool		    filtered;
A
Arnaldo Carvalho de Melo 已提交
276
	struct syscall_fmt  *fmt;
277 278
	size_t		    (**arg_scnprintf)(char *bf, size_t size,
					      unsigned long arg, u8 *args_mask);
A
Arnaldo Carvalho de Melo 已提交
279 280
};

281 282 283 284 285 286 287 288 289 290 291
static size_t fprintf_duration(unsigned long t, FILE *fp)
{
	double duration = (double)t / NSEC_PER_MSEC;
	size_t printed = fprintf(fp, "(");

	if (duration >= 1.0)
		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
	else if (duration >= 0.01)
		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
	else
		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
292
	return printed + fprintf(fp, "): ");
293 294
}

295 296 297 298
struct thread_trace {
	u64		  entry_time;
	u64		  exit_time;
	bool		  entry_pending;
299
	unsigned long	  nr_events;
300
	char		  *entry_str;
301
	double		  runtime_ms;
302 303 304 305 306 307 308
};

static struct thread_trace *thread_trace__new(void)
{
	return zalloc(sizeof(struct thread_trace));
}

309
static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
310
{
311 312
	struct thread_trace *ttrace;

313 314 315 316 317
	if (thread == NULL)
		goto fail;

	if (thread->priv == NULL)
		thread->priv = thread_trace__new();
318
		
319 320 321
	if (thread->priv == NULL)
		goto fail;

322 323 324 325
	ttrace = thread->priv;
	++ttrace->nr_events;

	return ttrace;
326
fail:
327
	color_fprintf(fp, PERF_COLOR_RED,
328 329 330 331
		      "WARNING: not enough memory, dropping samples!\n");
	return NULL;
}

A
Arnaldo Carvalho de Melo 已提交
332
struct trace {
333
	struct perf_tool	tool;
A
Arnaldo Carvalho de Melo 已提交
334 335 336 337 338 339
	int			audit_machine;
	struct {
		int		max;
		struct syscall  *table;
	} syscalls;
	struct perf_record_opts opts;
340 341
	struct machine		host;
	u64			base_time;
342
	FILE			*output;
343
	unsigned long		nr_events;
344 345
	struct strlist		*ev_qualifier;
	bool			not_ev_qualifier;
346 347
	struct intlist		*tid_list;
	struct intlist		*pid_list;
348
	bool			sched;
349
	bool			multiple_threads;
350
	double			duration_filter;
351
	double			runtime_ms;
A
Arnaldo Carvalho de Melo 已提交
352 353
};

354 355 356 357 358
static bool trace__filter_duration(struct trace *trace, double t)
{
	return t < (trace->duration_filter * NSEC_PER_MSEC);
}

359 360 361 362
static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
{
	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;

363
	return fprintf(fp, "%10.3f ", ts);
364 365
}

366 367 368 369 370 371 372
static bool done = false;

static void sig_handler(int sig __maybe_unused)
{
	done = true;
}

373
static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
374
					u64 duration, u64 tstamp, FILE *fp)
375 376
{
	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
377
	printed += fprintf_duration(duration, fp);
378 379

	if (trace->multiple_threads)
380
		printed += fprintf(fp, "%d ", thread->tid);
381 382 383 384

	return printed;
}

385 386
static int trace__process_event(struct trace *trace, struct machine *machine,
				union perf_event *event)
387 388 389 390 391
{
	int ret = 0;

	switch (event->header.type) {
	case PERF_RECORD_LOST:
392
		color_fprintf(trace->output, PERF_COLOR_RED,
393 394 395 396 397 398 399 400 401 402
			      "LOST %" PRIu64 " events!\n", event->lost.lost);
		ret = machine__process_lost_event(machine, event);
	default:
		ret = machine__process_event(machine, event);
		break;
	}

	return ret;
}

403
static int trace__tool_process(struct perf_tool *tool,
404 405 406 407
			       union perf_event *event,
			       struct perf_sample *sample __maybe_unused,
			       struct machine *machine)
{
408 409
	struct trace *trace = container_of(tool, struct trace, tool);
	return trace__process_event(trace, machine, event);
410 411 412 413 414 415 416 417 418 419 420 421 422
}

static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
{
	int err = symbol__init();

	if (err)
		return err;

	machine__init(&trace->host, "", HOST_KERNEL_ID);
	machine__create_kernel_maps(&trace->host);

	if (perf_target__has_task(&trace->opts.target)) {
423
		err = perf_event__synthesize_thread_map(&trace->tool, evlist->threads,
424 425 426
							trace__tool_process,
							&trace->host);
	} else {
427
		err = perf_event__synthesize_threads(&trace->tool, trace__tool_process,
428 429 430 431 432 433 434 435 436
						     &trace->host);
	}

	if (err)
		symbol__exit();

	return err;
}

437 438 439 440 441 442 443 444 445 446
static int syscall__set_arg_fmts(struct syscall *sc)
{
	struct format_field *field;
	int idx = 0;

	sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
	if (sc->arg_scnprintf == NULL)
		return -1;

	for (field = sc->tp_format->format.fields->next; field; field = field->next) {
447 448 449
		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
		else if (field->flags & FIELD_IS_POINTER)
450 451 452 453 454 455 456
			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
		++idx;
	}

	return 0;
}

A
Arnaldo Carvalho de Melo 已提交
457 458 459 460
static int trace__read_syscall_info(struct trace *trace, int id)
{
	char tp_name[128];
	struct syscall *sc;
461 462 463 464
	const char *name = audit_syscall_to_name(id, trace->audit_machine);

	if (name == NULL)
		return -1;
A
Arnaldo Carvalho de Melo 已提交
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483

	if (id > trace->syscalls.max) {
		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));

		if (nsyscalls == NULL)
			return -1;

		if (trace->syscalls.max != -1) {
			memset(nsyscalls + trace->syscalls.max + 1, 0,
			       (id - trace->syscalls.max) * sizeof(*sc));
		} else {
			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
		}

		trace->syscalls.table = nsyscalls;
		trace->syscalls.max   = id;
	}

	sc = trace->syscalls.table + id;
484
	sc->name = name;
485

486 487 488 489 490 491 492 493 494 495 496
	if (trace->ev_qualifier) {
		bool in = strlist__find(trace->ev_qualifier, name) != NULL;

		if (!(in ^ trace->not_ev_qualifier)) {
			sc->filtered = true;
			/*
			 * No need to do read tracepoint information since this will be
			 * filtered out.
			 */
			return 0;
		}
497 498
	}

499
	sc->fmt  = syscall_fmt__find(sc->name);
A
Arnaldo Carvalho de Melo 已提交
500

501
	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
A
Arnaldo Carvalho de Melo 已提交
502
	sc->tp_format = event_format__new("syscalls", tp_name);
503 504 505 506 507

	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
		sc->tp_format = event_format__new("syscalls", tp_name);
	}
A
Arnaldo Carvalho de Melo 已提交
508

509 510 511 512
	if (sc->tp_format == NULL)
		return -1;

	return syscall__set_arg_fmts(sc);
A
Arnaldo Carvalho de Melo 已提交
513 514
}

515 516
static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
				      unsigned long *args)
A
Arnaldo Carvalho de Melo 已提交
517 518 519 520 521 522
{
	int i = 0;
	size_t printed = 0;

	if (sc->tp_format != NULL) {
		struct format_field *field;
523 524 525 526 527 528
		u8 mask = 0, bit = 1;

		for (field = sc->tp_format->format.fields->next; field;
		     field = field->next, ++i, bit <<= 1) {
			if (mask & bit)
				continue;
A
Arnaldo Carvalho de Melo 已提交
529

530
			printed += scnprintf(bf + printed, size - printed,
531 532
					     "%s%s: ", printed ? ", " : "", field->name);

533 534 535 536
			if (sc->arg_scnprintf && sc->arg_scnprintf[i]) {
				printed += sc->arg_scnprintf[i](bf + printed, size - printed,
								args[i], &mask);
			} else {
537 538
				printed += scnprintf(bf + printed, size - printed,
						     "%ld", args[i]);
539
			}
A
Arnaldo Carvalho de Melo 已提交
540 541 542
		}
	} else {
		while (i < 6) {
543 544 545
			printed += scnprintf(bf + printed, size - printed,
					     "%sarg%d: %ld",
					     printed ? ", " : "", i, args[i]);
A
Arnaldo Carvalho de Melo 已提交
546 547 548 549 550 551 552
			++i;
		}
	}

	return printed;
}

553 554 555 556 557 558 559 560 561 562
typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
				  struct perf_sample *sample);

static struct syscall *trace__syscall_info(struct trace *trace,
					   struct perf_evsel *evsel,
					   struct perf_sample *sample)
{
	int id = perf_evsel__intval(evsel, sample, "id");

	if (id < 0) {
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578

		/*
		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
		 * before that, leaving at a higher verbosity level till that is
		 * explained. Reproduced with plain ftrace with:
		 *
		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
		 * grep "NR -1 " /t/trace_pipe
		 *
		 * After generating some load on the machine.
 		 */
		if (verbose > 1) {
			static u64 n;
			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
				id, perf_evsel__name(evsel), ++n);
		}
579 580 581 582 583 584 585 586 587 588 589 590 591
		return NULL;
	}

	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
	    trace__read_syscall_info(trace, id))
		goto out_cant_read;

	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
		goto out_cant_read;

	return &trace->syscalls.table[id];

out_cant_read:
592 593 594 595 596 597
	if (verbose) {
		fprintf(trace->output, "Problems reading syscall %d", id);
		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
		fputs(" information\n", trace->output);
	}
598 599 600 601 602 603
	return NULL;
}

static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
			    struct perf_sample *sample)
{
604
	char *msg;
605
	void *args;
606
	size_t printed = 0;
607
	struct thread *thread;
608
	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
609 610 611 612
	struct thread_trace *ttrace;

	if (sc == NULL)
		return -1;
613

614 615 616
	if (sc->filtered)
		return 0;

617 618
	thread = machine__findnew_thread(&trace->host, sample->pid,
					 sample->tid);
619
	ttrace = thread__trace(thread, trace->output);
620
	if (ttrace == NULL)
621 622 623 624
		return -1;

	args = perf_evsel__rawptr(evsel, sample, "args");
	if (args == NULL) {
625
		fprintf(trace->output, "Problems reading syscall arguments\n");
626 627 628
		return -1;
	}

629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
	ttrace = thread->priv;

	if (ttrace->entry_str == NULL) {
		ttrace->entry_str = malloc(1024);
		if (!ttrace->entry_str)
			return -1;
	}

	ttrace->entry_time = sample->time;
	msg = ttrace->entry_str;
	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);

	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,  args);

	if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
644
		if (!trace->duration_filter) {
645 646
			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
647
		}
648 649
	} else
		ttrace->entry_pending = true;
650 651 652 653 654 655 656 657

	return 0;
}

static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
			   struct perf_sample *sample)
{
	int ret;
658
	u64 duration = 0;
659
	struct thread *thread;
660
	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
661 662 663 664
	struct thread_trace *ttrace;

	if (sc == NULL)
		return -1;
665

666 667 668
	if (sc->filtered)
		return 0;

669 670
	thread = machine__findnew_thread(&trace->host, sample->pid,
					 sample->tid);
671
	ttrace = thread__trace(thread, trace->output);
672
	if (ttrace == NULL)
673 674 675 676
		return -1;

	ret = perf_evsel__intval(evsel, sample, "ret");

677 678 679 680
	ttrace = thread->priv;

	ttrace->exit_time = sample->time;

681
	if (ttrace->entry_time) {
682
		duration = sample->time - ttrace->entry_time;
683 684 685 686
		if (trace__filter_duration(trace, duration))
			goto out;
	} else if (trace->duration_filter)
		goto out;
687

688
	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
689 690

	if (ttrace->entry_pending) {
691
		fprintf(trace->output, "%-70s", ttrace->entry_str);
692
	} else {
693 694 695
		fprintf(trace->output, " ... [");
		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
		fprintf(trace->output, "]: %s()", sc->name);
696 697
	}

698 699 700 701
	if (sc->fmt == NULL) {
signed_print:
		fprintf(trace->output, ") = %d", ret);
	} else if (ret < 0 && sc->fmt->errmsg) {
702 703 704 705
		char bf[256];
		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
			   *e = audit_errno_to_name(-ret);

706
		fprintf(trace->output, ") = -1 %s %s", e, emsg);
707
	} else if (ret == 0 && sc->fmt->timeout)
708
		fprintf(trace->output, ") = 0 Timeout");
709 710
	else if (sc->fmt->hexret)
		fprintf(trace->output, ") = %#x", ret);
711
	else
712
		goto signed_print;
713

714
	fputc('\n', trace->output);
715
out:
716 717
	ttrace->entry_pending = false;

718 719 720
	return 0;
}

721 722 723 724 725
static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
				     struct perf_sample *sample)
{
        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
726 727 728
	struct thread *thread = machine__findnew_thread(&trace->host,
							sample->pid,
							sample->tid);
729
	struct thread_trace *ttrace = thread__trace(thread, trace->output);
730 731 732 733 734 735 736 737 738

	if (ttrace == NULL)
		goto out_dump;

	ttrace->runtime_ms += runtime_ms;
	trace->runtime_ms += runtime_ms;
	return 0;

out_dump:
739
	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
740 741 742 743 744 745 746 747
	       evsel->name,
	       perf_evsel__strval(evsel, sample, "comm"),
	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
	       runtime,
	       perf_evsel__intval(evsel, sample, "vruntime"));
	return 0;
}

748 749 750 751 752 753 754 755 756 757 758 759
static bool skip_sample(struct trace *trace, struct perf_sample *sample)
{
	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
		return false;

	if (trace->pid_list || trace->tid_list)
		return true;

	return false;
}

760 761 762 763 764 765 766 767 768 769 770
static int trace__process_sample(struct perf_tool *tool,
				 union perf_event *event __maybe_unused,
				 struct perf_sample *sample,
				 struct perf_evsel *evsel,
				 struct machine *machine __maybe_unused)
{
	struct trace *trace = container_of(tool, struct trace, tool);
	int err = 0;

	tracepoint_handler handler = evsel->handler.func;

771 772 773
	if (skip_sample(trace, sample))
		return 0;

774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
	if (trace->base_time == 0)
		trace->base_time = sample->time;

	if (handler)
		handler(trace, evsel, sample);

	return err;
}

static bool
perf_session__has_tp(struct perf_session *session, const char *name)
{
	struct perf_evsel *evsel;

	evsel = perf_evlist__find_tracepoint_by_name(session->evlist, name);

	return evsel != NULL;
}

793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
static int parse_target_str(struct trace *trace)
{
	if (trace->opts.target.pid) {
		trace->pid_list = intlist__new(trace->opts.target.pid);
		if (trace->pid_list == NULL) {
			pr_err("Error parsing process id string\n");
			return -EINVAL;
		}
	}

	if (trace->opts.target.tid) {
		trace->tid_list = intlist__new(trace->opts.target.tid);
		if (trace->tid_list == NULL) {
			pr_err("Error parsing thread id string\n");
			return -EINVAL;
		}
	}

	return 0;
}

814
static int trace__run(struct trace *trace, int argc, const char **argv)
A
Arnaldo Carvalho de Melo 已提交
815
{
816
	struct perf_evlist *evlist = perf_evlist__new();
817
	struct perf_evsel *evsel;
818 819
	int err = -1, i;
	unsigned long before;
820
	const bool forks = argc > 0;
A
Arnaldo Carvalho de Melo 已提交
821 822

	if (evlist == NULL) {
823
		fprintf(trace->output, "Not enough memory to run!\n");
A
Arnaldo Carvalho de Melo 已提交
824 825 826
		goto out;
	}

827 828
	if (perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_enter", trace__sys_enter) ||
	    perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_exit", trace__sys_exit)) {
829
		fprintf(trace->output, "Couldn't read the raw_syscalls tracepoints information!\n");
A
Arnaldo Carvalho de Melo 已提交
830 831 832
		goto out_delete_evlist;
	}

833 834 835
	if (trace->sched &&
	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
				   trace__sched_stat_runtime)) {
836
		fprintf(trace->output, "Couldn't read the sched_stat_runtime tracepoint information!\n");
837 838 839
		goto out_delete_evlist;
	}

A
Arnaldo Carvalho de Melo 已提交
840 841
	err = perf_evlist__create_maps(evlist, &trace->opts.target);
	if (err < 0) {
842
		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
A
Arnaldo Carvalho de Melo 已提交
843 844 845
		goto out_delete_evlist;
	}

846 847
	err = trace__symbols_init(trace, evlist);
	if (err < 0) {
848
		fprintf(trace->output, "Problems initializing symbol libraries!\n");
849
		goto out_delete_maps;
850 851
	}

852
	perf_evlist__config(evlist, &trace->opts);
A
Arnaldo Carvalho de Melo 已提交
853

854 855 856 857
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);

	if (forks) {
858
		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
859
						    argv, false, false);
860
		if (err < 0) {
861
			fprintf(trace->output, "Couldn't run the workload!\n");
862
			goto out_delete_maps;
863 864 865
		}
	}

A
Arnaldo Carvalho de Melo 已提交
866 867
	err = perf_evlist__open(evlist);
	if (err < 0) {
868
		fprintf(trace->output, "Couldn't create the events: %s\n", strerror(errno));
869
		goto out_delete_maps;
A
Arnaldo Carvalho de Melo 已提交
870 871 872 873
	}

	err = perf_evlist__mmap(evlist, UINT_MAX, false);
	if (err < 0) {
874
		fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
875
		goto out_close_evlist;
A
Arnaldo Carvalho de Melo 已提交
876 877 878
	}

	perf_evlist__enable(evlist);
879 880 881 882

	if (forks)
		perf_evlist__start_workload(evlist);

883
	trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
A
Arnaldo Carvalho de Melo 已提交
884
again:
885
	before = trace->nr_events;
A
Arnaldo Carvalho de Melo 已提交
886 887 888 889 890 891

	for (i = 0; i < evlist->nr_mmaps; i++) {
		union perf_event *event;

		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
			const u32 type = event->header.type;
892
			tracepoint_handler handler;
A
Arnaldo Carvalho de Melo 已提交
893 894
			struct perf_sample sample;

895
			++trace->nr_events;
A
Arnaldo Carvalho de Melo 已提交
896 897 898

			err = perf_evlist__parse_sample(evlist, event, &sample);
			if (err) {
899
				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
A
Arnaldo Carvalho de Melo 已提交
900 901 902
				continue;
			}

903 904 905 906
			if (trace->base_time == 0)
				trace->base_time = sample.time;

			if (type != PERF_RECORD_SAMPLE) {
907
				trace__process_event(trace, &trace->host, event);
908 909 910
				continue;
			}

A
Arnaldo Carvalho de Melo 已提交
911 912
			evsel = perf_evlist__id2evsel(evlist, sample.id);
			if (evsel == NULL) {
913
				fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
A
Arnaldo Carvalho de Melo 已提交
914 915 916
				continue;
			}

917
			if (sample.raw_data == NULL) {
918
				fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
919 920 921 922 923
				       perf_evsel__name(evsel), sample.tid,
				       sample.cpu, sample.raw_size);
				continue;
			}

924 925
			handler = evsel->handler.func;
			handler(trace, evsel, &sample);
A
Arnaldo Carvalho de Melo 已提交
926 927 928
		}
	}

929
	if (trace->nr_events == before) {
930
		if (done)
931
			goto out_unmap_evlist;
932

A
Arnaldo Carvalho de Melo 已提交
933
		poll(evlist->pollfd, evlist->nr_fds, -1);
934 935 936 937
	}

	if (done)
		perf_evlist__disable(evlist);
A
Arnaldo Carvalho de Melo 已提交
938 939 940

	goto again;

941 942 943 944 945 946
out_unmap_evlist:
	perf_evlist__munmap(evlist);
out_close_evlist:
	perf_evlist__close(evlist);
out_delete_maps:
	perf_evlist__delete_maps(evlist);
A
Arnaldo Carvalho de Melo 已提交
947 948 949 950 951 952
out_delete_evlist:
	perf_evlist__delete(evlist);
out:
	return err;
}

953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
static int trace__replay(struct trace *trace)
{
	const struct perf_evsel_str_handler handlers[] = {
		{ "raw_syscalls:sys_enter",  trace__sys_enter, },
		{ "raw_syscalls:sys_exit",   trace__sys_exit, },
	};

	struct perf_session *session;
	int err = -1;

	trace->tool.sample	  = trace__process_sample;
	trace->tool.mmap	  = perf_event__process_mmap;
	trace->tool.comm	  = perf_event__process_comm;
	trace->tool.exit	  = perf_event__process_exit;
	trace->tool.fork	  = perf_event__process_fork;
	trace->tool.attr	  = perf_event__process_attr;
	trace->tool.tracing_data = perf_event__process_tracing_data;
	trace->tool.build_id	  = perf_event__process_build_id;

	trace->tool.ordered_samples = true;
	trace->tool.ordering_requires_timestamps = true;

	/* add tid to output */
	trace->multiple_threads = true;

	if (symbol__init() < 0)
		return -1;

	session = perf_session__new(input_name, O_RDONLY, 0, false,
				    &trace->tool);
	if (session == NULL)
		return -ENOMEM;

	err = perf_session__set_tracepoints_handlers(session, handlers);
	if (err)
		goto out;

	if (!perf_session__has_tp(session, "raw_syscalls:sys_enter")) {
		pr_err("Data file does not have raw_syscalls:sys_enter events\n");
		goto out;
	}

	if (!perf_session__has_tp(session, "raw_syscalls:sys_exit")) {
		pr_err("Data file does not have raw_syscalls:sys_exit events\n");
		goto out;
	}

1000 1001 1002 1003
	err = parse_target_str(trace);
	if (err != 0)
		goto out;

1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
	setup_pager();

	err = perf_session__process_events(session, &trace->tool);
	if (err)
		pr_err("Failed to process events, error %d", err);

out:
	perf_session__delete(session);

	return err;
}

1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
static size_t trace__fprintf_threads_header(FILE *fp)
{
	size_t printed;

	printed  = fprintf(fp, "\n _____________________________________________________________________\n");
	printed += fprintf(fp," __)    Summary of events    (__\n\n");
	printed += fprintf(fp,"              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
	printed += fprintf(fp," _____________________________________________________________________\n\n");

	return printed;
}

static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
{
	size_t printed = trace__fprintf_threads_header(fp);
	struct rb_node *nd;

	for (nd = rb_first(&trace->host.threads); nd; nd = rb_next(nd)) {
		struct thread *thread = rb_entry(nd, struct thread, rb_node);
		struct thread_trace *ttrace = thread->priv;
		const char *color;
		double ratio;

		if (ttrace == NULL)
			continue;

		ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;

		color = PERF_COLOR_NORMAL;
		if (ratio > 50.0)
			color = PERF_COLOR_RED;
		else if (ratio > 25.0)
			color = PERF_COLOR_GREEN;
		else if (ratio > 5.0)
			color = PERF_COLOR_YELLOW;

		printed += color_fprintf(fp, color, "%20s", thread->comm);
1053
		printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
1054 1055 1056 1057 1058 1059 1060
		printed += color_fprintf(fp, color, "%5.1f%%", ratio);
		printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
	}

	return printed;
}

1061 1062 1063 1064 1065 1066 1067 1068 1069
static int trace__set_duration(const struct option *opt, const char *str,
			       int unset __maybe_unused)
{
	struct trace *trace = opt->value;

	trace->duration_filter = atof(str);
	return 0;
}

1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
static int trace__open_output(struct trace *trace, const char *filename)
{
	struct stat st;

	if (!stat(filename, &st) && st.st_size) {
		char oldname[PATH_MAX];

		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
		unlink(oldname);
		rename(filename, oldname);
	}

	trace->output = fopen(filename, "w");

	return trace->output == NULL ? -errno : 0;
}

A
Arnaldo Carvalho de Melo 已提交
1087 1088 1089
int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
{
	const char * const trace_usage[] = {
1090 1091
		"perf trace [<options>] [<command>]",
		"perf trace [<options>] -- <command> [<options>]",
A
Arnaldo Carvalho de Melo 已提交
1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
		NULL
	};
	struct trace trace = {
		.audit_machine = audit_detect_machine(),
		.syscalls = {
			. max = -1,
		},
		.opts = {
			.target = {
				.uid	   = UINT_MAX,
				.uses_mmap = true,
			},
			.user_freq     = UINT_MAX,
			.user_interval = ULLONG_MAX,
			.no_delay      = true,
			.mmap_pages    = 1024,
		},
1109
		.output = stdout,
A
Arnaldo Carvalho de Melo 已提交
1110
	};
1111
	const char *output_name = NULL;
1112
	const char *ev_qualifier_str = NULL;
A
Arnaldo Carvalho de Melo 已提交
1113
	const struct option trace_options[] = {
1114 1115
	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
		    "list of events to trace"),
1116
	OPT_STRING('o', "output", &output_name, "file", "output file name"),
1117
	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
A
Arnaldo Carvalho de Melo 已提交
1118 1119
	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
		    "trace events on existing process id"),
1120
	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
A
Arnaldo Carvalho de Melo 已提交
1121
		    "trace events on existing thread id"),
1122
	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
A
Arnaldo Carvalho de Melo 已提交
1123
		    "system-wide collection from all CPUs"),
1124
	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
A
Arnaldo Carvalho de Melo 已提交
1125
		    "list of cpus to monitor"),
1126
	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
A
Arnaldo Carvalho de Melo 已提交
1127
		    "child tasks do not inherit counters"),
1128
	OPT_UINTEGER('m', "mmap-pages", &trace.opts.mmap_pages,
A
Arnaldo Carvalho de Melo 已提交
1129
		     "number of mmap data pages"),
1130
	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
A
Arnaldo Carvalho de Melo 已提交
1131
		   "user to profile"),
1132 1133 1134
	OPT_CALLBACK(0, "duration", &trace, "float",
		     "show only events with duration > N.M ms",
		     trace__set_duration),
1135
	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
1136
	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
A
Arnaldo Carvalho de Melo 已提交
1137 1138 1139
	OPT_END()
	};
	int err;
1140
	char bf[BUFSIZ];
A
Arnaldo Carvalho de Melo 已提交
1141 1142 1143

	argc = parse_options(argc, argv, trace_options, trace_usage, 0);

1144 1145 1146 1147 1148 1149 1150 1151
	if (output_name != NULL) {
		err = trace__open_output(&trace, output_name);
		if (err < 0) {
			perror("failed to create output file");
			goto out;
		}
	}

1152
	if (ev_qualifier_str != NULL) {
1153 1154 1155 1156 1157 1158
		const char *s = ev_qualifier_str;

		trace.not_ev_qualifier = *s == '!';
		if (trace.not_ev_qualifier)
			++s;
		trace.ev_qualifier = strlist__new(true, s);
1159
		if (trace.ev_qualifier == NULL) {
1160 1161 1162 1163
			fputs("Not enough memory to parse event qualifier",
			      trace.output);
			err = -ENOMEM;
			goto out_close;
1164 1165 1166
		}
	}

1167 1168 1169
	err = perf_target__validate(&trace.opts.target);
	if (err) {
		perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf));
1170 1171
		fprintf(trace.output, "%s", bf);
		goto out_close;
1172 1173
	}

A
Arnaldo Carvalho de Melo 已提交
1174 1175 1176
	err = perf_target__parse_uid(&trace.opts.target);
	if (err) {
		perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf));
1177 1178
		fprintf(trace.output, "%s", bf);
		goto out_close;
A
Arnaldo Carvalho de Melo 已提交
1179 1180
	}

1181
	if (!argc && perf_target__none(&trace.opts.target))
1182 1183
		trace.opts.target.system_wide = true;

1184 1185 1186 1187
	if (input_name)
		err = trace__replay(&trace);
	else
		err = trace__run(&trace, argc, argv);
1188 1189

	if (trace.sched && !err)
1190
		trace__fprintf_thread_summary(&trace, trace.output);
1191

1192 1193 1194 1195
out_close:
	if (output_name != NULL)
		fclose(trace.output);
out:
1196
	return err;
A
Arnaldo Carvalho de Melo 已提交
1197
}