builtin-trace.c 112.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * builtin-trace.c
 *
 * Builtin 'trace' command:
 *
 * Display a continuously updated trace of any workload, CPU, specific PID,
 * system wide, etc.  Default format is loosely strace like, but any other
 * event may be specified using --event.
 *
 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
 *
 * Initially based on the 'trace' prototype by Thomas Gleixner:
 *
 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
 *
 * Released under the GPL v2. (and only v2, not any later version)
 */

19
#include <traceevent/event-parse.h>
20
#include <api/fs/tracing_path.h>
21
#include <bpf/bpf.h>
22
#include "util/bpf_map.h"
A
Arnaldo Carvalho de Melo 已提交
23
#include "builtin.h"
24
#include "util/cgroup.h"
25
#include "util/color.h"
26
#include "util/config.h"
27
#include "util/debug.h"
28
#include "util/env.h"
29
#include "util/event.h"
A
Arnaldo Carvalho de Melo 已提交
30
#include "util/evlist.h"
31
#include <subcmd/exec-cmd.h>
32
#include "util/machine.h"
33
#include "util/map.h"
34
#include "util/symbol.h"
35
#include "util/path.h"
36
#include "util/session.h"
37
#include "util/thread.h"
38
#include <subcmd/parse-options.h>
39
#include "util/strlist.h"
40
#include "util/intlist.h"
A
Arnaldo Carvalho de Melo 已提交
41
#include "util/thread_map.h"
42
#include "util/stat.h"
43
#include "trace/beauty/beauty.h"
44
#include "trace-event.h"
45
#include "util/parse-events.h"
46
#include "util/bpf-loader.h"
47
#include "callchain.h"
48
#include "print_binary.h"
49
#include "string2.h"
50
#include "syscalltbl.h"
51
#include "rb_resort.h"
A
Arnaldo Carvalho de Melo 已提交
52

53
#include <errno.h>
54
#include <inttypes.h>
55
#include <poll.h>
56
#include <signal.h>
A
Arnaldo Carvalho de Melo 已提交
57
#include <stdlib.h>
58
#include <string.h>
59
#include <linux/err.h>
60
#include <linux/filter.h>
61
#include <linux/kernel.h>
62
#include <linux/random.h>
63
#include <linux/stringify.h>
64
#include <linux/time64.h>
65
#include <fcntl.h>
66
#include <sys/sysmacros.h>
A
Arnaldo Carvalho de Melo 已提交
67

68 69
#include "sane_ctype.h"

70 71 72 73
#ifndef O_CLOEXEC
# define O_CLOEXEC		02000000
#endif

74 75 76 77
#ifndef F_LINUX_SPECIFIC_BASE
# define F_LINUX_SPECIFIC_BASE	1024
#endif

78 79
struct trace {
	struct perf_tool	tool;
80
	struct syscalltbl	*sctbl;
81 82 83
	struct {
		int		max;
		struct syscall  *table;
84
		struct bpf_map  *map;
85 86
		struct {
			struct perf_evsel *sys_enter,
87 88
					  *sys_exit,
					  *augmented;
89 90
		}		events;
	} syscalls;
91 92 93
	struct {
		struct bpf_map *map;
	} dump;
94 95 96 97
	struct record_opts	opts;
	struct perf_evlist	*evlist;
	struct machine		*host;
	struct thread		*current;
98
	struct cgroup		*cgroup;
99 100 101
	u64			base_time;
	FILE			*output;
	unsigned long		nr_events;
102 103
	unsigned long		nr_events_printed;
	unsigned long		max_events;
104 105 106 107 108 109 110 111
	struct strlist		*ev_qualifier;
	struct {
		size_t		nr;
		int		*entries;
	}			ev_qualifier_ids;
	struct {
		size_t		nr;
		pid_t		*entries;
112
		struct bpf_map  *map;
113 114 115 116 117 118 119
	}			filter_pids;
	double			duration_filter;
	double			runtime_ms;
	struct {
		u64		vfs_getname,
				proc_getname;
	} stats;
120
	unsigned int		max_stack;
121
	unsigned int		min_stack;
122
	int			raw_augmented_syscalls_args_size;
123
	bool			raw_augmented_syscalls;
124
	bool			sort_events;
125 126 127 128 129 130 131
	bool			not_ev_qualifier;
	bool			live;
	bool			full_time;
	bool			sched;
	bool			multiple_threads;
	bool			summary;
	bool			summary_only;
132
	bool			failure_only;
133
	bool			show_comm;
134
	bool			print_sample;
135 136
	bool			show_tool_stats;
	bool			trace_syscalls;
137
	bool			kernel_syscallchains;
138
	s16			args_alignment;
139
	bool			show_tstamp;
140
	bool			show_duration;
141
	bool			show_zeros;
142
	bool			show_arg_names;
143
	bool			show_string_prefix;
144 145 146
	bool			force;
	bool			vfs_getname;
	int			trace_pgfaults;
J
Jiri Olsa 已提交
147 148 149 150
	struct {
		struct ordered_events	data;
		u64			last;
	} oe;
151
};
152

153 154 155 156 157 158 159 160 161 162 163
struct tp_field {
	int offset;
	union {
		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
	};
};

#define TP_UINT_FIELD(bits) \
static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
{ \
164 165 166
	u##bits value; \
	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
	return value;  \
167 168 169 170 171 172 173 174 175 176
}

TP_UINT_FIELD(8);
TP_UINT_FIELD(16);
TP_UINT_FIELD(32);
TP_UINT_FIELD(64);

#define TP_UINT_FIELD__SWAPPED(bits) \
static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
{ \
177 178
	u##bits value; \
	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
179 180 181 182 183 184 185
	return bswap_##bits(value);\
}

TP_UINT_FIELD__SWAPPED(16);
TP_UINT_FIELD__SWAPPED(32);
TP_UINT_FIELD__SWAPPED(64);

186
static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
187
{
188
	field->offset = offset;
189

190
	switch (size) {
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	case 1:
		field->integer = tp_field__u8;
		break;
	case 2:
		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
		break;
	case 4:
		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
		break;
	case 8:
		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
		break;
	default:
		return -1;
	}

	return 0;
}

210
static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
211 212 213 214
{
	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
}

215 216 217 218 219
static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
{
	return sample->raw_data + field->offset;
}

220
static int __tp_field__init_ptr(struct tp_field *field, int offset)
221
{
222
	field->offset = offset;
223 224 225 226
	field->pointer = tp_field__ptr;
	return 0;
}

227
static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
228 229 230 231
{
	return __tp_field__init_ptr(field, format_field->offset);
}

232 233 234 235 236 237 238 239 240 241 242
struct syscall_tp {
	struct tp_field id;
	union {
		struct tp_field args, ret;
	};
};

static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
					  struct tp_field *field,
					  const char *name)
{
243
	struct tep_format_field *format_field = perf_evsel__field(evsel, name);
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

	if (format_field == NULL)
		return -1;

	return tp_field__init_uint(field, format_field, evsel->needs_swap);
}

#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
	({ struct syscall_tp *sc = evsel->priv;\
	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })

static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
					 struct tp_field *field,
					 const char *name)
{
259
	struct tep_format_field *format_field = perf_evsel__field(evsel, name);
260 261 262 263 264 265 266 267 268 269 270 271 272

	if (format_field == NULL)
		return -1;

	return tp_field__init_ptr(field, format_field);
}

#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
	({ struct syscall_tp *sc = evsel->priv;\
	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })

static void perf_evsel__delete_priv(struct perf_evsel *evsel)
{
273
	zfree(&evsel->priv);
274 275 276
	perf_evsel__delete(evsel);
}

277 278 279 280 281
static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
{
	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));

	if (evsel->priv != NULL) {
282 283
		if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
		    perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
284 285 286 287 288 289 290 291 292 293
			goto out_delete;
		return 0;
	}

	return -ENOMEM;
out_delete:
	zfree(&evsel->priv);
	return -ENOENT;
}

294
static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
295 296 297
{
	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));

298 299 300 301 302 303 304
	if (evsel->priv != NULL) {
		struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
		if (syscall_id == NULL)
			syscall_id = perf_evsel__field(tp, "__syscall_nr");
		if (syscall_id == NULL)
			goto out_delete;
		if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
			goto out_delete;

		return 0;
	}

	return -ENOMEM;
out_delete:
	zfree(&evsel->priv);
	return -EINVAL;
}

static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
{
	struct syscall_tp *sc = evsel->priv;

	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
}

323 324 325 326 327 328 329
static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
{
	struct syscall_tp *sc = evsel->priv;

	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
}

330
static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
331 332 333 334 335 336 337 338 339 340 341 342 343
{
	evsel->priv = malloc(sizeof(struct syscall_tp));
	if (evsel->priv != NULL) {
		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
			goto out_delete;

		evsel->handler = handler;
		return 0;
	}

	return -ENOMEM;

out_delete:
344
	zfree(&evsel->priv);
345 346 347
	return -ENOENT;
}

348
static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
349
{
350
	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
351

352
	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
353
	if (IS_ERR(evsel))
354 355
		evsel = perf_evsel__newtp("syscalls", direction);

356 357 358
	if (IS_ERR(evsel))
		return NULL;

359
	if (perf_evsel__init_raw_syscall_tp(evsel, handler))
360
		goto out_delete;
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376

	return evsel;

out_delete:
	perf_evsel__delete_priv(evsel);
	return NULL;
}

#define perf_evsel__sc_tp_uint(evsel, name, sample) \
	({ struct syscall_tp *fields = evsel->priv; \
	   fields->name.integer(&fields->name, sample); })

#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
	({ struct syscall_tp *fields = evsel->priv; \
	   fields->name.pointer(&fields->name, sample); })

377
size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
378 379
{
	int idx = val - sa->offset;
380

381 382 383 384 385 386
	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
		size_t printed = scnprintf(bf, size, intfmt, val);
		if (show_prefix)
			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
		return printed;
	}
387

388
	return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
389 390
}

391 392 393
static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
						const char *intfmt,
					        struct syscall_arg *arg)
394
{
395
	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
396 397
}

398 399 400 401 402 403
static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
					      struct syscall_arg *arg)
{
	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
}

404 405
#define SCA_STRARRAY syscall_arg__scnprintf_strarray

406
size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
407
{
408
	size_t printed;
409 410 411 412
	int i;

	for (i = 0; i < sas->nr_entries; ++i) {
		struct strarray *sa = sas->entries[i];
413
		int idx = val - sa->offset;
414 415 416 417

		if (idx >= 0 && idx < sa->nr_entries) {
			if (sa->entries[idx] == NULL)
				break;
418
			return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
419 420 421
		}
	}

422 423 424 425
	printed = scnprintf(bf, size, intfmt, val);
	if (show_prefix)
		printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
	return printed;
426 427 428 429 430 431
}

size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
					struct syscall_arg *arg)
{
	return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
432 433
}

434 435 436 437
#ifndef AT_FDCWD
#define AT_FDCWD	-100
#endif

438 439 440 441
static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
					   struct syscall_arg *arg)
{
	int fd = arg->val;
442
	const char *prefix = "AT_FD";
443 444

	if (fd == AT_FDCWD)
445
		return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
446 447 448 449 450 451 452 453 454 455 456

	return syscall_arg__scnprintf_fd(bf, size, arg);
}

#define SCA_FDAT syscall_arg__scnprintf_fd_at

static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
					      struct syscall_arg *arg);

#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd

457
size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
458
{
459
	return scnprintf(bf, size, "%#lx", arg->val);
460 461
}

462 463 464 465 466 467 468
size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
{
	if (arg->val == 0)
		return scnprintf(bf, size, "NULL");
	return syscall_arg__scnprintf_hex(bf, size, arg);
}

469
size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
470 471 472 473
{
	return scnprintf(bf, size, "%d", arg->val);
}

474 475 476 477 478
size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
{
	return scnprintf(bf, size, "%ld", arg->val);
}

479 480 481 482
static const char *bpf_cmd[] = {
	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
	"MAP_GET_NEXT_KEY", "PROG_LOAD",
};
483
static DEFINE_STRARRAY(bpf_cmd, "BPF_");
484

485
static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
486
static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
487

488
static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
489
static DEFINE_STRARRAY(itimers, "ITIMER_");
490

491 492 493 494 495 496 497
static const char *keyctl_options[] = {
	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
};
498
static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
499

500 501 502 503 504 505 506 507
static const char *whences[] = { "SET", "CUR", "END",
#ifdef SEEK_DATA
"DATA",
#endif
#ifdef SEEK_HOLE
"HOLE",
#endif
};
508
static DEFINE_STRARRAY(whences, "SEEK_");
509

510 511
static const char *fcntl_cmds[] = {
	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
512 513 514
	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
	"GETOWNER_UIDS",
515
};
516
static DEFINE_STRARRAY(fcntl_cmds, "F_");
517

518 519 520
static const char *fcntl_linux_specific_cmds[] = {
	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
521
	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
522 523
};

524
static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
525 526 527 528 529 530 531 532

static struct strarray *fcntl_cmds_arrays[] = {
	&strarray__fcntl_cmds,
	&strarray__fcntl_linux_specific_cmds,
};

static DEFINE_STRARRAYS(fcntl_cmds_arrays);

533 534 535 536 537
static const char *rlimit_resources[] = {
	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
	"RTTIME",
};
538
static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
539

540
static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
541
static DEFINE_STRARRAY(sighow, "SIG_");
542

543 544
static const char *clockid[] = {
	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
545 546
	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
547
};
548
static DEFINE_STRARRAY(clockid, "CLOCK_");
549

550 551 552
static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
						 struct syscall_arg *arg)
{
553 554
	bool show_prefix = arg->show_string_prefix;
	const char *suffix = "_OK";
555 556 557 558
	size_t printed = 0;
	int mode = arg->val;

	if (mode == F_OK) /* 0 */
559
		return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
560 561
#define	P_MODE(n) \
	if (mode & n##_OK) { \
562
		printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
		mode &= ~n##_OK; \
	}

	P_MODE(R);
	P_MODE(W);
	P_MODE(X);
#undef P_MODE

	if (mode)
		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);

	return printed;
}

#define SCA_ACCMODE syscall_arg__scnprintf_access_mode

579 580 581 582 583
static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
					      struct syscall_arg *arg);

#define SCA_FILENAME syscall_arg__scnprintf_filename

584 585 586
static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
						struct syscall_arg *arg)
{
587 588
	bool show_prefix = arg->show_string_prefix;
	const char *prefix = "O_";
589 590 591 592
	int printed = 0, flags = arg->val;

#define	P_FLAG(n) \
	if (flags & O_##n) { \
593
		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
		flags &= ~O_##n; \
	}

	P_FLAG(CLOEXEC);
	P_FLAG(NONBLOCK);
#undef P_FLAG

	if (flags)
		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);

	return printed;
}

#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags

609 610 611 612 613 614 615
#ifndef GRND_NONBLOCK
#define GRND_NONBLOCK	0x0001
#endif
#ifndef GRND_RANDOM
#define GRND_RANDOM	0x0002
#endif

616 617 618
static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
						   struct syscall_arg *arg)
{
619 620
	bool show_prefix = arg->show_string_prefix;
	const char *prefix = "GRND_";
621 622 623 624
	int printed = 0, flags = arg->val;

#define	P_FLAG(n) \
	if (flags & GRND_##n) { \
625
		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
		flags &= ~GRND_##n; \
	}

	P_FLAG(RANDOM);
	P_FLAG(NONBLOCK);
#undef P_FLAG

	if (flags)
		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);

	return printed;
}

#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags

641 642 643
#define STRARRAY(name, array) \
	  { .scnprintf	= SCA_STRARRAY, \
	    .parm	= &strarray__##array, }
644

645
#include "trace/beauty/arch_errno_names.c"
646
#include "trace/beauty/eventfd.c"
647
#include "trace/beauty/futex_op.c"
648
#include "trace/beauty/futex_val3.c"
649
#include "trace/beauty/mmap.c"
650
#include "trace/beauty/mode_t.c"
651
#include "trace/beauty/msg_flags.c"
652
#include "trace/beauty/open_flags.c"
653
#include "trace/beauty/perf_event_open.c"
654
#include "trace/beauty/pid.c"
655
#include "trace/beauty/sched_policy.c"
656
#include "trace/beauty/seccomp.c"
657
#include "trace/beauty/signum.c"
658
#include "trace/beauty/socket_type.c"
659
#include "trace/beauty/waitid_options.c"
660

661 662
struct syscall_arg_fmt {
	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
663
	unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
664
	void	   *parm;
665
	const char *name;
666
	bool	   show_zero;
667 668
};

A
Arnaldo Carvalho de Melo 已提交
669 670
static struct syscall_fmt {
	const char *name;
671
	const char *alias;
672
	struct syscall_arg_fmt arg[6];
673
	u8	   nr_args;
674
	bool	   errpid;
A
Arnaldo Carvalho de Melo 已提交
675
	bool	   timeout;
676
	bool	   hexret;
A
Arnaldo Carvalho de Melo 已提交
677
} syscall_fmts[] = {
678
	{ .name	    = "access",
679
	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
680 681 682
	{ .name	    = "arch_prctl",
	  .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
683 684
	{ .name	    = "bind",
	  .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
685
	{ .name	    = "bpf",
686
	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
687
	{ .name	    = "brk",	    .hexret = true,
688
	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
689
	{ .name     = "clock_gettime",
690
	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
691 692 693 694 695 696
	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
697
	{ .name	    = "close",
698
	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
699 700
	{ .name	    = "connect",
	  .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
701
	{ .name	    = "epoll_ctl",
702
	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
703
	{ .name	    = "eventfd2",
704
	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
705
	{ .name	    = "fchmodat",
706
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
707
	{ .name	    = "fchownat",
708
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
709
	{ .name	    = "fcntl",
710
	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
711 712
			   .parm      = &strarrays__fcntl_cmds_arrays,
			   .show_zero = true, },
713
		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
714
	{ .name	    = "flock",
715
	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
716 717 718
	{ .name	    = "fstat", .alias = "newfstat", },
	{ .name	    = "fstatat", .alias = "newfstatat", },
	{ .name	    = "futex",
719 720
	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
721
	{ .name	    = "futimesat",
722
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
723
	{ .name	    = "getitimer",
724
	  .arg = { [0] = STRARRAY(which, itimers), }, },
725
	{ .name	    = "getpid",	    .errpid = true, },
726
	{ .name	    = "getpgid",    .errpid = true, },
727
	{ .name	    = "getppid",    .errpid = true, },
728
	{ .name	    = "getrandom",
729
	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
730
	{ .name	    = "getrlimit",
731
	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
732
	{ .name	    = "gettid",	    .errpid = true, },
733
	{ .name	    = "ioctl",
734
	  .arg = {
735 736 737 738
#if defined(__i386__) || defined(__x86_64__)
/*
 * FIXME: Make this available to all arches.
 */
739
		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
740
		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
741
#else
742
		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
743
#endif
744 745 746 747 748 749
	{ .name	    = "kcmp",	    .nr_args = 5,
	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
750
	{ .name	    = "keyctl",
751
	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
752
	{ .name	    = "kill",
753
	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
754
	{ .name	    = "linkat",
755
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
756
	{ .name	    = "lseek",
757
	  .arg = { [2] = STRARRAY(whence, whences), }, },
758 759
	{ .name	    = "lstat", .alias = "newlstat", },
	{ .name     = "madvise",
760 761
	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
762
	{ .name	    = "mkdirat",
763
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
764
	{ .name	    = "mknodat",
765
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
766
	{ .name	    = "mmap",	    .hexret = true,
J
Jiri Olsa 已提交
767 768 769 770
/* The standard mmap maps to old_mmap on s390x */
#if defined(__s390x__)
	.alias = "old_mmap",
#endif
771
	  .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
772 773
		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ },
		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
774
	{ .name	    = "mount",
775 776
	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
777
			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
778
	{ .name	    = "mprotect",
779 780
	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
781
	{ .name	    = "mq_unlink",
782
	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
783
	{ .name	    = "mremap",	    .hexret = true,
784
	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
785
	{ .name	    = "name_to_handle_at",
786
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
787
	{ .name	    = "newfstatat",
788
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
789
	{ .name	    = "open",
790
	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
791
	{ .name	    = "open_by_handle_at",
792 793
	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
794
	{ .name	    = "openat",
795 796
	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
797
	{ .name	    = "perf_event_open",
798 799 800
	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
801
	{ .name	    = "pipe2",
802
	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
803 804 805 806 807 808 809 810
	{ .name	    = "pkey_alloc",
	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
	{ .name	    = "pkey_free",
	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
	{ .name	    = "pkey_mprotect",
	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
811 812
	{ .name	    = "poll", .timeout = true, },
	{ .name	    = "ppoll", .timeout = true, },
813
	{ .name	    = "prctl",
814 815 816
	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
817 818 819
	{ .name	    = "pread", .alias = "pread64", },
	{ .name	    = "preadv", .alias = "pread", },
	{ .name	    = "prlimit64",
820
	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
821 822
	{ .name	    = "pwrite", .alias = "pwrite64", },
	{ .name	    = "readlinkat",
823
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
824
	{ .name	    = "recvfrom",
825
	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
826
	{ .name	    = "recvmmsg",
827
	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
828
	{ .name	    = "recvmsg",
829
	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
830
	{ .name	    = "renameat",
831 832
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
833 834
	{ .name	    = "renameat2",
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
835 836
		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
837
	{ .name	    = "rt_sigaction",
838
	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
839
	{ .name	    = "rt_sigprocmask",
840
	  .arg = { [0] = STRARRAY(how, sighow), }, },
841
	{ .name	    = "rt_sigqueueinfo",
842
	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
843
	{ .name	    = "rt_tgsigqueueinfo",
844
	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
845
	{ .name	    = "sched_setscheduler",
846
	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
847
	{ .name	    = "seccomp",
848 849
	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
850 851
	{ .name	    = "select", .timeout = true, },
	{ .name	    = "sendmmsg",
852
	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
853
	{ .name	    = "sendmsg",
854
	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
855
	{ .name	    = "sendto",
856 857
	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
		   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
858
	{ .name	    = "set_tid_address", .errpid = true, },
859
	{ .name	    = "setitimer",
860
	  .arg = { [0] = STRARRAY(which, itimers), }, },
861
	{ .name	    = "setrlimit",
862
	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
863
	{ .name	    = "socket",
864
	  .arg = { [0] = STRARRAY(family, socket_families),
865 866
		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
867
	{ .name	    = "socketpair",
868
	  .arg = { [0] = STRARRAY(family, socket_families),
869 870
		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
871 872
	{ .name	    = "stat", .alias = "newstat", },
	{ .name	    = "statx",
873 874 875
	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
876
	{ .name	    = "swapoff",
877
	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
878
	{ .name	    = "swapon",
879
	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
880
	{ .name	    = "symlinkat",
881
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
882
	{ .name	    = "tgkill",
883
	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
884
	{ .name	    = "tkill",
885
	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
886 887
	{ .name     = "umount2", .alias = "umount",
	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
888 889
	{ .name	    = "uname", .alias = "newuname", },
	{ .name	    = "unlinkat",
890
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
891
	{ .name	    = "utimensat",
892
	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
893
	{ .name	    = "wait4",	    .errpid = true,
894
	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
895
	{ .name	    = "waitid",	    .errpid = true,
896
	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
A
Arnaldo Carvalho de Melo 已提交
897 898 899 900 901 902 903 904 905 906 907 908 909 910
};

static int syscall_fmt__cmp(const void *name, const void *fmtp)
{
	const struct syscall_fmt *fmt = fmtp;
	return strcmp(name, fmt->name);
}

static struct syscall_fmt *syscall_fmt__find(const char *name)
{
	const int nmemb = ARRAY_SIZE(syscall_fmts);
	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
}

911 912 913 914 915 916 917 918 919 920 921 922
static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
{
	int i, nmemb = ARRAY_SIZE(syscall_fmts);

	for (i = 0; i < nmemb; ++i) {
		if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
			return &syscall_fmts[i];
	}

	return NULL;
}

923 924 925
/*
 * is_exit: is this "exit" or "exit_group"?
 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
926
 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
927
 */
A
Arnaldo Carvalho de Melo 已提交
928
struct syscall {
929
	struct tep_event    *tp_format;
930
	int		    nr_args;
931
	int		    args_size;
932 933
	bool		    is_exit;
	bool		    is_open;
934
	struct tep_format_field *args;
A
Arnaldo Carvalho de Melo 已提交
935 936
	const char	    *name;
	struct syscall_fmt  *fmt;
937
	struct syscall_arg_fmt *arg_fmt;
A
Arnaldo Carvalho de Melo 已提交
938 939
};

940 941 942 943
struct bpf_map_syscall_entry {
	bool	enabled;
};

944 945 946 947 948 949 950 951
/*
 * We need to have this 'calculated' boolean because in some cases we really
 * don't know what is the duration of a syscall, for instance, when we start
 * a session and some threads are waiting for a syscall to finish, say 'poll',
 * in which case all we can do is to print "( ? ) for duration and for the
 * start timestamp.
 */
static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
952 953 954 955
{
	double duration = (double)t / NSEC_PER_MSEC;
	size_t printed = fprintf(fp, "(");

956
	if (!calculated)
957
		printed += fprintf(fp, "         ");
958
	else if (duration >= 1.0)
959 960 961 962 963
		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
	else if (duration >= 0.01)
		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
	else
		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
964
	return printed + fprintf(fp, "): ");
965 966
}

967 968 969 970
/**
 * filename.ptr: The filename char pointer that will be vfs_getname'd
 * filename.entry_str_pos: Where to insert the string translated from
 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
971 972
 * ret_scnprintf: syscall args may set this to a different syscall return
 *                formatter, for instance, fcntl may return fds, file flags, etc.
973
 */
974 975 976
struct thread_trace {
	u64		  entry_time;
	bool		  entry_pending;
977
	unsigned long	  nr_events;
978
	unsigned long	  pfmaj, pfmin;
979
	char		  *entry_str;
980
	double		  runtime_ms;
981
	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
982 983
        struct {
		unsigned long ptr;
984 985 986 987
		short int     entry_str_pos;
		bool	      pending_open;
		unsigned int  namelen;
		char	      *name;
988
	} filename;
989
	struct {
990 991 992
		int	      max;
		struct file   *table;
	} files;
993 994

	struct intlist *syscall_stats;
995 996 997 998
};

static struct thread_trace *thread_trace__new(void)
{
999 1000 1001
	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));

	if (ttrace)
1002
		ttrace->files.max = -1;
1003

1004 1005
	ttrace->syscall_stats = intlist__new(NULL);

1006
	return ttrace;
1007 1008
}

1009
static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1010
{
1011 1012
	struct thread_trace *ttrace;

1013 1014 1015
	if (thread == NULL)
		goto fail;

1016 1017
	if (thread__priv(thread) == NULL)
		thread__set_priv(thread, thread_trace__new());
1018

1019
	if (thread__priv(thread) == NULL)
1020 1021
		goto fail;

1022
	ttrace = thread__priv(thread);
1023 1024 1025
	++ttrace->nr_events;

	return ttrace;
1026
fail:
1027
	color_fprintf(fp, PERF_COLOR_RED,
1028 1029 1030 1031
		      "WARNING: not enough memory, dropping samples!\n");
	return NULL;
}

1032 1033

void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1034
				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1035 1036 1037 1038 1039 1040
{
	struct thread_trace *ttrace = thread__priv(arg->thread);

	ttrace->ret_scnprintf = ret_scnprintf;
}

1041 1042 1043
#define TRACE_PFMAJ		(1 << 0)
#define TRACE_PFMIN		(1 << 1)

1044 1045
static const size_t trace__entry_str_size = 2048;

1046
static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1047
{
1048 1049 1050
	if (fd < 0)
		return NULL;

1051 1052
	if (fd > ttrace->files.max) {
		struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1053

1054
		if (nfiles == NULL)
1055
			return NULL;
1056

1057 1058 1059
		if (ttrace->files.max != -1) {
			memset(nfiles + ttrace->files.max + 1, 0,
			       (fd - ttrace->files.max) * sizeof(struct file));
1060
		} else {
1061
			memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1062 1063
		}

1064 1065
		ttrace->files.table = nfiles;
		ttrace->files.max   = fd;
1066 1067
	}

1068 1069 1070
	return ttrace->files.table + fd;
}

1071 1072 1073 1074 1075
struct file *thread__files_entry(struct thread *thread, int fd)
{
	return thread_trace__files_entry(thread__priv(thread), fd);
}

1076 1077 1078 1079 1080 1081
static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
{
	struct thread_trace *ttrace = thread__priv(thread);
	struct file *file = thread_trace__files_entry(ttrace, fd);

	if (file != NULL) {
1082 1083 1084
		struct stat st;
		if (stat(pathname, &st) == 0)
			file->dev_maj = major(st.st_rdev);
1085 1086 1087 1088
		file->pathname = strdup(pathname);
		if (file->pathname)
			return 0;
	}
1089

1090
	return -1;
1091 1092
}

1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
static int thread__read_fd_path(struct thread *thread, int fd)
{
	char linkname[PATH_MAX], pathname[PATH_MAX];
	struct stat st;
	int ret;

	if (thread->pid_ == thread->tid) {
		scnprintf(linkname, sizeof(linkname),
			  "/proc/%d/fd/%d", thread->pid_, fd);
	} else {
		scnprintf(linkname, sizeof(linkname),
			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
	}

	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
		return -1;

	ret = readlink(linkname, pathname, sizeof(pathname));

	if (ret < 0 || ret > st.st_size)
		return -1;

	pathname[ret] = '\0';
	return trace__set_fd_pathname(thread, fd, pathname);
}

1119 1120
static const char *thread__fd_path(struct thread *thread, int fd,
				   struct trace *trace)
1121
{
1122
	struct thread_trace *ttrace = thread__priv(thread);
1123 1124 1125 1126 1127 1128 1129

	if (ttrace == NULL)
		return NULL;

	if (fd < 0)
		return NULL;

1130
	if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1131 1132 1133
		if (!trace->live)
			return NULL;
		++trace->stats.proc_getname;
1134
		if (thread__read_fd_path(thread, fd))
1135 1136
			return NULL;
	}
1137

1138
	return ttrace->files.table[fd].pathname;
1139 1140
}

1141
size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1142 1143 1144
{
	int fd = arg->val;
	size_t printed = scnprintf(bf, size, "%d", fd);
1145
	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1146 1147 1148 1149 1150 1151 1152

	if (path)
		printed += scnprintf(bf + printed, size - printed, "<%s>", path);

	return printed;
}

1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
{
        size_t printed = scnprintf(bf, size, "%d", fd);
	struct thread *thread = machine__find_thread(trace->host, pid, pid);

	if (thread) {
		const char *path = thread__fd_path(thread, fd, trace);

		if (path)
			printed += scnprintf(bf + printed, size - printed, "<%s>", path);

		thread__put(thread);
	}

        return printed;
}

1170 1171 1172 1173 1174
static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
					      struct syscall_arg *arg)
{
	int fd = arg->val;
	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1175
	struct thread_trace *ttrace = thread__priv(arg->thread);
1176

1177 1178
	if (ttrace && fd >= 0 && fd <= ttrace->files.max)
		zfree(&ttrace->files.table[fd].pathname);
1179 1180 1181 1182

	return printed;
}

1183 1184 1185 1186 1187 1188 1189 1190 1191
static void thread__set_filename_pos(struct thread *thread, const char *bf,
				     unsigned long ptr)
{
	struct thread_trace *ttrace = thread__priv(thread);

	ttrace->filename.ptr = ptr;
	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
}

1192 1193 1194 1195
static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
{
	struct augmented_arg *augmented_arg = arg->augmented.args;

1196
	return scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1197 1198
}

1199 1200 1201 1202 1203
static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
					      struct syscall_arg *arg)
{
	unsigned long ptr = arg->val;

1204 1205 1206
	if (arg->augmented.args)
		return syscall_arg__scnprintf_augmented_string(arg, bf, size);

1207 1208 1209 1210 1211 1212 1213
	if (!arg->trace->vfs_getname)
		return scnprintf(bf, size, "%#x", ptr);

	thread__set_filename_pos(arg->thread, bf, ptr);
	return 0;
}

1214 1215 1216 1217 1218
static bool trace__filter_duration(struct trace *trace, double t)
{
	return t < (trace->duration_filter * NSEC_PER_MSEC);
}

1219
static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1220 1221 1222
{
	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;

1223
	return fprintf(fp, "%10.3f ", ts);
1224 1225
}

1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
/*
 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
 * using ttrace->entry_time for a thread that receives a sys_exit without
 * first having received a sys_enter ("poll" issued before tracing session
 * starts, lost sys_enter exit due to ring buffer overflow).
 */
static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
{
	if (tstamp > 0)
		return __trace__fprintf_tstamp(trace, tstamp, fp);

	return fprintf(fp, "         ? ");
}

1240
static bool done = false;
1241
static bool interrupted = false;
1242

1243
static void sig_handler(int sig)
1244 1245
{
	done = true;
1246
	interrupted = sig == SIGINT;
1247 1248
}

1249
static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1250
{
1251
	size_t printed = 0;
1252

1253 1254
	if (trace->multiple_threads) {
		if (trace->show_comm)
1255
			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1256
		printed += fprintf(fp, "%d ", thread->tid);
1257
	}
1258 1259 1260 1261

	return printed;
}

1262 1263 1264
static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
{
1265 1266 1267 1268
	size_t printed = 0;

	if (trace->show_tstamp)
		printed = trace__fprintf_tstamp(trace, tstamp, fp);
1269 1270
	if (trace->show_duration)
		printed += fprintf_duration(duration, duration_calculated, fp);
1271 1272 1273
	return printed + trace__fprintf_comm_tid(trace, thread, fp);
}

1274
static int trace__process_event(struct trace *trace, struct machine *machine,
1275
				union perf_event *event, struct perf_sample *sample)
1276 1277 1278 1279 1280
{
	int ret = 0;

	switch (event->header.type) {
	case PERF_RECORD_LOST:
1281
		color_fprintf(trace->output, PERF_COLOR_RED,
1282
			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1283
		ret = machine__process_lost_event(machine, event, sample);
1284
		break;
1285
	default:
1286
		ret = machine__process_event(machine, event, sample);
1287 1288 1289 1290 1291 1292
		break;
	}

	return ret;
}

1293
static int trace__tool_process(struct perf_tool *tool,
1294
			       union perf_event *event,
1295
			       struct perf_sample *sample,
1296 1297
			       struct machine *machine)
{
1298
	struct trace *trace = container_of(tool, struct trace, tool);
1299
	return trace__process_event(trace, machine, event, sample);
1300 1301
}

1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
{
	struct machine *machine = vmachine;

	if (machine->kptr_restrict_warned)
		return NULL;

	if (symbol_conf.kptr_restrict) {
		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
			   "Kernel samples will not be resolved.\n");
		machine->kptr_restrict_warned = true;
		return NULL;
	}

	return machine__resolve_kernel_addr(vmachine, addrp, modp);
}

1320 1321
static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
{
1322
	int err = symbol__init(NULL);
1323 1324 1325 1326

	if (err)
		return err;

1327 1328 1329
	trace->host = machine__new_host();
	if (trace->host == NULL)
		return -ENOMEM;
1330

1331 1332 1333
	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
	if (err < 0)
		goto out;
1334

1335
	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1336
					    evlist->threads, trace__tool_process, false,
1337
					    1);
1338
out:
1339 1340 1341 1342 1343 1344
	if (err)
		symbol__exit();

	return err;
}

1345 1346 1347 1348 1349 1350 1351 1352
static void trace__symbols__exit(struct trace *trace)
{
	machine__exit(trace->host);
	trace->host = NULL;

	symbol__exit();
}

1353
static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1354
{
1355
	int idx;
1356

1357 1358 1359
	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
		nr_args = sc->fmt->nr_args;

1360
	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1361
	if (sc->arg_fmt == NULL)
1362 1363
		return -1;

1364 1365
	for (idx = 0; idx < nr_args; ++idx) {
		if (sc->fmt)
1366
			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1367
	}
1368

1369 1370 1371 1372 1373 1374
	sc->nr_args = nr_args;
	return 0;
}

static int syscall__set_arg_fmts(struct syscall *sc)
{
1375
	struct tep_format_field *field, *last_field = NULL;
1376 1377 1378
	int idx = 0, len;

	for (field = sc->args; field; field = field->next, ++idx) {
1379 1380
		last_field = field;

1381 1382
		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
			continue;
1383

1384
		if (strcmp(field->type, "const char *") == 0 &&
1385 1386 1387
			 (strcmp(field->name, "filename") == 0 ||
			  strcmp(field->name, "path") == 0 ||
			  strcmp(field->name, "pathname") == 0))
1388
			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1389 1390
		else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
			sc->arg_fmt[idx].scnprintf = SCA_PTR;
1391
		else if (strcmp(field->type, "pid_t") == 0)
1392
			sc->arg_fmt[idx].scnprintf = SCA_PID;
1393
		else if (strcmp(field->type, "umode_t") == 0)
1394
			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
		else if ((strcmp(field->type, "int") == 0 ||
			  strcmp(field->type, "unsigned int") == 0 ||
			  strcmp(field->type, "long") == 0) &&
			 (len = strlen(field->name)) >= 2 &&
			 strcmp(field->name + len - 2, "fd") == 0) {
			/*
			 * /sys/kernel/tracing/events/syscalls/sys_enter*
			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
			 * 65 int
			 * 23 unsigned int
			 * 7 unsigned long
			 */
1407
			sc->arg_fmt[idx].scnprintf = SCA_FD;
1408
		}
1409 1410
	}

1411 1412 1413
	if (last_field)
		sc->args_size = last_field->offset + last_field->size;

1414 1415 1416
	return 0;
}

A
Arnaldo Carvalho de Melo 已提交
1417 1418 1419 1420
static int trace__read_syscall_info(struct trace *trace, int id)
{
	char tp_name[128];
	struct syscall *sc;
1421
	const char *name = syscalltbl__name(trace->sctbl, id);
1422 1423 1424

	if (name == NULL)
		return -1;
A
Arnaldo Carvalho de Melo 已提交
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443

	if (id > trace->syscalls.max) {
		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));

		if (nsyscalls == NULL)
			return -1;

		if (trace->syscalls.max != -1) {
			memset(nsyscalls + trace->syscalls.max + 1, 0,
			       (id - trace->syscalls.max) * sizeof(*sc));
		} else {
			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
		}

		trace->syscalls.table = nsyscalls;
		trace->syscalls.max   = id;
	}

	sc = trace->syscalls.table + id;
1444
	sc->name = name;
1445

1446
	sc->fmt  = syscall_fmt__find(sc->name);
A
Arnaldo Carvalho de Melo 已提交
1447

1448
	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1449
	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1450

1451
	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1452
		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1453
		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1454
	}
A
Arnaldo Carvalho de Melo 已提交
1455

1456 1457 1458
	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
		return -1;

1459
	if (IS_ERR(sc->tp_format))
1460 1461
		return -1;

1462
	sc->args = sc->tp_format->format.fields;
1463 1464 1465 1466 1467 1468
	/*
	 * We need to check and discard the first variable '__syscall_nr'
	 * or 'nr' that mean the syscall number. It is needless here.
	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
	 */
	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1469 1470 1471 1472
		sc->args = sc->args->next;
		--sc->nr_args;
	}

1473
	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1474
	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1475

1476
	return syscall__set_arg_fmts(sc);
A
Arnaldo Carvalho de Melo 已提交
1477 1478
}

1479 1480
static int trace__validate_ev_qualifier(struct trace *trace)
{
1481
	int err = 0, i;
1482
	size_t nr_allocated;
1483 1484
	struct str_node *pos;

1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
						 sizeof(trace->ev_qualifier_ids.entries[0]));

	if (trace->ev_qualifier_ids.entries == NULL) {
		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
		       trace->output);
		err = -EINVAL;
		goto out;
	}

1496
	nr_allocated = trace->ev_qualifier_ids.nr;
1497 1498
	i = 0;

1499
	strlist__for_each_entry(pos, trace->ev_qualifier) {
1500
		const char *sc = pos->s;
1501
		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1502

1503
		if (id < 0) {
1504 1505 1506 1507
			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
			if (id >= 0)
				goto matches;

1508 1509 1510 1511 1512 1513 1514 1515 1516
			if (err == 0) {
				fputs("Error:\tInvalid syscall ", trace->output);
				err = -EINVAL;
			} else {
				fputs(", ", trace->output);
			}

			fputs(sc, trace->output);
		}
1517
matches:
1518
		trace->ev_qualifier_ids.entries[i++] = id;
1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
		if (match_next == -1)
			continue;

		while (1) {
			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
			if (id < 0)
				break;
			if (nr_allocated == trace->ev_qualifier_ids.nr) {
				void *entries;

				nr_allocated += 8;
				entries = realloc(trace->ev_qualifier_ids.entries,
						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
				if (entries == NULL) {
					err = -ENOMEM;
					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
					goto out_free;
				}
				trace->ev_qualifier_ids.entries = entries;
			}
			trace->ev_qualifier_ids.nr++;
			trace->ev_qualifier_ids.entries[i++] = id;
		}
1542 1543 1544 1545 1546
	}

	if (err < 0) {
		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1547
out_free:
1548 1549
		zfree(&trace->ev_qualifier_ids.entries);
		trace->ev_qualifier_ids.nr = 0;
1550
	}
1551
out:
1552 1553 1554
	return err;
}

1555 1556 1557 1558 1559 1560 1561 1562
/*
 * args is to be interpreted as a series of longs but we need to handle
 * 8-byte unaligned accesses. args points to raw_data within the event
 * and raw_data is guaranteed to be 8-byte unaligned because it is
 * preceded by raw_size which is a u32. So we need to copy args to a temp
 * variable to read it. Most notably this avoids extended load instructions
 * on unaligned addresses
 */
1563
unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1564 1565
{
	unsigned long val;
1566
	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1567 1568 1569 1570 1571

	memcpy(&val, p, sizeof(val));
	return val;
}

1572 1573 1574 1575 1576 1577 1578 1579 1580
static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
				      struct syscall_arg *arg)
{
	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);

	return scnprintf(bf, size, "arg%d: ", arg->idx);
}

1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593
/*
 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
 * as mount 'flags' argument that needs ignoring some magic flag, see comment
 * in tools/perf/trace/beauty/mount_flags.c
 */
static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
{
	if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
		return sc->arg_fmt[arg->idx].mask_val(arg, val);

	return val;
}

1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605
static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
				     struct syscall_arg *arg, unsigned long val)
{
	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
		arg->val = val;
		if (sc->arg_fmt[arg->idx].parm)
			arg->parm = sc->arg_fmt[arg->idx].parm;
		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
	}
	return scnprintf(bf, size, "%ld", val);
}

1606
static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1607 1608
				      unsigned char *args, void *augmented_args, int augmented_args_size,
				      struct trace *trace, struct thread *thread)
A
Arnaldo Carvalho de Melo 已提交
1609 1610
{
	size_t printed = 0;
1611
	unsigned long val;
1612 1613 1614
	u8 bit = 1;
	struct syscall_arg arg = {
		.args	= args,
1615 1616 1617 1618
		.augmented = {
			.size = augmented_args_size,
			.args = augmented_args,
		},
1619 1620 1621 1622
		.idx	= 0,
		.mask	= 0,
		.trace  = trace,
		.thread = thread,
1623
		.show_string_prefix = trace->show_string_prefix,
1624
	};
1625 1626 1627 1628 1629 1630 1631 1632
	struct thread_trace *ttrace = thread__priv(thread);

	/*
	 * Things like fcntl will set this in its 'cmd' formatter to pick the
	 * right formatter for the return value (an fd? file flags?), which is
	 * not needed for syscalls that always return a given type, say an fd.
	 */
	ttrace->ret_scnprintf = NULL;
A
Arnaldo Carvalho de Melo 已提交
1633

1634
	if (sc->args != NULL) {
1635
		struct tep_format_field *field;
1636

1637
		for (field = sc->args; field;
1638 1639
		     field = field->next, ++arg.idx, bit <<= 1) {
			if (arg.mask & bit)
1640
				continue;
1641

1642
			val = syscall_arg__val(&arg, arg.idx);
1643 1644 1645 1646 1647
			/*
			 * Some syscall args need some mask, most don't and
			 * return val untouched.
			 */
			val = syscall__mask_val(sc, &arg, val);
1648

1649 1650 1651 1652 1653
			/*
 			 * Suppress this argument if its value is zero and
 			 * and we don't have a string associated in an
 			 * strarray for it.
 			 */
1654
			if (val == 0 &&
1655
			    !trace->show_zeros &&
1656
			    !(sc->arg_fmt &&
1657 1658
			      (sc->arg_fmt[arg.idx].show_zero ||
			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1659 1660
			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
			      sc->arg_fmt[arg.idx].parm))
1661 1662
				continue;

1663 1664 1665 1666 1667
			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");

			if (trace->show_arg_names)
				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);

1668
			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
A
Arnaldo Carvalho de Melo 已提交
1669
		}
1670 1671 1672 1673 1674 1675
	} else if (IS_ERR(sc->tp_format)) {
		/*
		 * If we managed to read the tracepoint /format file, then we
		 * may end up not having any args, like with gettid(), so only
		 * print the raw args when we didn't manage to read it.
		 */
1676
		while (arg.idx < sc->nr_args) {
1677 1678 1679
			if (arg.mask & bit)
				goto next_arg;
			val = syscall_arg__val(&arg, arg.idx);
1680 1681 1682
			if (printed)
				printed += scnprintf(bf + printed, size - printed, ", ");
			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1683 1684 1685 1686
			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
next_arg:
			++arg.idx;
			bit <<= 1;
A
Arnaldo Carvalho de Melo 已提交
1687 1688 1689 1690 1691 1692
		}
	}

	return printed;
}

1693
typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1694
				  union perf_event *event,
1695 1696 1697
				  struct perf_sample *sample);

static struct syscall *trace__syscall_info(struct trace *trace,
1698
					   struct perf_evsel *evsel, int id)
1699 1700 1701
{

	if (id < 0) {
1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717

		/*
		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
		 * before that, leaving at a higher verbosity level till that is
		 * explained. Reproduced with plain ftrace with:
		 *
		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
		 * grep "NR -1 " /t/trace_pipe
		 *
		 * After generating some load on the machine.
 		 */
		if (verbose > 1) {
			static u64 n;
			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
				id, perf_evsel__name(evsel), ++n);
		}
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
		return NULL;
	}

	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
	    trace__read_syscall_info(trace, id))
		goto out_cant_read;

	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
		goto out_cant_read;

	return &trace->syscalls.table[id];

out_cant_read:
1731
	if (verbose > 0) {
1732 1733 1734 1735 1736
		fprintf(trace->output, "Problems reading syscall %d", id);
		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
		fputs(" information\n", trace->output);
	}
1737 1738 1739
	return NULL;
}

1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765
static void thread__update_stats(struct thread_trace *ttrace,
				 int id, struct perf_sample *sample)
{
	struct int_node *inode;
	struct stats *stats;
	u64 duration = 0;

	inode = intlist__findnew(ttrace->syscall_stats, id);
	if (inode == NULL)
		return;

	stats = inode->priv;
	if (stats == NULL) {
		stats = malloc(sizeof(struct stats));
		if (stats == NULL)
			return;
		init_stats(stats);
		inode->priv = stats;
	}

	if (ttrace->entry_time && sample->time > ttrace->entry_time)
		duration = sample->time - ttrace->entry_time;

	update_stats(stats, duration);
}

1766
static int trace__printf_interrupted_entry(struct trace *trace)
1767 1768 1769
{
	struct thread_trace *ttrace;
	size_t printed;
1770
	int len;
1771

1772
	if (trace->failure_only || trace->current == NULL)
1773 1774 1775 1776 1777 1778 1779
		return 0;

	ttrace = thread__priv(trace->current);

	if (!ttrace->entry_pending)
		return 0;

1780
	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1781 1782 1783 1784
	printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);

	if (len < trace->args_alignment - 4)
		printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1785

1786 1787 1788
	printed += fprintf(trace->output, " ...\n");

	ttrace->entry_pending = false;
1789 1790
	++trace->nr_events_printed;

1791 1792 1793
	return printed;
}

1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810
static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
				 struct perf_sample *sample, struct thread *thread)
{
	int printed = 0;

	if (trace->print_sample) {
		double ts = (double)sample->time / NSEC_PER_MSEC;

		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
				   perf_evsel__name(evsel), ts,
				   thread__comm_str(thread),
				   sample->pid, sample->tid, sample->cpu);
	}

	return printed;
}

1811
static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1812 1813
{
	void *augmented_args = NULL;
1814 1815
	/*
	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1816 1817 1818 1819 1820
	 * and there we get all 6 syscall args plus the tracepoint common fields
	 * that gets calculated at the start and the syscall_nr (another long).
	 * So we check if that is the case and if so don't look after the
	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
	 * which is fixed.
1821 1822 1823 1824 1825 1826 1827
	 *
	 * We'll revisit this later to pass s->args_size to the BPF augmenter
	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
	 * copies only what we need for each syscall, like what happens when we
	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
	 * traffic to just what is needed for each syscall.
	 */
1828
	int args_size = raw_augmented_args_size ?: sc->args_size;
1829

1830
	*augmented_args_size = sample->raw_size - args_size;
1831
	if (*augmented_args_size > 0)
1832
		augmented_args = sample->raw_data + args_size;
1833 1834 1835 1836

	return augmented_args;
}

1837
static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1838
			    union perf_event *event __maybe_unused,
1839 1840
			    struct perf_sample *sample)
{
1841
	char *msg;
1842
	void *args;
1843
	int printed = 0;
1844
	struct thread *thread;
1845
	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1846 1847
	int augmented_args_size = 0;
	void *augmented_args = NULL;
1848
	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1849 1850 1851 1852
	struct thread_trace *ttrace;

	if (sc == NULL)
		return -1;
1853

1854
	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1855
	ttrace = thread__trace(thread, trace->output);
1856
	if (ttrace == NULL)
1857
		goto out_put;
1858

1859 1860
	trace__fprintf_sample(trace, evsel, sample, thread);

1861
	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1862 1863

	if (ttrace->entry_str == NULL) {
1864
		ttrace->entry_str = malloc(trace__entry_str_size);
1865
		if (!ttrace->entry_str)
1866
			goto out_put;
1867 1868
	}

1869
	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1870
		trace__printf_interrupted_entry(trace);
1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881
	/*
	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
	 * so when handling, say the openat syscall, we end up getting 6 args for the
	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
	 * thinking that the extra 2 u64 args are the augmented filename, so just check
	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
	 */
	if (evsel != trace->syscalls.events.sys_enter)
1882
		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1883 1884
	ttrace->entry_time = sample->time;
	msg = ttrace->entry_str;
1885
	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1886

1887
	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1888
					   args, augmented_args, augmented_args_size, trace, thread);
1889

1890
	if (sc->is_exit) {
1891
		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1892 1893
			int alignment = 0;

1894
			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1895 1896 1897 1898
			printed = fprintf(trace->output, "%s)", ttrace->entry_str);
			if (trace->args_alignment > printed)
				alignment = trace->args_alignment - printed;
			fprintf(trace->output, "%*s= ?\n", alignment, " ");
1899
		}
1900
	} else {
1901
		ttrace->entry_pending = true;
1902 1903 1904
		/* See trace__vfs_getname & trace__sys_exit */
		ttrace->filename.pending_open = false;
	}
1905

1906 1907 1908 1909
	if (trace->current != thread) {
		thread__put(trace->current);
		trace->current = thread__get(thread);
	}
1910 1911 1912 1913
	err = 0;
out_put:
	thread__put(thread);
	return err;
1914 1915
}

1916 1917 1918 1919 1920
static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
				    struct perf_sample *sample)
{
	struct thread_trace *ttrace;
	struct thread *thread;
1921 1922
	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1923
	char msg[1024];
1924 1925
	void *args, *augmented_args = NULL;
	int augmented_args_size;
1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938

	if (sc == NULL)
		return -1;

	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
	ttrace = thread__trace(thread, trace->output);
	/*
	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
	 */
	if (ttrace == NULL)
		goto out_put;

1939
	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1940
	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1941
	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
1942 1943 1944 1945 1946 1947 1948
	fprintf(trace->output, "%s", msg);
	err = 0;
out_put:
	thread__put(thread);
	return err;
}

1949 1950 1951
static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
				    struct perf_sample *sample,
				    struct callchain_cursor *cursor)
1952 1953
{
	struct addr_location al;
1954 1955 1956
	int max_stack = evsel->attr.sample_max_stack ?
			evsel->attr.sample_max_stack :
			trace->max_stack;
1957
	int err;
1958

1959
	if (machine__resolve(trace->host, &al, sample) < 0)
1960 1961
		return -1;

1962 1963 1964
	err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
	addr_location__put(&al);
	return err;
1965 1966 1967 1968
}

static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
{
1969
	/* TODO: user-configurable print_opts */
1970 1971 1972
	const unsigned int print_opts = EVSEL__PRINT_SYM |
				        EVSEL__PRINT_DSO |
				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1973

1974
	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1975 1976
}

1977 1978 1979 1980 1981 1982 1983 1984
static const char *errno_to_name(struct perf_evsel *evsel, int err)
{
	struct perf_env *env = perf_evsel__env(evsel);
	const char *arch_name = perf_env__arch(env);

	return arch_syscalls__strerrno(arch_name, err);
}

1985
static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1986
			   union perf_event *event __maybe_unused,
1987 1988
			   struct perf_sample *sample)
{
1989
	long ret;
1990
	u64 duration = 0;
1991
	bool duration_calculated = false;
1992
	struct thread *thread;
1993 1994
	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
	int alignment = trace->args_alignment;
1995
	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1996 1997 1998 1999
	struct thread_trace *ttrace;

	if (sc == NULL)
		return -1;
2000

2001
	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2002
	ttrace = thread__trace(thread, trace->output);
2003
	if (ttrace == NULL)
2004
		goto out_put;
2005

2006 2007
	trace__fprintf_sample(trace, evsel, sample, thread);

2008 2009 2010
	if (trace->summary)
		thread__update_stats(ttrace, id, sample);

2011
	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2012

2013
	if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2014 2015
		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
		ttrace->filename.pending_open = false;
2016 2017 2018
		++trace->stats.vfs_getname;
	}

2019
	if (ttrace->entry_time) {
2020
		duration = sample->time - ttrace->entry_time;
2021 2022
		if (trace__filter_duration(trace, duration))
			goto out;
2023
		duration_calculated = true;
2024 2025
	} else if (trace->duration_filter)
		goto out;
2026

2027 2028 2029 2030 2031 2032 2033 2034 2035
	if (sample->callchain) {
		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
		if (callchain_ret == 0) {
			if (callchain_cursor.nr < trace->min_stack)
				goto out;
			callchain_ret = 1;
		}
	}

2036
	if (trace->summary_only || (ret >= 0 && trace->failure_only))
D
David Ahern 已提交
2037 2038
		goto out;

2039
	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2040 2041

	if (ttrace->entry_pending) {
2042
		printed = fprintf(trace->output, "%s", ttrace->entry_str);
2043
	} else {
2044
		printed += fprintf(trace->output, " ... [");
2045
		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2046 2047
		printed += 9;
		printed += fprintf(trace->output, "]: %s()", sc->name);
2048 2049
	}

2050 2051 2052 2053 2054 2055 2056 2057 2058
	printed++; /* the closing ')' */

	if (alignment > printed)
		alignment -= printed;
	else
		alignment = 0;

	fprintf(trace->output, ")%*s= ", alignment, " ");

2059
	if (sc->fmt == NULL) {
2060 2061
		if (ret < 0)
			goto errno_print;
2062
signed_print:
2063
		fprintf(trace->output, "%ld", ret);
2064 2065
	} else if (ret < 0) {
errno_print: {
2066
		char bf[STRERR_BUFSIZE];
2067
		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2068
			   *e = errno_to_name(evsel, -ret);
2069

2070
		fprintf(trace->output, "-1 %s (%s)", e, emsg);
2071
	}
2072
	} else if (ret == 0 && sc->fmt->timeout)
2073
		fprintf(trace->output, "0 (Timeout)");
2074 2075
	else if (ttrace->ret_scnprintf) {
		char bf[1024];
2076 2077 2078 2079 2080 2081
		struct syscall_arg arg = {
			.val	= ret,
			.thread	= thread,
			.trace	= trace,
		};
		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2082
		ttrace->ret_scnprintf = NULL;
2083
		fprintf(trace->output, "%s", bf);
2084
	} else if (sc->fmt->hexret)
2085
		fprintf(trace->output, "%#lx", ret);
2086 2087 2088 2089
	else if (sc->fmt->errpid) {
		struct thread *child = machine__find_thread(trace->host, ret, ret);

		if (child != NULL) {
2090
			fprintf(trace->output, "%ld", ret);
2091 2092 2093 2094 2095
			if (child->comm_set)
				fprintf(trace->output, " (%s)", thread__comm_str(child));
			thread__put(child);
		}
	} else
2096
		goto signed_print;
2097

2098
	fputc('\n', trace->output);
2099

2100 2101 2102 2103 2104 2105 2106
	/*
	 * We only consider an 'event' for the sake of --max-events a non-filtered
	 * sys_enter + sys_exit and other tracepoint events.
	 */
	if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
		interrupted = true;

2107 2108 2109 2110
	if (callchain_ret > 0)
		trace__fprintf_callchain(trace, sample);
	else if (callchain_ret < 0)
		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2111
out:
2112
	ttrace->entry_pending = false;
2113 2114 2115 2116
	err = 0;
out_put:
	thread__put(thread);
	return err;
2117 2118
}

2119
static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2120
			      union perf_event *event __maybe_unused,
2121 2122
			      struct perf_sample *sample)
{
2123 2124 2125 2126 2127
	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
	struct thread_trace *ttrace;
	size_t filename_len, entry_str_len, to_move;
	ssize_t remaining_space;
	char *pos;
2128
	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2129 2130 2131 2132 2133 2134

	if (!thread)
		goto out;

	ttrace = thread__priv(thread);
	if (!ttrace)
2135
		goto out_put;
2136

2137
	filename_len = strlen(filename);
2138
	if (filename_len == 0)
2139
		goto out_put;
2140 2141 2142 2143 2144

	if (ttrace->filename.namelen < filename_len) {
		char *f = realloc(ttrace->filename.name, filename_len + 1);

		if (f == NULL)
2145
			goto out_put;
2146 2147 2148 2149 2150 2151 2152 2153

		ttrace->filename.namelen = filename_len;
		ttrace->filename.name = f;
	}

	strcpy(ttrace->filename.name, filename);
	ttrace->filename.pending_open = true;

2154
	if (!ttrace->filename.ptr)
2155
		goto out_put;
2156 2157 2158 2159

	entry_str_len = strlen(ttrace->entry_str);
	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
	if (remaining_space <= 0)
2160
		goto out_put;
2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173

	if (filename_len > (size_t)remaining_space) {
		filename += filename_len - remaining_space;
		filename_len = remaining_space;
	}

	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
	memmove(pos + filename_len, pos, to_move);
	memcpy(pos, filename, filename_len);

	ttrace->filename.ptr = 0;
	ttrace->filename.entry_str_pos = 0;
2174 2175
out_put:
	thread__put(thread);
2176
out:
2177 2178 2179
	return 0;
}

2180
static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2181
				     union perf_event *event __maybe_unused,
2182 2183 2184 2185
				     struct perf_sample *sample)
{
        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2186
	struct thread *thread = machine__findnew_thread(trace->host,
2187 2188
							sample->pid,
							sample->tid);
2189
	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2190 2191 2192 2193 2194 2195

	if (ttrace == NULL)
		goto out_dump;

	ttrace->runtime_ms += runtime_ms;
	trace->runtime_ms += runtime_ms;
2196
out_put:
2197
	thread__put(thread);
2198 2199 2200
	return 0;

out_dump:
2201
	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2202 2203 2204 2205 2206
	       evsel->name,
	       perf_evsel__strval(evsel, sample, "comm"),
	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
	       runtime,
	       perf_evsel__intval(evsel, sample, "vruntime"));
2207
	goto out_put;
2208 2209
}

2210 2211
static int bpf_output__printer(enum binary_printer_ops op,
			       unsigned int val, void *extra __maybe_unused, FILE *fp)
2212 2213 2214 2215 2216
{
	unsigned char ch = (unsigned char)val;

	switch (op) {
	case BINARY_PRINT_CHAR_DATA:
2217
		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229
	case BINARY_PRINT_DATA_BEGIN:
	case BINARY_PRINT_LINE_BEGIN:
	case BINARY_PRINT_ADDR:
	case BINARY_PRINT_NUM_DATA:
	case BINARY_PRINT_NUM_PAD:
	case BINARY_PRINT_SEP:
	case BINARY_PRINT_CHAR_PAD:
	case BINARY_PRINT_LINE_END:
	case BINARY_PRINT_DATA_END:
	default:
		break;
	}
2230 2231

	return 0;
2232 2233 2234 2235 2236
}

static void bpf_output__fprintf(struct trace *trace,
				struct perf_sample *sample)
{
2237 2238
	binary__fprintf(sample->raw_data, sample->raw_size, 8,
			bpf_output__printer, NULL, trace->output);
2239
	++trace->nr_events_printed;
2240 2241
}

2242 2243 2244 2245
static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
				union perf_event *event __maybe_unused,
				struct perf_sample *sample)
{
2246
	struct thread *thread;
2247
	int callchain_ret = 0;
2248 2249 2250 2251 2252 2253 2254 2255 2256 2257
	/*
	 * Check if we called perf_evsel__disable(evsel) due to, for instance,
	 * this event's max_events having been hit and this is an entry coming
	 * from the ring buffer that we should discard, since the max events
	 * have already been considered/printed.
	 */
	if (evsel->disabled)
		return 0;

	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2258 2259 2260 2261 2262 2263 2264 2265 2266 2267

	if (sample->callchain) {
		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
		if (callchain_ret == 0) {
			if (callchain_cursor.nr < trace->min_stack)
				goto out;
			callchain_ret = 1;
		}
	}

2268
	trace__printf_interrupted_entry(trace);
2269
	trace__fprintf_tstamp(trace, sample->time, trace->output);
2270

2271
	if (trace->trace_syscalls && trace->show_duration)
2272 2273
		fprintf(trace->output, "(         ): ");

2274 2275 2276
	if (thread)
		trace__fprintf_comm_tid(trace, thread, trace->output);

2277 2278 2279 2280 2281
	if (evsel == trace->syscalls.events.augmented) {
		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
		struct syscall *sc = trace__syscall_info(trace, evsel, id);

		if (sc) {
2282 2283 2284 2285
			fprintf(trace->output, "%s(", sc->name);
			trace__fprintf_sys_enter(trace, evsel, sample);
			fputc(')', trace->output);
			goto newline;
2286 2287 2288 2289 2290 2291 2292 2293 2294
		}

		/*
		 * XXX: Not having the associated syscall info or not finding/adding
		 * 	the thread should never happen, but if it does...
		 * 	fall thru and print it as a bpf_output event.
		 */
	}

2295
	fprintf(trace->output, "%s:", evsel->name);
2296

2297
	if (perf_evsel__is_bpf_output(evsel)) {
2298
		bpf_output__fprintf(trace, sample);
2299
	} else if (evsel->tp_format) {
2300 2301 2302 2303 2304
		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
		    trace__fprintf_sys_enter(trace, evsel, sample)) {
			event_format__fprintf(evsel->tp_format, sample->cpu,
					      sample->raw_data, sample->raw_size,
					      trace->output);
2305
			++trace->nr_events_printed;
2306 2307 2308 2309 2310

			if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
				perf_evsel__disable(evsel);
				perf_evsel__close(evsel);
			}
2311
		}
2312 2313
	}

2314
newline:
C
Changbin Du 已提交
2315
	fprintf(trace->output, "\n");
2316

2317 2318 2319 2320 2321
	if (callchain_ret > 0)
		trace__fprintf_callchain(trace, sample);
	else if (callchain_ret < 0)
		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
out:
2322
	thread__put(thread);
2323 2324 2325
	return 0;
}

2326 2327 2328 2329 2330
static void print_location(FILE *f, struct perf_sample *sample,
			   struct addr_location *al,
			   bool print_dso, bool print_sym)
{

2331
	if ((verbose > 0 || print_dso) && al->map)
2332 2333
		fprintf(f, "%s@", al->map->dso->long_name);

2334
	if ((verbose > 0 || print_sym) && al->sym)
2335
		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2336 2337
			al->addr - al->sym->start);
	else if (al->map)
2338
		fprintf(f, "0x%" PRIx64, al->addr);
2339
	else
2340
		fprintf(f, "0x%" PRIx64, sample->addr);
2341 2342 2343 2344
}

static int trace__pgfault(struct trace *trace,
			  struct perf_evsel *evsel,
2345
			  union perf_event *event __maybe_unused,
2346 2347 2348 2349 2350
			  struct perf_sample *sample)
{
	struct thread *thread;
	struct addr_location al;
	char map_type = 'd';
2351
	struct thread_trace *ttrace;
2352
	int err = -1;
2353
	int callchain_ret = 0;
2354 2355

	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2356 2357 2358 2359 2360 2361 2362 2363 2364 2365

	if (sample->callchain) {
		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
		if (callchain_ret == 0) {
			if (callchain_cursor.nr < trace->min_stack)
				goto out_put;
			callchain_ret = 1;
		}
	}

2366 2367
	ttrace = thread__trace(thread, trace->output);
	if (ttrace == NULL)
2368
		goto out_put;
2369 2370 2371 2372 2373 2374 2375

	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
		ttrace->pfmaj++;
	else
		ttrace->pfmin++;

	if (trace->summary_only)
2376
		goto out;
2377

2378
	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2379

2380
	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2381 2382 2383 2384 2385 2386 2387 2388 2389

	fprintf(trace->output, "%sfault [",
		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
		"maj" : "min");

	print_location(trace->output, sample, &al, false, true);

	fprintf(trace->output, "] => ");

2390
	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2391 2392

	if (!al.map) {
2393
		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2394 2395 2396 2397 2398 2399 2400 2401 2402 2403

		if (al.map)
			map_type = 'x';
		else
			map_type = '?';
	}

	print_location(trace->output, sample, &al, true, false);

	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2404

2405 2406 2407 2408
	if (callchain_ret > 0)
		trace__fprintf_callchain(trace, sample);
	else if (callchain_ret < 0)
		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2409 2410

	++trace->nr_events_printed;
2411 2412 2413 2414 2415
out:
	err = 0;
out_put:
	thread__put(thread);
	return err;
2416 2417
}

2418
static void trace__set_base_time(struct trace *trace,
2419
				 struct perf_evsel *evsel,
2420 2421
				 struct perf_sample *sample)
{
2422 2423 2424 2425 2426 2427 2428 2429 2430 2431
	/*
	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
	 * and don't use sample->time unconditionally, we may end up having
	 * some other event in the future without PERF_SAMPLE_TIME for good
	 * reason, i.e. we may not be interested in its timestamps, just in
	 * it taking place, picking some piece of information when it
	 * appears in our event stream (vfs_getname comes to mind).
	 */
	if (trace->base_time == 0 && !trace->full_time &&
	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2432 2433 2434
		trace->base_time = sample->time;
}

2435
static int trace__process_sample(struct perf_tool *tool,
2436
				 union perf_event *event,
2437 2438 2439 2440 2441
				 struct perf_sample *sample,
				 struct perf_evsel *evsel,
				 struct machine *machine __maybe_unused)
{
	struct trace *trace = container_of(tool, struct trace, tool);
2442
	struct thread *thread;
2443 2444
	int err = 0;

2445
	tracepoint_handler handler = evsel->handler;
2446

2447 2448
	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
	if (thread && thread__is_filtered(thread))
2449
		goto out;
2450

2451
	trace__set_base_time(trace, evsel, sample);
2452

2453 2454
	if (handler) {
		++trace->nr_events;
2455
		handler(trace, evsel, event, sample);
2456
	}
2457 2458
out:
	thread__put(thread);
2459 2460 2461
	return err;
}

2462
static int trace__record(struct trace *trace, int argc, const char **argv)
D
David Ahern 已提交
2463 2464 2465 2466 2467 2468 2469 2470 2471 2472
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
	const char * const record_args[] = {
		"record",
		"-R",
		"-m", "1024",
		"-c", "1",
	};

2473 2474 2475 2476 2477 2478 2479
	const char * const sc_args[] = { "-e", };
	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
	const char * const majpf_args[] = { "-e", "major-faults" };
	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
	const char * const minpf_args[] = { "-e", "minor-faults" };
	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);

2480
	/* +1 is for the event string below */
2481 2482
	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
		majpf_args_nr + minpf_args_nr + argc;
D
David Ahern 已提交
2483 2484 2485 2486 2487
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

	if (rec_argv == NULL)
		return -ENOMEM;

2488
	j = 0;
D
David Ahern 已提交
2489
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2490 2491
		rec_argv[j++] = record_args[i];

2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502
	if (trace->trace_syscalls) {
		for (i = 0; i < sc_args_nr; i++)
			rec_argv[j++] = sc_args[i];

		/* event string may be different for older kernels - e.g., RHEL6 */
		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
		else if (is_valid_tracepoint("syscalls:sys_enter"))
			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
		else {
			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2503
			free(rec_argv);
2504 2505
			return -1;
		}
2506 2507
	}

2508 2509 2510 2511 2512 2513 2514 2515 2516 2517
	if (trace->trace_pgfaults & TRACE_PFMAJ)
		for (i = 0; i < majpf_args_nr; i++)
			rec_argv[j++] = majpf_args[i];

	if (trace->trace_pgfaults & TRACE_PFMIN)
		for (i = 0; i < minpf_args_nr; i++)
			rec_argv[j++] = minpf_args[i];

	for (i = 0; i < (unsigned int)argc; i++)
		rec_argv[j++] = argv[i];
D
David Ahern 已提交
2518

2519
	return cmd_record(j, rec_argv);
D
David Ahern 已提交
2520 2521
}

2522 2523
static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);

2524
static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2525
{
2526 2527 2528 2529
	bool found = false;
	struct perf_evsel *evsel, *tmp;
	struct parse_events_error err = { .idx = 0, };
	int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2530

2531
	if (ret)
2532
		return false;
2533

2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
	evlist__for_each_entry_safe(evlist, evsel, tmp) {
		if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
			continue;

		if (perf_evsel__field(evsel, "pathname")) {
			evsel->handler = trace__vfs_getname;
			found = true;
			continue;
		}

		list_del_init(&evsel->node);
		evsel->evlist = NULL;
2546 2547 2548
		perf_evsel__delete(evsel);
	}

2549
	return found;
2550 2551
}

2552
static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2553 2554 2555 2556 2557 2558 2559 2560
{
	struct perf_evsel *evsel;
	struct perf_event_attr attr = {
		.type = PERF_TYPE_SOFTWARE,
		.mmap_data = 1,
	};

	attr.config = config;
2561
	attr.sample_period = 1;
2562 2563 2564 2565

	event_attr_init(&attr);

	evsel = perf_evsel__new(&attr);
2566 2567
	if (evsel)
		evsel->handler = trace__pgfault;
2568

2569
	return evsel;
2570 2571
}

2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587
static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
{
	const u32 type = event->header.type;
	struct perf_evsel *evsel;

	if (type != PERF_RECORD_SAMPLE) {
		trace__process_event(trace, trace->host, event, sample);
		return;
	}

	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
	if (evsel == NULL) {
		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
		return;
	}

2588 2589
	trace__set_base_time(trace, evsel, sample);

2590 2591 2592 2593 2594 2595 2596 2597 2598
	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
	    sample->raw_data == NULL) {
		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
		       perf_evsel__name(evsel), sample->tid,
		       sample->cpu, sample->raw_size);
	} else {
		tracepoint_handler handler = evsel->handler;
		handler(trace, evsel, event, sample);
	}
2599 2600 2601

	if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
		interrupted = true;
2602 2603
}

2604 2605 2606 2607 2608 2609
static int trace__add_syscall_newtp(struct trace *trace)
{
	int ret = -1;
	struct perf_evlist *evlist = trace->evlist;
	struct perf_evsel *sys_enter, *sys_exit;

2610
	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2611 2612 2613 2614 2615 2616
	if (sys_enter == NULL)
		goto out;

	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
		goto out_delete_sys_enter;

2617
	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2618 2619 2620 2621 2622 2623
	if (sys_exit == NULL)
		goto out_delete_sys_enter;

	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
		goto out_delete_sys_exit;

2624 2625 2626
	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);

2627 2628 2629
	perf_evlist__add(evlist, sys_enter);
	perf_evlist__add(evlist, sys_exit);

2630
	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2631 2632 2633 2634 2635 2636 2637 2638
		/*
		 * We're interested only in the user space callchain
		 * leading to the syscall, allow overriding that for
		 * debugging reasons using --kernel_syscall_callchains
		 */
		sys_exit->attr.exclude_callchain_kernel = 1;
	}

2639 2640
	trace->syscalls.events.sys_enter = sys_enter;
	trace->syscalls.events.sys_exit  = sys_exit;
2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652

	ret = 0;
out:
	return ret;

out_delete_sys_exit:
	perf_evsel__delete_priv(sys_exit);
out_delete_sys_enter:
	perf_evsel__delete_priv(sys_enter);
	goto out;
}

2653
static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2654 2655
{
	int err = -1;
2656
	struct perf_evsel *sys_exit;
2657 2658 2659 2660 2661 2662 2663
	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
						trace->ev_qualifier_ids.nr,
						trace->ev_qualifier_ids.entries);

	if (filter == NULL)
		goto out_enomem;

2664 2665
	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
					  filter)) {
2666
		sys_exit = trace->syscalls.events.sys_exit;
2667
		err = perf_evsel__append_tp_filter(sys_exit, filter);
2668
	}
2669 2670 2671 2672 2673 2674 2675 2676

	free(filter);
out:
	return err;
out_enomem:
	errno = ENOMEM;
	goto out;
}
2677

2678 2679 2680 2681
#ifdef HAVE_LIBBPF_SUPPORT
static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
{
	int fd = bpf_map__fd(trace->syscalls.map);
2682 2683 2684
	struct bpf_map_syscall_entry value = {
		.enabled = !trace->not_ev_qualifier,
	};
2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701
	int err = 0;
	size_t i;

	for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
		int key = trace->ev_qualifier_ids.entries[i];

		err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
		if (err)
			break;
	}

	return err;
}

static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
{
	int fd = bpf_map__fd(trace->syscalls.map);
2702 2703 2704
	struct bpf_map_syscall_entry value = {
		.enabled = enabled,
	};
2705 2706 2707
	int err = 0, key;

	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2708
		err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
		if (err)
			break;
	}

	return err;
}

static int trace__init_syscalls_bpf_map(struct trace *trace)
{
	bool enabled = true;

	if (trace->ev_qualifier_ids.nr)
		enabled = trace->not_ev_qualifier;

	return __trace__init_syscalls_bpf_map(trace, enabled);
}
#else
static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
{
	return 0;
}

static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
{
	return 0;
}
#endif // HAVE_LIBBPF_SUPPORT

2737 2738
static int trace__set_ev_qualifier_filter(struct trace *trace)
{
2739 2740
	if (trace->syscalls.map)
		return trace__set_ev_qualifier_bpf_filter(trace);
2741 2742 2743
	if (trace->syscalls.events.sys_enter)
		return trace__set_ev_qualifier_tp_filter(trace);
	return 0;
2744 2745
}

2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763
static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
				    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
{
	int err = 0;
#ifdef HAVE_LIBBPF_SUPPORT
	bool value = true;
	int map_fd = bpf_map__fd(map);
	size_t i;

	for (i = 0; i < npids; ++i) {
		err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
		if (err)
			break;
	}
#endif
	return err;
}

2764 2765
static int trace__set_filter_loop_pids(struct trace *trace)
{
2766
	unsigned int nr = 1, err;
2767 2768 2769
	pid_t pids[32] = {
		getpid(),
	};
2770 2771 2772 2773 2774 2775 2776 2777
	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);

	while (thread && nr < ARRAY_SIZE(pids)) {
		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);

		if (parent == NULL)
			break;

2778 2779
		if (!strcmp(thread__comm_str(parent), "sshd") ||
		    strstarts(thread__comm_str(parent), "gnome-terminal")) {
2780 2781 2782 2783 2784
			pids[nr++] = parent->tid;
			break;
		}
		thread = parent;
	}
2785

2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813
	err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
	if (!err && trace->filter_pids.map)
		err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);

	return err;
}

static int trace__set_filter_pids(struct trace *trace)
{
	int err = 0;
	/*
	 * Better not use !target__has_task() here because we need to cover the
	 * case where no threads were specified in the command line, but a
	 * workload was, and in that case we will fill in the thread_map when
	 * we fork the workload in perf_evlist__prepare_workload.
	 */
	if (trace->filter_pids.nr > 0) {
		err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
						      trace->filter_pids.entries);
		if (!err && trace->filter_pids.map) {
			err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
						       trace->filter_pids.entries);
		}
	} else if (thread_map__pid(trace->evlist->threads, 0) == -1) {
		err = trace__set_filter_loop_pids(trace);
	}

	return err;
2814 2815
}

2816
static int __trace__deliver_event(struct trace *trace, union perf_event *event)
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
{
	struct perf_evlist *evlist = trace->evlist;
	struct perf_sample sample;
	int err;

	err = perf_evlist__parse_sample(evlist, event, &sample);
	if (err)
		fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
	else
		trace__handle_event(trace, event, &sample);

	return 0;
}

2831
static int __trace__flush_events(struct trace *trace)
J
Jiri Olsa 已提交
2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842
{
	u64 first = ordered_events__first_time(&trace->oe.data);
	u64 flush = trace->oe.last - NSEC_PER_SEC;

	/* Is there some thing to flush.. */
	if (first && first < flush)
		return ordered_events__flush_time(&trace->oe.data, flush);

	return 0;
}

2843 2844 2845 2846 2847
static int trace__flush_events(struct trace *trace)
{
	return !trace->sort_events ? 0 : __trace__flush_events(trace);
}

2848
static int trace__deliver_event(struct trace *trace, union perf_event *event)
J
Jiri Olsa 已提交
2849 2850 2851
{
	int err;

2852 2853 2854 2855
	if (!trace->sort_events)
		return __trace__deliver_event(trace, event);

	err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
J
Jiri Olsa 已提交
2856 2857 2858 2859 2860 2861 2862
	if (err && err != -1)
		return err;

	err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
	if (err)
		return err;

2863
	return trace__flush_events(trace);
J
Jiri Olsa 已提交
2864 2865 2866 2867 2868 2869 2870
}

static int ordered_events__deliver_event(struct ordered_events *oe,
					 struct ordered_event *event)
{
	struct trace *trace = container_of(oe, struct trace, oe.data);

2871
	return __trace__deliver_event(trace, event->event);
J
Jiri Olsa 已提交
2872 2873
}

2874
static int trace__run(struct trace *trace, int argc, const char **argv)
A
Arnaldo Carvalho de Melo 已提交
2875
{
2876
	struct perf_evlist *evlist = trace->evlist;
2877
	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2878 2879
	int err = -1, i;
	unsigned long before;
2880
	const bool forks = argc > 0;
2881
	bool draining = false;
A
Arnaldo Carvalho de Melo 已提交
2882

2883 2884
	trace->live = true;

2885 2886 2887
	if (!trace->raw_augmented_syscalls) {
		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
			goto out_error_raw_syscalls;
A
Arnaldo Carvalho de Melo 已提交
2888

2889 2890 2891
		if (trace->trace_syscalls)
			trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
	}
2892

2893 2894 2895 2896
	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
		if (pgfault_maj == NULL)
			goto out_error_mem;
2897
		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2898
		perf_evlist__add(evlist, pgfault_maj);
2899
	}
2900

2901 2902 2903 2904
	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
		if (pgfault_min == NULL)
			goto out_error_mem;
2905
		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2906 2907
		perf_evlist__add(evlist, pgfault_min);
	}
2908

2909
	if (trace->sched &&
2910 2911 2912
	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
				   trace__sched_stat_runtime))
		goto out_error_sched_stat_runtime;
2913

2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941
	/*
	 * If a global cgroup was set, apply it to all the events without an
	 * explicit cgroup. I.e.:
	 *
	 * 	trace -G A -e sched:*switch
	 *
	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
	 * _and_ sched:sched_switch to the 'A' cgroup, while:
	 *
	 * trace -e sched:*switch -G A
	 *
	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
	 * a cgroup (on the root cgroup, sys wide, etc).
	 *
	 * Multiple cgroups:
	 *
	 * trace -G A -e sched:*switch -G B
	 *
	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
	 * to the 'B' cgroup.
	 *
	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
	 */
	if (trace->cgroup)
		evlist__set_default_cgroup(trace->evlist, trace->cgroup);

A
Arnaldo Carvalho de Melo 已提交
2942 2943
	err = perf_evlist__create_maps(evlist, &trace->opts.target);
	if (err < 0) {
2944
		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
A
Arnaldo Carvalho de Melo 已提交
2945 2946 2947
		goto out_delete_evlist;
	}

2948 2949
	err = trace__symbols_init(trace, evlist);
	if (err < 0) {
2950
		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2951
		goto out_delete_evlist;
2952 2953
	}

2954
	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2955

2956 2957 2958 2959
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);

	if (forks) {
2960
		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2961
						    argv, false, NULL);
2962
		if (err < 0) {
2963
			fprintf(trace->output, "Couldn't run the workload!\n");
2964
			goto out_delete_evlist;
2965 2966 2967
		}
	}

A
Arnaldo Carvalho de Melo 已提交
2968
	err = perf_evlist__open(evlist);
2969 2970
	if (err < 0)
		goto out_error_open;
A
Arnaldo Carvalho de Melo 已提交
2971

2972 2973 2974 2975 2976 2977 2978 2979 2980 2981
	err = bpf__apply_obj_config();
	if (err) {
		char errbuf[BUFSIZ];

		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
		pr_err("ERROR: Apply config to BPF failed: %s\n",
			 errbuf);
		goto out_error_open;
	}

2982
	err = trace__set_filter_pids(trace);
2983 2984 2985
	if (err < 0)
		goto out_error_mem;

2986 2987 2988
	if (trace->syscalls.map)
		trace__init_syscalls_bpf_map(trace);

2989 2990 2991 2992 2993
	if (trace->ev_qualifier_ids.nr > 0) {
		err = trace__set_ev_qualifier_filter(trace);
		if (err < 0)
			goto out_errno;

2994 2995 2996 2997
		if (trace->syscalls.events.sys_exit) {
			pr_debug("event qualifier tracepoint filter: %s\n",
				 trace->syscalls.events.sys_exit->filter);
		}
2998
	}
2999

3000 3001 3002
	err = perf_evlist__apply_filters(evlist, &evsel);
	if (err < 0)
		goto out_error_apply_filters;
3003

3004 3005 3006
	if (trace->dump.map)
		bpf_map__fprintf(trace->dump.map, trace->output);

3007
	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
3008 3009
	if (err < 0)
		goto out_error_mmap;
A
Arnaldo Carvalho de Melo 已提交
3010

A
Alexis Berlemont 已提交
3011
	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
3012 3013
		perf_evlist__enable(evlist);

3014 3015 3016
	if (forks)
		perf_evlist__start_workload(evlist);

A
Alexis Berlemont 已提交
3017 3018 3019 3020 3021
	if (trace->opts.initial_delay) {
		usleep(trace->opts.initial_delay * 1000);
		perf_evlist__enable(evlist);
	}

3022
	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
3023 3024
				  evlist->threads->nr > 1 ||
				  perf_evlist__first(evlist)->attr.inherit;
3025 3026 3027 3028 3029

	/*
	 * Now that we already used evsel->attr to ask the kernel to setup the
	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
	 * trace__resolve_callchain(), allowing per-event max-stack settings
3030
	 * to override an explicitly set --max-stack global setting.
3031 3032
	 */
	evlist__for_each_entry(evlist, evsel) {
3033
		if (evsel__has_callchain(evsel) &&
3034 3035 3036
		    evsel->attr.sample_max_stack == 0)
			evsel->attr.sample_max_stack = trace->max_stack;
	}
A
Arnaldo Carvalho de Melo 已提交
3037
again:
3038
	before = trace->nr_events;
A
Arnaldo Carvalho de Melo 已提交
3039 3040 3041

	for (i = 0; i < evlist->nr_mmaps; i++) {
		union perf_event *event;
3042
		struct perf_mmap *md;
A
Arnaldo Carvalho de Melo 已提交
3043

3044
		md = &evlist->mmap[i];
3045
		if (perf_mmap__read_init(md) < 0)
3046 3047
			continue;

3048
		while ((event = perf_mmap__read_event(md)) != NULL) {
3049
			++trace->nr_events;
A
Arnaldo Carvalho de Melo 已提交
3050

3051
			err = trace__deliver_event(trace, event);
J
Jiri Olsa 已提交
3052 3053
			if (err)
				goto out_disable;
A
Arnaldo Carvalho de Melo 已提交
3054

3055
			perf_mmap__consume(md);
3056

3057 3058
			if (interrupted)
				goto out_disable;
3059 3060 3061 3062 3063

			if (done && !draining) {
				perf_evlist__disable(evlist);
				draining = true;
			}
A
Arnaldo Carvalho de Melo 已提交
3064
		}
3065
		perf_mmap__read_done(md);
A
Arnaldo Carvalho de Melo 已提交
3066 3067
	}

3068
	if (trace->nr_events == before) {
3069
		int timeout = done ? 100 : -1;
3070

3071
		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
3072
			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3073 3074
				draining = true;

3075
			goto again;
J
Jiri Olsa 已提交
3076
		} else {
3077
			if (trace__flush_events(trace))
J
Jiri Olsa 已提交
3078
				goto out_disable;
3079
		}
3080 3081
	} else {
		goto again;
3082 3083
	}

3084
out_disable:
3085 3086
	thread__zput(trace->current);

3087
	perf_evlist__disable(evlist);
A
Arnaldo Carvalho de Melo 已提交
3088

3089 3090
	if (trace->sort_events)
		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
J
Jiri Olsa 已提交
3091

3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103
	if (!err) {
		if (trace->summary)
			trace__fprintf_thread_summary(trace, trace->output);

		if (trace->show_tool_stats) {
			fprintf(trace->output, "Stats:\n "
					       " vfs_getname : %" PRIu64 "\n"
					       " proc_getname: %" PRIu64 "\n",
				trace->stats.vfs_getname,
				trace->stats.proc_getname);
		}
	}
3104

A
Arnaldo Carvalho de Melo 已提交
3105
out_delete_evlist:
3106 3107
	trace__symbols__exit(trace);

A
Arnaldo Carvalho de Melo 已提交
3108
	perf_evlist__delete(evlist);
3109
	cgroup__put(trace->cgroup);
3110
	trace->evlist = NULL;
3111
	trace->live = false;
A
Arnaldo Carvalho de Melo 已提交
3112
	return err;
3113 3114
{
	char errbuf[BUFSIZ];
3115

3116
out_error_sched_stat_runtime:
3117
	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3118 3119
	goto out_error;

3120
out_error_raw_syscalls:
3121
	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3122 3123
	goto out_error;

3124 3125 3126 3127
out_error_mmap:
	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
	goto out_error;

3128 3129 3130 3131
out_error_open:
	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));

out_error:
3132
	fprintf(trace->output, "%s\n", errbuf);
3133
	goto out_delete_evlist;
3134 3135 3136 3137 3138

out_error_apply_filters:
	fprintf(trace->output,
		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
		evsel->filter, perf_evsel__name(evsel), errno,
3139
		str_error_r(errno, errbuf, sizeof(errbuf)));
3140
	goto out_delete_evlist;
A
Arnaldo Carvalho de Melo 已提交
3141
}
3142 3143 3144
out_error_mem:
	fprintf(trace->output, "Not enough memory to run!\n");
	goto out_delete_evlist;
3145 3146 3147 3148

out_errno:
	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
	goto out_delete_evlist;
3149
}
A
Arnaldo Carvalho de Melo 已提交
3150

3151 3152 3153
static int trace__replay(struct trace *trace)
{
	const struct perf_evsel_str_handler handlers[] = {
3154
		{ "probe:vfs_getname",	     trace__vfs_getname, },
3155
	};
3156
	struct perf_data data = {
J
Jiri Olsa 已提交
3157 3158 3159 3160 3161
		.file      = {
			.path = input_name,
		},
		.mode      = PERF_DATA_MODE_READ,
		.force     = trace->force,
3162
	};
3163
	struct perf_session *session;
3164
	struct perf_evsel *evsel;
3165 3166 3167 3168
	int err = -1;

	trace->tool.sample	  = trace__process_sample;
	trace->tool.mmap	  = perf_event__process_mmap;
D
David Ahern 已提交
3169
	trace->tool.mmap2	  = perf_event__process_mmap2;
3170 3171 3172 3173
	trace->tool.comm	  = perf_event__process_comm;
	trace->tool.exit	  = perf_event__process_exit;
	trace->tool.fork	  = perf_event__process_fork;
	trace->tool.attr	  = perf_event__process_attr;
3174
	trace->tool.tracing_data  = perf_event__process_tracing_data;
3175
	trace->tool.build_id	  = perf_event__process_build_id;
3176
	trace->tool.namespaces	  = perf_event__process_namespaces;
3177

3178
	trace->tool.ordered_events = true;
3179 3180 3181 3182 3183
	trace->tool.ordering_requires_timestamps = true;

	/* add tid to output */
	trace->multiple_threads = true;

3184
	session = perf_session__new(&data, false, &trace->tool);
3185
	if (session == NULL)
3186
		return -1;
3187

3188 3189 3190 3191 3192 3193
	if (trace->opts.target.pid)
		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);

	if (trace->opts.target.tid)
		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);

3194
	if (symbol__init(&session->header.env) < 0)
3195 3196
		goto out;

3197 3198
	trace->host = &session->machines.host;

3199 3200 3201 3202
	err = perf_session__set_tracepoints_handlers(session, handlers);
	if (err)
		goto out;

3203 3204
	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
						     "raw_syscalls:sys_enter");
3205 3206 3207 3208
	/* older kernels have syscalls tp versus raw_syscalls */
	if (evsel == NULL)
		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
							     "syscalls:sys_enter");
3209

3210
	if (evsel &&
3211
	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3212
	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3213 3214 3215 3216 3217 3218
		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
		goto out;
	}

	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
						     "raw_syscalls:sys_exit");
3219 3220 3221
	if (evsel == NULL)
		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
							     "syscalls:sys_exit");
3222
	if (evsel &&
3223
	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3224
	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3225
		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3226 3227 3228
		goto out;
	}

3229
	evlist__for_each_entry(session->evlist, evsel) {
3230 3231 3232 3233 3234 3235 3236
		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
			evsel->handler = trace__pgfault;
	}

3237 3238
	setup_pager();

3239
	err = perf_session__process_events(session);
3240 3241 3242
	if (err)
		pr_err("Failed to process events, error %d", err);

3243 3244 3245
	else if (trace->summary)
		trace__fprintf_thread_summary(trace, trace->output);

3246 3247 3248 3249 3250 3251
out:
	perf_session__delete(session);

	return err;
}

3252 3253 3254 3255
static size_t trace__fprintf_threads_header(FILE *fp)
{
	size_t printed;

3256
	printed  = fprintf(fp, "\n Summary of events:\n\n");
3257 3258 3259 3260

	return printed;
}

3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274
DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
	struct stats 	*stats;
	double		msecs;
	int		syscall;
)
{
	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
	struct stats *stats = source->priv;

	entry->syscall = source->i;
	entry->stats   = stats;
	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
}

3275 3276 3277 3278 3279
static size_t thread__dump_stats(struct thread_trace *ttrace,
				 struct trace *trace, FILE *fp)
{
	size_t printed = 0;
	struct syscall *sc;
3280 3281
	struct rb_node *nd;
	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3282

3283
	if (syscall_stats == NULL)
3284 3285 3286 3287
		return 0;

	printed += fprintf(fp, "\n");

3288 3289 3290
	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3291

3292
	resort_rb__for_each_entry(nd, syscall_stats) {
3293
		struct stats *stats = syscall_stats_entry->stats;
3294 3295 3296 3297 3298 3299 3300 3301 3302 3303
		if (stats) {
			double min = (double)(stats->min) / NSEC_PER_MSEC;
			double max = (double)(stats->max) / NSEC_PER_MSEC;
			double avg = avg_stats(stats);
			double pct;
			u64 n = (u64) stats->n;

			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
			avg /= NSEC_PER_MSEC;

3304
			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3305
			printed += fprintf(fp, "   %-15s", sc->name);
3306
			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3307
					   n, syscall_stats_entry->msecs, min, avg);
P
Pekka Enberg 已提交
3308
			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3309 3310 3311
		}
	}

3312
	resort_rb__delete(syscall_stats);
3313
	printed += fprintf(fp, "\n\n");
3314 3315 3316 3317

	return printed;
}

3318
static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3319
{
3320
	size_t printed = 0;
3321
	struct thread_trace *ttrace = thread__priv(thread);
3322 3323 3324 3325 3326 3327 3328
	double ratio;

	if (ttrace == NULL)
		return 0;

	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;

3329
	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3330
	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3331
	printed += fprintf(fp, "%.1f%%", ratio);
3332 3333 3334 3335
	if (ttrace->pfmaj)
		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
	if (ttrace->pfmin)
		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3336 3337 3338 3339 3340
	if (trace->sched)
		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
	else if (fputc('\n', fp) != EOF)
		++printed;

3341
	printed += thread__dump_stats(ttrace, trace, fp);
3342

3343 3344
	return printed;
}
3345

3346 3347 3348 3349 3350 3351 3352 3353 3354 3355
static unsigned long thread__nr_events(struct thread_trace *ttrace)
{
	return ttrace ? ttrace->nr_events : 0;
}

DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
	struct thread *thread;
)
{
	entry->thread = rb_entry(nd, struct thread, rb_node);
3356 3357
}

3358 3359
static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
{
3360 3361
	size_t printed = trace__fprintf_threads_header(fp);
	struct rb_node *nd;
3362
	int i;
3363

3364 3365
	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3366

3367 3368 3369 3370
		if (threads == NULL) {
			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
			return 0;
		}
3371

3372 3373
		resort_rb__for_each_entry(nd, threads)
			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3374

3375 3376
		resort_rb__delete(threads);
	}
3377
	return printed;
3378 3379
}

3380 3381 3382 3383 3384 3385 3386 3387 3388
static int trace__set_duration(const struct option *opt, const char *str,
			       int unset __maybe_unused)
{
	struct trace *trace = opt->value;

	trace->duration_filter = atof(str);
	return 0;
}

3389 3390
static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
					      int unset __maybe_unused)
3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420
{
	int ret = -1;
	size_t i;
	struct trace *trace = opt->value;
	/*
	 * FIXME: introduce a intarray class, plain parse csv and create a
	 * { int nr, int entries[] } struct...
	 */
	struct intlist *list = intlist__new(str);

	if (list == NULL)
		return -1;

	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
	trace->filter_pids.entries = calloc(i, sizeof(pid_t));

	if (trace->filter_pids.entries == NULL)
		goto out;

	trace->filter_pids.entries[0] = getpid();

	for (i = 1; i < trace->filter_pids.nr; ++i)
		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;

	intlist__delete(list);
	ret = 0;
out:
	return ret;
}

3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437
static int trace__open_output(struct trace *trace, const char *filename)
{
	struct stat st;

	if (!stat(filename, &st) && st.st_size) {
		char oldname[PATH_MAX];

		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
		unlink(oldname);
		rename(filename, oldname);
	}

	trace->output = fopen(filename, "w");

	return trace->output == NULL ? -errno : 0;
}

3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454
static int parse_pagefaults(const struct option *opt, const char *str,
			    int unset __maybe_unused)
{
	int *trace_pgfaults = opt->value;

	if (strcmp(str, "all") == 0)
		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
	else if (strcmp(str, "maj") == 0)
		*trace_pgfaults |= TRACE_PFMAJ;
	else if (strcmp(str, "min") == 0)
		*trace_pgfaults |= TRACE_PFMIN;
	else
		return -1;

	return 0;
}

3455 3456 3457 3458
static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
{
	struct perf_evsel *evsel;

3459
	evlist__for_each_entry(evlist, evsel)
3460 3461 3462
		evsel->handler = handler;
}

3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492
static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
{
	struct perf_evsel *evsel;

	evlist__for_each_entry(evlist, evsel) {
		if (evsel->priv || !evsel->tp_format)
			continue;

		if (strcmp(evsel->tp_format->system, "syscalls"))
			continue;

		if (perf_evsel__init_syscall_tp(evsel))
			return -1;

		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
			struct syscall_tp *sc = evsel->priv;

			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
				return -1;
		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
			struct syscall_tp *sc = evsel->priv;

			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
				return -1;
		}
	}

	return 0;
}

3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506
/*
 * XXX: Hackish, just splitting the combined -e+--event (syscalls
 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
 *
 * It'd be better to introduce a parse_options() variant that would return a
 * list with the terms it didn't match to an event...
 */
static int trace__parse_events_option(const struct option *opt, const char *str,
				      int unset __maybe_unused)
{
	struct trace *trace = (struct trace *)opt->value;
	const char *s = str;
	char *sep = NULL, *lists[2] = { NULL, NULL, };
3507
	int len = strlen(str) + 1, err = -1, list, idx;
3508 3509
	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
	char group_name[PATH_MAX];
3510
	struct syscall_fmt *fmt;
3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524

	if (strace_groups_dir == NULL)
		return -1;

	if (*s == '!') {
		++s;
		trace->not_ev_qualifier = true;
	}

	while (1) {
		if ((sep = strchr(s, ',')) != NULL)
			*sep = '\0';

		list = 0;
3525 3526
		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3527
			list = 1;
3528 3529 3530 3531 3532 3533 3534
			goto do_concat;
		}

		fmt = syscall_fmt__find_by_alias(s);
		if (fmt != NULL) {
			list = 1;
			s = fmt->name;
3535 3536 3537 3538 3539
		} else {
			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
			if (access(group_name, R_OK) == 0)
				list = 1;
		}
3540
do_concat:
3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569
		if (lists[list]) {
			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
		} else {
			lists[list] = malloc(len);
			if (lists[list] == NULL)
				goto out;
			strcpy(lists[list], s);
		}

		if (!sep)
			break;

		*sep = ',';
		s = sep + 1;
	}

	if (lists[1] != NULL) {
		struct strlist_config slist_config = {
			.dirname = strace_groups_dir,
		};

		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
		if (trace->ev_qualifier == NULL) {
			fputs("Not enough memory to parse event qualifier", trace->output);
			goto out;
		}

		if (trace__validate_ev_qualifier(trace))
			goto out;
3570
		trace->trace_syscalls = true;
3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587
	}

	err = 0;

	if (lists[0]) {
		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
					       "event selector. use 'perf list' to list available events",
					       parse_events_option);
		err = parse_events_option(&o, lists[0], 0);
	}
out:
	if (sep)
		*sep = ',';

	return err;
}

3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599
static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
{
	struct trace *trace = opt->value;

	if (!list_empty(&trace->evlist->entries))
		return parse_cgroups(opt, str, unset);

	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);

	return 0;
}

3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618
static struct bpf_map *bpf__find_map_by_name(const char *name)
{
	struct bpf_object *obj, *tmp;

	bpf_object__for_each_safe(obj, tmp) {
		struct bpf_map *map = bpf_object__find_map_by_name(obj, name);
		if (map)
			return map;

	}

	return NULL;
}

static void trace__set_bpf_map_filtered_pids(struct trace *trace)
{
	trace->filter_pids.map = bpf__find_map_by_name("pids_filtered");
}

3619 3620 3621 3622 3623
static void trace__set_bpf_map_syscalls(struct trace *trace)
{
	trace->syscalls.map = bpf__find_map_by_name("syscalls");
}

3624 3625
static int trace__config(const char *var, const char *value, void *arg)
{
3626
	struct trace *trace = arg;
3627 3628 3629 3630 3631 3632 3633
	int err = 0;

	if (!strcmp(var, "trace.add_events")) {
		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
					       "event selector. use 'perf list' to list available events",
					       parse_events_option);
		err = parse_events_option(&o, value, 0);
3634 3635
	} else if (!strcmp(var, "trace.show_timestamp")) {
		trace->show_tstamp = perf_config_bool(var, value);
3636 3637
	} else if (!strcmp(var, "trace.show_duration")) {
		trace->show_duration = perf_config_bool(var, value);
3638 3639 3640 3641
	} else if (!strcmp(var, "trace.show_arg_names")) {
		trace->show_arg_names = perf_config_bool(var, value);
		if (!trace->show_arg_names)
			trace->show_zeros = true;
3642
	} else if (!strcmp(var, "trace.show_zeros")) {
3643 3644 3645 3646 3647 3648
		bool new_show_zeros = perf_config_bool(var, value);
		if (!trace->show_arg_names && !new_show_zeros) {
			pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
			goto out;
		}
		trace->show_zeros = new_show_zeros;
3649 3650
	} else if (!strcmp(var, "trace.show_prefix")) {
		trace->show_string_prefix = perf_config_bool(var, value);
3651 3652
	} else if (!strcmp(var, "trace.no_inherit")) {
		trace->opts.no_inherit = perf_config_bool(var, value);
3653 3654 3655 3656
	} else if (!strcmp(var, "trace.args_alignment")) {
		int args_alignment = 0;
		if (perf_config_int(&args_alignment, var, value) == 0)
			trace->args_alignment = args_alignment;
3657
	}
3658
out:
3659 3660 3661
	return err;
}

3662
int cmd_trace(int argc, const char **argv)
A
Arnaldo Carvalho de Melo 已提交
3663
{
3664
	const char *trace_usage[] = {
3665 3666
		"perf trace [<options>] [<command>]",
		"perf trace [<options>] -- <command> [<options>]",
D
David Ahern 已提交
3667 3668
		"perf trace record [<options>] [<command>]",
		"perf trace record [<options>] -- <command> [<options>]",
A
Arnaldo Carvalho de Melo 已提交
3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681
		NULL
	};
	struct trace trace = {
		.syscalls = {
			. max = -1,
		},
		.opts = {
			.target = {
				.uid	   = UINT_MAX,
				.uses_mmap = true,
			},
			.user_freq     = UINT_MAX,
			.user_interval = ULLONG_MAX,
3682
			.no_buffering  = true,
3683
			.mmap_pages    = UINT_MAX,
A
Arnaldo Carvalho de Melo 已提交
3684
		},
3685
		.output = stderr,
3686
		.show_comm = true,
3687
		.show_tstamp = true,
3688
		.show_duration = true,
3689
		.show_arg_names = true,
3690
		.args_alignment = 70,
3691
		.trace_syscalls = false,
3692
		.kernel_syscallchains = false,
3693
		.max_stack = UINT_MAX,
3694
		.max_events = ULONG_MAX,
A
Arnaldo Carvalho de Melo 已提交
3695
	};
3696
	const char *map_dump_str = NULL;
3697
	const char *output_name = NULL;
A
Arnaldo Carvalho de Melo 已提交
3698
	const struct option trace_options[] = {
3699 3700 3701
	OPT_CALLBACK('e', "event", &trace, "event",
		     "event/syscall selector. use 'perf list' to list available events",
		     trace__parse_events_option),
3702 3703
	OPT_BOOLEAN(0, "comm", &trace.show_comm,
		    "show the thread COMM next to its id"),
3704
	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3705 3706
	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
		     trace__parse_events_option),
3707
	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3708
	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
A
Arnaldo Carvalho de Melo 已提交
3709 3710
	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
		    "trace events on existing process id"),
3711
	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
A
Arnaldo Carvalho de Melo 已提交
3712
		    "trace events on existing thread id"),
3713
	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3714
		     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
3715
	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
A
Arnaldo Carvalho de Melo 已提交
3716
		    "system-wide collection from all CPUs"),
3717
	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
A
Arnaldo Carvalho de Melo 已提交
3718
		    "list of cpus to monitor"),
3719
	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
A
Arnaldo Carvalho de Melo 已提交
3720
		    "child tasks do not inherit counters"),
3721 3722 3723
	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
		     "number of mmap data pages",
		     perf_evlist__parse_mmap_pages),
3724
	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
A
Arnaldo Carvalho de Melo 已提交
3725
		   "user to profile"),
3726 3727 3728
	OPT_CALLBACK(0, "duration", &trace, "float",
		     "show only events with duration > N.M ms",
		     trace__set_duration),
3729 3730 3731
#ifdef HAVE_LIBBPF_SUPPORT
	OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
#endif
3732
	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3733
	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3734 3735
	OPT_BOOLEAN('T', "time", &trace.full_time,
		    "Show full timestamp, not time relative to first start"),
3736 3737
	OPT_BOOLEAN(0, "failure", &trace.failure_only,
		    "Show only syscalls that failed"),
D
David Ahern 已提交
3738 3739 3740 3741
	OPT_BOOLEAN('s', "summary", &trace.summary_only,
		    "Show only syscall summary with statistics"),
	OPT_BOOLEAN('S', "with-summary", &trace.summary,
		    "Show all syscalls and summary with statistics"),
3742 3743
	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
		     "Trace pagefaults", parse_pagefaults, "maj"),
3744
	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3745
	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3746 3747 3748
	OPT_CALLBACK(0, "call-graph", &trace.opts,
		     "record_mode[,record_size]", record_callchain_help,
		     &record_parse_callchain_opt),
3749 3750
	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
		    "Show the kernel callchains on the syscall exit path"),
3751 3752
	OPT_ULONG(0, "max-events", &trace.max_events,
		"Set the maximum number of events to print, exit after that is reached. "),
3753 3754 3755
	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
		     "Set the minimum stack depth when parsing the callchain, "
		     "anything below the specified depth will be ignored."),
3756 3757 3758
	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
		     "Set the maximum stack depth when parsing the callchain, "
		     "anything beyond the specified depth will be ignored. "
3759
		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3760 3761
	OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
			"Sort batch of events before processing, use if getting out of order events"),
3762 3763
	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3764
	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3765
			"per thread proc mmap processing timeout in ms"),
3766 3767
	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
		     trace__parse_cgroups),
A
Alexis Berlemont 已提交
3768 3769 3770
	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
		     "ms to wait before starting measurement after program "
		     "start"),
A
Arnaldo Carvalho de Melo 已提交
3771 3772
	OPT_END()
	};
3773
	bool __maybe_unused max_stack_user_set = true;
3774
	bool mmap_pages_user_set = true;
3775
	struct perf_evsel *evsel;
3776
	const char * const trace_subcommands[] = { "record", NULL };
3777
	int err = -1;
3778
	char bf[BUFSIZ];
A
Arnaldo Carvalho de Melo 已提交
3779

3780 3781 3782
	signal(SIGSEGV, sighandler_dump_stack);
	signal(SIGFPE, sighandler_dump_stack);

3783
	trace.evlist = perf_evlist__new();
3784
	trace.sctbl = syscalltbl__new();
3785

3786
	if (trace.evlist == NULL || trace.sctbl == NULL) {
3787
		pr_err("Not enough memory to run!\n");
3788
		err = -ENOMEM;
3789 3790 3791
		goto out;
	}

3792 3793 3794 3795
	err = perf_config(trace__config, &trace);
	if (err)
		goto out;

3796 3797
	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
D
David Ahern 已提交
3798

3799 3800 3801 3802 3803
	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
		usage_with_options_msg(trace_usage, trace_options,
				       "cgroup monitoring only available in system-wide mode");
	}

3804 3805 3806
	evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
	if (IS_ERR(evsel)) {
		bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3807 3808 3809 3810
		pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
		goto out;
	}

3811
	if (evsel) {
3812
		trace.syscalls.events.augmented = evsel;
3813
		trace__set_bpf_map_filtered_pids(&trace);
3814
		trace__set_bpf_map_syscalls(&trace);
3815
	}
3816

3817 3818 3819 3820 3821 3822 3823
	err = bpf__setup_stdout(trace.evlist);
	if (err) {
		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
		goto out;
	}

3824 3825
	err = -1;

3826 3827 3828 3829 3830 3831 3832 3833
	if (map_dump_str) {
		trace.dump.map = bpf__find_map_by_name(map_dump_str);
		if (trace.dump.map == NULL) {
			pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
			goto out;
		}
	}

3834 3835 3836 3837 3838
	if (trace.trace_pgfaults) {
		trace.opts.sample_address = true;
		trace.opts.sample_time = true;
	}

3839 3840 3841
	if (trace.opts.mmap_pages == UINT_MAX)
		mmap_pages_user_set = false;

3842
	if (trace.max_stack == UINT_MAX) {
3843
		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3844 3845 3846 3847
		max_stack_user_set = false;
	}

#ifdef HAVE_DWARF_UNWIND_SUPPORT
3848
	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3849
		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3850
	}
3851 3852
#endif

3853
	if (callchain_param.enabled) {
3854 3855 3856
		if (!mmap_pages_user_set && geteuid() == 0)
			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;

3857
		symbol_conf.use_callchain = true;
3858
	}
3859

3860
	if (trace.evlist->nr_entries > 0) {
3861
		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3862 3863 3864 3865 3866
		if (evlist__set_syscall_tp_fields(trace.evlist)) {
			perror("failed to set syscalls:* tracepoint fields");
			goto out;
		}
	}
3867

3868 3869 3870 3871
	if (trace.sort_events) {
		ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
		ordered_events__set_copy_on_queue(&trace.oe.data, true);
	}
J
Jiri Olsa 已提交
3872

3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885
	/*
	 * If we are augmenting syscalls, then combine what we put in the
	 * __augmented_syscalls__ BPF map with what is in the
	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
	 *
	 * We'll switch to look at two BPF maps, one for sys_enter and the
	 * other for sys_exit when we start augmenting the sys_exit paths with
	 * buffers that are being copied from kernel to userspace, think 'read'
	 * syscall.
	 */
	if (trace.syscalls.events.augmented) {
		evlist__for_each_entry(trace.evlist, evsel) {
3886 3887 3888 3889 3890 3891 3892
			bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;

			if (raw_syscalls_sys_exit) {
				trace.raw_augmented_syscalls = true;
				goto init_augmented_syscall_tp;
			}

3893 3894
			if (trace.syscalls.events.augmented->priv == NULL &&
			    strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) {
3895 3896 3897 3898 3899 3900 3901
				struct perf_evsel *augmented = trace.syscalls.events.augmented;
				if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
				    perf_evsel__init_augmented_syscall_tp_args(augmented))
					goto out;
				augmented->handler = trace__sys_enter;
			}

3902
			if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
3903
				struct syscall_tp *sc;
3904
init_augmented_syscall_tp:
3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928
				if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
					goto out;
				sc = evsel->priv;
				/*
				 * For now with BPF raw_augmented we hook into
				 * raw_syscalls:sys_enter and there we get all
				 * 6 syscall args plus the tracepoint common
				 * fields and the syscall_nr (another long).
				 * So we check if that is the case and if so
				 * don't look after the sc->args_size but
				 * always after the full raw_syscalls:sys_enter
				 * payload, which is fixed.
				 *
				 * We'll revisit this later to pass
				 * s->args_size to the BPF augmenter (now
				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
				 * so that it copies only what we need for each
				 * syscall, like what happens when we use
				 * syscalls:sys_enter_NAME, so that we reduce
				 * the kernel/userspace traffic to just what is
				 * needed for each syscall.
				 */
				if (trace.raw_augmented_syscalls)
					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
3929 3930 3931 3932 3933 3934
				perf_evsel__init_augmented_syscall_tp_ret(evsel);
				evsel->handler = trace__sys_exit;
			}
		}
	}

3935 3936 3937 3938 3939 3940 3941
	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
		return trace__record(&trace, argc-1, &argv[1]);

	/* summary_only implies summary option, but don't overwrite summary if set */
	if (trace.summary_only)
		trace.summary = trace.summary_only;

3942 3943
	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3944
		trace.trace_syscalls = true;
3945 3946
	}

3947 3948 3949 3950 3951 3952 3953 3954
	if (output_name != NULL) {
		err = trace__open_output(&trace, output_name);
		if (err < 0) {
			perror("failed to create output file");
			goto out;
		}
	}

3955
	err = target__validate(&trace.opts.target);
3956
	if (err) {
3957
		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3958 3959
		fprintf(trace.output, "%s", bf);
		goto out_close;
3960 3961
	}

3962
	err = target__parse_uid(&trace.opts.target);
A
Arnaldo Carvalho de Melo 已提交
3963
	if (err) {
3964
		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3965 3966
		fprintf(trace.output, "%s", bf);
		goto out_close;
A
Arnaldo Carvalho de Melo 已提交
3967 3968
	}

3969
	if (!argc && target__none(&trace.opts.target))
3970 3971
		trace.opts.target.system_wide = true;

3972 3973 3974 3975
	if (input_name)
		err = trace__replay(&trace);
	else
		err = trace__run(&trace, argc, argv);
3976

3977 3978 3979 3980
out_close:
	if (output_name != NULL)
		fclose(trace.output);
out:
3981
	return err;
A
Arnaldo Carvalho de Melo 已提交
3982
}