builtin-record.c 72.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
I
Ingo Molnar 已提交
2
/*
3 4 5 6 7
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
8
 */
9
#include "builtin.h"
10

11
#include "util/build-id.h"
12
#include <subcmd/parse-options.h>
13
#include "util/parse-events.h"
14
#include "util/config.h"
15

16
#include "util/callchain.h"
17
#include "util/cgroup.h"
18
#include "util/header.h"
19
#include "util/event.h"
20
#include "util/evlist.h"
21
#include "util/evsel.h"
22
#include "util/debug.h"
23
#include "util/mmap.h"
24
#include "util/target.h"
25
#include "util/session.h"
26
#include "util/tool.h"
27
#include "util/symbol.h"
28
#include "util/record.h"
29
#include "util/cpumap.h"
30
#include "util/thread_map.h"
31
#include "util/data.h"
32
#include "util/perf_regs.h"
33
#include "util/auxtrace.h"
34
#include "util/tsc.h"
35
#include "util/parse-branch-options.h"
36
#include "util/parse-regs-options.h"
37
#include "util/perf_api_probe.h"
38
#include "util/llvm-utils.h"
39
#include "util/bpf-loader.h"
40
#include "util/trigger.h"
W
Wang Nan 已提交
41
#include "util/perf-hooks.h"
42
#include "util/cpu-set-sched.h"
43
#include "util/synthetic-events.h"
44
#include "util/time-utils.h"
45
#include "util/units.h"
46
#include "util/bpf-event.h"
47
#include "util/util.h"
48
#include "asm/bug.h"
49
#include "perf.h"
50

51
#include <errno.h>
52
#include <inttypes.h>
53
#include <locale.h>
54
#include <poll.h>
55
#include <pthread.h>
56
#include <unistd.h>
57
#include <sched.h>
58
#include <signal.h>
59
#include <sys/mman.h>
60
#include <sys/wait.h>
61 62 63
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
64
#include <linux/err.h>
65
#include <linux/string.h>
66
#include <linux/time64.h>
67
#include <linux/zalloc.h>
68
#include <linux/bitmap.h>
69

70
struct switch_output {
71
	bool		 enabled;
72
	bool		 signal;
73
	unsigned long	 size;
74
	unsigned long	 time;
75 76
	const char	*str;
	bool		 set;
77 78 79
	char		 **filenames;
	int		 num_files;
	int		 cur_file;
80 81
};

82
struct record {
83
	struct perf_tool	tool;
84
	struct record_opts	opts;
85
	u64			bytes_written;
86
	struct perf_data	data;
87
	struct auxtrace_record	*itr;
88
	struct evlist	*evlist;
89
	struct perf_session	*session;
90
	struct evlist		*sb_evlist;
91
	pthread_t		thread_id;
92
	int			realtime_prio;
93
	bool			switch_output_event_set;
94
	bool			no_buildid;
95
	bool			no_buildid_set;
96
	bool			no_buildid_cache;
97
	bool			no_buildid_cache_set;
98
	bool			buildid_all;
99
	bool			timestamp_filename;
100
	bool			timestamp_boundary;
101
	struct switch_output	switch_output;
102
	unsigned long long	samples;
103
	struct mmap_cpu_mask	affinity_mask;
104
	unsigned long		output_max_size;	/* = 0: unlimited */
105
};
106

107 108
static volatile int done;

109 110 111 112
static volatile int auxtrace_record__snapshot_started;
static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
static DEFINE_TRIGGER(switch_output_trigger);

113 114 115 116
static const char *affinity_tags[PERF_AFFINITY_MAX] = {
	"SYS", "NODE", "CPU"
};

117 118 119 120 121 122 123 124 125 126 127 128 129
static bool switch_output_signal(struct record *rec)
{
	return rec->switch_output.signal &&
	       trigger_is_ready(&switch_output_trigger);
}

static bool switch_output_size(struct record *rec)
{
	return rec->switch_output.size &&
	       trigger_is_ready(&switch_output_trigger) &&
	       (rec->bytes_written >= rec->switch_output.size);
}

130 131 132 133 134 135
static bool switch_output_time(struct record *rec)
{
	return rec->switch_output.time &&
	       trigger_is_ready(&switch_output_trigger);
}

136 137 138 139 140 141
static bool record__output_max_size_exceeded(struct record *rec)
{
	return rec->output_max_size &&
	       (rec->bytes_written >= rec->output_max_size);
}

142
static int record__write(struct record *rec, struct mmap *map __maybe_unused,
143
			 void *bf, size_t size)
144
{
145 146 147
	struct perf_data_file *file = &rec->session->data->file;

	if (perf_data_file__write(file, bf, size) < 0) {
148 149
		pr_err("failed to write perf data, error: %m\n");
		return -1;
150
	}
151

152
	rec->bytes_written += size;
153

154 155 156 157 158 159 160
	if (record__output_max_size_exceeded(rec) && !done) {
		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
				" stopping session ]\n",
				rec->bytes_written >> 10);
		done = 1;
	}

161 162 163
	if (switch_output_size(rec))
		trigger_hit(&switch_output_trigger);

164
	return 0;
165 166
}

167 168
static int record__aio_enabled(struct record *rec);
static int record__comp_enabled(struct record *rec);
169 170 171
static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
			    void *src, size_t src_size);

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
#ifdef HAVE_AIO_SUPPORT
static int record__aio_write(struct aiocb *cblock, int trace_fd,
		void *buf, size_t size, off_t off)
{
	int rc;

	cblock->aio_fildes = trace_fd;
	cblock->aio_buf    = buf;
	cblock->aio_nbytes = size;
	cblock->aio_offset = off;
	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;

	do {
		rc = aio_write(cblock);
		if (rc == 0) {
			break;
		} else if (errno != EAGAIN) {
			cblock->aio_fildes = -1;
			pr_err("failed to queue perf data, error: %m\n");
			break;
		}
	} while (1);

	return rc;
}

198
static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
{
	void *rem_buf;
	off_t rem_off;
	size_t rem_size;
	int rc, aio_errno;
	ssize_t aio_ret, written;

	aio_errno = aio_error(cblock);
	if (aio_errno == EINPROGRESS)
		return 0;

	written = aio_ret = aio_return(cblock);
	if (aio_ret < 0) {
		if (aio_errno != EINTR)
			pr_err("failed to write perf data, error: %m\n");
		written = 0;
	}

	rem_size = cblock->aio_nbytes - written;

	if (rem_size == 0) {
		cblock->aio_fildes = -1;
		/*
222 223 224
		 * md->refcount is incremented in record__aio_pushfn() for
		 * every aio write request started in record__aio_push() so
		 * decrement it because the request is now complete.
225
		 */
226
		perf_mmap__put(&md->core);
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
		rc = 1;
	} else {
		/*
		 * aio write request may require restart with the
		 * reminder if the kernel didn't write whole
		 * chunk at once.
		 */
		rem_off = cblock->aio_offset + written;
		rem_buf = (void *)(cblock->aio_buf + written);
		record__aio_write(cblock, cblock->aio_fildes,
				rem_buf, rem_size, rem_off);
		rc = 0;
	}

	return rc;
}

244
static int record__aio_sync(struct mmap *md, bool sync_all)
245
{
246 247
	struct aiocb **aiocb = md->aio.aiocb;
	struct aiocb *cblocks = md->aio.cblocks;
248
	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
249
	int i, do_suspend;
250 251

	do {
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
		do_suspend = 0;
		for (i = 0; i < md->aio.nr_cblocks; ++i) {
			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
				if (sync_all)
					aiocb[i] = NULL;
				else
					return i;
			} else {
				/*
				 * Started aio write is not complete yet
				 * so it has to be waited before the
				 * next allocation.
				 */
				aiocb[i] = &cblocks[i];
				do_suspend = 1;
			}
		}
		if (!do_suspend)
			return -1;
271

272
		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
273 274 275 276 277 278
			if (!(errno == EAGAIN || errno == EINTR))
				pr_err("failed to sync perf data, error: %m\n");
		}
	} while (1);
}

279 280 281 282 283 284
struct record_aio {
	struct record	*rec;
	void		*data;
	size_t		size;
};

285
static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
286
{
287
	struct record_aio *aio = to;
288

289
	/*
J
Jiri Olsa 已提交
290
	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
291 292 293 294 295 296 297 298 299 300 301 302 303 304
	 * to release space in the kernel buffer as fast as possible, calling
	 * perf_mmap__consume() from perf_mmap__push() function.
	 *
	 * That lets the kernel to proceed with storing more profiling data into
	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
	 *
	 * Coping can be done in two steps in case the chunk of profiling data
	 * crosses the upper bound of the kernel buffer. In this case we first move
	 * part of data from map->start till the upper bound and then the reminder
	 * from the beginning of the kernel buffer till the end of the data chunk.
	 */

	if (record__comp_enabled(aio->rec)) {
		size = zstd_compress(aio->rec->session, aio->data + aio->size,
305
				     mmap__mmap_len(map) - aio->size,
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
				     buf, size);
	} else {
		memcpy(aio->data + aio->size, buf, size);
	}

	if (!aio->size) {
		/*
		 * Increment map->refcount to guard map->aio.data[] buffer
		 * from premature deallocation because map object can be
		 * released earlier than aio write request started on
		 * map->aio.data[] buffer is complete.
		 *
		 * perf_mmap__put() is done at record__aio_complete()
		 * after started aio request completion or at record__aio_push()
		 * if the request failed to start.
		 */
322
		perf_mmap__get(&map->core);
323 324 325 326 327 328 329
	}

	aio->size += size;

	return size;
}

330
static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
331 332 333 334
{
	int ret, idx;
	int trace_fd = rec->session->data->file.fd;
	struct record_aio aio = { .rec = rec, .size = 0 };
335

336 337 338 339 340 341 342 343 344 345 346 347 348
	/*
	 * Call record__aio_sync() to wait till map->aio.data[] buffer
	 * becomes available after previous aio write operation.
	 */

	idx = record__aio_sync(map, false);
	aio.data = map->aio.data[idx];
	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
		return ret;

	rec->samples++;
	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
349
	if (!ret) {
350 351
		*off += aio.size;
		rec->bytes_written += aio.size;
352 353
		if (switch_output_size(rec))
			trigger_hit(&switch_output_trigger);
354 355 356 357 358 359 360
	} else {
		/*
		 * Decrement map->refcount incremented in record__aio_pushfn()
		 * back if record__aio_write() operation failed to start, otherwise
		 * map->refcount is decremented in record__aio_complete() after
		 * aio write operation finishes successfully.
		 */
361
		perf_mmap__put(&map->core);
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
	}

	return ret;
}

static off_t record__aio_get_pos(int trace_fd)
{
	return lseek(trace_fd, 0, SEEK_CUR);
}

static void record__aio_set_pos(int trace_fd, off_t pos)
{
	lseek(trace_fd, pos, SEEK_SET);
}

static void record__aio_mmap_read_sync(struct record *rec)
{
	int i;
380
	struct evlist *evlist = rec->evlist;
381
	struct mmap *maps = evlist->mmap;
382

383
	if (!record__aio_enabled(rec))
384 385
		return;

386
	for (i = 0; i < evlist->core.nr_mmaps; i++) {
387
		struct mmap *map = &maps[i];
388

J
Jiri Olsa 已提交
389
		if (map->core.base)
390
			record__aio_sync(map, true);
391 392 393 394
	}
}

static int nr_cblocks_default = 1;
395
static int nr_cblocks_max = 4;
396 397

static int record__aio_parse(const struct option *opt,
398
			     const char *str,
399 400 401 402
			     int unset)
{
	struct record_opts *opts = (struct record_opts *)opt->value;

403
	if (unset) {
404
		opts->nr_cblocks = 0;
405 406 407 408 409 410
	} else {
		if (str)
			opts->nr_cblocks = strtol(str, NULL, 0);
		if (!opts->nr_cblocks)
			opts->nr_cblocks = nr_cblocks_default;
	}
411 412 413 414

	return 0;
}
#else /* HAVE_AIO_SUPPORT */
415 416
static int nr_cblocks_max = 0;

417
static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
418
			    off_t *off __maybe_unused)
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
{
	return -1;
}

static off_t record__aio_get_pos(int trace_fd __maybe_unused)
{
	return -1;
}

static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
{
}

static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
{
}
#endif

static int record__aio_enabled(struct record *rec)
{
	return rec->opts.nr_cblocks > 0;
}

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
#define MMAP_FLUSH_DEFAULT 1
static int record__mmap_flush_parse(const struct option *opt,
				    const char *str,
				    int unset)
{
	int flush_max;
	struct record_opts *opts = (struct record_opts *)opt->value;
	static struct parse_tag tags[] = {
			{ .tag  = 'B', .mult = 1       },
			{ .tag  = 'K', .mult = 1 << 10 },
			{ .tag  = 'M', .mult = 1 << 20 },
			{ .tag  = 'G', .mult = 1 << 30 },
			{ .tag  = 0 },
	};

	if (unset)
		return 0;

	if (str) {
		opts->mmap_flush = parse_tag_value(str, tags);
		if (opts->mmap_flush == (int)-1)
			opts->mmap_flush = strtol(str, NULL, 0);
	}

	if (!opts->mmap_flush)
		opts->mmap_flush = MMAP_FLUSH_DEFAULT;

469
	flush_max = evlist__mmap_size(opts->mmap_pages);
470 471 472 473 474 475 476
	flush_max /= 4;
	if (opts->mmap_flush > flush_max)
		opts->mmap_flush = flush_max;

	return 0;
}

477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
#ifdef HAVE_ZSTD_SUPPORT
static unsigned int comp_level_default = 1;

static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
{
	struct record_opts *opts = opt->value;

	if (unset) {
		opts->comp_level = 0;
	} else {
		if (str)
			opts->comp_level = strtol(str, NULL, 0);
		if (!opts->comp_level)
			opts->comp_level = comp_level_default;
	}

	return 0;
}
#endif
496 497
static unsigned int comp_level_max = 22;

498 499 500 501 502
static int record__comp_enabled(struct record *rec)
{
	return rec->opts.comp_level > 0;
}

503
static int process_synthesized_event(struct perf_tool *tool,
504
				     union perf_event *event,
505 506
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
507
{
508
	struct record *rec = container_of(tool, struct record, tool);
509
	return record__write(rec, NULL, event, event->header.size);
510 511
}

512 513 514 515 516 517 518 519 520 521 522 523 524 525
static int process_locked_synthesized_event(struct perf_tool *tool,
				     union perf_event *event,
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
{
	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
	int ret;

	pthread_mutex_lock(&synth_lock);
	ret = process_synthesized_event(tool, event, sample, machine);
	pthread_mutex_unlock(&synth_lock);
	return ret;
}

526
static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
527 528 529
{
	struct record *rec = to;

530
	if (record__comp_enabled(rec)) {
531
		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
532 533 534
		bf   = map->data;
	}

535
	rec->samples++;
536
	return record__write(rec, map, bf, size);
537 538
}

539 540
static volatile int signr = -1;
static volatile int child_finished;
541

542 543 544 545 546 547 548 549 550 551
static void sig_handler(int sig)
{
	if (sig == SIGCHLD)
		child_finished = 1;
	else
		signr = sig;

	done = 1;
}

W
Wang Nan 已提交
552 553 554 555 556 557
static void sigsegv_handler(int sig)
{
	perf_hooks__recover();
	sighandler_dump_stack(sig);
}

558 559 560 561 562 563 564 565 566
static void record__sig_exit(void)
{
	if (signr == -1)
		return;

	signal(signr, SIG_DFL);
	raise(signr);
}

567 568
#ifdef HAVE_AUXTRACE_SUPPORT

569
static int record__process_auxtrace(struct perf_tool *tool,
570
				    struct mmap *map,
571 572 573 574
				    union perf_event *event, void *data1,
				    size_t len1, void *data2, size_t len2)
{
	struct record *rec = container_of(tool, struct record, tool);
575
	struct perf_data *data = &rec->data;
576 577 578
	size_t padding;
	u8 pad[8] = {0};

579
	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
580
		off_t file_offset;
581
		int fd = perf_data__fd(data);
582 583 584 585 586 587 588 589 590 591 592
		int err;

		file_offset = lseek(fd, 0, SEEK_CUR);
		if (file_offset == -1)
			return -1;
		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
						     event, file_offset);
		if (err)
			return err;
	}

593 594 595 596 597
	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
	padding = (len1 + len2) & 7;
	if (padding)
		padding = 8 - padding;

598 599
	record__write(rec, map, event, event->header.size);
	record__write(rec, map, data1, len1);
600
	if (len2)
601 602
		record__write(rec, map, data2, len2);
	record__write(rec, map, &pad, padding);
603 604 605 606 607

	return 0;
}

static int record__auxtrace_mmap_read(struct record *rec,
608
				      struct mmap *map)
609 610 611
{
	int ret;

612
	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
613 614 615 616 617 618 619 620 621 622
				  record__process_auxtrace);
	if (ret < 0)
		return ret;

	if (ret)
		rec->samples++;

	return 0;
}

623
static int record__auxtrace_mmap_read_snapshot(struct record *rec,
624
					       struct mmap *map)
625 626 627
{
	int ret;

628
	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
					   record__process_auxtrace,
					   rec->opts.auxtrace_snapshot_size);
	if (ret < 0)
		return ret;

	if (ret)
		rec->samples++;

	return 0;
}

static int record__auxtrace_read_snapshot_all(struct record *rec)
{
	int i;
	int rc = 0;

645
	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
646
		struct mmap *map = &rec->evlist->mmap[i];
647

648
		if (!map->auxtrace_mmap.base)
649 650
			continue;

651
		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
652 653 654 655 656 657 658 659
			rc = -1;
			goto out;
		}
	}
out:
	return rc;
}

660
static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
661 662 663
{
	pr_debug("Recording AUX area tracing snapshot\n");
	if (record__auxtrace_read_snapshot_all(rec) < 0) {
664
		trigger_error(&auxtrace_snapshot_trigger);
665
	} else {
666
		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
667 668 669
			trigger_error(&auxtrace_snapshot_trigger);
		else
			trigger_ready(&auxtrace_snapshot_trigger);
670 671 672
	}
}

673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
static int record__auxtrace_snapshot_exit(struct record *rec)
{
	if (trigger_is_error(&auxtrace_snapshot_trigger))
		return 0;

	if (!auxtrace_record__snapshot_started &&
	    auxtrace_record__snapshot_start(rec->itr))
		return -1;

	record__read_auxtrace_snapshot(rec, true);
	if (trigger_is_error(&auxtrace_snapshot_trigger))
		return -1;

	return 0;
}

689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
static int record__auxtrace_init(struct record *rec)
{
	int err;

	if (!rec->itr) {
		rec->itr = auxtrace_record__init(rec->evlist, &err);
		if (err)
			return err;
	}

	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
					      rec->opts.auxtrace_snapshot_opts);
	if (err)
		return err;

704 705 706 707 708
	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
					    rec->opts.auxtrace_sample_opts);
	if (err)
		return err;

709 710 711
	return auxtrace_parse_filters(rec->evlist);
}

712 713 714 715
#else

static inline
int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
716
			       struct mmap *map __maybe_unused)
717 718 719 720
{
	return 0;
}

721
static inline
722 723
void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
				    bool on_exit __maybe_unused)
724
{
725 726
}

727 728
static inline
int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
729
{
730
	return 0;
731 732
}

733 734 735 736 737 738
static inline
int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
{
	return 0;
}

739 740 741 742 743
static int record__auxtrace_init(struct record *rec __maybe_unused)
{
	return 0;
}

744 745
#endif

746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
static bool record__kcore_readable(struct machine *machine)
{
	char kcore[PATH_MAX];
	int fd;

	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);

	fd = open(kcore, O_RDONLY);
	if (fd < 0)
		return false;

	close(fd);

	return true;
}

static int record__kcore_copy(struct machine *machine, struct perf_data *data)
{
	char from_dir[PATH_MAX];
	char kcore_dir[PATH_MAX];
	int ret;

	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);

	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
	if (ret)
		return ret;

	return kcore_copy(from_dir, kcore_dir);
}

777
static int record__mmap_evlist(struct record *rec,
778
			       struct evlist *evlist)
779 780
{
	struct record_opts *opts = &rec->opts;
781 782
	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
				  opts->auxtrace_sample_mode;
783 784
	char msg[512];

785 786 787
	if (opts->affinity != PERF_AFFINITY_SYS)
		cpu__setup_cpunode_map();

788
	if (evlist__mmap_ex(evlist, opts->mmap_pages,
789
				 opts->auxtrace_mmap_pages,
790
				 auxtrace_overwrite,
791
				 opts->nr_cblocks, opts->affinity,
792
				 opts->mmap_flush, opts->comp_level) < 0) {
793 794 795 796 797 798 799 800 801 802
		if (errno == EPERM) {
			pr_err("Permission error mapping pages.\n"
			       "Consider increasing "
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
			       "or try again with a smaller value of -m/--mmap_pages.\n"
			       "(current value: %u,%u)\n",
			       opts->mmap_pages, opts->auxtrace_mmap_pages);
			return -errno;
		} else {
			pr_err("failed to mmap with %d (%s)\n", errno,
803
				str_error_r(errno, msg, sizeof(msg)));
804 805 806 807 808 809 810 811 812 813 814 815 816 817
			if (errno)
				return -errno;
			else
				return -EINVAL;
		}
	}
	return 0;
}

static int record__mmap(struct record *rec)
{
	return record__mmap_evlist(rec, rec->evlist);
}

818
static int record__open(struct record *rec)
819
{
820
	char msg[BUFSIZ];
821
	struct evsel *pos;
822
	struct evlist *evlist = rec->evlist;
823
	struct perf_session *session = rec->session;
824
	struct record_opts *opts = &rec->opts;
825
	int rc = 0;
826

827 828 829 830 831 832 833 834 835
	/*
	 * For initial_delay we need to add a dummy event so that we can track
	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
	 * real events, the ones asked by the user.
	 */
	if (opts->initial_delay) {
		if (perf_evlist__add_dummy(evlist))
			return -ENOMEM;

836
		pos = evlist__first(evlist);
837
		pos->tracking = 0;
838
		pos = evlist__last(evlist);
839
		pos->tracking = 1;
840
		pos->core.attr.enable_on_exec = 1;
841 842
	}

843
	perf_evlist__config(evlist, opts, &callchain_param);
844

845
	evlist__for_each_entry(evlist, pos) {
846
try_again:
847
		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
848
			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
849
				if (verbose > 0)
850
					ui__warning("%s\n", msg);
851 852
				goto try_again;
			}
A
Andi Kleen 已提交
853 854 855
			if ((errno == EINVAL || errno == EBADF) &&
			    pos->leader != pos &&
			    pos->weak_group) {
856
			        pos = perf_evlist__reset_weak_group(evlist, pos, true);
A
Andi Kleen 已提交
857 858
				goto try_again;
			}
859 860 861 862
			rc = -errno;
			perf_evsel__open_strerror(pos, &opts->target,
						  errno, msg, sizeof(msg));
			ui__error("%s\n", msg);
863
			goto out;
L
Li Zefan 已提交
864
		}
865 866

		pos->supported = true;
L
Li Zefan 已提交
867
	}
868

869 870 871 872 873 874 875 876 877 878 879
	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
		pr_warning(
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
"file is not found in the buildid cache or in the vmlinux path.\n\n"
"Samples in kernel modules won't be resolved at all.\n\n"
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
	}

880
	if (perf_evlist__apply_filters(evlist, &pos)) {
881
		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
882
			pos->filter, perf_evsel__name(pos), errno,
883
			str_error_r(errno, msg, sizeof(msg)));
884
		rc = -1;
885 886 887
		goto out;
	}

888 889
	rc = record__mmap(rec);
	if (rc)
890
		goto out;
891

892
	session->evlist = evlist;
893
	perf_session__set_id_hdr_size(session);
894 895
out:
	return rc;
896 897
}

898 899 900
static int process_sample_event(struct perf_tool *tool,
				union perf_event *event,
				struct perf_sample *sample,
901
				struct evsel *evsel,
902 903 904 905
				struct machine *machine)
{
	struct record *rec = container_of(tool, struct record, tool);

906 907 908 909
	if (rec->evlist->first_sample_time == 0)
		rec->evlist->first_sample_time = sample->time;

	rec->evlist->last_sample_time = sample->time;
910

911 912 913 914
	if (rec->buildid_all)
		return 0;

	rec->samples++;
915 916 917
	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
}

918
static int process_buildids(struct record *rec)
919
{
920
	struct perf_session *session = rec->session;
921

922
	if (perf_data__size(&rec->data) == 0)
923 924
		return 0;

925 926 927 928 929 930 931 932 933 934 935
	/*
	 * During this process, it'll load kernel map and replace the
	 * dso->long_name to a real pathname it found.  In this case
	 * we prefer the vmlinux path like
	 *   /lib/modules/3.16.4/build/vmlinux
	 *
	 * rather than build-id path (in debug directory).
	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
	 */
	symbol_conf.ignore_vmlinux_buildid = true;

936 937
	/*
	 * If --buildid-all is given, it marks all DSO regardless of hits,
938 939 940
	 * so no need to process samples. But if timestamp_boundary is enabled,
	 * it still needs to walk on all samples to get the timestamps of
	 * first/last samples.
941
	 */
942
	if (rec->buildid_all && !rec->timestamp_boundary)
943 944
		rec->tool.sample = NULL;

945
	return perf_session__process_events(session);
946 947
}

948
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
949 950
{
	int err;
951
	struct perf_tool *tool = data;
952 953 954 955 956 957 958 959
	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
960
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
961
					     machine);
962 963
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
964
		       " relocation symbol.\n", machine->pid);
965 966 967 968 969

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
970
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
971
						 machine);
972 973
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
974
		       " relocation symbol.\n", machine->pid);
975 976
}

977 978 979 980 981
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

982
static void record__adjust_affinity(struct record *rec, struct mmap *map)
983 984
{
	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
985 986 987 988 989 990 991 992 993
	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
			  rec->affinity_mask.nbits)) {
		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
			  map->affinity_mask.bits, rec->affinity_mask.nbits);
		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
				  (cpu_set_t *)rec->affinity_mask.bits);
		if (verbose == 2)
			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
994 995 996
	}
}

997 998
static size_t process_comp_header(void *record, size_t increment)
{
999
	struct perf_record_compressed *event = record;
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
	size_t size = sizeof(*event);

	if (increment) {
		event->header.size += increment;
		return increment;
	}

	event->header.type = PERF_RECORD_COMPRESSED;
	event->header.size = size;

	return size;
}

static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
			    void *src, size_t src_size)
{
	size_t compressed;
1017
	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027

	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
						     max_record_size, process_comp_header);

	session->bytes_transferred += src_size;
	session->bytes_compressed  += compressed;

	return compressed;
}

1028
static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1029
				    bool overwrite, bool synch)
1030
{
1031
	u64 bytes_written = rec->bytes_written;
1032
	int i;
1033
	int rc = 0;
1034
	struct mmap *maps;
1035
	int trace_fd = rec->data.file.fd;
1036
	off_t off = 0;
1037

1038 1039
	if (!evlist)
		return 0;
1040

1041
	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1042 1043 1044
	if (!maps)
		return 0;

1045
	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1046 1047
		return 0;

1048 1049 1050
	if (record__aio_enabled(rec))
		off = record__aio_get_pos(trace_fd);

1051
	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1052
		u64 flush = 0;
1053
		struct mmap *map = &maps[i];
1054

J
Jiri Olsa 已提交
1055
		if (map->core.base) {
1056
			record__adjust_affinity(rec, map);
1057
			if (synch) {
1058 1059
				flush = map->core.flush;
				map->core.flush = 1;
1060
			}
1061
			if (!record__aio_enabled(rec)) {
1062
				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1063
					if (synch)
1064
						map->core.flush = flush;
1065 1066 1067 1068
					rc = -1;
					goto out;
				}
			} else {
1069
				if (record__aio_push(rec, map, &off) < 0) {
1070
					record__aio_set_pos(trace_fd, off);
1071
					if (synch)
1072
						map->core.flush = flush;
1073 1074 1075
					rc = -1;
					goto out;
				}
1076
			}
1077
			if (synch)
1078
				map->core.flush = flush;
1079
		}
1080

1081
		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1082
		    !rec->opts.auxtrace_sample_mode &&
1083
		    record__auxtrace_mmap_read(rec, map) != 0) {
1084 1085 1086
			rc = -1;
			goto out;
		}
1087 1088
	}

1089 1090 1091
	if (record__aio_enabled(rec))
		record__aio_set_pos(trace_fd, off);

1092 1093 1094 1095 1096
	/*
	 * Mark the round finished in case we wrote
	 * at least one event.
	 */
	if (bytes_written != rec->bytes_written)
1097
		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1098

1099
	if (overwrite)
1100
		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1101 1102
out:
	return rc;
1103 1104
}

1105
static int record__mmap_read_all(struct record *rec, bool synch)
1106 1107 1108
{
	int err;

1109
	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1110 1111 1112
	if (err)
		return err;

1113
	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1114 1115
}

1116
static void record__init_features(struct record *rec)
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
{
	struct perf_session *session = rec->session;
	int feat;

	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
		perf_header__set_feat(&session->header, feat);

	if (rec->no_buildid)
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);

1127
	if (!have_tracepoints(&rec->evlist->core.entries))
1128 1129 1130 1131
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);

	if (!rec->opts.branch_stack)
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1132 1133 1134

	if (!rec->opts.full_auxtrace)
		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1135

1136 1137 1138
	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
		perf_header__clear_feat(&session->header, HEADER_CLOCKID);

1139
	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1140 1141
	if (!record__comp_enabled(rec))
		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1142

1143
	perf_header__clear_feat(&session->header, HEADER_STAT);
1144 1145
}

1146 1147 1148
static void
record__finish_output(struct record *rec)
{
1149 1150
	struct perf_data *data = &rec->data;
	int fd = perf_data__fd(data);
1151

1152
	if (data->is_pipe)
1153 1154 1155
		return;

	rec->session->header.data_size += rec->bytes_written;
1156
	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168

	if (!rec->no_buildid) {
		process_buildids(rec);

		if (rec->buildid_all)
			dsos__hit_all(rec->session);
	}
	perf_session__write_header(rec->session, rec->evlist, fd, true);

	return;
}

1169
static int record__synthesize_workload(struct record *rec, bool tail)
1170
{
1171
	int err;
1172
	struct perf_thread_map *thread_map;
1173

1174 1175 1176
	if (rec->opts.tail_synthesize != tail)
		return 0;

1177 1178 1179 1180 1181
	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
	if (thread_map == NULL)
		return -1;

	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1182 1183
						 process_synthesized_event,
						 &rec->session->machines.host,
1184
						 rec->opts.sample_address);
1185
	perf_thread_map__put(thread_map);
1186
	return err;
1187 1188
}

1189
static int record__synthesize(struct record *rec, bool tail);
1190

1191 1192 1193
static int
record__switch_output(struct record *rec, bool at_exit)
{
1194
	struct perf_data *data = &rec->data;
1195
	int fd, err;
1196
	char *new_filename;
1197 1198 1199 1200

	/* Same Size:      "2015122520103046"*/
	char timestamp[] = "InvalidTimestamp";

1201 1202
	record__aio_mmap_read_sync(rec);

1203 1204 1205 1206
	record__synthesize(rec, true);
	if (target__none(&rec->opts.target))
		record__synthesize_workload(rec, true);

1207 1208 1209 1210 1211 1212 1213 1214
	rec->samples = 0;
	record__finish_output(rec);
	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
	if (err) {
		pr_err("Failed to get current timestamp\n");
		return -EINVAL;
	}

1215
	fd = perf_data__switch(data, timestamp,
1216
				    rec->session->header.data_offset,
1217
				    at_exit, &new_filename);
1218 1219 1220 1221 1222 1223 1224
	if (fd >= 0 && !at_exit) {
		rec->bytes_written = 0;
		rec->session->header.data_size = 0;
	}

	if (!quiet)
		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
J
Jiri Olsa 已提交
1225
			data->path, timestamp);
1226

1227 1228 1229 1230 1231 1232 1233 1234
	if (rec->switch_output.num_files) {
		int n = rec->switch_output.cur_file + 1;

		if (n >= rec->switch_output.num_files)
			n = 0;
		rec->switch_output.cur_file = n;
		if (rec->switch_output.filenames[n]) {
			remove(rec->switch_output.filenames[n]);
1235
			zfree(&rec->switch_output.filenames[n]);
1236 1237 1238 1239 1240 1241
		}
		rec->switch_output.filenames[n] = new_filename;
	} else {
		free(new_filename);
	}

1242
	/* Output tracking events */
1243
	if (!at_exit) {
1244
		record__synthesize(rec, false);
1245

1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
		/*
		 * In 'perf record --switch-output' without -a,
		 * record__synthesize() in record__switch_output() won't
		 * generate tracking events because there's no thread_map
		 * in evlist. Which causes newly created perf.data doesn't
		 * contain map and comm information.
		 * Create a fake thread_map and directly call
		 * perf_event__synthesize_thread_map() for those events.
		 */
		if (target__none(&rec->opts.target))
1256
			record__synthesize_workload(rec, false);
1257
	}
1258 1259 1260
	return fd;
}

1261 1262 1263 1264 1265 1266 1267
static volatile int workload_exec_errno;

/*
 * perf_evlist__prepare_workload will send a SIGUSR1
 * if the fork fails, since we asked by setting its
 * want_signal to true.
 */
1268 1269
static void workload_exec_failed_signal(int signo __maybe_unused,
					siginfo_t *info,
1270 1271 1272 1273 1274 1275 1276
					void *ucontext __maybe_unused)
{
	workload_exec_errno = info->si_value.sival_int;
	done = 1;
	child_finished = 1;
}

1277
static void snapshot_sig_handler(int sig);
1278
static void alarm_sig_handler(int sig);
1279

1280
static const struct perf_event_mmap_page *
1281
perf_evlist__pick_pc(struct evlist *evlist)
1282
{
1283
	if (evlist) {
J
Jiri Olsa 已提交
1284 1285 1286 1287
		if (evlist->mmap && evlist->mmap[0].core.base)
			return evlist->mmap[0].core.base;
		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
			return evlist->overwrite_mmap[0].core.base;
1288
	}
1289 1290 1291
	return NULL;
}

1292 1293
static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
{
1294 1295 1296 1297 1298
	const struct perf_event_mmap_page *pc;

	pc = perf_evlist__pick_pc(rec->evlist);
	if (pc)
		return pc;
1299 1300 1301
	return NULL;
}

1302
static int record__synthesize(struct record *rec, bool tail)
1303 1304 1305
{
	struct perf_session *session = rec->session;
	struct machine *machine = &session->machines.host;
1306
	struct perf_data *data = &rec->data;
1307 1308
	struct record_opts *opts = &rec->opts;
	struct perf_tool *tool = &rec->tool;
1309
	int fd = perf_data__fd(data);
1310
	int err = 0;
1311
	event_op f = process_synthesized_event;
1312

1313 1314 1315
	if (rec->opts.tail_synthesize != tail)
		return 0;

1316
	if (data->is_pipe) {
1317 1318 1319 1320
		/*
		 * We need to synthesize events first, because some
		 * features works on top of them (on report side).
		 */
1321
		err = perf_event__synthesize_attrs(tool, rec->evlist,
1322 1323 1324 1325 1326 1327
						   process_synthesized_event);
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
			goto out;
		}

1328 1329 1330 1331 1332 1333 1334
		err = perf_event__synthesize_features(tool, session, rec->evlist,
						      process_synthesized_event);
		if (err < 0) {
			pr_err("Couldn't synthesize features.\n");
			return err;
		}

1335
		if (have_tracepoints(&rec->evlist->core.entries)) {
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
								  process_synthesized_event);
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
				goto out;
			}
			rec->bytes_written += err;
		}
	}

1354
	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1355 1356 1357 1358
					  process_synthesized_event, machine);
	if (err)
		goto out;

1359 1360 1361 1362 1363 1364 1365 1366 1367
	/* Synthesize id_index before auxtrace_info */
	if (rec->opts.auxtrace_sample_mode) {
		err = perf_event__synthesize_id_index(tool,
						      process_synthesized_event,
						      session->evlist, machine);
		if (err)
			goto out;
	}

1368 1369 1370 1371 1372 1373 1374
	if (rec->opts.full_auxtrace) {
		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
					session, process_synthesized_event);
		if (err)
			goto out;
	}

1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
	if (!perf_evlist__exclude_kernel(rec->evlist)) {
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
							 machine);
		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
				   "Check /proc/kallsyms permission or run as root.\n");

		err = perf_event__synthesize_modules(tool, process_synthesized_event,
						     machine);
		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
				   "Check /proc/modules permission or run as root.\n");
	}
1388 1389 1390 1391 1392 1393

	if (perf_guest) {
		machines__process_guests(&session->machines,
					 perf_event__synthesize_guest_os, tool);
	}

1394 1395 1396 1397 1398 1399 1400
	err = perf_event__synthesize_extra_attr(&rec->tool,
						rec->evlist,
						process_synthesized_event,
						data->is_pipe);
	if (err)
		goto out;

1401
	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1402 1403 1404 1405 1406 1407 1408
						 process_synthesized_event,
						NULL);
	if (err < 0) {
		pr_err("Couldn't synthesize thread map.\n");
		return err;
	}

1409
	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1410 1411 1412 1413 1414 1415
					     process_synthesized_event, NULL);
	if (err < 0) {
		pr_err("Couldn't synthesize cpu map.\n");
		return err;
	}

1416
	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1417 1418 1419 1420
						machine, opts);
	if (err < 0)
		pr_warning("Couldn't synthesize bpf events.\n");

1421 1422 1423 1424 1425
	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
					     machine);
	if (err < 0)
		pr_warning("Couldn't synthesize cgroup events.\n");

1426 1427 1428 1429 1430
	if (rec->opts.nr_threads_synthesize > 1) {
		perf_set_multithreaded();
		f = process_locked_synthesized_event;
	}

1431
	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1432 1433 1434 1435 1436 1437
					    f, opts->sample_address,
					    rec->opts.nr_threads_synthesize);

	if (rec->opts.nr_threads_synthesize > 1)
		perf_set_singlethreaded();

1438 1439 1440 1441
out:
	return err;
}

1442 1443 1444 1445 1446 1447 1448
static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
{
	struct record *rec = data;
	pthread_kill(rec->thread_id, SIGUSR2);
	return 0;
}

1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
static int record__setup_sb_evlist(struct record *rec)
{
	struct record_opts *opts = &rec->opts;

	if (rec->sb_evlist != NULL) {
		/*
		 * We get here if --switch-output-event populated the
		 * sb_evlist, so associate a callback that will send a SIGUSR2
		 * to the main thread.
		 */
		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
		rec->thread_id = pthread_self();
	}

	if (!opts->no_bpf_event) {
		if (rec->sb_evlist == NULL) {
			rec->sb_evlist = evlist__new();

			if (rec->sb_evlist == NULL) {
				pr_err("Couldn't create side band evlist.\n.");
				return -1;
			}
		}

		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
			return -1;
		}
	}

	if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
		opts->no_bpf_event = true;
	}

	return 0;
}

1487
static int __cmd_record(struct record *rec, int argc, const char **argv)
1488
{
1489
	int err;
1490
	int status = 0;
1491
	unsigned long waking = 0;
1492
	const bool forks = argc > 0;
1493
	struct perf_tool *tool = &rec->tool;
1494
	struct record_opts *opts = &rec->opts;
1495
	struct perf_data *data = &rec->data;
1496
	struct perf_session *session;
1497
	bool disabled = false, draining = false;
1498
	int fd;
1499
	float ratio = 0;
1500

1501
	atexit(record__sig_exit);
1502 1503
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
1504
	signal(SIGTERM, sig_handler);
W
Wang Nan 已提交
1505
	signal(SIGSEGV, sigsegv_handler);
1506

1507 1508 1509
	if (rec->opts.record_namespaces)
		tool->namespace_events = true;

1510 1511 1512 1513 1514 1515 1516 1517 1518
	if (rec->opts.record_cgroup) {
#ifdef HAVE_FILE_HANDLE
		tool->cgroup_events = true;
#else
		pr_err("cgroup tracking is not supported\n");
		return -1;
#endif
	}

1519
	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1520
		signal(SIGUSR2, snapshot_sig_handler);
1521 1522
		if (rec->opts.auxtrace_snapshot_mode)
			trigger_on(&auxtrace_snapshot_trigger);
1523
		if (rec->switch_output.enabled)
1524
			trigger_on(&switch_output_trigger);
1525
	} else {
1526
		signal(SIGUSR2, SIG_IGN);
1527
	}
1528

1529
	session = perf_session__new(data, false, tool);
1530
	if (IS_ERR(session)) {
A
Adrien BAK 已提交
1531
		pr_err("Perf session creation failed.\n");
1532
		return PTR_ERR(session);
1533 1534
	}

1535
	fd = perf_data__fd(data);
1536 1537
	rec->session = session;

1538 1539 1540 1541 1542 1543 1544 1545
	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
		pr_err("Compression initialization failed.\n");
		return -1;
	}

	session->header.env.comp_type  = PERF_COMP_ZSTD;
	session->header.env.comp_level = rec->opts.comp_level;

1546 1547 1548 1549 1550 1551
	if (rec->opts.kcore &&
	    !record__kcore_readable(&session->machines.host)) {
		pr_err("ERROR: kcore is not readable.\n");
		return -1;
	}

1552
	record__init_features(rec);
1553

1554 1555 1556
	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;

1557
	if (forks) {
1558
		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1559
						    argv, data->is_pipe,
1560
						    workload_exec_failed_signal);
1561 1562
		if (err < 0) {
			pr_err("Couldn't run the workload!\n");
1563
			status = err;
1564
			goto out_delete_session;
1565 1566 1567
		}
	}

J
Jiri Olsa 已提交
1568 1569 1570 1571 1572 1573
	/*
	 * If we have just single event and are sending data
	 * through pipe, we need to force the ids allocation,
	 * because we synthesize event name through the pipe
	 * and need the id for that.
	 */
1574
	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
J
Jiri Olsa 已提交
1575 1576
		rec->opts.sample_id = true;

1577
	if (record__open(rec) != 0) {
1578
		err = -1;
1579
		goto out_child;
1580
	}
1581
	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1582

1583 1584 1585 1586 1587 1588 1589 1590
	if (rec->opts.kcore) {
		err = record__kcore_copy(&session->machines.host, data);
		if (err) {
			pr_err("ERROR: Failed to copy kcore\n");
			goto out_child;
		}
	}

1591 1592 1593 1594 1595 1596 1597 1598 1599 1600
	err = bpf__apply_obj_config();
	if (err) {
		char errbuf[BUFSIZ];

		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
		pr_err("ERROR: Apply config to BPF failed: %s\n",
			 errbuf);
		goto out_child;
	}

1601 1602 1603 1604 1605 1606 1607 1608 1609
	/*
	 * Normally perf_session__new would do this, but it doesn't have the
	 * evlist.
	 */
	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
		rec->tool.ordered_events = false;
	}

1610
	if (!rec->evlist->nr_groups)
1611 1612
		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);

1613
	if (data->is_pipe) {
1614
		err = perf_header__write_pipe(fd);
1615
		if (err < 0)
1616
			goto out_child;
1617
	} else {
1618
		err = perf_session__write_header(session, rec->evlist, fd, false);
1619
		if (err < 0)
1620
			goto out_child;
1621 1622
	}

1623
	err = -1;
1624
	if (!rec->no_buildid
1625
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1626
		pr_err("Couldn't generate buildids. "
1627
		       "Use --no-buildid to profile anyway.\n");
1628
		goto out_child;
1629 1630
	}

1631 1632 1633
	err = record__setup_sb_evlist(rec);
	if (err)
		goto out_child;
1634

1635
	err = record__synthesize(rec, false);
1636
	if (err < 0)
1637
		goto out_child;
1638

1639
	if (rec->realtime_prio) {
1640 1641
		struct sched_param param;

1642
		param.sched_priority = rec->realtime_prio;
1643
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1644
			pr_err("Could not set realtime priority.\n");
1645
			err = -1;
1646
			goto out_child;
1647 1648 1649
		}
	}

1650 1651 1652 1653 1654
	/*
	 * When perf is starting the traced process, all the events
	 * (apart from group members) have enable_on_exec=1 set,
	 * so don't spoil it by prematurely enabling them.
	 */
1655
	if (!target__none(&opts->target) && !opts->initial_delay)
1656
		evlist__enable(rec->evlist);
1657

1658 1659 1660
	/*
	 * Let the child rip
	 */
1661
	if (forks) {
1662
		struct machine *machine = &session->machines.host;
1663
		union perf_event *event;
1664
		pid_t tgid;
1665 1666 1667 1668 1669 1670 1671

		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
		if (event == NULL) {
			err = -ENOMEM;
			goto out_child;
		}

1672 1673 1674 1675 1676 1677
		/*
		 * Some H/W events are generated before COMM event
		 * which is emitted during exec(), so perf script
		 * cannot see a correct process name for those events.
		 * Synthesize COMM event to prevent it.
		 */
1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701
		tgid = perf_event__synthesize_comm(tool, event,
						   rec->evlist->workload.pid,
						   process_synthesized_event,
						   machine);
		free(event);

		if (tgid == -1)
			goto out_child;

		event = malloc(sizeof(event->namespaces) +
			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
			       machine->id_hdr_size);
		if (event == NULL) {
			err = -ENOMEM;
			goto out_child;
		}

		/*
		 * Synthesize NAMESPACES event for the command specified.
		 */
		perf_event__synthesize_namespaces(tool, event,
						  rec->evlist->workload.pid,
						  tgid, process_synthesized_event,
						  machine);
1702
		free(event);
1703

1704
		perf_evlist__start_workload(rec->evlist);
1705
	}
1706

1707
	if (opts->initial_delay) {
1708
		usleep(opts->initial_delay * USEC_PER_MSEC);
1709
		evlist__enable(rec->evlist);
1710 1711
	}

1712
	trigger_ready(&auxtrace_snapshot_trigger);
1713
	trigger_ready(&switch_output_trigger);
W
Wang Nan 已提交
1714
	perf_hooks__invoke_record_start();
1715
	for (;;) {
1716
		unsigned long long hits = rec->samples;
1717

1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
		/*
		 * rec->evlist->bkw_mmap_state is possible to be
		 * BKW_MMAP_EMPTY here: when done == true and
		 * hits != rec->samples in previous round.
		 *
		 * perf_evlist__toggle_bkw_mmap ensure we never
		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
		 */
		if (trigger_is_hit(&switch_output_trigger) || done || draining)
			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);

1729
		if (record__mmap_read_all(rec, false) < 0) {
1730
			trigger_error(&auxtrace_snapshot_trigger);
1731
			trigger_error(&switch_output_trigger);
1732
			err = -1;
1733
			goto out_child;
1734
		}
1735

1736 1737
		if (auxtrace_record__snapshot_started) {
			auxtrace_record__snapshot_started = 0;
1738
			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1739
				record__read_auxtrace_snapshot(rec, false);
1740
			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1741 1742 1743 1744 1745 1746
				pr_err("AUX area tracing snapshot failed\n");
				err = -1;
				goto out_child;
			}
		}

1747
		if (trigger_is_hit(&switch_output_trigger)) {
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758
			/*
			 * If switch_output_trigger is hit, the data in
			 * overwritable ring buffer should have been collected,
			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
			 *
			 * If SIGUSR2 raise after or during record__mmap_read_all(),
			 * record__mmap_read_all() didn't collect data from
			 * overwritable ring buffer. Read again.
			 */
			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
				continue;
1759 1760
			trigger_ready(&switch_output_trigger);

1761 1762 1763 1764 1765 1766 1767
			/*
			 * Reenable events in overwrite ring buffer after
			 * record__mmap_read_all(): we should have collected
			 * data from it.
			 */
			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);

1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778
			if (!quiet)
				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
					waking);
			waking = 0;
			fd = record__switch_output(rec, false);
			if (fd < 0) {
				pr_err("Failed to switch to new file\n");
				trigger_error(&switch_output_trigger);
				err = fd;
				goto out_child;
			}
1779 1780 1781 1782

			/* re-arm the alarm */
			if (rec->switch_output.time)
				alarm(rec->switch_output.time);
1783 1784
		}

1785
		if (hits == rec->samples) {
1786
			if (done || draining)
1787
				break;
1788
			err = evlist__poll(rec->evlist, -1);
1789 1790 1791 1792 1793
			/*
			 * Propagate error, only if there's any. Ignore positive
			 * number of returned events and interrupt error.
			 */
			if (err > 0 || (err < 0 && errno == EINTR))
1794
				err = 0;
1795
			waking++;
1796

1797
			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1798
				draining = true;
1799 1800
		}

1801 1802 1803 1804 1805
		/*
		 * When perf is starting the traced process, at the end events
		 * die with the process and we wait for that. Thus no need to
		 * disable events in this case.
		 */
1806
		if (done && !disabled && !target__none(&opts->target)) {
1807
			trigger_off(&auxtrace_snapshot_trigger);
1808
			evlist__disable(rec->evlist);
1809 1810
			disabled = true;
		}
1811
	}
1812

1813
	trigger_off(&auxtrace_snapshot_trigger);
1814
	trigger_off(&switch_output_trigger);
1815

1816 1817 1818
	if (opts->auxtrace_snapshot_on_exit)
		record__auxtrace_snapshot_exit(rec);

1819
	if (forks && workload_exec_errno) {
1820
		char msg[STRERR_BUFSIZE];
1821
		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1822 1823
		pr_err("Workload failed: %s\n", emsg);
		err = -1;
1824
		goto out_child;
1825 1826
	}

1827
	if (!quiet)
1828
		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1829

1830 1831 1832
	if (target__none(&rec->opts.target))
		record__synthesize_workload(rec, true);

1833
out_child:
1834
	record__mmap_read_all(rec, true);
1835 1836
	record__aio_mmap_read_sync(rec);

1837 1838 1839 1840 1841
	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
		session->header.env.comp_ratio = ratio + 0.5;
	}

1842 1843
	if (forks) {
		int exit_status;
1844

1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858
		if (!child_finished)
			kill(rec->evlist->workload.pid, SIGTERM);

		wait(&exit_status);

		if (err < 0)
			status = err;
		else if (WIFEXITED(exit_status))
			status = WEXITSTATUS(exit_status);
		else if (WIFSIGNALED(exit_status))
			signr = WTERMSIG(exit_status);
	} else
		status = err;

1859
	record__synthesize(rec, true);
1860 1861 1862
	/* this will be recalculated during process_buildids() */
	rec->samples = 0;

1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
	if (!err) {
		if (!rec->timestamp_filename) {
			record__finish_output(rec);
		} else {
			fd = record__switch_output(rec, true);
			if (fd < 0) {
				status = fd;
				goto out_delete_session;
			}
		}
	}
1874

W
Wang Nan 已提交
1875 1876
	perf_hooks__invoke_record_end();

1877 1878
	if (!err && !quiet) {
		char samples[128];
1879 1880
		const char *postfix = rec->timestamp_filename ?
					".<timestamp>" : "";
1881

1882
		if (rec->samples && !rec->opts.full_auxtrace)
1883 1884 1885 1886 1887
			scnprintf(samples, sizeof(samples),
				  " (%" PRIu64 " samples)", rec->samples);
		else
			samples[0] = '\0';

1888
		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1889
			perf_data__size(data) / 1024.0 / 1024.0,
J
Jiri Olsa 已提交
1890
			data->path, postfix, samples);
1891 1892 1893 1894 1895 1896
		if (ratio) {
			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
					rec->session->bytes_transferred / 1024.0 / 1024.0,
					ratio);
		}
		fprintf(stderr, " ]\n");
1897 1898
	}

1899
out_delete_session:
1900
	zstd_fini(&session->zstd_data);
1901
	perf_session__delete(session);
1902 1903

	if (!opts->no_bpf_event)
1904
		perf_evlist__stop_sb_thread(rec->sb_evlist);
1905
	return status;
1906
}
1907

1908
static void callchain_debug(struct callchain_param *callchain)
J
Jiri Olsa 已提交
1909
{
1910
	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1911

1912
	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1913

1914
	if (callchain->record_mode == CALLCHAIN_DWARF)
J
Jiri Olsa 已提交
1915
		pr_debug("callchain: stack dump size %d\n",
1916
			 callchain->dump_size);
J
Jiri Olsa 已提交
1917 1918
}

1919 1920 1921
int record_opts__parse_callchain(struct record_opts *record,
				 struct callchain_param *callchain,
				 const char *arg, bool unset)
J
Jiri Olsa 已提交
1922 1923
{
	int ret;
1924
	callchain->enabled = !unset;
1925

J
Jiri Olsa 已提交
1926 1927
	/* --no-call-graph */
	if (unset) {
1928
		callchain->record_mode = CALLCHAIN_NONE;
J
Jiri Olsa 已提交
1929 1930 1931 1932
		pr_debug("callchain: disabled\n");
		return 0;
	}

1933
	ret = parse_callchain_record_opt(arg, callchain);
1934 1935
	if (!ret) {
		/* Enable data address sampling for DWARF unwind. */
1936
		if (callchain->record_mode == CALLCHAIN_DWARF)
1937
			record->sample_address = true;
1938
		callchain_debug(callchain);
1939
	}
1940 1941 1942 1943

	return ret;
}

1944 1945 1946 1947 1948 1949 1950
int record_parse_callchain_opt(const struct option *opt,
			       const char *arg,
			       int unset)
{
	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
}

1951
int record_callchain_opt(const struct option *opt,
J
Jiri Olsa 已提交
1952 1953 1954
			 const char *arg __maybe_unused,
			 int unset __maybe_unused)
{
1955
	struct callchain_param *callchain = opt->value;
1956

1957
	callchain->enabled = true;
J
Jiri Olsa 已提交
1958

1959 1960
	if (callchain->record_mode == CALLCHAIN_NONE)
		callchain->record_mode = CALLCHAIN_FP;
1961

1962
	callchain_debug(callchain);
J
Jiri Olsa 已提交
1963 1964 1965
	return 0;
}

1966 1967
static int perf_record_config(const char *var, const char *value, void *cb)
{
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
	struct record *rec = cb;

	if (!strcmp(var, "record.build-id")) {
		if (!strcmp(value, "cache"))
			rec->no_buildid_cache = false;
		else if (!strcmp(value, "no-cache"))
			rec->no_buildid_cache = true;
		else if (!strcmp(value, "skip"))
			rec->no_buildid = true;
		else
			return -1;
		return 0;
	}
1981 1982 1983 1984
	if (!strcmp(var, "record.call-graph")) {
		var = "call-graph.record-mode";
		return perf_default_config(var, value, cb);
	}
1985 1986 1987 1988 1989 1990 1991
#ifdef HAVE_AIO_SUPPORT
	if (!strcmp(var, "record.aio")) {
		rec->opts.nr_cblocks = strtol(value, NULL, 0);
		if (!rec->opts.nr_cblocks)
			rec->opts.nr_cblocks = nr_cblocks_default;
	}
#endif
1992

1993
	return 0;
1994 1995
}

1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
struct clockid_map {
	const char *name;
	int clockid;
};

#define CLOCKID_MAP(n, c)	\
	{ .name = n, .clockid = (c), }

#define CLOCKID_END	{ .name = NULL, }


/*
 * Add the missing ones, we need to build on many distros...
 */
#ifndef CLOCK_MONOTONIC_RAW
#define CLOCK_MONOTONIC_RAW 4
#endif
#ifndef CLOCK_BOOTTIME
#define CLOCK_BOOTTIME 7
#endif
#ifndef CLOCK_TAI
#define CLOCK_TAI 11
#endif

static const struct clockid_map clockids[] = {
	/* available for all events, NMI safe */
	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),

	/* available for some events */
	CLOCKID_MAP("realtime", CLOCK_REALTIME),
	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
	CLOCKID_MAP("tai", CLOCK_TAI),

	/* available for the lazy */
	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
	CLOCKID_MAP("real", CLOCK_REALTIME),
	CLOCKID_MAP("boot", CLOCK_BOOTTIME),

	CLOCKID_END,
};

2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051
static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
{
	struct timespec res;

	*res_ns = 0;
	if (!clock_getres(clk_id, &res))
		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
	else
		pr_warning("WARNING: Failed to determine specified clock resolution.\n");

	return 0;
}

2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074
static int parse_clockid(const struct option *opt, const char *str, int unset)
{
	struct record_opts *opts = (struct record_opts *)opt->value;
	const struct clockid_map *cm;
	const char *ostr = str;

	if (unset) {
		opts->use_clockid = 0;
		return 0;
	}

	/* no arg passed */
	if (!str)
		return 0;

	/* no setting it twice */
	if (opts->use_clockid)
		return -1;

	opts->use_clockid = true;

	/* if its a number, we're done */
	if (sscanf(str, "%d", &opts->clockid) == 1)
2075
		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
2076 2077 2078 2079 2080 2081 2082 2083

	/* allow a "CLOCK_" prefix to the name */
	if (!strncasecmp(str, "CLOCK_", 6))
		str += 6;

	for (cm = clockids; cm->name; cm++) {
		if (!strcasecmp(str, cm->name)) {
			opts->clockid = cm->clockid;
2084 2085
			return get_clockid_res(opts->clockid,
					       &opts->clockid_res_ns);
2086 2087 2088 2089 2090 2091 2092 2093
		}
	}

	opts->use_clockid = false;
	ui__warning("unknown clockid %s, check man page\n", ostr);
	return -1;
}

2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
static int record__parse_affinity(const struct option *opt, const char *str, int unset)
{
	struct record_opts *opts = (struct record_opts *)opt->value;

	if (unset || !str)
		return 0;

	if (!strcasecmp(str, "node"))
		opts->affinity = PERF_AFFINITY_NODE;
	else if (!strcasecmp(str, "cpu"))
		opts->affinity = PERF_AFFINITY_CPU;

	return 0;
}

2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
static int parse_output_max_size(const struct option *opt,
				 const char *str, int unset)
{
	unsigned long *s = (unsigned long *)opt->value;
	static struct parse_tag tags_size[] = {
		{ .tag  = 'B', .mult = 1       },
		{ .tag  = 'K', .mult = 1 << 10 },
		{ .tag  = 'M', .mult = 1 << 20 },
		{ .tag  = 'G', .mult = 1 << 30 },
		{ .tag  = 0 },
	};
	unsigned long val;

	if (unset) {
		*s = 0;
		return 0;
	}

	val = parse_tag_value(str, tags_size);
	if (val != (unsigned long) -1) {
		*s = val;
		return 0;
	}

	return -1;
}

2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
static int record__parse_mmap_pages(const struct option *opt,
				    const char *str,
				    int unset __maybe_unused)
{
	struct record_opts *opts = opt->value;
	char *s, *p;
	unsigned int mmap_pages;
	int ret;

	if (!str)
		return -EINVAL;

	s = strdup(str);
	if (!s)
		return -ENOMEM;

	p = strchr(s, ',');
	if (p)
		*p = '\0';

	if (*s) {
		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
		if (ret)
			goto out_free;
		opts->mmap_pages = mmap_pages;
	}

	if (!p) {
		ret = 0;
		goto out_free;
	}

	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
	if (ret)
		goto out_free;

	opts->auxtrace_mmap_pages = mmap_pages;

out_free:
	free(s);
	return ret;
}

2179 2180
static void switch_output_size_warn(struct record *rec)
{
2181
	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195
	struct switch_output *s = &rec->switch_output;

	wakeup_size /= 2;

	if (s->size < wakeup_size) {
		char buf[100];

		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
		pr_warning("WARNING: switch-output data size lower than "
			   "wakeup kernel buffer size (%s) "
			   "expect bigger perf.data sizes\n", buf);
	}
}

2196 2197 2198
static int switch_output_setup(struct record *rec)
{
	struct switch_output *s = &rec->switch_output;
2199 2200 2201 2202 2203 2204 2205
	static struct parse_tag tags_size[] = {
		{ .tag  = 'B', .mult = 1       },
		{ .tag  = 'K', .mult = 1 << 10 },
		{ .tag  = 'M', .mult = 1 << 20 },
		{ .tag  = 'G', .mult = 1 << 30 },
		{ .tag  = 0 },
	};
2206 2207 2208 2209 2210 2211 2212
	static struct parse_tag tags_time[] = {
		{ .tag  = 's', .mult = 1        },
		{ .tag  = 'm', .mult = 60       },
		{ .tag  = 'h', .mult = 60*60    },
		{ .tag  = 'd', .mult = 60*60*24 },
		{ .tag  = 0 },
	};
2213
	unsigned long val;
2214

2215 2216 2217 2218 2219 2220 2221 2222
	/*
	 * If we're using --switch-output-events, then we imply its 
	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
	 *  thread to its parent.
	 */
	if (rec->switch_output_event_set)
		goto do_signal;

2223 2224 2225 2226
	if (!s->set)
		return 0;

	if (!strcmp(s->str, "signal")) {
2227
do_signal:
2228 2229
		s->signal = true;
		pr_debug("switch-output with SIGUSR2 signal\n");
2230 2231 2232 2233 2234 2235 2236 2237
		goto enabled;
	}

	val = parse_tag_value(s->str, tags_size);
	if (val != (unsigned long) -1) {
		s->size = val;
		pr_debug("switch-output with %s size threshold\n", s->str);
		goto enabled;
2238 2239
	}

2240 2241 2242 2243 2244 2245 2246 2247
	val = parse_tag_value(s->str, tags_time);
	if (val != (unsigned long) -1) {
		s->time = val;
		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
			 s->str, s->time);
		goto enabled;
	}

2248
	return -1;
2249 2250 2251 2252

enabled:
	rec->timestamp_filename = true;
	s->enabled              = true;
2253 2254 2255 2256

	if (s->size && !rec->opts.no_buffering)
		switch_output_size_warn(rec);

2257
	return 0;
2258 2259
}

2260
static const char * const __record_usage[] = {
2261 2262
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
2263 2264
	NULL
};
2265
const char * const *record_usage = __record_usage;
2266

2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291
static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
				  struct perf_sample *sample, struct machine *machine)
{
	/*
	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
	 * no need to add them twice.
	 */
	if (!(event->header.misc & PERF_RECORD_MISC_USER))
		return 0;
	return perf_event__process_mmap(tool, event, sample, machine);
}

static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
				   struct perf_sample *sample, struct machine *machine)
{
	/*
	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
	 * no need to add them twice.
	 */
	if (!(event->header.misc & PERF_RECORD_MISC_USER))
		return 0;

	return perf_event__process_mmap2(tool, event, sample, machine);
}

2292
/*
2293 2294
 * XXX Ideally would be local to cmd_record() and passed to a record__new
 * because we need to have access to it in record__exit, that is called
2295 2296 2297 2298 2299 2300 2301
 * after cmd_record() exits, but since record_options need to be accessible to
 * builtin-script, leave it here.
 *
 * At least we don't ouch it in all the other functions here directly.
 *
 * Just say no to tons of global variables, sigh.
 */
2302
static struct record record = {
2303
	.opts = {
2304
		.sample_time	     = true,
2305 2306 2307
		.mmap_pages	     = UINT_MAX,
		.user_freq	     = UINT_MAX,
		.user_interval	     = ULLONG_MAX,
2308
		.freq		     = 4000,
N
Namhyung Kim 已提交
2309 2310
		.target		     = {
			.uses_mmap   = true,
2311
			.default_per_cpu = true,
N
Namhyung Kim 已提交
2312
		},
2313
		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2314
		.nr_threads_synthesize = 1,
2315
	},
2316 2317 2318
	.tool = {
		.sample		= process_sample_event,
		.fork		= perf_event__process_fork,
2319
		.exit		= perf_event__process_exit,
2320
		.comm		= perf_event__process_comm,
2321
		.namespaces	= perf_event__process_namespaces,
2322 2323
		.mmap		= build_id__process_mmap,
		.mmap2		= build_id__process_mmap2,
2324
		.ordered_events	= true,
2325
	},
2326
};
2327

2328 2329
const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
	"\n\t\t\t\tDefault: fp";
2330

2331 2332
static bool dry_run;

2333 2334 2335
/*
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
 * with it and switch to use the library functions in perf_evlist that came
2336
 * from builtin-record.c, i.e. use record_opts,
2337 2338 2339
 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
 * using pipes, etc.
 */
2340
static struct option __record_options[] = {
2341
	OPT_CALLBACK('e', "event", &record.evlist, "event",
2342
		     "event selector. use 'perf list' to list available events",
2343
		     parse_events_option),
2344
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
L
Li Zefan 已提交
2345
		     "event filter", parse_filter),
2346 2347 2348
	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
			   NULL, "don't record events from perf itself",
			   exclude_perf),
2349
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2350
		    "record events on existing process id"),
2351
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2352
		    "record events on existing thread id"),
2353
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2354
		    "collect data with this RT SCHED_FIFO priority"),
2355
	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2356
		    "collect data without buffering"),
2357
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2358
		    "collect raw sample records from all opened counters"),
2359
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2360
			    "system-wide collection from all CPUs"),
2361
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2362
		    "list of cpus to monitor"),
2363
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
J
Jiri Olsa 已提交
2364
	OPT_STRING('o', "output", &record.data.path, "file",
I
Ingo Molnar 已提交
2365
		    "output file name"),
2366 2367 2368
	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
			&record.opts.no_inherit_set,
			"child tasks do not inherit counters"),
2369 2370
	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
		    "synthesize non-sample events at the end of output"),
W
Wang Nan 已提交
2371
	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2372
	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2373 2374
	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
		    "Fail if the specified frequency can't be used"),
2375 2376 2377
	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
		     "profile at this frequency",
		      record__parse_freq),
2378 2379 2380
	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
		     "number of mmap data pages and AUX area tracing mmap pages",
		     record__parse_mmap_pages),
2381 2382 2383
	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
		     record__mmap_flush_parse),
2384
	OPT_BOOLEAN(0, "group", &record.opts.group,
2385
		    "put the counters into a counter group"),
2386
	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
J
Jiri Olsa 已提交
2387 2388 2389
			   NULL, "enables call-graph recording" ,
			   &record_callchain_opt),
	OPT_CALLBACK(0, "call-graph", &record.opts,
2390
		     "record_mode[,record_size]", record_callchain_help,
J
Jiri Olsa 已提交
2391
		     &record_parse_callchain_opt),
2392
	OPT_INCR('v', "verbose", &verbose,
2393
		    "be more verbose (show counter open errors, etc)"),
2394
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2395
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2396
		    "per thread counts"),
2397
	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2398 2399
	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
		    "Record the sample physical addresses"),
J
Jiri Olsa 已提交
2400
	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2401 2402 2403
	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
			&record.opts.sample_time_set,
			"Record the sample timestamps"),
2404 2405
	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
			"Record the sample period"),
2406
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2407
		    "don't sample"),
2408 2409 2410 2411 2412 2413
	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
			&record.no_buildid_cache_set,
			"do not update the buildid cache"),
	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
			&record.no_buildid_set,
			"do not collect buildids in perf.data"),
2414
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
S
Stephane Eranian 已提交
2415 2416
		     "monitor event in cgroup name only",
		     parse_cgroups),
2417
	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2418
		  "ms to wait before starting measurement after program start"),
2419
	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2420 2421
	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
		   "user to profile"),
2422 2423 2424 2425 2426 2427 2428

	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
		     "branch any", "sample any taken branches",
		     parse_branch_stack),

	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
		     "branch filter mask", "branch stack filter modes",
2429
		     parse_branch_stack),
2430 2431
	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
		    "sample by weight (on special events only)"),
2432 2433
	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
		    "sample transaction flags (special events only)"),
2434 2435
	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
		    "use per-thread mmaps"),
2436 2437
	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
		    "sample selected machine registers on interrupt,"
K
Kan Liang 已提交
2438
		    " use '-I?' to list register names", parse_intr_regs),
2439 2440
	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
		    "sample selected machine registers on interrupt,"
K
Kan Liang 已提交
2441
		    " use '--user-regs=?' to list register names", parse_user_regs),
2442 2443
	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
		    "Record running/enabled time of read (:S) events"),
2444 2445 2446
	OPT_CALLBACK('k', "clockid", &record.opts,
	"clockid", "clockid to use for events, see clock_gettime()",
	parse_clockid),
2447 2448
	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
			  "opts", "AUX area tracing Snapshot Mode", ""),
2449 2450
	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
			  "opts", "sample AUX area", ""),
2451
	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2452
			"per thread proc mmap processing timeout in ms"),
2453 2454
	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
		    "Record namespaces events"),
2455 2456
	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
		    "Record cgroup events"),
2457 2458
	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
		    "Record context switch events"),
2459 2460 2461 2462 2463 2464
	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
			 "Configure all used events to run in kernel space.",
			 PARSE_OPT_EXCLUSIVE),
	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
			 "Configure all used events to run in user space.",
			 PARSE_OPT_EXCLUSIVE),
2465 2466 2467 2468
	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
		    "collect kernel callchains"),
	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
		    "collect user callchains"),
2469 2470 2471 2472
	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
		   "clang binary to use for compiling BPF scriptlets"),
	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
		   "options passed to clang when compiling BPF scriptlets"),
2473 2474
	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
		   "file", "vmlinux pathname"),
2475 2476
	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
		    "Record build-id of all DSOs regardless of hits"),
2477 2478
	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
		    "append timestamp to output filename"),
2479 2480
	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
		    "Record timestamp boundary (time of first/last samples)"),
2481
	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2482 2483
			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2484
			  "signal"),
2485 2486 2487
	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
			 "switch output event selector. use 'perf list' to list available events",
			 parse_events_option_new_evlist),
2488 2489
	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
		   "Limit number of switch output generated files"),
2490 2491
	OPT_BOOLEAN(0, "dry-run", &dry_run,
		    "Parse options then exit"),
2492
#ifdef HAVE_AIO_SUPPORT
2493 2494
	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2495 2496
		     record__aio_parse),
#endif
2497 2498 2499
	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
		     record__parse_affinity),
2500 2501 2502 2503 2504
#ifdef HAVE_ZSTD_SUPPORT
	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
			    record__parse_comp_level),
#endif
2505 2506
	OPT_CALLBACK(0, "max-size", &record.output_max_size,
		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2507 2508 2509
	OPT_UINTEGER(0, "num-thread-synthesize",
		     &record.opts.nr_threads_synthesize,
		     "number of threads to run for event synthesis"),
2510 2511 2512
	OPT_END()
};

2513 2514
struct option *record_options = __record_options;

2515
int cmd_record(int argc, const char **argv)
2516
{
2517
	int err;
2518
	struct record *rec = &record;
2519
	char errbuf[BUFSIZ];
2520

2521 2522
	setlocale(LC_ALL, "");

2523 2524 2525 2526 2527
#ifndef HAVE_LIBBPF_SUPPORT
# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
	set_nobuild('\0', "clang-path", true);
	set_nobuild('\0', "clang-opt", true);
# undef set_nobuild
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541
#endif

#ifndef HAVE_BPF_PROLOGUE
# if !defined (HAVE_DWARF_SUPPORT)
#  define REASON  "NO_DWARF=1"
# elif !defined (HAVE_LIBBPF_SUPPORT)
#  define REASON  "NO_LIBBPF=1"
# else
#  define REASON  "this architecture doesn't support BPF prologue"
# endif
# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
	set_nobuild('\0', "vmlinux", true);
# undef set_nobuild
# undef REASON
2542 2543
#endif

2544 2545
	rec->opts.affinity = PERF_AFFINITY_SYS;

2546
	rec->evlist = evlist__new();
2547
	if (rec->evlist == NULL)
2548 2549
		return -ENOMEM;

2550 2551 2552
	err = perf_config(perf_record_config, rec);
	if (err)
		return err;
2553

2554
	argc = parse_options(argc, argv, record_options, record_usage,
2555
			    PARSE_OPT_STOP_AT_NON_OPTION);
2556 2557
	if (quiet)
		perf_quiet_option();
2558 2559

	/* Make system wide (-a) the default target. */
2560
	if (!argc && target__none(&rec->opts.target))
2561
		rec->opts.target.system_wide = true;
2562

2563
	if (nr_cgroups && !rec->opts.target.system_wide) {
2564 2565 2566
		usage_with_options_msg(record_usage, record_options,
			"cgroup monitoring only available in system-wide mode");

S
Stephane Eranian 已提交
2567
	}
2568

2569 2570 2571
	if (rec->opts.kcore)
		rec->data.is_dir = true;

2572 2573 2574 2575 2576
	if (rec->opts.comp_level != 0) {
		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
		rec->no_buildid = true;
	}

2577 2578
	if (rec->opts.record_switch_events &&
	    !perf_can_record_switch_events()) {
2579 2580 2581
		ui__error("kernel does not support recording context switch events\n");
		parse_options_usage(record_usage, record_options, "switch-events", 0);
		return -EINVAL;
2582
	}
S
Stephane Eranian 已提交
2583

2584 2585 2586 2587 2588
	if (switch_output_setup(rec)) {
		parse_options_usage(record_usage, record_options, "switch-output", 0);
		return -EINVAL;
	}

2589 2590 2591 2592 2593
	if (rec->switch_output.time) {
		signal(SIGALRM, alarm_sig_handler);
		alarm(rec->switch_output.time);
	}

2594 2595 2596 2597 2598 2599 2600
	if (rec->switch_output.num_files) {
		rec->switch_output.filenames = calloc(sizeof(char *),
						      rec->switch_output.num_files);
		if (!rec->switch_output.filenames)
			return -EINVAL;
	}

2601 2602 2603 2604 2605 2606 2607 2608
	/*
	 * Allow aliases to facilitate the lookup of symbols for address
	 * filters. Refer to auxtrace_parse_filters().
	 */
	symbol_conf.allow_aliases = true;

	symbol__init(NULL);

2609 2610 2611 2612 2613 2614 2615 2616 2617 2618
	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
		rec->affinity_mask.nbits = cpu__max_cpu();
		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
		if (!rec->affinity_mask.bits) {
			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
			return -ENOMEM;
		}
		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
	}

2619
	err = record__auxtrace_init(rec);
2620 2621 2622
	if (err)
		goto out;

2623
	if (dry_run)
A
Adrian Hunter 已提交
2624
		goto out;
2625

2626 2627 2628 2629 2630
	err = bpf__setup_stdout(rec->evlist);
	if (err) {
		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
		pr_err("ERROR: Setup BPF stdout failed: %s\n",
			 errbuf);
A
Adrian Hunter 已提交
2631
		goto out;
2632 2633
	}

2634 2635
	err = -ENOMEM;

2636
	if (rec->no_buildid_cache || rec->no_buildid) {
2637
		disable_buildid_cache();
2638
	} else if (rec->switch_output.enabled) {
2639 2640 2641 2642 2643 2644
		/*
		 * In 'perf record --switch-output', disable buildid
		 * generation by default to reduce data file switching
		 * overhead. Still generate buildid if they are required
		 * explicitly using
		 *
2645
		 *  perf record --switch-output --no-no-buildid \
2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665
		 *              --no-no-buildid-cache
		 *
		 * Following code equals to:
		 *
		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
		 *         disable_buildid_cache();
		 */
		bool disable = true;

		if (rec->no_buildid_set && !rec->no_buildid)
			disable = false;
		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
			disable = false;
		if (disable) {
			rec->no_buildid = true;
			rec->no_buildid_cache = true;
			disable_buildid_cache();
		}
	}
2666

2667 2668 2669
	if (record.opts.overwrite)
		record.opts.tail_synthesize = true;

2670
	if (rec->evlist->core.nr_entries == 0 &&
2671
	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2672
		pr_err("Not enough memory for event selector list\n");
2673
		goto out;
2674
	}
2675

2676 2677 2678
	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
		rec->opts.no_inherit = true;

2679
	err = target__validate(&rec->opts.target);
2680
	if (err) {
2681
		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2682
		ui__warning("%s\n", errbuf);
2683 2684
	}

2685
	err = target__parse_uid(&rec->opts.target);
2686 2687
	if (err) {
		int saved_errno = errno;
2688

2689
		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2690
		ui__error("%s", errbuf);
2691 2692

		err = -saved_errno;
2693
		goto out;
2694
	}
2695

2696 2697
	/* Enable ignoring missing threads when -u/-p option is defined. */
	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2698

2699
	err = -ENOMEM;
2700
	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2701
		usage_with_options(record_usage, record_options);
2702

2703 2704
	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
	if (err)
2705
		goto out;
2706

2707 2708 2709 2710 2711 2712 2713 2714
	/*
	 * We take all buildids when the file contains
	 * AUX area tracing data because we do not decode the
	 * trace because it would take too long.
	 */
	if (rec->opts.full_auxtrace)
		rec->buildid_all = true;

2715
	if (record_opts__config(&rec->opts)) {
2716
		err = -EINVAL;
2717
		goto out;
2718 2719
	}

2720 2721
	if (rec->opts.nr_cblocks > nr_cblocks_max)
		rec->opts.nr_cblocks = nr_cblocks_max;
2722
	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2723

2724
	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2725
	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2726

2727 2728 2729 2730
	if (rec->opts.comp_level > comp_level_max)
		rec->opts.comp_level = comp_level_max;
	pr_debug("comp level: %d\n", rec->opts.comp_level);

2731
	err = __cmd_record(&record, argc, argv);
2732
out:
2733
	bitmap_free(rec->affinity_mask.bits);
2734
	evlist__delete(rec->evlist);
2735
	symbol__exit();
2736
	auxtrace_record__free(rec->itr);
2737
	return err;
2738
}
2739 2740 2741

static void snapshot_sig_handler(int sig __maybe_unused)
{
2742 2743
	struct record *rec = &record;

2744 2745 2746 2747 2748 2749
	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
		trigger_hit(&auxtrace_snapshot_trigger);
		auxtrace_record__snapshot_started = 1;
		if (auxtrace_record__snapshot_start(record.itr))
			trigger_error(&auxtrace_snapshot_trigger);
	}
2750

2751
	if (switch_output_signal(rec))
2752
		trigger_hit(&switch_output_trigger);
2753
}
2754 2755 2756 2757 2758 2759 2760 2761

static void alarm_sig_handler(int sig __maybe_unused)
{
	struct record *rec = &record;

	if (switch_output_time(rec))
		trigger_hit(&switch_output_trigger);
}