builtin-record.c 29.9 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8 9
#define _FILE_OFFSET_BITS 64

10
#include "builtin.h"
11 12 13

#include "perf.h"

14
#include "util/build-id.h"
15
#include "util/util.h"
16
#include "util/parse-options.h"
17
#include "util/parse-events.h"
18

19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/session.h"
25
#include "util/tool.h"
26
#include "util/symbol.h"
27
#include "util/cpumap.h"
28
#include "util/thread_map.h"
29

30
#include <unistd.h>
31
#include <sched.h>
32
#include <sys/mman.h>
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
#ifndef HAVE_ON_EXIT
#ifndef ATEXIT_MAX
#define ATEXIT_MAX 32
#endif
static int __on_exit_count = 0;
typedef void (*on_exit_func_t) (int, void *);
static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
static void *__on_exit_args[ATEXIT_MAX];
static int __exitcode = 0;
static void __handle_on_exit_funcs(void);
static int on_exit(on_exit_func_t function, void *arg);
#define exit(x) (exit)(__exitcode = (x))

static int on_exit(on_exit_func_t function, void *arg)
{
	if (__on_exit_count == ATEXIT_MAX)
		return -ENOMEM;
	else if (__on_exit_count == 0)
		atexit(__handle_on_exit_funcs);
	__on_exit_funcs[__on_exit_count] = function;
	__on_exit_args[__on_exit_count++] = arg;
	return 0;
}

static void __handle_on_exit_funcs(void)
{
	int i;
	for (i = 0; i < __on_exit_count; i++)
		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
}
#endif

66 67 68 69 70
enum write_mode_t {
	WRITE_FORCE,
	WRITE_APPEND
};

71
struct perf_record {
72
	struct perf_tool	tool;
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
	struct perf_record_opts	opts;
	u64			bytes_written;
	const char		*output_name;
	struct perf_evlist	*evlist;
	struct perf_session	*session;
	const char		*progname;
	int			output;
	unsigned int		page_size;
	int			realtime_prio;
	enum write_mode_t	write_mode;
	bool			no_buildid;
	bool			no_buildid_cache;
	bool			force;
	bool			file_new;
	bool			append_file;
	long			samples;
	off_t			post_processing_offset;
90
};
91

92
static void advance_output(struct perf_record *rec, size_t size)
93
{
94
	rec->bytes_written += size;
95 96
}

97
static int write_output(struct perf_record *rec, void *buf, size_t size)
98 99
{
	while (size) {
100
		int ret = write(rec->output, buf, size);
101

102 103 104 105
		if (ret < 0) {
			pr_err("failed to write\n");
			return -1;
		}
106 107 108 109

		size -= ret;
		buf += ret;

110
		rec->bytes_written += ret;
111
	}
112 113

	return 0;
114 115
}

116
static int process_synthesized_event(struct perf_tool *tool,
117
				     union perf_event *event,
118 119
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
120
{
121
	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 123 124
	if (write_output(rec, event, event->header.size) < 0)
		return -1;

125 126 127
	return 0;
}

128
static int perf_record__mmap_read(struct perf_record *rec,
129
				   struct perf_mmap *md)
130
{
131
	unsigned int head = perf_mmap__read_head(md);
132
	unsigned int old = md->prev;
133
	unsigned char *data = md->base + rec->page_size;
134 135
	unsigned long size;
	void *buf;
136
	int rc = 0;
137

138
	if (old == head)
139
		return 0;
140

141
	rec->samples++;
142 143 144 145 146 147 148

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
149

150 151 152 153
		if (write_output(rec, buf, size) < 0) {
			rc = -1;
			goto out;
		}
154 155 156 157 158
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
159

160 161 162 163
	if (write_output(rec, buf, size) < 0) {
		rc = -1;
		goto out;
	}
164 165

	md->prev = old;
166
	perf_mmap__write_tail(md, old);
167 168 169

out:
	return rc;
170 171 172
}

static volatile int done = 0;
173
static volatile int signr = -1;
174
static volatile int child_finished = 0;
175

176
static void sig_handler(int sig)
177
{
178 179 180
	if (sig == SIGCHLD)
		child_finished = 1;

181
	done = 1;
182 183 184
	signr = sig;
}

185
static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186
{
187
	struct perf_record *rec = arg;
188 189
	int status;

190
	if (rec->evlist->workload.pid > 0) {
191
		if (!child_finished)
192
			kill(rec->evlist->workload.pid, SIGTERM);
193 194 195

		wait(&status);
		if (WIFSIGNALED(status))
196
			psignal(WTERMSIG(status), rec->progname);
197
	}
198

199
	if (signr == -1 || signr == SIGUSR1)
200 201 202 203
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
204 205
}

206 207 208 209 210 211 212 213
static bool perf_evlist__equal(struct perf_evlist *evlist,
			       struct perf_evlist *other)
{
	struct perf_evsel *pos, *pair;

	if (evlist->nr_entries != other->nr_entries)
		return false;

214
	pair = perf_evlist__first(other);
215 216 217 218

	list_for_each_entry(pos, &evlist->entries, node) {
		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
			return false;
219
		pair = perf_evsel__next(pair);
220 221 222 223 224
	}

	return true;
}

225
static int perf_record__open(struct perf_record *rec)
226
{
227
	struct perf_evsel *pos;
228 229 230
	struct perf_evlist *evlist = rec->evlist;
	struct perf_session *session = rec->session;
	struct perf_record_opts *opts = &rec->opts;
231
	int rc = 0;
232

233 234 235 236
	/*
	 * Set the evsel leader links before we configure attributes,
	 * since some might depend on this info.
	 */
237
	if (opts->group)
238
		perf_evlist__set_leader(evlist);
239

240 241
	perf_evlist__config_attrs(evlist, opts);

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	list_for_each_entry(pos, &evlist->entries, node) {
		struct perf_event_attr *attr = &pos->attr;
		/*
		 * Check if parse_single_tracepoint_event has already asked for
		 * PERF_SAMPLE_TIME.
		 *
		 * XXX this is kludgy but short term fix for problems introduced by
		 * eac23d1c that broke 'perf script' by having different sample_types
		 * when using multiple tracepoint events when we use a perf binary
		 * that tries to use sample_id_all on an older kernel.
		 *
		 * We need to move counter creation to perf_session, support
		 * different sample_types, etc.
		 */
		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
257

258 259 260
fallback_missing_features:
		if (opts->exclude_guest_missing)
			attr->exclude_guest = attr->exclude_host = 0;
261
retry_sample_id:
262
		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
263
try_again:
264
		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
265 266
			int err = errno;

267
			if (err == EPERM || err == EACCES) {
268
				ui__error_paranoid();
269 270
				rc = -err;
				goto out;
271
			} else if (err ==  ENODEV && opts->target.cpu_list) {
272 273 274 275
				pr_err("No such device - did you specify"
				       " an out-of-range profile CPU?\n");
				rc = -err;
				goto out;
276 277 278 279 280 281 282
			} else if (err == EINVAL) {
				if (!opts->exclude_guest_missing &&
				    (attr->exclude_guest || attr->exclude_host)) {
					pr_debug("Old kernel, cannot exclude "
						 "guest or host samples.\n");
					opts->exclude_guest_missing = true;
					goto fallback_missing_features;
283
				} else if (!opts->sample_id_all_missing) {
284 285 286
					/*
					 * Old kernel, no attr->sample_id_type_all field
					 */
287
					opts->sample_id_all_missing = true;
288
					if (!opts->sample_time && !opts->raw_samples && !time_needed)
289
						perf_evsel__reset_sample_bit(pos, TIME);
290 291 292

					goto retry_sample_id;
				}
293
			}
294

295 296 297
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
298 299 300 301
			 * is always available even if no PMU support.
			 *
			 * PPC returns ENXIO until 2.6.37 (behavior changed
			 * with commit b0a873e).
302
			 */
303 304
			if ((err == ENOENT || err == ENXIO)
					&& attr->type == PERF_TYPE_HARDWARE
305 306 307
					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {

				if (verbose)
308 309
					ui__warning("The cycles event is not supported, "
						    "trying to fall back to cpu-clock-ticks\n");
310 311
				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
312 313 314 315
				if (pos->name) {
					free(pos->name);
					pos->name = NULL;
				}
316 317
				goto try_again;
			}
318 319

			if (err == ENOENT) {
320
				ui__error("The %s event is not supported.\n",
321
					  perf_evsel__name(pos));
322 323
				rc = -err;
				goto out;
324 325 326 327 328
			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
				ui__error("\'precise\' request may not be supported. "
					  "Try removing 'p' modifier\n");
				rc = -err;
				goto out;
329 330
			}

331
			printf("\n");
332 333 334 335
			error("sys_perf_event_open() syscall returned with %d "
			      "(%s) for event %s. /bin/dmesg may provide "
			      "additional information.\n",
			      err, strerror(err), perf_evsel__name(pos));
336 337

#if defined(__i386__) || defined(__x86_64__)
338 339 340 341 342 343 344 345 346
			if (attr->type == PERF_TYPE_HARDWARE &&
			    err == EOPNOTSUPP) {
				pr_err("No hardware sampling interrupt available."
				       " No APIC? If so then you can boot the kernel"
				       " with the \"lapic\" boot parameter to"
				       " force-enable it.\n");
				rc = -err;
				goto out;
			}
347 348
#endif

349 350 351
			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
			rc = -err;
			goto out;
L
Li Zefan 已提交
352 353
		}
	}
354

355
	if (perf_evlist__apply_filters(evlist)) {
356 357
		error("failed to set filter with %d (%s)\n", errno,
			strerror(errno));
358 359
		rc = -1;
		goto out;
360 361
	}

362
	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
363 364 365 366 367 368 369
		if (errno == EPERM) {
			pr_err("Permission error mapping pages.\n"
			       "Consider increasing "
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
			       "or try again with a smaller value of -m/--mmap_pages.\n"
			       "(current value: %d)\n", opts->mmap_pages);
			rc = -errno;
370 371
		} else if (!is_power_of_2(opts->mmap_pages) &&
			   (opts->mmap_pages != UINT_MAX)) {
372 373 374 375 376 377 378
			pr_err("--mmap_pages/-m value must be a power of two.");
			rc = -EINVAL;
		} else {
			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
			rc = -errno;
		}
		goto out;
379
	}
380

381
	if (rec->file_new)
382 383 384 385
		session->evlist = evlist;
	else {
		if (!perf_evlist__equal(session->evlist, evlist)) {
			fprintf(stderr, "incompatible append\n");
386 387
			rc = -1;
			goto out;
388 389 390
		}
 	}

391
	perf_session__set_id_hdr_size(session);
392 393
out:
	return rc;
394 395
}

396
static int process_buildids(struct perf_record *rec)
397
{
398
	u64 size = lseek(rec->output, 0, SEEK_CUR);
399

400 401 402
	if (size == 0)
		return 0;

403 404 405
	rec->session->fd = rec->output;
	return __perf_session__process_events(rec->session, rec->post_processing_offset,
					      size - rec->post_processing_offset,
406 407 408
					      size, &build_id__mark_dso_hit_ops);
}

409
static void perf_record__exit(int status, void *arg)
410
{
411 412
	struct perf_record *rec = arg;

413 414 415
	if (status != 0)
		return;

416 417 418 419 420 421 422 423 424
	if (!rec->opts.pipe_output) {
		rec->session->header.data_size += rec->bytes_written;

		if (!rec->no_buildid)
			process_buildids(rec);
		perf_session__write_header(rec->session, rec->evlist,
					   rec->output, true);
		perf_session__delete(rec->session);
		perf_evlist__delete(rec->evlist);
425
		symbol__exit();
426
	}
427 428
}

429
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
430 431
{
	int err;
432
	struct perf_tool *tool = data;
433

434
	if (machine__is_host(machine))
435 436 437 438 439 440 441 442 443 444
		return;

	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
445
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
446
					     machine);
447 448
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
449
		       " relocation symbol.\n", machine->pid);
450 451 452 453 454

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
455
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
456
						 machine, "_text");
457
	if (err < 0)
458
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
459
							 machine, "_stext");
460 461
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
462
		       " relocation symbol.\n", machine->pid);
463 464
}

465 466 467 468 469
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

470
static int perf_record__mmap_read_all(struct perf_record *rec)
471
{
472
	int i;
473
	int rc = 0;
474

475
	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
476 477 478 479 480 481
		if (rec->evlist->mmap[i].base) {
			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
				rc = -1;
				goto out;
			}
		}
482 483
	}

484
	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
485 486 487 488 489
		rc = write_output(rec, &finished_round_event,
				  sizeof(finished_round_event));

out:
	return rc;
490 491
}

492
static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
493
{
I
Ingo Molnar 已提交
494 495
	struct stat st;
	int flags;
496
	int err, output, feat;
497
	unsigned long waking = 0;
498
	const bool forks = argc > 0;
499
	struct machine *machine;
500
	struct perf_tool *tool = &rec->tool;
501 502 503 504
	struct perf_record_opts *opts = &rec->opts;
	struct perf_evlist *evsel_list = rec->evlist;
	const char *output_name = rec->output_name;
	struct perf_session *session;
505
	bool disabled = false;
506

507
	rec->progname = argv[0];
508

509
	rec->page_size = sysconf(_SC_PAGE_SIZE);
510

511
	on_exit(perf_record__sig_exit, rec);
512 513
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
514
	signal(SIGUSR1, sig_handler);
515

516 517
	if (!output_name) {
		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
518
			opts->pipe_output = true;
519
		else
520
			rec->output_name = output_name = "perf.data";
521 522 523
	}
	if (output_name) {
		if (!strcmp(output_name, "-"))
524
			opts->pipe_output = true;
525
		else if (!stat(output_name, &st) && st.st_size) {
526
			if (rec->write_mode == WRITE_FORCE) {
527 528 529 530 531 532
				char oldname[PATH_MAX];
				snprintf(oldname, sizeof(oldname), "%s.old",
					 output_name);
				unlink(oldname);
				rename(output_name, oldname);
			}
533 534
		} else if (rec->write_mode == WRITE_APPEND) {
			rec->write_mode = WRITE_FORCE;
535
		}
536 537
	}

538
	flags = O_CREAT|O_RDWR;
539 540
	if (rec->write_mode == WRITE_APPEND)
		rec->file_new = 0;
I
Ingo Molnar 已提交
541 542 543
	else
		flags |= O_TRUNC;

544
	if (opts->pipe_output)
545 546 547
		output = STDOUT_FILENO;
	else
		output = open(output_name, flags, S_IRUSR | S_IWUSR);
548 549
	if (output < 0) {
		perror("failed to create output file");
550
		return -1;
551 552
	}

553 554
	rec->output = output;

555
	session = perf_session__new(output_name, O_WRONLY,
556
				    rec->write_mode == WRITE_FORCE, false, NULL);
557
	if (session == NULL) {
558 559 560 561
		pr_err("Not enough memory for reading perf file header\n");
		return -1;
	}

562 563
	rec->session = session;

564 565 566 567 568 569 570
	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
		perf_header__set_feat(&session->header, feat);

	if (rec->no_buildid)
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);

	if (!have_tracepoints(&evsel_list->entries))
571
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
572

573 574 575
	if (!rec->opts.branch_stack)
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);

576
	if (!rec->file_new) {
577
		err = perf_session__read_header(session, output);
578
		if (err < 0)
579
			goto out_delete_session;
580 581
	}

582
	if (forks) {
583
		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
584 585 586
		if (err < 0) {
			pr_err("Couldn't run the workload!\n");
			goto out_delete_session;
587 588 589
		}
	}

590 591 592 593
	if (perf_record__open(rec) != 0) {
		err = -1;
		goto out_delete_session;
	}
594

595
	/*
596
	 * perf_session__delete(session) will be called at perf_record__exit()
597
	 */
598
	on_exit(perf_record__exit, rec);
599

600
	if (opts->pipe_output) {
601 602
		err = perf_header__write_pipe(output);
		if (err < 0)
603
			goto out_delete_session;
604
	} else if (rec->file_new) {
605 606
		err = perf_session__write_header(session, evsel_list,
						 output, false);
607
		if (err < 0)
608
			goto out_delete_session;
609 610
	}

611
	if (!rec->no_buildid
612
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
613
		pr_err("Couldn't generate buildids. "
614
		       "Use --no-buildid to profile anyway.\n");
615 616
		err = -1;
		goto out_delete_session;
617 618
	}

619
	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
620

621 622 623
	machine = perf_session__find_host_machine(session);
	if (!machine) {
		pr_err("Couldn't find native kernel information.\n");
624 625
		err = -1;
		goto out_delete_session;
626 627
	}

628
	if (opts->pipe_output) {
629
		err = perf_event__synthesize_attrs(tool, session,
630
						   process_synthesized_event);
631 632
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
633
			goto out_delete_session;
634
		}
635

636
		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
637
							 machine);
638 639
		if (err < 0) {
			pr_err("Couldn't synthesize event_types.\n");
640
			goto out_delete_session;
641
		}
642

643
		if (have_tracepoints(&evsel_list->entries)) {
644 645 646 647 648 649 650 651
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
652
			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
653
								  process_synthesized_event);
654 655
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
656
				goto out_delete_session;
657
			}
658
			advance_output(rec, err);
659
		}
660 661
	}

662
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
663
						 machine, "_text");
664
	if (err < 0)
665
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
666
							 machine, "_stext");
667 668 669 670
	if (err < 0)
		pr_err("Couldn't record kernel reference relocation symbol\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/kallsyms permission or run as root.\n");
671

672
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
673
					     machine);
674 675 676 677 678
	if (err < 0)
		pr_err("Couldn't record kernel module information.\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/modules permission or run as root.\n");

679
	if (perf_guest)
680
		perf_session__process_machines(session, tool,
681
					       perf_event__synthesize_guest_os);
682

683
	if (!opts->target.system_wide)
684
		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
685
						  process_synthesized_event,
686
						  machine);
687
	else
688
		err = perf_event__synthesize_threads(tool, process_synthesized_event,
689
					       machine);
690

691 692 693
	if (err != 0)
		goto out_delete_session;

694
	if (rec->realtime_prio) {
695 696
		struct sched_param param;

697
		param.sched_priority = rec->realtime_prio;
698
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
699
			pr_err("Could not set realtime priority.\n");
700 701
			err = -1;
			goto out_delete_session;
702 703 704
		}
	}

705 706 707 708 709 710 711
	/*
	 * When perf is starting the traced process, all the events
	 * (apart from group members) have enable_on_exec=1 set,
	 * so don't spoil it by prematurely enabling them.
	 */
	if (!perf_target__none(&opts->target))
		perf_evlist__enable(evsel_list);
712

713 714 715
	/*
	 * Let the child rip
	 */
716
	if (forks)
717
		perf_evlist__start_workload(evsel_list);
718

719
	for (;;) {
720
		int hits = rec->samples;
721

722 723 724 725
		if (perf_record__mmap_read_all(rec) < 0) {
			err = -1;
			goto out_delete_session;
		}
726

727
		if (hits == rec->samples) {
728 729
			if (done)
				break;
730
			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
731 732 733
			waking++;
		}

734 735 736 737 738
		/*
		 * When perf is starting the traced process, at the end events
		 * die with the process and we wait for that. Thus no need to
		 * disable events in this case.
		 */
739
		if (done && !disabled && !perf_target__none(&opts->target)) {
740
			perf_evlist__disable(evsel_list);
741 742
			disabled = true;
		}
743 744
	}

745
	if (quiet || signr == SIGUSR1)
746 747
		return 0;

748 749
	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);

750 751 752 753
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
754
		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
755
		(double)rec->bytes_written / 1024.0 / 1024.0,
756
		output_name,
757
		rec->bytes_written / 24);
758

759
	return 0;
760 761 762 763

out_delete_session:
	perf_session__delete(session);
	return err;
764
}
765

766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
#define BRANCH_OPT(n, m) \
	{ .name = n, .mode = (m) }

#define BRANCH_END { .name = NULL }

struct branch_mode {
	const char *name;
	int mode;
};

static const struct branch_mode branch_modes[] = {
	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
	BRANCH_END
};

static int
788
parse_branch_stack(const struct option *opt, const char *str, int unset)
789 790 791 792 793 794 795 796
{
#define ONLY_PLM \
	(PERF_SAMPLE_BRANCH_USER	|\
	 PERF_SAMPLE_BRANCH_KERNEL	|\
	 PERF_SAMPLE_BRANCH_HV)

	uint64_t *mode = (uint64_t *)opt->value;
	const struct branch_mode *br;
797
	char *s, *os = NULL, *p;
798 799
	int ret = -1;

800 801
	if (unset)
		return 0;
802

803 804 805 806
	/*
	 * cannot set it twice, -b + --branch-filter for instance
	 */
	if (*mode)
807 808
		return -1;

809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
	/* str may be NULL in case no arg is passed to -b */
	if (str) {
		/* because str is read-only */
		s = os = strdup(str);
		if (!s)
			return -1;

		for (;;) {
			p = strchr(s, ',');
			if (p)
				*p = '\0';

			for (br = branch_modes; br->name; br++) {
				if (!strcasecmp(s, br->name))
					break;
			}
			if (!br->name) {
				ui__warning("unknown branch filter %s,"
					    " check man page\n", s);
				goto error;
			}
830

831
			*mode |= br->mode;
832

833 834
			if (!p)
				break;
835

836 837
			s = p + 1;
		}
838 839 840
	}
	ret = 0;

841
	/* default to any branch */
842
	if ((*mode & ~ONLY_PLM) == 0) {
843
		*mode = PERF_SAMPLE_BRANCH_ANY;
844 845 846 847 848 849
	}
error:
	free(os);
	return ret;
}

850
#ifdef LIBUNWIND_SUPPORT
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
static int get_stack_size(char *str, unsigned long *_size)
{
	char *endptr;
	unsigned long size;
	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));

	size = strtoul(str, &endptr, 0);

	do {
		if (*endptr)
			break;

		size = round_up(size, sizeof(u64));
		if (!size || size > max_size)
			break;

		*_size = size;
		return 0;

	} while (0);

	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
	       max_size, str);
	return -1;
}
876
#endif /* LIBUNWIND_SUPPORT */
877 878

static int
879
parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
		    int unset)
{
	struct perf_record *rec = (struct perf_record *)opt->value;
	char *tok, *name, *saveptr = NULL;
	char *buf;
	int ret = -1;

	/* --no-call-graph */
	if (unset)
		return 0;

	/* We specified default option if none is provided. */
	BUG_ON(!arg);

	/* We need buffer that we know we can write to. */
	buf = malloc(strlen(arg) + 1);
	if (!buf)
		return -ENOMEM;

	strcpy(buf, arg);

	tok = strtok_r((char *)buf, ",", &saveptr);
	name = tok ? : (char *)buf;

	do {
		/* Framepointer style */
		if (!strncmp(name, "fp", sizeof("fp"))) {
			if (!strtok_r(NULL, ",", &saveptr)) {
				rec->opts.call_graph = CALLCHAIN_FP;
				ret = 0;
			} else
				pr_err("callchain: No more arguments "
				       "needed for -g fp\n");
			break;

915
#ifdef LIBUNWIND_SUPPORT
916 917
		/* Dwarf style */
		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
918 919
			const unsigned long default_stack_dump_size = 8192;

920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
			ret = 0;
			rec->opts.call_graph = CALLCHAIN_DWARF;
			rec->opts.stack_dump_size = default_stack_dump_size;

			tok = strtok_r(NULL, ",", &saveptr);
			if (tok) {
				unsigned long size = 0;

				ret = get_stack_size(tok, &size);
				rec->opts.stack_dump_size = size;
			}

			if (!ret)
				pr_debug("callchain: stack dump size %d\n",
					 rec->opts.stack_dump_size);
935
#endif /* LIBUNWIND_SUPPORT */
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
		} else {
			pr_err("callchain: Unknown -g option "
			       "value: %s\n", arg);
			break;
		}

	} while (0);

	free(buf);

	if (!ret)
		pr_debug("callchain: type %d\n", rec->opts.call_graph);

	return ret;
}

952
static const char * const record_usage[] = {
953 954
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
955 956 957
	NULL
};

958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
/*
 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
 * because we need to have access to it in perf_record__exit, that is called
 * after cmd_record() exits, but since record_options need to be accessible to
 * builtin-script, leave it here.
 *
 * At least we don't ouch it in all the other functions here directly.
 *
 * Just say no to tons of global variables, sigh.
 */
static struct perf_record record = {
	.opts = {
		.mmap_pages	     = UINT_MAX,
		.user_freq	     = UINT_MAX,
		.user_interval	     = ULLONG_MAX,
973
		.freq		     = 4000,
N
Namhyung Kim 已提交
974 975 976
		.target		     = {
			.uses_mmap   = true,
		},
977 978 979 980
	},
	.write_mode = WRITE_FORCE,
	.file_new   = true,
};
981

982 983 984 985 986 987 988 989
#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "

#ifdef LIBUNWIND_SUPPORT
static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
#else
static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
#endif

990 991 992 993 994 995 996
/*
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
 * with it and switch to use the library functions in perf_evlist that came
 * from builtin-record.c, i.e. use perf_record_opts,
 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
 * using pipes, etc.
 */
997
const struct option record_options[] = {
998
	OPT_CALLBACK('e', "event", &record.evlist, "event",
999
		     "event selector. use 'perf list' to list available events",
1000
		     parse_events_option),
1001
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
L
Li Zefan 已提交
1002
		     "event filter", parse_filter),
1003
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1004
		    "record events on existing process id"),
1005
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1006
		    "record events on existing thread id"),
1007
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1008
		    "collect data with this RT SCHED_FIFO priority"),
1009
	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1010
		    "collect data without buffering"),
1011
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1012
		    "collect raw sample records from all opened counters"),
1013
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1014
			    "system-wide collection from all CPUs"),
1015
	OPT_BOOLEAN('A', "append", &record.append_file,
I
Ingo Molnar 已提交
1016
			    "append to the output file to do incremental profiling"),
1017
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1018
		    "list of cpus to monitor"),
1019
	OPT_BOOLEAN('f', "force", &record.force,
1020
			"overwrite existing data file (deprecated)"),
1021 1022
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
	OPT_STRING('o', "output", &record.output_name, "file",
I
Ingo Molnar 已提交
1023
		    "output file name"),
1024
	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1025
		    "child tasks do not inherit counters"),
1026 1027
	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1028
		     "number of mmap data pages"),
1029
	OPT_BOOLEAN(0, "group", &record.opts.group,
1030
		    "put the counters into a counter group"),
1031 1032 1033
	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
			     callchain_help, &parse_callchain_opt,
			     "fp"),
1034
	OPT_INCR('v', "verbose", &verbose,
1035
		    "be more verbose (show counter open errors, etc)"),
1036
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1037
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1038
		    "per thread counts"),
1039
	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1040
		    "Sample addresses"),
1041
	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1042
	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1043
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1044
		    "don't sample"),
1045
	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1046
		    "do not update the buildid cache"),
1047
	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1048
		    "do not collect buildids in perf.data"),
1049
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
S
Stephane Eranian 已提交
1050 1051
		     "monitor event in cgroup name only",
		     parse_cgroups),
1052 1053
	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
		   "user to profile"),
1054 1055 1056 1057 1058 1059 1060

	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
		     "branch any", "sample any taken branches",
		     parse_branch_stack),

	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
		     "branch filter mask", "branch stack filter modes",
1061
		     parse_branch_stack),
1062 1063 1064
	OPT_END()
};

1065
int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1066
{
1067 1068
	int err = -ENOMEM;
	struct perf_evsel *pos;
1069 1070
	struct perf_evlist *evsel_list;
	struct perf_record *rec = &record;
1071
	char errbuf[BUFSIZ];
1072

1073
	evsel_list = perf_evlist__new(NULL, NULL);
1074 1075 1076
	if (evsel_list == NULL)
		return -ENOMEM;

1077 1078
	rec->evlist = evsel_list;

1079
	argc = parse_options(argc, argv, record_options, record_usage,
1080
			    PARSE_OPT_STOP_AT_NON_OPTION);
1081
	if (!argc && perf_target__none(&rec->opts.target))
1082
		usage_with_options(record_usage, record_options);
1083

1084
	if (rec->force && rec->append_file) {
1085 1086
		ui__error("Can't overwrite and append at the same time."
			  " You need to choose between -f and -A");
1087
		usage_with_options(record_usage, record_options);
1088 1089
	} else if (rec->append_file) {
		rec->write_mode = WRITE_APPEND;
1090
	} else {
1091
		rec->write_mode = WRITE_FORCE;
1092 1093
	}

1094
	if (nr_cgroups && !rec->opts.target.system_wide) {
1095 1096
		ui__error("cgroup monitoring only available in"
			  " system-wide mode\n");
S
Stephane Eranian 已提交
1097 1098 1099
		usage_with_options(record_usage, record_options);
	}

1100
	symbol__init();
1101

1102
	if (symbol_conf.kptr_restrict)
1103 1104 1105 1106 1107 1108 1109 1110
		pr_warning(
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
"check /proc/sys/kernel/kptr_restrict.\n\n"
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
"file is not found in the buildid cache or in the vmlinux path.\n\n"
"Samples in kernel modules won't be resolved at all.\n\n"
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
1111

1112
	if (rec->no_buildid_cache || rec->no_buildid)
1113
		disable_buildid_cache();
1114

1115 1116
	if (evsel_list->nr_entries == 0 &&
	    perf_evlist__add_default(evsel_list) < 0) {
1117 1118
		pr_err("Not enough memory for event selector list\n");
		goto out_symbol_exit;
1119
	}
1120

1121 1122 1123 1124 1125 1126 1127 1128 1129
	err = perf_target__validate(&rec->opts.target);
	if (err) {
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
		ui__warning("%s", errbuf);
	}

	err = perf_target__parse_uid(&rec->opts.target);
	if (err) {
		int saved_errno = errno;
1130

1131
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1132
		ui__error("%s", errbuf);
1133 1134

		err = -saved_errno;
1135
		goto out_free_fd;
1136
	}
1137

1138
	err = -ENOMEM;
1139
	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1140
		usage_with_options(record_usage, record_options);
1141

1142
	list_for_each_entry(pos, &evsel_list->entries, node) {
1143
		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1144
			goto out_free_fd;
1145
	}
1146

1147 1148 1149 1150
	if (rec->opts.user_interval != ULLONG_MAX)
		rec->opts.default_interval = rec->opts.user_interval;
	if (rec->opts.user_freq != UINT_MAX)
		rec->opts.freq = rec->opts.user_freq;
1151

1152 1153 1154
	/*
	 * User specified count overrides default frequency.
	 */
1155 1156 1157 1158
	if (rec->opts.default_interval)
		rec->opts.freq = 0;
	else if (rec->opts.freq) {
		rec->opts.default_interval = rec->opts.freq;
1159
	} else {
1160
		ui__error("frequency and count are zero, aborting\n");
1161
		err = -EINVAL;
1162
		goto out_free_fd;
1163 1164
	}

1165
	err = __cmd_record(&record, argc, argv);
1166
out_free_fd:
1167
	perf_evlist__delete_maps(evsel_list);
1168 1169
out_symbol_exit:
	symbol__exit();
1170
	return err;
1171
}