builtin-record.c 29.7 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8 9
#define _FILE_OFFSET_BITS 64

10
#include "builtin.h"
11 12 13

#include "perf.h"

14
#include "util/build-id.h"
15
#include "util/util.h"
16
#include "util/parse-options.h"
17
#include "util/parse-events.h"
18

19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/session.h"
25
#include "util/tool.h"
26
#include "util/symbol.h"
27
#include "util/cpumap.h"
28
#include "util/thread_map.h"
29

30
#include <unistd.h>
31
#include <sched.h>
32
#include <sys/mman.h>
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
#ifndef HAVE_ON_EXIT
#ifndef ATEXIT_MAX
#define ATEXIT_MAX 32
#endif
static int __on_exit_count = 0;
typedef void (*on_exit_func_t) (int, void *);
static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
static void *__on_exit_args[ATEXIT_MAX];
static int __exitcode = 0;
static void __handle_on_exit_funcs(void);
static int on_exit(on_exit_func_t function, void *arg);
#define exit(x) (exit)(__exitcode = (x))

static int on_exit(on_exit_func_t function, void *arg)
{
	if (__on_exit_count == ATEXIT_MAX)
		return -ENOMEM;
	else if (__on_exit_count == 0)
		atexit(__handle_on_exit_funcs);
	__on_exit_funcs[__on_exit_count] = function;
	__on_exit_args[__on_exit_count++] = arg;
	return 0;
}

static void __handle_on_exit_funcs(void)
{
	int i;
	for (i = 0; i < __on_exit_count; i++)
		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
}
#endif

66 67 68 69 70
enum write_mode_t {
	WRITE_FORCE,
	WRITE_APPEND
};

71
struct perf_record {
72
	struct perf_tool	tool;
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
	struct perf_record_opts	opts;
	u64			bytes_written;
	const char		*output_name;
	struct perf_evlist	*evlist;
	struct perf_session	*session;
	const char		*progname;
	int			output;
	unsigned int		page_size;
	int			realtime_prio;
	enum write_mode_t	write_mode;
	bool			no_buildid;
	bool			no_buildid_cache;
	bool			force;
	bool			file_new;
	bool			append_file;
	long			samples;
	off_t			post_processing_offset;
90
};
91

92
static void advance_output(struct perf_record *rec, size_t size)
93
{
94
	rec->bytes_written += size;
95 96
}

97
static int write_output(struct perf_record *rec, void *buf, size_t size)
98 99
{
	while (size) {
100
		int ret = write(rec->output, buf, size);
101

102 103 104 105
		if (ret < 0) {
			pr_err("failed to write\n");
			return -1;
		}
106 107 108 109

		size -= ret;
		buf += ret;

110
		rec->bytes_written += ret;
111
	}
112 113

	return 0;
114 115
}

116
static int process_synthesized_event(struct perf_tool *tool,
117
				     union perf_event *event,
118 119
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
120
{
121
	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 123 124
	if (write_output(rec, event, event->header.size) < 0)
		return -1;

125 126 127
	return 0;
}

128
static int perf_record__mmap_read(struct perf_record *rec,
129
				   struct perf_mmap *md)
130
{
131
	unsigned int head = perf_mmap__read_head(md);
132
	unsigned int old = md->prev;
133
	unsigned char *data = md->base + rec->page_size;
134 135
	unsigned long size;
	void *buf;
136
	int rc = 0;
137

138
	if (old == head)
139
		return 0;
140

141
	rec->samples++;
142 143 144 145 146 147 148

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
149

150 151 152 153
		if (write_output(rec, buf, size) < 0) {
			rc = -1;
			goto out;
		}
154 155 156 157 158
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
159

160 161 162 163
	if (write_output(rec, buf, size) < 0) {
		rc = -1;
		goto out;
	}
164 165

	md->prev = old;
166
	perf_mmap__write_tail(md, old);
167 168 169

out:
	return rc;
170 171 172
}

static volatile int done = 0;
173
static volatile int signr = -1;
174
static volatile int child_finished = 0;
175

176
static void sig_handler(int sig)
177
{
178 179 180
	if (sig == SIGCHLD)
		child_finished = 1;

181
	done = 1;
182 183 184
	signr = sig;
}

185
static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186
{
187
	struct perf_record *rec = arg;
188 189
	int status;

190
	if (rec->evlist->workload.pid > 0) {
191
		if (!child_finished)
192
			kill(rec->evlist->workload.pid, SIGTERM);
193 194 195

		wait(&status);
		if (WIFSIGNALED(status))
196
			psignal(WTERMSIG(status), rec->progname);
197
	}
198

199
	if (signr == -1 || signr == SIGUSR1)
200 201 202 203
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
204 205
}

206 207 208 209 210 211 212 213
static bool perf_evlist__equal(struct perf_evlist *evlist,
			       struct perf_evlist *other)
{
	struct perf_evsel *pos, *pair;

	if (evlist->nr_entries != other->nr_entries)
		return false;

214
	pair = perf_evlist__first(other);
215 216 217 218

	list_for_each_entry(pos, &evlist->entries, node) {
		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
			return false;
219
		pair = perf_evsel__next(pair);
220 221 222 223 224
	}

	return true;
}

225
static int perf_record__open(struct perf_record *rec)
226
{
227
	struct perf_evsel *pos;
228 229 230
	struct perf_evlist *evlist = rec->evlist;
	struct perf_session *session = rec->session;
	struct perf_record_opts *opts = &rec->opts;
231
	int rc = 0;
232

233
	perf_evlist__config(evlist, opts);
234

235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
	list_for_each_entry(pos, &evlist->entries, node) {
		struct perf_event_attr *attr = &pos->attr;
		/*
		 * Check if parse_single_tracepoint_event has already asked for
		 * PERF_SAMPLE_TIME.
		 *
		 * XXX this is kludgy but short term fix for problems introduced by
		 * eac23d1c that broke 'perf script' by having different sample_types
		 * when using multiple tracepoint events when we use a perf binary
		 * that tries to use sample_id_all on an older kernel.
		 *
		 * We need to move counter creation to perf_session, support
		 * different sample_types, etc.
		 */
		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
250

251 252 253
fallback_missing_features:
		if (opts->exclude_guest_missing)
			attr->exclude_guest = attr->exclude_host = 0;
254
retry_sample_id:
255
		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
256
try_again:
257
		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
258 259
			int err = errno;

260
			if (err == EPERM || err == EACCES) {
261
				ui__error_paranoid();
262 263
				rc = -err;
				goto out;
264
			} else if (err ==  ENODEV && opts->target.cpu_list) {
265 266 267 268
				pr_err("No such device - did you specify"
				       " an out-of-range profile CPU?\n");
				rc = -err;
				goto out;
269 270 271 272 273 274 275
			} else if (err == EINVAL) {
				if (!opts->exclude_guest_missing &&
				    (attr->exclude_guest || attr->exclude_host)) {
					pr_debug("Old kernel, cannot exclude "
						 "guest or host samples.\n");
					opts->exclude_guest_missing = true;
					goto fallback_missing_features;
276
				} else if (!opts->sample_id_all_missing) {
277 278 279
					/*
					 * Old kernel, no attr->sample_id_type_all field
					 */
280
					opts->sample_id_all_missing = true;
281
					if (!opts->sample_time && !opts->raw_samples && !time_needed)
282
						perf_evsel__reset_sample_bit(pos, TIME);
283 284 285

					goto retry_sample_id;
				}
286
			}
287

288 289 290
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
291 292 293 294
			 * is always available even if no PMU support.
			 *
			 * PPC returns ENXIO until 2.6.37 (behavior changed
			 * with commit b0a873e).
295
			 */
296 297
			if ((err == ENOENT || err == ENXIO)
					&& attr->type == PERF_TYPE_HARDWARE
298 299 300
					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {

				if (verbose)
301 302
					ui__warning("The cycles event is not supported, "
						    "trying to fall back to cpu-clock-ticks\n");
303 304
				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
305 306 307 308
				if (pos->name) {
					free(pos->name);
					pos->name = NULL;
				}
309 310
				goto try_again;
			}
311 312

			if (err == ENOENT) {
313
				ui__error("The %s event is not supported.\n",
314
					  perf_evsel__name(pos));
315 316
				rc = -err;
				goto out;
317 318 319 320 321
			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
				ui__error("\'precise\' request may not be supported. "
					  "Try removing 'p' modifier\n");
				rc = -err;
				goto out;
322 323
			}

324
			printf("\n");
325 326 327 328
			error("sys_perf_event_open() syscall returned with %d "
			      "(%s) for event %s. /bin/dmesg may provide "
			      "additional information.\n",
			      err, strerror(err), perf_evsel__name(pos));
329 330

#if defined(__i386__) || defined(__x86_64__)
331 332 333 334 335 336 337 338 339
			if (attr->type == PERF_TYPE_HARDWARE &&
			    err == EOPNOTSUPP) {
				pr_err("No hardware sampling interrupt available."
				       " No APIC? If so then you can boot the kernel"
				       " with the \"lapic\" boot parameter to"
				       " force-enable it.\n");
				rc = -err;
				goto out;
			}
340 341
#endif

342 343 344
			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
			rc = -err;
			goto out;
L
Li Zefan 已提交
345 346
		}
	}
347

348
	if (perf_evlist__apply_filters(evlist)) {
349 350
		error("failed to set filter with %d (%s)\n", errno,
			strerror(errno));
351 352
		rc = -1;
		goto out;
353 354
	}

355
	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
356 357 358 359 360 361 362
		if (errno == EPERM) {
			pr_err("Permission error mapping pages.\n"
			       "Consider increasing "
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
			       "or try again with a smaller value of -m/--mmap_pages.\n"
			       "(current value: %d)\n", opts->mmap_pages);
			rc = -errno;
363 364
		} else if (!is_power_of_2(opts->mmap_pages) &&
			   (opts->mmap_pages != UINT_MAX)) {
365 366 367 368 369 370 371
			pr_err("--mmap_pages/-m value must be a power of two.");
			rc = -EINVAL;
		} else {
			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
			rc = -errno;
		}
		goto out;
372
	}
373

374
	if (rec->file_new)
375 376 377 378
		session->evlist = evlist;
	else {
		if (!perf_evlist__equal(session->evlist, evlist)) {
			fprintf(stderr, "incompatible append\n");
379 380
			rc = -1;
			goto out;
381 382 383
		}
 	}

384
	perf_session__set_id_hdr_size(session);
385 386
out:
	return rc;
387 388
}

389
static int process_buildids(struct perf_record *rec)
390
{
391
	u64 size = lseek(rec->output, 0, SEEK_CUR);
392

393 394 395
	if (size == 0)
		return 0;

396 397 398
	rec->session->fd = rec->output;
	return __perf_session__process_events(rec->session, rec->post_processing_offset,
					      size - rec->post_processing_offset,
399 400 401
					      size, &build_id__mark_dso_hit_ops);
}

402
static void perf_record__exit(int status, void *arg)
403
{
404 405
	struct perf_record *rec = arg;

406 407 408
	if (status != 0)
		return;

409 410 411 412 413 414 415 416 417
	if (!rec->opts.pipe_output) {
		rec->session->header.data_size += rec->bytes_written;

		if (!rec->no_buildid)
			process_buildids(rec);
		perf_session__write_header(rec->session, rec->evlist,
					   rec->output, true);
		perf_session__delete(rec->session);
		perf_evlist__delete(rec->evlist);
418
		symbol__exit();
419
	}
420 421
}

422
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
423 424
{
	int err;
425
	struct perf_tool *tool = data;
426

427
	if (machine__is_host(machine))
428 429 430 431 432 433 434 435 436 437
		return;

	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
438
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
439
					     machine);
440 441
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
442
		       " relocation symbol.\n", machine->pid);
443 444 445 446 447

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
448
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
449
						 machine, "_text");
450
	if (err < 0)
451
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
452
							 machine, "_stext");
453 454
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
455
		       " relocation symbol.\n", machine->pid);
456 457
}

458 459 460 461 462
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

463
static int perf_record__mmap_read_all(struct perf_record *rec)
464
{
465
	int i;
466
	int rc = 0;
467

468
	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
469 470 471 472 473 474
		if (rec->evlist->mmap[i].base) {
			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
				rc = -1;
				goto out;
			}
		}
475 476
	}

477
	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
478 479 480 481 482
		rc = write_output(rec, &finished_round_event,
				  sizeof(finished_round_event));

out:
	return rc;
483 484
}

485
static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
486
{
I
Ingo Molnar 已提交
487 488
	struct stat st;
	int flags;
489
	int err, output, feat;
490
	unsigned long waking = 0;
491
	const bool forks = argc > 0;
492
	struct machine *machine;
493
	struct perf_tool *tool = &rec->tool;
494 495 496 497
	struct perf_record_opts *opts = &rec->opts;
	struct perf_evlist *evsel_list = rec->evlist;
	const char *output_name = rec->output_name;
	struct perf_session *session;
498
	bool disabled = false;
499

500
	rec->progname = argv[0];
501

502
	rec->page_size = sysconf(_SC_PAGE_SIZE);
503

504
	on_exit(perf_record__sig_exit, rec);
505 506
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
507
	signal(SIGUSR1, sig_handler);
508

509 510
	if (!output_name) {
		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
511
			opts->pipe_output = true;
512
		else
513
			rec->output_name = output_name = "perf.data";
514 515 516
	}
	if (output_name) {
		if (!strcmp(output_name, "-"))
517
			opts->pipe_output = true;
518
		else if (!stat(output_name, &st) && st.st_size) {
519
			if (rec->write_mode == WRITE_FORCE) {
520 521 522 523 524 525
				char oldname[PATH_MAX];
				snprintf(oldname, sizeof(oldname), "%s.old",
					 output_name);
				unlink(oldname);
				rename(output_name, oldname);
			}
526 527
		} else if (rec->write_mode == WRITE_APPEND) {
			rec->write_mode = WRITE_FORCE;
528
		}
529 530
	}

531
	flags = O_CREAT|O_RDWR;
532 533
	if (rec->write_mode == WRITE_APPEND)
		rec->file_new = 0;
I
Ingo Molnar 已提交
534 535 536
	else
		flags |= O_TRUNC;

537
	if (opts->pipe_output)
538 539 540
		output = STDOUT_FILENO;
	else
		output = open(output_name, flags, S_IRUSR | S_IWUSR);
541 542
	if (output < 0) {
		perror("failed to create output file");
543
		return -1;
544 545
	}

546 547
	rec->output = output;

548
	session = perf_session__new(output_name, O_WRONLY,
549
				    rec->write_mode == WRITE_FORCE, false, NULL);
550
	if (session == NULL) {
551 552 553 554
		pr_err("Not enough memory for reading perf file header\n");
		return -1;
	}

555 556
	rec->session = session;

557 558 559 560 561 562 563
	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
		perf_header__set_feat(&session->header, feat);

	if (rec->no_buildid)
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);

	if (!have_tracepoints(&evsel_list->entries))
564
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
565

566 567 568
	if (!rec->opts.branch_stack)
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);

569
	if (!rec->file_new) {
570
		err = perf_session__read_header(session, output);
571
		if (err < 0)
572
			goto out_delete_session;
573 574
	}

575
	if (forks) {
576
		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
577 578 579
		if (err < 0) {
			pr_err("Couldn't run the workload!\n");
			goto out_delete_session;
580 581 582
		}
	}

583 584 585 586
	if (perf_record__open(rec) != 0) {
		err = -1;
		goto out_delete_session;
	}
587

588
	/*
589
	 * perf_session__delete(session) will be called at perf_record__exit()
590
	 */
591
	on_exit(perf_record__exit, rec);
592

593
	if (opts->pipe_output) {
594 595
		err = perf_header__write_pipe(output);
		if (err < 0)
596
			goto out_delete_session;
597
	} else if (rec->file_new) {
598 599
		err = perf_session__write_header(session, evsel_list,
						 output, false);
600
		if (err < 0)
601
			goto out_delete_session;
602 603
	}

604
	if (!rec->no_buildid
605
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
606
		pr_err("Couldn't generate buildids. "
607
		       "Use --no-buildid to profile anyway.\n");
608 609
		err = -1;
		goto out_delete_session;
610 611
	}

612
	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
613

614 615 616
	machine = perf_session__find_host_machine(session);
	if (!machine) {
		pr_err("Couldn't find native kernel information.\n");
617 618
		err = -1;
		goto out_delete_session;
619 620
	}

621
	if (opts->pipe_output) {
622
		err = perf_event__synthesize_attrs(tool, session,
623
						   process_synthesized_event);
624 625
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
626
			goto out_delete_session;
627
		}
628

629
		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
630
							 machine);
631 632
		if (err < 0) {
			pr_err("Couldn't synthesize event_types.\n");
633
			goto out_delete_session;
634
		}
635

636
		if (have_tracepoints(&evsel_list->entries)) {
637 638 639 640 641 642 643 644
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
645
			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
646
								  process_synthesized_event);
647 648
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
649
				goto out_delete_session;
650
			}
651
			advance_output(rec, err);
652
		}
653 654
	}

655
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
656
						 machine, "_text");
657
	if (err < 0)
658
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
659
							 machine, "_stext");
660 661 662 663
	if (err < 0)
		pr_err("Couldn't record kernel reference relocation symbol\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/kallsyms permission or run as root.\n");
664

665
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
666
					     machine);
667 668 669 670 671
	if (err < 0)
		pr_err("Couldn't record kernel module information.\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/modules permission or run as root.\n");

672
	if (perf_guest)
673
		perf_session__process_machines(session, tool,
674
					       perf_event__synthesize_guest_os);
675

676
	if (!opts->target.system_wide)
677
		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
678
						  process_synthesized_event,
679
						  machine);
680
	else
681
		err = perf_event__synthesize_threads(tool, process_synthesized_event,
682
					       machine);
683

684 685 686
	if (err != 0)
		goto out_delete_session;

687
	if (rec->realtime_prio) {
688 689
		struct sched_param param;

690
		param.sched_priority = rec->realtime_prio;
691
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
692
			pr_err("Could not set realtime priority.\n");
693 694
			err = -1;
			goto out_delete_session;
695 696 697
		}
	}

698 699 700 701 702 703 704
	/*
	 * When perf is starting the traced process, all the events
	 * (apart from group members) have enable_on_exec=1 set,
	 * so don't spoil it by prematurely enabling them.
	 */
	if (!perf_target__none(&opts->target))
		perf_evlist__enable(evsel_list);
705

706 707 708
	/*
	 * Let the child rip
	 */
709
	if (forks)
710
		perf_evlist__start_workload(evsel_list);
711

712
	for (;;) {
713
		int hits = rec->samples;
714

715 716 717 718
		if (perf_record__mmap_read_all(rec) < 0) {
			err = -1;
			goto out_delete_session;
		}
719

720
		if (hits == rec->samples) {
721 722
			if (done)
				break;
723
			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
724 725 726
			waking++;
		}

727 728 729 730 731
		/*
		 * When perf is starting the traced process, at the end events
		 * die with the process and we wait for that. Thus no need to
		 * disable events in this case.
		 */
732
		if (done && !disabled && !perf_target__none(&opts->target)) {
733
			perf_evlist__disable(evsel_list);
734 735
			disabled = true;
		}
736 737
	}

738
	if (quiet || signr == SIGUSR1)
739 740
		return 0;

741 742
	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);

743 744 745 746
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
747
		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
748
		(double)rec->bytes_written / 1024.0 / 1024.0,
749
		output_name,
750
		rec->bytes_written / 24);
751

752
	return 0;
753 754 755 756

out_delete_session:
	perf_session__delete(session);
	return err;
757
}
758

759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
#define BRANCH_OPT(n, m) \
	{ .name = n, .mode = (m) }

#define BRANCH_END { .name = NULL }

struct branch_mode {
	const char *name;
	int mode;
};

static const struct branch_mode branch_modes[] = {
	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
	BRANCH_END
};

static int
781
parse_branch_stack(const struct option *opt, const char *str, int unset)
782 783 784 785 786 787 788 789
{
#define ONLY_PLM \
	(PERF_SAMPLE_BRANCH_USER	|\
	 PERF_SAMPLE_BRANCH_KERNEL	|\
	 PERF_SAMPLE_BRANCH_HV)

	uint64_t *mode = (uint64_t *)opt->value;
	const struct branch_mode *br;
790
	char *s, *os = NULL, *p;
791 792
	int ret = -1;

793 794
	if (unset)
		return 0;
795

796 797 798 799
	/*
	 * cannot set it twice, -b + --branch-filter for instance
	 */
	if (*mode)
800 801
		return -1;

802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
	/* str may be NULL in case no arg is passed to -b */
	if (str) {
		/* because str is read-only */
		s = os = strdup(str);
		if (!s)
			return -1;

		for (;;) {
			p = strchr(s, ',');
			if (p)
				*p = '\0';

			for (br = branch_modes; br->name; br++) {
				if (!strcasecmp(s, br->name))
					break;
			}
			if (!br->name) {
				ui__warning("unknown branch filter %s,"
					    " check man page\n", s);
				goto error;
			}
823

824
			*mode |= br->mode;
825

826 827
			if (!p)
				break;
828

829 830
			s = p + 1;
		}
831 832 833
	}
	ret = 0;

834
	/* default to any branch */
835
	if ((*mode & ~ONLY_PLM) == 0) {
836
		*mode = PERF_SAMPLE_BRANCH_ANY;
837 838 839 840 841 842
	}
error:
	free(os);
	return ret;
}

843
#ifdef LIBUNWIND_SUPPORT
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
static int get_stack_size(char *str, unsigned long *_size)
{
	char *endptr;
	unsigned long size;
	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));

	size = strtoul(str, &endptr, 0);

	do {
		if (*endptr)
			break;

		size = round_up(size, sizeof(u64));
		if (!size || size > max_size)
			break;

		*_size = size;
		return 0;

	} while (0);

	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
	       max_size, str);
	return -1;
}
869
#endif /* LIBUNWIND_SUPPORT */
870 871

static int
872
parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
		    int unset)
{
	struct perf_record *rec = (struct perf_record *)opt->value;
	char *tok, *name, *saveptr = NULL;
	char *buf;
	int ret = -1;

	/* --no-call-graph */
	if (unset)
		return 0;

	/* We specified default option if none is provided. */
	BUG_ON(!arg);

	/* We need buffer that we know we can write to. */
	buf = malloc(strlen(arg) + 1);
	if (!buf)
		return -ENOMEM;

	strcpy(buf, arg);

	tok = strtok_r((char *)buf, ",", &saveptr);
	name = tok ? : (char *)buf;

	do {
		/* Framepointer style */
		if (!strncmp(name, "fp", sizeof("fp"))) {
			if (!strtok_r(NULL, ",", &saveptr)) {
				rec->opts.call_graph = CALLCHAIN_FP;
				ret = 0;
			} else
				pr_err("callchain: No more arguments "
				       "needed for -g fp\n");
			break;

908
#ifdef LIBUNWIND_SUPPORT
909 910
		/* Dwarf style */
		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
911 912
			const unsigned long default_stack_dump_size = 8192;

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
			ret = 0;
			rec->opts.call_graph = CALLCHAIN_DWARF;
			rec->opts.stack_dump_size = default_stack_dump_size;

			tok = strtok_r(NULL, ",", &saveptr);
			if (tok) {
				unsigned long size = 0;

				ret = get_stack_size(tok, &size);
				rec->opts.stack_dump_size = size;
			}

			if (!ret)
				pr_debug("callchain: stack dump size %d\n",
					 rec->opts.stack_dump_size);
928
#endif /* LIBUNWIND_SUPPORT */
929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
		} else {
			pr_err("callchain: Unknown -g option "
			       "value: %s\n", arg);
			break;
		}

	} while (0);

	free(buf);

	if (!ret)
		pr_debug("callchain: type %d\n", rec->opts.call_graph);

	return ret;
}

945
static const char * const record_usage[] = {
946 947
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
948 949 950
	NULL
};

951 952 953 954 955 956 957 958 959 960 961 962 963 964 965
/*
 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
 * because we need to have access to it in perf_record__exit, that is called
 * after cmd_record() exits, but since record_options need to be accessible to
 * builtin-script, leave it here.
 *
 * At least we don't ouch it in all the other functions here directly.
 *
 * Just say no to tons of global variables, sigh.
 */
static struct perf_record record = {
	.opts = {
		.mmap_pages	     = UINT_MAX,
		.user_freq	     = UINT_MAX,
		.user_interval	     = ULLONG_MAX,
966
		.freq		     = 4000,
N
Namhyung Kim 已提交
967 968 969
		.target		     = {
			.uses_mmap   = true,
		},
970 971 972 973
	},
	.write_mode = WRITE_FORCE,
	.file_new   = true,
};
974

975 976 977 978 979 980 981 982
#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "

#ifdef LIBUNWIND_SUPPORT
static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
#else
static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
#endif

983 984 985 986 987 988 989
/*
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
 * with it and switch to use the library functions in perf_evlist that came
 * from builtin-record.c, i.e. use perf_record_opts,
 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
 * using pipes, etc.
 */
990
const struct option record_options[] = {
991
	OPT_CALLBACK('e', "event", &record.evlist, "event",
992
		     "event selector. use 'perf list' to list available events",
993
		     parse_events_option),
994
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
L
Li Zefan 已提交
995
		     "event filter", parse_filter),
996
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
997
		    "record events on existing process id"),
998
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
999
		    "record events on existing thread id"),
1000
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1001
		    "collect data with this RT SCHED_FIFO priority"),
1002
	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1003
		    "collect data without buffering"),
1004
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1005
		    "collect raw sample records from all opened counters"),
1006
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1007
			    "system-wide collection from all CPUs"),
1008
	OPT_BOOLEAN('A', "append", &record.append_file,
I
Ingo Molnar 已提交
1009
			    "append to the output file to do incremental profiling"),
1010
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1011
		    "list of cpus to monitor"),
1012
	OPT_BOOLEAN('f', "force", &record.force,
1013
			"overwrite existing data file (deprecated)"),
1014 1015
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
	OPT_STRING('o', "output", &record.output_name, "file",
I
Ingo Molnar 已提交
1016
		    "output file name"),
1017
	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1018
		    "child tasks do not inherit counters"),
1019 1020
	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1021
		     "number of mmap data pages"),
1022
	OPT_BOOLEAN(0, "group", &record.opts.group,
1023
		    "put the counters into a counter group"),
1024 1025 1026
	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
			     callchain_help, &parse_callchain_opt,
			     "fp"),
1027
	OPT_INCR('v', "verbose", &verbose,
1028
		    "be more verbose (show counter open errors, etc)"),
1029
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1030
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1031
		    "per thread counts"),
1032
	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1033
		    "Sample addresses"),
1034
	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1035
	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1036
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1037
		    "don't sample"),
1038
	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1039
		    "do not update the buildid cache"),
1040
	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1041
		    "do not collect buildids in perf.data"),
1042
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
S
Stephane Eranian 已提交
1043 1044
		     "monitor event in cgroup name only",
		     parse_cgroups),
1045 1046
	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
		   "user to profile"),
1047 1048 1049 1050 1051 1052 1053

	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
		     "branch any", "sample any taken branches",
		     parse_branch_stack),

	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
		     "branch filter mask", "branch stack filter modes",
1054
		     parse_branch_stack),
1055 1056 1057
	OPT_END()
};

1058
int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1059
{
1060 1061
	int err = -ENOMEM;
	struct perf_evsel *pos;
1062 1063
	struct perf_evlist *evsel_list;
	struct perf_record *rec = &record;
1064
	char errbuf[BUFSIZ];
1065

1066
	evsel_list = perf_evlist__new(NULL, NULL);
1067 1068 1069
	if (evsel_list == NULL)
		return -ENOMEM;

1070 1071
	rec->evlist = evsel_list;

1072
	argc = parse_options(argc, argv, record_options, record_usage,
1073
			    PARSE_OPT_STOP_AT_NON_OPTION);
1074
	if (!argc && perf_target__none(&rec->opts.target))
1075
		usage_with_options(record_usage, record_options);
1076

1077
	if (rec->force && rec->append_file) {
1078 1079
		ui__error("Can't overwrite and append at the same time."
			  " You need to choose between -f and -A");
1080
		usage_with_options(record_usage, record_options);
1081 1082
	} else if (rec->append_file) {
		rec->write_mode = WRITE_APPEND;
1083
	} else {
1084
		rec->write_mode = WRITE_FORCE;
1085 1086
	}

1087
	if (nr_cgroups && !rec->opts.target.system_wide) {
1088 1089
		ui__error("cgroup monitoring only available in"
			  " system-wide mode\n");
S
Stephane Eranian 已提交
1090 1091 1092
		usage_with_options(record_usage, record_options);
	}

1093
	symbol__init();
1094

1095
	if (symbol_conf.kptr_restrict)
1096 1097 1098 1099 1100 1101 1102 1103
		pr_warning(
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
"check /proc/sys/kernel/kptr_restrict.\n\n"
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
"file is not found in the buildid cache or in the vmlinux path.\n\n"
"Samples in kernel modules won't be resolved at all.\n\n"
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
1104

1105
	if (rec->no_buildid_cache || rec->no_buildid)
1106
		disable_buildid_cache();
1107

1108 1109
	if (evsel_list->nr_entries == 0 &&
	    perf_evlist__add_default(evsel_list) < 0) {
1110 1111
		pr_err("Not enough memory for event selector list\n");
		goto out_symbol_exit;
1112
	}
1113

1114 1115 1116 1117 1118 1119 1120 1121 1122
	err = perf_target__validate(&rec->opts.target);
	if (err) {
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
		ui__warning("%s", errbuf);
	}

	err = perf_target__parse_uid(&rec->opts.target);
	if (err) {
		int saved_errno = errno;
1123

1124
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1125
		ui__error("%s", errbuf);
1126 1127

		err = -saved_errno;
1128
		goto out_free_fd;
1129
	}
1130

1131
	err = -ENOMEM;
1132
	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1133
		usage_with_options(record_usage, record_options);
1134

1135
	list_for_each_entry(pos, &evsel_list->entries, node) {
1136
		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1137
			goto out_free_fd;
1138
	}
1139

1140 1141 1142 1143
	if (rec->opts.user_interval != ULLONG_MAX)
		rec->opts.default_interval = rec->opts.user_interval;
	if (rec->opts.user_freq != UINT_MAX)
		rec->opts.freq = rec->opts.user_freq;
1144

1145 1146 1147
	/*
	 * User specified count overrides default frequency.
	 */
1148 1149 1150 1151
	if (rec->opts.default_interval)
		rec->opts.freq = 0;
	else if (rec->opts.freq) {
		rec->opts.default_interval = rec->opts.freq;
1152
	} else {
1153
		ui__error("frequency and count are zero, aborting\n");
1154
		err = -EINVAL;
1155
		goto out_free_fd;
1156 1157
	}

1158
	err = __cmd_record(&record, argc, argv);
1159
out_free_fd:
1160
	perf_evlist__delete_maps(evsel_list);
1161 1162
out_symbol_exit:
	symbol__exit();
1163
	return err;
1164
}