builtin-record.c 29.8 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8 9
#define _FILE_OFFSET_BITS 64

10
#include "builtin.h"
11 12 13

#include "perf.h"

14
#include "util/build-id.h"
15
#include "util/util.h"
16
#include "util/parse-options.h"
17
#include "util/parse-events.h"
18

19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/session.h"
25
#include "util/tool.h"
26
#include "util/symbol.h"
27
#include "util/cpumap.h"
28
#include "util/thread_map.h"
29

30
#include <unistd.h>
31
#include <sched.h>
32
#include <sys/mman.h>
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
#ifndef HAVE_ON_EXIT
#ifndef ATEXIT_MAX
#define ATEXIT_MAX 32
#endif
static int __on_exit_count = 0;
typedef void (*on_exit_func_t) (int, void *);
static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
static void *__on_exit_args[ATEXIT_MAX];
static int __exitcode = 0;
static void __handle_on_exit_funcs(void);
static int on_exit(on_exit_func_t function, void *arg);
#define exit(x) (exit)(__exitcode = (x))

static int on_exit(on_exit_func_t function, void *arg)
{
	if (__on_exit_count == ATEXIT_MAX)
		return -ENOMEM;
	else if (__on_exit_count == 0)
		atexit(__handle_on_exit_funcs);
	__on_exit_funcs[__on_exit_count] = function;
	__on_exit_args[__on_exit_count++] = arg;
	return 0;
}

static void __handle_on_exit_funcs(void)
{
	int i;
	for (i = 0; i < __on_exit_count; i++)
		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
}
#endif

66 67 68 69 70
enum write_mode_t {
	WRITE_FORCE,
	WRITE_APPEND
};

71
struct perf_record {
72
	struct perf_tool	tool;
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
	struct perf_record_opts	opts;
	u64			bytes_written;
	const char		*output_name;
	struct perf_evlist	*evlist;
	struct perf_session	*session;
	const char		*progname;
	int			output;
	unsigned int		page_size;
	int			realtime_prio;
	enum write_mode_t	write_mode;
	bool			no_buildid;
	bool			no_buildid_cache;
	bool			force;
	bool			file_new;
	bool			append_file;
	long			samples;
	off_t			post_processing_offset;
90
};
91

92
static void advance_output(struct perf_record *rec, size_t size)
93
{
94
	rec->bytes_written += size;
95 96
}

97
static int write_output(struct perf_record *rec, void *buf, size_t size)
98 99
{
	while (size) {
100
		int ret = write(rec->output, buf, size);
101

102 103 104 105
		if (ret < 0) {
			pr_err("failed to write\n");
			return -1;
		}
106 107 108 109

		size -= ret;
		buf += ret;

110
		rec->bytes_written += ret;
111
	}
112 113

	return 0;
114 115
}

116
static int process_synthesized_event(struct perf_tool *tool,
117
				     union perf_event *event,
118 119
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
120
{
121
	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 123 124
	if (write_output(rec, event, event->header.size) < 0)
		return -1;

125 126 127
	return 0;
}

128
static int perf_record__mmap_read(struct perf_record *rec,
129
				   struct perf_mmap *md)
130
{
131
	unsigned int head = perf_mmap__read_head(md);
132
	unsigned int old = md->prev;
133
	unsigned char *data = md->base + rec->page_size;
134 135
	unsigned long size;
	void *buf;
136
	int rc = 0;
137

138
	if (old == head)
139
		return 0;
140

141
	rec->samples++;
142 143 144 145 146 147 148

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
149

150 151 152 153
		if (write_output(rec, buf, size) < 0) {
			rc = -1;
			goto out;
		}
154 155 156 157 158
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
159

160 161 162 163
	if (write_output(rec, buf, size) < 0) {
		rc = -1;
		goto out;
	}
164 165

	md->prev = old;
166
	perf_mmap__write_tail(md, old);
167 168 169

out:
	return rc;
170 171 172
}

static volatile int done = 0;
173
static volatile int signr = -1;
174
static volatile int child_finished = 0;
175

176
static void sig_handler(int sig)
177
{
178 179 180
	if (sig == SIGCHLD)
		child_finished = 1;

181
	done = 1;
182 183 184
	signr = sig;
}

185
static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186
{
187
	struct perf_record *rec = arg;
188 189
	int status;

190
	if (rec->evlist->workload.pid > 0) {
191
		if (!child_finished)
192
			kill(rec->evlist->workload.pid, SIGTERM);
193 194 195

		wait(&status);
		if (WIFSIGNALED(status))
196
			psignal(WTERMSIG(status), rec->progname);
197
	}
198

199
	if (signr == -1 || signr == SIGUSR1)
200 201 202 203
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
204 205
}

206 207 208 209 210 211 212 213
static bool perf_evlist__equal(struct perf_evlist *evlist,
			       struct perf_evlist *other)
{
	struct perf_evsel *pos, *pair;

	if (evlist->nr_entries != other->nr_entries)
		return false;

214
	pair = perf_evlist__first(other);
215 216 217 218

	list_for_each_entry(pos, &evlist->entries, node) {
		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
			return false;
219
		pair = perf_evsel__next(pair);
220 221 222 223 224
	}

	return true;
}

225
static int perf_record__open(struct perf_record *rec)
226
{
227
	struct perf_evsel *pos;
228 229 230
	struct perf_evlist *evlist = rec->evlist;
	struct perf_session *session = rec->session;
	struct perf_record_opts *opts = &rec->opts;
231
	int rc = 0;
232

233 234 235 236
	/*
	 * Set the evsel leader links before we configure attributes,
	 * since some might depend on this info.
	 */
237
	if (opts->group)
238
		perf_evlist__set_leader(evlist);
239

240 241
	perf_evlist__config_attrs(evlist, opts);

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	list_for_each_entry(pos, &evlist->entries, node) {
		struct perf_event_attr *attr = &pos->attr;
		/*
		 * Check if parse_single_tracepoint_event has already asked for
		 * PERF_SAMPLE_TIME.
		 *
		 * XXX this is kludgy but short term fix for problems introduced by
		 * eac23d1c that broke 'perf script' by having different sample_types
		 * when using multiple tracepoint events when we use a perf binary
		 * that tries to use sample_id_all on an older kernel.
		 *
		 * We need to move counter creation to perf_session, support
		 * different sample_types, etc.
		 */
		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
257

258 259 260
fallback_missing_features:
		if (opts->exclude_guest_missing)
			attr->exclude_guest = attr->exclude_host = 0;
261
retry_sample_id:
262
		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
263
try_again:
264
		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
265 266
			int err = errno;

267
			if (err == EPERM || err == EACCES) {
268
				ui__error_paranoid();
269 270
				rc = -err;
				goto out;
271
			} else if (err ==  ENODEV && opts->target.cpu_list) {
272 273 274 275
				pr_err("No such device - did you specify"
				       " an out-of-range profile CPU?\n");
				rc = -err;
				goto out;
276 277 278 279 280 281 282
			} else if (err == EINVAL) {
				if (!opts->exclude_guest_missing &&
				    (attr->exclude_guest || attr->exclude_host)) {
					pr_debug("Old kernel, cannot exclude "
						 "guest or host samples.\n");
					opts->exclude_guest_missing = true;
					goto fallback_missing_features;
283
				} else if (!opts->sample_id_all_missing) {
284 285 286
					/*
					 * Old kernel, no attr->sample_id_type_all field
					 */
287
					opts->sample_id_all_missing = true;
288 289 290 291 292
					if (!opts->sample_time && !opts->raw_samples && !time_needed)
						attr->sample_type &= ~PERF_SAMPLE_TIME;

					goto retry_sample_id;
				}
293
			}
294

295 296 297
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
298 299 300 301
			 * is always available even if no PMU support.
			 *
			 * PPC returns ENXIO until 2.6.37 (behavior changed
			 * with commit b0a873e).
302
			 */
303 304
			if ((err == ENOENT || err == ENXIO)
					&& attr->type == PERF_TYPE_HARDWARE
305 306 307
					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {

				if (verbose)
308 309
					ui__warning("The cycles event is not supported, "
						    "trying to fall back to cpu-clock-ticks\n");
310 311
				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
312 313 314 315
				if (pos->name) {
					free(pos->name);
					pos->name = NULL;
				}
316 317
				goto try_again;
			}
318 319

			if (err == ENOENT) {
320
				ui__error("The %s event is not supported.\n",
321
					  perf_evsel__name(pos));
322 323
				rc = -err;
				goto out;
324 325 326 327 328
			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
				ui__error("\'precise\' request may not be supported. "
					  "Try removing 'p' modifier\n");
				rc = -err;
				goto out;
329 330
			}

331
			printf("\n");
332 333 334 335
			error("sys_perf_event_open() syscall returned with %d "
			      "(%s) for event %s. /bin/dmesg may provide "
			      "additional information.\n",
			      err, strerror(err), perf_evsel__name(pos));
336 337

#if defined(__i386__) || defined(__x86_64__)
338 339 340 341 342 343 344 345 346
			if (attr->type == PERF_TYPE_HARDWARE &&
			    err == EOPNOTSUPP) {
				pr_err("No hardware sampling interrupt available."
				       " No APIC? If so then you can boot the kernel"
				       " with the \"lapic\" boot parameter to"
				       " force-enable it.\n");
				rc = -err;
				goto out;
			}
347 348
#endif

349 350 351
			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
			rc = -err;
			goto out;
L
Li Zefan 已提交
352 353
		}
	}
354

355
	if (perf_evlist__apply_filters(evlist)) {
356 357
		error("failed to set filter with %d (%s)\n", errno,
			strerror(errno));
358 359
		rc = -1;
		goto out;
360 361
	}

362
	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
363 364 365 366 367 368 369
		if (errno == EPERM) {
			pr_err("Permission error mapping pages.\n"
			       "Consider increasing "
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
			       "or try again with a smaller value of -m/--mmap_pages.\n"
			       "(current value: %d)\n", opts->mmap_pages);
			rc = -errno;
370 371
		} else if (!is_power_of_2(opts->mmap_pages) &&
			   (opts->mmap_pages != UINT_MAX)) {
372 373 374 375 376 377 378
			pr_err("--mmap_pages/-m value must be a power of two.");
			rc = -EINVAL;
		} else {
			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
			rc = -errno;
		}
		goto out;
379
	}
380

381
	if (rec->file_new)
382 383 384 385
		session->evlist = evlist;
	else {
		if (!perf_evlist__equal(session->evlist, evlist)) {
			fprintf(stderr, "incompatible append\n");
386 387
			rc = -1;
			goto out;
388 389 390
		}
 	}

391
	perf_session__set_id_hdr_size(session);
392 393
out:
	return rc;
394 395
}

396
static int process_buildids(struct perf_record *rec)
397
{
398
	u64 size = lseek(rec->output, 0, SEEK_CUR);
399

400 401 402
	if (size == 0)
		return 0;

403 404 405
	rec->session->fd = rec->output;
	return __perf_session__process_events(rec->session, rec->post_processing_offset,
					      size - rec->post_processing_offset,
406 407 408
					      size, &build_id__mark_dso_hit_ops);
}

409
static void perf_record__exit(int status, void *arg)
410
{
411 412
	struct perf_record *rec = arg;

413 414 415
	if (status != 0)
		return;

416 417 418 419 420 421 422 423 424
	if (!rec->opts.pipe_output) {
		rec->session->header.data_size += rec->bytes_written;

		if (!rec->no_buildid)
			process_buildids(rec);
		perf_session__write_header(rec->session, rec->evlist,
					   rec->output, true);
		perf_session__delete(rec->session);
		perf_evlist__delete(rec->evlist);
425
		symbol__exit();
426
	}
427 428
}

429
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
430 431
{
	int err;
432
	struct perf_tool *tool = data;
433

434
	if (machine__is_host(machine))
435 436 437 438 439 440 441 442 443 444
		return;

	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
445
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
446
					     machine);
447 448
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
449
		       " relocation symbol.\n", machine->pid);
450 451 452 453 454

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
455
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
456
						 machine, "_text");
457
	if (err < 0)
458
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
459
							 machine, "_stext");
460 461
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
462
		       " relocation symbol.\n", machine->pid);
463 464
}

465 466 467 468 469
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

470
static int perf_record__mmap_read_all(struct perf_record *rec)
471
{
472
	int i;
473
	int rc = 0;
474

475
	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
476 477 478 479 480 481
		if (rec->evlist->mmap[i].base) {
			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
				rc = -1;
				goto out;
			}
		}
482 483
	}

484
	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
485 486 487 488 489
		rc = write_output(rec, &finished_round_event,
				  sizeof(finished_round_event));

out:
	return rc;
490 491
}

492
static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
493
{
I
Ingo Molnar 已提交
494 495
	struct stat st;
	int flags;
496
	int err, output, feat;
497
	unsigned long waking = 0;
498
	const bool forks = argc > 0;
499
	struct machine *machine;
500
	struct perf_tool *tool = &rec->tool;
501 502 503 504
	struct perf_record_opts *opts = &rec->opts;
	struct perf_evlist *evsel_list = rec->evlist;
	const char *output_name = rec->output_name;
	struct perf_session *session;
505

506
	rec->progname = argv[0];
507

508
	rec->page_size = sysconf(_SC_PAGE_SIZE);
509

510
	on_exit(perf_record__sig_exit, rec);
511 512
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
513
	signal(SIGUSR1, sig_handler);
514

515 516
	if (!output_name) {
		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
517
			opts->pipe_output = true;
518
		else
519
			rec->output_name = output_name = "perf.data";
520 521 522
	}
	if (output_name) {
		if (!strcmp(output_name, "-"))
523
			opts->pipe_output = true;
524
		else if (!stat(output_name, &st) && st.st_size) {
525
			if (rec->write_mode == WRITE_FORCE) {
526 527 528 529 530 531
				char oldname[PATH_MAX];
				snprintf(oldname, sizeof(oldname), "%s.old",
					 output_name);
				unlink(oldname);
				rename(output_name, oldname);
			}
532 533
		} else if (rec->write_mode == WRITE_APPEND) {
			rec->write_mode = WRITE_FORCE;
534
		}
535 536
	}

537
	flags = O_CREAT|O_RDWR;
538 539
	if (rec->write_mode == WRITE_APPEND)
		rec->file_new = 0;
I
Ingo Molnar 已提交
540 541 542
	else
		flags |= O_TRUNC;

543
	if (opts->pipe_output)
544 545 546
		output = STDOUT_FILENO;
	else
		output = open(output_name, flags, S_IRUSR | S_IWUSR);
547 548
	if (output < 0) {
		perror("failed to create output file");
549
		return -1;
550 551
	}

552 553
	rec->output = output;

554
	session = perf_session__new(output_name, O_WRONLY,
555
				    rec->write_mode == WRITE_FORCE, false, NULL);
556
	if (session == NULL) {
557 558 559 560
		pr_err("Not enough memory for reading perf file header\n");
		return -1;
	}

561 562
	rec->session = session;

563 564 565 566 567 568 569
	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
		perf_header__set_feat(&session->header, feat);

	if (rec->no_buildid)
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);

	if (!have_tracepoints(&evsel_list->entries))
570
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
571

572 573 574
	if (!rec->opts.branch_stack)
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);

575
	if (!rec->file_new) {
576
		err = perf_session__read_header(session, output);
577
		if (err < 0)
578
			goto out_delete_session;
579 580
	}

581
	if (forks) {
582
		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
583 584 585
		if (err < 0) {
			pr_err("Couldn't run the workload!\n");
			goto out_delete_session;
586 587 588
		}
	}

589 590 591 592
	if (perf_record__open(rec) != 0) {
		err = -1;
		goto out_delete_session;
	}
593

594
	/*
595
	 * perf_session__delete(session) will be called at perf_record__exit()
596
	 */
597
	on_exit(perf_record__exit, rec);
598

599
	if (opts->pipe_output) {
600 601
		err = perf_header__write_pipe(output);
		if (err < 0)
602
			goto out_delete_session;
603
	} else if (rec->file_new) {
604 605
		err = perf_session__write_header(session, evsel_list,
						 output, false);
606
		if (err < 0)
607
			goto out_delete_session;
608 609
	}

610
	if (!rec->no_buildid
611
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
612
		pr_err("Couldn't generate buildids. "
613
		       "Use --no-buildid to profile anyway.\n");
614 615
		err = -1;
		goto out_delete_session;
616 617
	}

618
	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
619

620 621 622
	machine = perf_session__find_host_machine(session);
	if (!machine) {
		pr_err("Couldn't find native kernel information.\n");
623 624
		err = -1;
		goto out_delete_session;
625 626
	}

627
	if (opts->pipe_output) {
628
		err = perf_event__synthesize_attrs(tool, session,
629
						   process_synthesized_event);
630 631
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
632
			goto out_delete_session;
633
		}
634

635
		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
636
							 machine);
637 638
		if (err < 0) {
			pr_err("Couldn't synthesize event_types.\n");
639
			goto out_delete_session;
640
		}
641

642
		if (have_tracepoints(&evsel_list->entries)) {
643 644 645 646 647 648 649 650
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
651
			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
652
								  process_synthesized_event);
653 654
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
655
				goto out_delete_session;
656
			}
657
			advance_output(rec, err);
658
		}
659 660
	}

661
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
662
						 machine, "_text");
663
	if (err < 0)
664
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
665
							 machine, "_stext");
666 667 668 669
	if (err < 0)
		pr_err("Couldn't record kernel reference relocation symbol\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/kallsyms permission or run as root.\n");
670

671
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
672
					     machine);
673 674 675 676 677
	if (err < 0)
		pr_err("Couldn't record kernel module information.\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/modules permission or run as root.\n");

678
	if (perf_guest)
679
		perf_session__process_machines(session, tool,
680
					       perf_event__synthesize_guest_os);
681

682
	if (!opts->target.system_wide)
683
		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
684
						  process_synthesized_event,
685
						  machine);
686
	else
687
		err = perf_event__synthesize_threads(tool, process_synthesized_event,
688
					       machine);
689

690 691 692
	if (err != 0)
		goto out_delete_session;

693
	if (rec->realtime_prio) {
694 695
		struct sched_param param;

696
		param.sched_priority = rec->realtime_prio;
697
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
698
			pr_err("Could not set realtime priority.\n");
699 700
			err = -1;
			goto out_delete_session;
701 702 703
		}
	}

704 705 706 707 708 709 710
	/*
	 * When perf is starting the traced process, all the events
	 * (apart from group members) have enable_on_exec=1 set,
	 * so don't spoil it by prematurely enabling them.
	 */
	if (!perf_target__none(&opts->target))
		perf_evlist__enable(evsel_list);
711

712 713 714
	/*
	 * Let the child rip
	 */
715
	if (forks)
716
		perf_evlist__start_workload(evsel_list);
717

718
	for (;;) {
719
		int hits = rec->samples;
720

721 722 723 724
		if (perf_record__mmap_read_all(rec) < 0) {
			err = -1;
			goto out_delete_session;
		}
725

726
		if (hits == rec->samples) {
727 728
			if (done)
				break;
729
			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
730 731 732
			waking++;
		}

733 734 735 736 737 738
		/*
		 * When perf is starting the traced process, at the end events
		 * die with the process and we wait for that. Thus no need to
		 * disable events in this case.
		 */
		if (done && !perf_target__none(&opts->target))
739
			perf_evlist__disable(evsel_list);
740 741
	}

742
	if (quiet || signr == SIGUSR1)
743 744
		return 0;

745 746
	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);

747 748 749 750
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
751
		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
752
		(double)rec->bytes_written / 1024.0 / 1024.0,
753
		output_name,
754
		rec->bytes_written / 24);
755

756
	return 0;
757 758 759 760

out_delete_session:
	perf_session__delete(session);
	return err;
761
}
762

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
#define BRANCH_OPT(n, m) \
	{ .name = n, .mode = (m) }

#define BRANCH_END { .name = NULL }

struct branch_mode {
	const char *name;
	int mode;
};

static const struct branch_mode branch_modes[] = {
	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
	BRANCH_END
};

static int
785
parse_branch_stack(const struct option *opt, const char *str, int unset)
786 787 788 789 790 791 792 793
{
#define ONLY_PLM \
	(PERF_SAMPLE_BRANCH_USER	|\
	 PERF_SAMPLE_BRANCH_KERNEL	|\
	 PERF_SAMPLE_BRANCH_HV)

	uint64_t *mode = (uint64_t *)opt->value;
	const struct branch_mode *br;
794
	char *s, *os = NULL, *p;
795 796
	int ret = -1;

797 798
	if (unset)
		return 0;
799

800 801 802 803
	/*
	 * cannot set it twice, -b + --branch-filter for instance
	 */
	if (*mode)
804 805
		return -1;

806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
	/* str may be NULL in case no arg is passed to -b */
	if (str) {
		/* because str is read-only */
		s = os = strdup(str);
		if (!s)
			return -1;

		for (;;) {
			p = strchr(s, ',');
			if (p)
				*p = '\0';

			for (br = branch_modes; br->name; br++) {
				if (!strcasecmp(s, br->name))
					break;
			}
			if (!br->name) {
				ui__warning("unknown branch filter %s,"
					    " check man page\n", s);
				goto error;
			}
827

828
			*mode |= br->mode;
829

830 831
			if (!p)
				break;
832

833 834
			s = p + 1;
		}
835 836 837
	}
	ret = 0;

838
	/* default to any branch */
839
	if ((*mode & ~ONLY_PLM) == 0) {
840
		*mode = PERF_SAMPLE_BRANCH_ANY;
841 842 843 844 845 846
	}
error:
	free(os);
	return ret;
}

847
#ifdef LIBUNWIND_SUPPORT
848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
static int get_stack_size(char *str, unsigned long *_size)
{
	char *endptr;
	unsigned long size;
	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));

	size = strtoul(str, &endptr, 0);

	do {
		if (*endptr)
			break;

		size = round_up(size, sizeof(u64));
		if (!size || size > max_size)
			break;

		*_size = size;
		return 0;

	} while (0);

	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
	       max_size, str);
	return -1;
}
873
#endif /* LIBUNWIND_SUPPORT */
874 875

static int
876
parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
		    int unset)
{
	struct perf_record *rec = (struct perf_record *)opt->value;
	char *tok, *name, *saveptr = NULL;
	char *buf;
	int ret = -1;

	/* --no-call-graph */
	if (unset)
		return 0;

	/* We specified default option if none is provided. */
	BUG_ON(!arg);

	/* We need buffer that we know we can write to. */
	buf = malloc(strlen(arg) + 1);
	if (!buf)
		return -ENOMEM;

	strcpy(buf, arg);

	tok = strtok_r((char *)buf, ",", &saveptr);
	name = tok ? : (char *)buf;

	do {
		/* Framepointer style */
		if (!strncmp(name, "fp", sizeof("fp"))) {
			if (!strtok_r(NULL, ",", &saveptr)) {
				rec->opts.call_graph = CALLCHAIN_FP;
				ret = 0;
			} else
				pr_err("callchain: No more arguments "
				       "needed for -g fp\n");
			break;

912
#ifdef LIBUNWIND_SUPPORT
913 914
		/* Dwarf style */
		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
915 916
			const unsigned long default_stack_dump_size = 8192;

917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
			ret = 0;
			rec->opts.call_graph = CALLCHAIN_DWARF;
			rec->opts.stack_dump_size = default_stack_dump_size;

			tok = strtok_r(NULL, ",", &saveptr);
			if (tok) {
				unsigned long size = 0;

				ret = get_stack_size(tok, &size);
				rec->opts.stack_dump_size = size;
			}

			if (!ret)
				pr_debug("callchain: stack dump size %d\n",
					 rec->opts.stack_dump_size);
932
#endif /* LIBUNWIND_SUPPORT */
933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948
		} else {
			pr_err("callchain: Unknown -g option "
			       "value: %s\n", arg);
			break;
		}

	} while (0);

	free(buf);

	if (!ret)
		pr_debug("callchain: type %d\n", rec->opts.call_graph);

	return ret;
}

949
static const char * const record_usage[] = {
950 951
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
952 953 954
	NULL
};

955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
/*
 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
 * because we need to have access to it in perf_record__exit, that is called
 * after cmd_record() exits, but since record_options need to be accessible to
 * builtin-script, leave it here.
 *
 * At least we don't ouch it in all the other functions here directly.
 *
 * Just say no to tons of global variables, sigh.
 */
static struct perf_record record = {
	.opts = {
		.mmap_pages	     = UINT_MAX,
		.user_freq	     = UINT_MAX,
		.user_interval	     = ULLONG_MAX,
970
		.freq		     = 4000,
N
Namhyung Kim 已提交
971 972 973
		.target		     = {
			.uses_mmap   = true,
		},
974 975 976 977
	},
	.write_mode = WRITE_FORCE,
	.file_new   = true,
};
978

979 980 981 982 983 984 985 986
#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "

#ifdef LIBUNWIND_SUPPORT
static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
#else
static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
#endif

987 988 989 990 991 992 993
/*
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
 * with it and switch to use the library functions in perf_evlist that came
 * from builtin-record.c, i.e. use perf_record_opts,
 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
 * using pipes, etc.
 */
994
const struct option record_options[] = {
995
	OPT_CALLBACK('e', "event", &record.evlist, "event",
996
		     "event selector. use 'perf list' to list available events",
997
		     parse_events_option),
998
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
L
Li Zefan 已提交
999
		     "event filter", parse_filter),
1000
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1001
		    "record events on existing process id"),
1002
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1003
		    "record events on existing thread id"),
1004
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1005
		    "collect data with this RT SCHED_FIFO priority"),
1006
	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1007
		    "collect data without buffering"),
1008
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1009
		    "collect raw sample records from all opened counters"),
1010
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1011
			    "system-wide collection from all CPUs"),
1012
	OPT_BOOLEAN('A', "append", &record.append_file,
I
Ingo Molnar 已提交
1013
			    "append to the output file to do incremental profiling"),
1014
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1015
		    "list of cpus to monitor"),
1016
	OPT_BOOLEAN('f', "force", &record.force,
1017
			"overwrite existing data file (deprecated)"),
1018 1019
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
	OPT_STRING('o', "output", &record.output_name, "file",
I
Ingo Molnar 已提交
1020
		    "output file name"),
1021
	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1022
		    "child tasks do not inherit counters"),
1023 1024
	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1025
		     "number of mmap data pages"),
1026
	OPT_BOOLEAN(0, "group", &record.opts.group,
1027
		    "put the counters into a counter group"),
1028 1029 1030
	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
			     callchain_help, &parse_callchain_opt,
			     "fp"),
1031
	OPT_INCR('v', "verbose", &verbose,
1032
		    "be more verbose (show counter open errors, etc)"),
1033
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1034
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1035
		    "per thread counts"),
1036
	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1037
		    "Sample addresses"),
1038
	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1039
	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1040
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1041
		    "don't sample"),
1042
	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1043
		    "do not update the buildid cache"),
1044
	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1045
		    "do not collect buildids in perf.data"),
1046
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
S
Stephane Eranian 已提交
1047 1048
		     "monitor event in cgroup name only",
		     parse_cgroups),
1049 1050
	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
		   "user to profile"),
1051 1052 1053 1054 1055 1056 1057

	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
		     "branch any", "sample any taken branches",
		     parse_branch_stack),

	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
		     "branch filter mask", "branch stack filter modes",
1058
		     parse_branch_stack),
1059 1060 1061
	OPT_END()
};

1062
int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1063
{
1064 1065
	int err = -ENOMEM;
	struct perf_evsel *pos;
1066 1067
	struct perf_evlist *evsel_list;
	struct perf_record *rec = &record;
1068
	char errbuf[BUFSIZ];
1069

1070
	evsel_list = perf_evlist__new(NULL, NULL);
1071 1072 1073
	if (evsel_list == NULL)
		return -ENOMEM;

1074 1075
	rec->evlist = evsel_list;

1076
	argc = parse_options(argc, argv, record_options, record_usage,
1077
			    PARSE_OPT_STOP_AT_NON_OPTION);
1078
	if (!argc && perf_target__none(&rec->opts.target))
1079
		usage_with_options(record_usage, record_options);
1080

1081
	if (rec->force && rec->append_file) {
1082 1083
		ui__error("Can't overwrite and append at the same time."
			  " You need to choose between -f and -A");
1084
		usage_with_options(record_usage, record_options);
1085 1086
	} else if (rec->append_file) {
		rec->write_mode = WRITE_APPEND;
1087
	} else {
1088
		rec->write_mode = WRITE_FORCE;
1089 1090
	}

1091
	if (nr_cgroups && !rec->opts.target.system_wide) {
1092 1093
		ui__error("cgroup monitoring only available in"
			  " system-wide mode\n");
S
Stephane Eranian 已提交
1094 1095 1096
		usage_with_options(record_usage, record_options);
	}

1097
	symbol__init();
1098

1099
	if (symbol_conf.kptr_restrict)
1100 1101 1102 1103 1104 1105 1106 1107
		pr_warning(
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
"check /proc/sys/kernel/kptr_restrict.\n\n"
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
"file is not found in the buildid cache or in the vmlinux path.\n\n"
"Samples in kernel modules won't be resolved at all.\n\n"
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
1108

1109
	if (rec->no_buildid_cache || rec->no_buildid)
1110
		disable_buildid_cache();
1111

1112 1113
	if (evsel_list->nr_entries == 0 &&
	    perf_evlist__add_default(evsel_list) < 0) {
1114 1115
		pr_err("Not enough memory for event selector list\n");
		goto out_symbol_exit;
1116
	}
1117

1118 1119 1120 1121 1122 1123 1124 1125 1126
	err = perf_target__validate(&rec->opts.target);
	if (err) {
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
		ui__warning("%s", errbuf);
	}

	err = perf_target__parse_uid(&rec->opts.target);
	if (err) {
		int saved_errno = errno;
1127

1128
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1129
		ui__error("%s", errbuf);
1130 1131

		err = -saved_errno;
1132
		goto out_free_fd;
1133
	}
1134

1135
	err = -ENOMEM;
1136
	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1137
		usage_with_options(record_usage, record_options);
1138

1139
	list_for_each_entry(pos, &evsel_list->entries, node) {
1140
		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1141
			goto out_free_fd;
1142
	}
1143

1144 1145 1146 1147
	if (rec->opts.user_interval != ULLONG_MAX)
		rec->opts.default_interval = rec->opts.user_interval;
	if (rec->opts.user_freq != UINT_MAX)
		rec->opts.freq = rec->opts.user_freq;
1148

1149 1150 1151
	/*
	 * User specified count overrides default frequency.
	 */
1152 1153 1154 1155
	if (rec->opts.default_interval)
		rec->opts.freq = 0;
	else if (rec->opts.freq) {
		rec->opts.default_interval = rec->opts.freq;
1156
	} else {
1157
		ui__error("frequency and count are zero, aborting\n");
1158
		err = -EINVAL;
1159
		goto out_free_fd;
1160 1161
	}

1162
	err = __cmd_record(&record, argc, argv);
1163
out_free_fd:
1164
	perf_evlist__delete_maps(evsel_list);
1165 1166
out_symbol_exit:
	symbol__exit();
1167
	return err;
1168
}