builtin-record.c 29.4 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8 9
#define _FILE_OFFSET_BITS 64

10
#include "builtin.h"
11 12 13

#include "perf.h"

14
#include "util/build-id.h"
15
#include "util/util.h"
16
#include "util/parse-options.h"
17
#include "util/parse-events.h"
18

19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/session.h"
25
#include "util/tool.h"
26
#include "util/symbol.h"
27
#include "util/cpumap.h"
28
#include "util/thread_map.h"
29

30
#include <unistd.h>
31
#include <sched.h>
32
#include <sys/mman.h>
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
#ifndef HAVE_ON_EXIT
#ifndef ATEXIT_MAX
#define ATEXIT_MAX 32
#endif
static int __on_exit_count = 0;
typedef void (*on_exit_func_t) (int, void *);
static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
static void *__on_exit_args[ATEXIT_MAX];
static int __exitcode = 0;
static void __handle_on_exit_funcs(void);
static int on_exit(on_exit_func_t function, void *arg);
#define exit(x) (exit)(__exitcode = (x))

static int on_exit(on_exit_func_t function, void *arg)
{
	if (__on_exit_count == ATEXIT_MAX)
		return -ENOMEM;
	else if (__on_exit_count == 0)
		atexit(__handle_on_exit_funcs);
	__on_exit_funcs[__on_exit_count] = function;
	__on_exit_args[__on_exit_count++] = arg;
	return 0;
}

static void __handle_on_exit_funcs(void)
{
	int i;
	for (i = 0; i < __on_exit_count; i++)
		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
}
#endif

66 67 68 69 70
enum write_mode_t {
	WRITE_FORCE,
	WRITE_APPEND
};

71
struct perf_record {
72
	struct perf_tool	tool;
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
	struct perf_record_opts	opts;
	u64			bytes_written;
	const char		*output_name;
	struct perf_evlist	*evlist;
	struct perf_session	*session;
	const char		*progname;
	int			output;
	unsigned int		page_size;
	int			realtime_prio;
	enum write_mode_t	write_mode;
	bool			no_buildid;
	bool			no_buildid_cache;
	bool			force;
	bool			file_new;
	bool			append_file;
	long			samples;
	off_t			post_processing_offset;
90
};
91

92
static void advance_output(struct perf_record *rec, size_t size)
93
{
94
	rec->bytes_written += size;
95 96
}

97
static int write_output(struct perf_record *rec, void *buf, size_t size)
98 99
{
	while (size) {
100
		int ret = write(rec->output, buf, size);
101

102 103 104 105
		if (ret < 0) {
			pr_err("failed to write\n");
			return -1;
		}
106 107 108 109

		size -= ret;
		buf += ret;

110
		rec->bytes_written += ret;
111
	}
112 113

	return 0;
114 115
}

116
static int process_synthesized_event(struct perf_tool *tool,
117
				     union perf_event *event,
118 119
				     struct perf_sample *sample __maybe_unused,
				     struct machine *machine __maybe_unused)
120
{
121
	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 123 124
	if (write_output(rec, event, event->header.size) < 0)
		return -1;

125 126 127
	return 0;
}

128
static int perf_record__mmap_read(struct perf_record *rec,
129
				   struct perf_mmap *md)
130
{
131
	unsigned int head = perf_mmap__read_head(md);
132
	unsigned int old = md->prev;
133
	unsigned char *data = md->base + rec->page_size;
134 135
	unsigned long size;
	void *buf;
136
	int rc = 0;
137

138
	if (old == head)
139
		return 0;
140

141
	rec->samples++;
142 143 144 145 146 147 148

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
149

150 151 152 153
		if (write_output(rec, buf, size) < 0) {
			rc = -1;
			goto out;
		}
154 155 156 157 158
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
159

160 161 162 163
	if (write_output(rec, buf, size) < 0) {
		rc = -1;
		goto out;
	}
164 165

	md->prev = old;
166
	perf_mmap__write_tail(md, old);
167 168 169

out:
	return rc;
170 171 172
}

static volatile int done = 0;
173
static volatile int signr = -1;
174
static volatile int child_finished = 0;
175

176
static void sig_handler(int sig)
177
{
178 179 180
	if (sig == SIGCHLD)
		child_finished = 1;

181
	done = 1;
182 183 184
	signr = sig;
}

185
static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186
{
187
	struct perf_record *rec = arg;
188 189
	int status;

190
	if (rec->evlist->workload.pid > 0) {
191
		if (!child_finished)
192
			kill(rec->evlist->workload.pid, SIGTERM);
193 194 195

		wait(&status);
		if (WIFSIGNALED(status))
196
			psignal(WTERMSIG(status), rec->progname);
197
	}
198

199
	if (signr == -1 || signr == SIGUSR1)
200 201 202 203
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
204 205
}

206 207 208 209 210 211 212 213
static bool perf_evlist__equal(struct perf_evlist *evlist,
			       struct perf_evlist *other)
{
	struct perf_evsel *pos, *pair;

	if (evlist->nr_entries != other->nr_entries)
		return false;

214
	pair = perf_evlist__first(other);
215 216 217 218

	list_for_each_entry(pos, &evlist->entries, node) {
		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
			return false;
219
		pair = perf_evsel__next(pair);
220 221 222 223 224
	}

	return true;
}

225
static int perf_record__open(struct perf_record *rec)
226
{
227
	struct perf_evsel *pos;
228 229 230
	struct perf_evlist *evlist = rec->evlist;
	struct perf_session *session = rec->session;
	struct perf_record_opts *opts = &rec->opts;
231
	int rc = 0;
232

233 234 235 236
	/*
	 * Set the evsel leader links before we configure attributes,
	 * since some might depend on this info.
	 */
237
	if (opts->group)
238
		perf_evlist__set_leader(evlist);
239

240 241
	perf_evlist__config_attrs(evlist, opts);

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	list_for_each_entry(pos, &evlist->entries, node) {
		struct perf_event_attr *attr = &pos->attr;
		/*
		 * Check if parse_single_tracepoint_event has already asked for
		 * PERF_SAMPLE_TIME.
		 *
		 * XXX this is kludgy but short term fix for problems introduced by
		 * eac23d1c that broke 'perf script' by having different sample_types
		 * when using multiple tracepoint events when we use a perf binary
		 * that tries to use sample_id_all on an older kernel.
		 *
		 * We need to move counter creation to perf_session, support
		 * different sample_types, etc.
		 */
		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
257

258 259 260
fallback_missing_features:
		if (opts->exclude_guest_missing)
			attr->exclude_guest = attr->exclude_host = 0;
261
retry_sample_id:
262
		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
263
try_again:
264
		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
265 266
			int err = errno;

267
			if (err == EPERM || err == EACCES) {
268
				ui__error_paranoid();
269 270
				rc = -err;
				goto out;
271
			} else if (err ==  ENODEV && opts->target.cpu_list) {
272 273 274 275
				pr_err("No such device - did you specify"
				       " an out-of-range profile CPU?\n");
				rc = -err;
				goto out;
276 277 278 279 280 281 282
			} else if (err == EINVAL) {
				if (!opts->exclude_guest_missing &&
				    (attr->exclude_guest || attr->exclude_host)) {
					pr_debug("Old kernel, cannot exclude "
						 "guest or host samples.\n");
					opts->exclude_guest_missing = true;
					goto fallback_missing_features;
283
				} else if (!opts->sample_id_all_missing) {
284 285 286
					/*
					 * Old kernel, no attr->sample_id_type_all field
					 */
287
					opts->sample_id_all_missing = true;
288 289 290 291 292
					if (!opts->sample_time && !opts->raw_samples && !time_needed)
						attr->sample_type &= ~PERF_SAMPLE_TIME;

					goto retry_sample_id;
				}
293
			}
294

295 296 297
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
298 299 300 301
			 * is always available even if no PMU support.
			 *
			 * PPC returns ENXIO until 2.6.37 (behavior changed
			 * with commit b0a873e).
302
			 */
303 304
			if ((err == ENOENT || err == ENXIO)
					&& attr->type == PERF_TYPE_HARDWARE
305 306 307
					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {

				if (verbose)
308 309
					ui__warning("The cycles event is not supported, "
						    "trying to fall back to cpu-clock-ticks\n");
310 311
				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
312 313 314 315
				if (pos->name) {
					free(pos->name);
					pos->name = NULL;
				}
316 317
				goto try_again;
			}
318 319

			if (err == ENOENT) {
320
				ui__error("The %s event is not supported.\n",
321
					  perf_evsel__name(pos));
322 323
				rc = -err;
				goto out;
324 325 326 327 328
			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
				ui__error("\'precise\' request may not be supported. "
					  "Try removing 'p' modifier\n");
				rc = -err;
				goto out;
329 330
			}

331
			printf("\n");
332 333 334 335
			error("sys_perf_event_open() syscall returned with %d "
			      "(%s) for event %s. /bin/dmesg may provide "
			      "additional information.\n",
			      err, strerror(err), perf_evsel__name(pos));
336 337

#if defined(__i386__) || defined(__x86_64__)
338 339 340 341 342 343 344 345 346
			if (attr->type == PERF_TYPE_HARDWARE &&
			    err == EOPNOTSUPP) {
				pr_err("No hardware sampling interrupt available."
				       " No APIC? If so then you can boot the kernel"
				       " with the \"lapic\" boot parameter to"
				       " force-enable it.\n");
				rc = -err;
				goto out;
			}
347 348
#endif

349 350 351
			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
			rc = -err;
			goto out;
L
Li Zefan 已提交
352 353
		}
	}
354

355
	if (perf_evlist__apply_filters(evlist)) {
356 357
		error("failed to set filter with %d (%s)\n", errno,
			strerror(errno));
358 359
		rc = -1;
		goto out;
360 361
	}

362
	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
363 364 365 366 367 368 369
		if (errno == EPERM) {
			pr_err("Permission error mapping pages.\n"
			       "Consider increasing "
			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
			       "or try again with a smaller value of -m/--mmap_pages.\n"
			       "(current value: %d)\n", opts->mmap_pages);
			rc = -errno;
370 371
		} else if (!is_power_of_2(opts->mmap_pages) &&
			   (opts->mmap_pages != UINT_MAX)) {
372 373 374 375 376 377 378
			pr_err("--mmap_pages/-m value must be a power of two.");
			rc = -EINVAL;
		} else {
			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
			rc = -errno;
		}
		goto out;
379
	}
380

381
	if (rec->file_new)
382 383 384 385
		session->evlist = evlist;
	else {
		if (!perf_evlist__equal(session->evlist, evlist)) {
			fprintf(stderr, "incompatible append\n");
386 387
			rc = -1;
			goto out;
388 389 390
		}
 	}

391
	perf_session__set_id_hdr_size(session);
392 393
out:
	return rc;
394 395
}

396
static int process_buildids(struct perf_record *rec)
397
{
398
	u64 size = lseek(rec->output, 0, SEEK_CUR);
399

400 401 402
	if (size == 0)
		return 0;

403 404 405
	rec->session->fd = rec->output;
	return __perf_session__process_events(rec->session, rec->post_processing_offset,
					      size - rec->post_processing_offset,
406 407 408
					      size, &build_id__mark_dso_hit_ops);
}

409
static void perf_record__exit(int status, void *arg)
410
{
411 412
	struct perf_record *rec = arg;

413 414 415
	if (status != 0)
		return;

416 417 418 419 420 421 422 423 424
	if (!rec->opts.pipe_output) {
		rec->session->header.data_size += rec->bytes_written;

		if (!rec->no_buildid)
			process_buildids(rec);
		perf_session__write_header(rec->session, rec->evlist,
					   rec->output, true);
		perf_session__delete(rec->session);
		perf_evlist__delete(rec->evlist);
425
		symbol__exit();
426
	}
427 428
}

429
static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
430 431
{
	int err;
432
	struct perf_tool *tool = data;
433

434
	if (machine__is_host(machine))
435 436 437 438 439 440 441 442 443 444
		return;

	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
445
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
446
					     machine);
447 448
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
449
		       " relocation symbol.\n", machine->pid);
450 451 452 453 454

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
455
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
456
						 machine, "_text");
457
	if (err < 0)
458
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
459
							 machine, "_stext");
460 461
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
462
		       " relocation symbol.\n", machine->pid);
463 464
}

465 466 467 468 469
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

470
static int perf_record__mmap_read_all(struct perf_record *rec)
471
{
472
	int i;
473
	int rc = 0;
474

475
	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
476 477 478 479 480 481
		if (rec->evlist->mmap[i].base) {
			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
				rc = -1;
				goto out;
			}
		}
482 483
	}

484
	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
485 486 487 488 489
		rc = write_output(rec, &finished_round_event,
				  sizeof(finished_round_event));

out:
	return rc;
490 491
}

492
static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
493
{
I
Ingo Molnar 已提交
494 495
	struct stat st;
	int flags;
496
	int err, output, feat;
497
	unsigned long waking = 0;
498
	const bool forks = argc > 0;
499
	struct machine *machine;
500
	struct perf_tool *tool = &rec->tool;
501 502 503 504
	struct perf_record_opts *opts = &rec->opts;
	struct perf_evlist *evsel_list = rec->evlist;
	const char *output_name = rec->output_name;
	struct perf_session *session;
505

506
	rec->progname = argv[0];
507

508
	rec->page_size = sysconf(_SC_PAGE_SIZE);
509

510
	on_exit(perf_record__sig_exit, rec);
511 512
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
513
	signal(SIGUSR1, sig_handler);
514

515 516
	if (!output_name) {
		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
517
			opts->pipe_output = true;
518
		else
519
			rec->output_name = output_name = "perf.data";
520 521 522
	}
	if (output_name) {
		if (!strcmp(output_name, "-"))
523
			opts->pipe_output = true;
524
		else if (!stat(output_name, &st) && st.st_size) {
525
			if (rec->write_mode == WRITE_FORCE) {
526 527 528 529 530 531
				char oldname[PATH_MAX];
				snprintf(oldname, sizeof(oldname), "%s.old",
					 output_name);
				unlink(oldname);
				rename(output_name, oldname);
			}
532 533
		} else if (rec->write_mode == WRITE_APPEND) {
			rec->write_mode = WRITE_FORCE;
534
		}
535 536
	}

537
	flags = O_CREAT|O_RDWR;
538 539
	if (rec->write_mode == WRITE_APPEND)
		rec->file_new = 0;
I
Ingo Molnar 已提交
540 541 542
	else
		flags |= O_TRUNC;

543
	if (opts->pipe_output)
544 545 546
		output = STDOUT_FILENO;
	else
		output = open(output_name, flags, S_IRUSR | S_IWUSR);
547 548
	if (output < 0) {
		perror("failed to create output file");
549
		return -1;
550 551
	}

552 553
	rec->output = output;

554
	session = perf_session__new(output_name, O_WRONLY,
555
				    rec->write_mode == WRITE_FORCE, false, NULL);
556
	if (session == NULL) {
557 558 559 560
		pr_err("Not enough memory for reading perf file header\n");
		return -1;
	}

561 562
	rec->session = session;

563 564 565 566 567 568 569
	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
		perf_header__set_feat(&session->header, feat);

	if (rec->no_buildid)
		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);

	if (!have_tracepoints(&evsel_list->entries))
570
		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
571

572 573 574
	if (!rec->opts.branch_stack)
		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);

575
	if (!rec->file_new) {
576
		err = perf_session__read_header(session, output);
577
		if (err < 0)
578
			goto out_delete_session;
579 580
	}

581
	if (forks) {
582
		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
583 584 585
		if (err < 0) {
			pr_err("Couldn't run the workload!\n");
			goto out_delete_session;
586 587 588
		}
	}

589 590 591 592
	if (perf_record__open(rec) != 0) {
		err = -1;
		goto out_delete_session;
	}
593

594
	/*
595
	 * perf_session__delete(session) will be called at perf_record__exit()
596
	 */
597
	on_exit(perf_record__exit, rec);
598

599
	if (opts->pipe_output) {
600 601
		err = perf_header__write_pipe(output);
		if (err < 0)
602
			goto out_delete_session;
603
	} else if (rec->file_new) {
604 605
		err = perf_session__write_header(session, evsel_list,
						 output, false);
606
		if (err < 0)
607
			goto out_delete_session;
608 609
	}

610
	if (!rec->no_buildid
611
	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
612
		pr_err("Couldn't generate buildids. "
613
		       "Use --no-buildid to profile anyway.\n");
614 615
		err = -1;
		goto out_delete_session;
616 617
	}

618
	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
619

620 621 622
	machine = perf_session__find_host_machine(session);
	if (!machine) {
		pr_err("Couldn't find native kernel information.\n");
623 624
		err = -1;
		goto out_delete_session;
625 626
	}

627
	if (opts->pipe_output) {
628
		err = perf_event__synthesize_attrs(tool, session,
629
						   process_synthesized_event);
630 631
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
632
			goto out_delete_session;
633
		}
634

635
		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
636
							 machine);
637 638
		if (err < 0) {
			pr_err("Couldn't synthesize event_types.\n");
639
			goto out_delete_session;
640
		}
641

642
		if (have_tracepoints(&evsel_list->entries)) {
643 644 645 646 647 648 649 650
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
651
			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
652
								  process_synthesized_event);
653 654
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
655
				goto out_delete_session;
656
			}
657
			advance_output(rec, err);
658
		}
659 660
	}

661
	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
662
						 machine, "_text");
663
	if (err < 0)
664
		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
665
							 machine, "_stext");
666 667 668 669
	if (err < 0)
		pr_err("Couldn't record kernel reference relocation symbol\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/kallsyms permission or run as root.\n");
670

671
	err = perf_event__synthesize_modules(tool, process_synthesized_event,
672
					     machine);
673 674 675 676 677
	if (err < 0)
		pr_err("Couldn't record kernel module information.\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/modules permission or run as root.\n");

678
	if (perf_guest)
679
		perf_session__process_machines(session, tool,
680
					       perf_event__synthesize_guest_os);
681

682
	if (!opts->target.system_wide)
683
		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
684
						  process_synthesized_event,
685
						  machine);
686
	else
687
		err = perf_event__synthesize_threads(tool, process_synthesized_event,
688
					       machine);
689

690 691 692
	if (err != 0)
		goto out_delete_session;

693
	if (rec->realtime_prio) {
694 695
		struct sched_param param;

696
		param.sched_priority = rec->realtime_prio;
697
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
698
			pr_err("Could not set realtime priority.\n");
699 700
			err = -1;
			goto out_delete_session;
701 702 703
		}
	}

704 705
	perf_evlist__enable(evsel_list);

706 707 708
	/*
	 * Let the child rip
	 */
709
	if (forks)
710
		perf_evlist__start_workload(evsel_list);
711

712
	for (;;) {
713
		int hits = rec->samples;
714

715 716 717 718
		if (perf_record__mmap_read_all(rec) < 0) {
			err = -1;
			goto out_delete_session;
		}
719

720
		if (hits == rec->samples) {
721 722
			if (done)
				break;
723
			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
724 725 726
			waking++;
		}

727 728
		if (done)
			perf_evlist__disable(evsel_list);
729 730
	}

731
	if (quiet || signr == SIGUSR1)
732 733
		return 0;

734 735
	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);

736 737 738 739
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
740
		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
741
		(double)rec->bytes_written / 1024.0 / 1024.0,
742
		output_name,
743
		rec->bytes_written / 24);
744

745
	return 0;
746 747 748 749

out_delete_session:
	perf_session__delete(session);
	return err;
750
}
751

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
#define BRANCH_OPT(n, m) \
	{ .name = n, .mode = (m) }

#define BRANCH_END { .name = NULL }

struct branch_mode {
	const char *name;
	int mode;
};

static const struct branch_mode branch_modes[] = {
	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
	BRANCH_END
};

static int
774
parse_branch_stack(const struct option *opt, const char *str, int unset)
775 776 777 778 779 780 781 782
{
#define ONLY_PLM \
	(PERF_SAMPLE_BRANCH_USER	|\
	 PERF_SAMPLE_BRANCH_KERNEL	|\
	 PERF_SAMPLE_BRANCH_HV)

	uint64_t *mode = (uint64_t *)opt->value;
	const struct branch_mode *br;
783
	char *s, *os = NULL, *p;
784 785
	int ret = -1;

786 787
	if (unset)
		return 0;
788

789 790 791 792
	/*
	 * cannot set it twice, -b + --branch-filter for instance
	 */
	if (*mode)
793 794
		return -1;

795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	/* str may be NULL in case no arg is passed to -b */
	if (str) {
		/* because str is read-only */
		s = os = strdup(str);
		if (!s)
			return -1;

		for (;;) {
			p = strchr(s, ',');
			if (p)
				*p = '\0';

			for (br = branch_modes; br->name; br++) {
				if (!strcasecmp(s, br->name))
					break;
			}
			if (!br->name) {
				ui__warning("unknown branch filter %s,"
					    " check man page\n", s);
				goto error;
			}
816

817
			*mode |= br->mode;
818

819 820
			if (!p)
				break;
821

822 823
			s = p + 1;
		}
824 825 826
	}
	ret = 0;

827
	/* default to any branch */
828
	if ((*mode & ~ONLY_PLM) == 0) {
829
		*mode = PERF_SAMPLE_BRANCH_ANY;
830 831 832 833 834 835
	}
error:
	free(os);
	return ret;
}

836
#ifdef LIBUNWIND_SUPPORT
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
static int get_stack_size(char *str, unsigned long *_size)
{
	char *endptr;
	unsigned long size;
	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));

	size = strtoul(str, &endptr, 0);

	do {
		if (*endptr)
			break;

		size = round_up(size, sizeof(u64));
		if (!size || size > max_size)
			break;

		*_size = size;
		return 0;

	} while (0);

	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
	       max_size, str);
	return -1;
}
862
#endif /* LIBUNWIND_SUPPORT */
863 864

static int
865
parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
		    int unset)
{
	struct perf_record *rec = (struct perf_record *)opt->value;
	char *tok, *name, *saveptr = NULL;
	char *buf;
	int ret = -1;

	/* --no-call-graph */
	if (unset)
		return 0;

	/* We specified default option if none is provided. */
	BUG_ON(!arg);

	/* We need buffer that we know we can write to. */
	buf = malloc(strlen(arg) + 1);
	if (!buf)
		return -ENOMEM;

	strcpy(buf, arg);

	tok = strtok_r((char *)buf, ",", &saveptr);
	name = tok ? : (char *)buf;

	do {
		/* Framepointer style */
		if (!strncmp(name, "fp", sizeof("fp"))) {
			if (!strtok_r(NULL, ",", &saveptr)) {
				rec->opts.call_graph = CALLCHAIN_FP;
				ret = 0;
			} else
				pr_err("callchain: No more arguments "
				       "needed for -g fp\n");
			break;

901
#ifdef LIBUNWIND_SUPPORT
902 903
		/* Dwarf style */
		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
904 905
			const unsigned long default_stack_dump_size = 8192;

906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
			ret = 0;
			rec->opts.call_graph = CALLCHAIN_DWARF;
			rec->opts.stack_dump_size = default_stack_dump_size;

			tok = strtok_r(NULL, ",", &saveptr);
			if (tok) {
				unsigned long size = 0;

				ret = get_stack_size(tok, &size);
				rec->opts.stack_dump_size = size;
			}

			if (!ret)
				pr_debug("callchain: stack dump size %d\n",
					 rec->opts.stack_dump_size);
921
#endif /* LIBUNWIND_SUPPORT */
922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
		} else {
			pr_err("callchain: Unknown -g option "
			       "value: %s\n", arg);
			break;
		}

	} while (0);

	free(buf);

	if (!ret)
		pr_debug("callchain: type %d\n", rec->opts.call_graph);

	return ret;
}

938
static const char * const record_usage[] = {
939 940
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
941 942 943
	NULL
};

944 945 946 947 948 949 950 951 952 953 954 955 956 957 958
/*
 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
 * because we need to have access to it in perf_record__exit, that is called
 * after cmd_record() exits, but since record_options need to be accessible to
 * builtin-script, leave it here.
 *
 * At least we don't ouch it in all the other functions here directly.
 *
 * Just say no to tons of global variables, sigh.
 */
static struct perf_record record = {
	.opts = {
		.mmap_pages	     = UINT_MAX,
		.user_freq	     = UINT_MAX,
		.user_interval	     = ULLONG_MAX,
959
		.freq		     = 4000,
N
Namhyung Kim 已提交
960 961 962
		.target		     = {
			.uses_mmap   = true,
		},
963 964 965 966
	},
	.write_mode = WRITE_FORCE,
	.file_new   = true,
};
967

968 969 970 971 972 973 974 975
#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "

#ifdef LIBUNWIND_SUPPORT
static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
#else
static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
#endif

976 977 978 979 980 981 982
/*
 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
 * with it and switch to use the library functions in perf_evlist that came
 * from builtin-record.c, i.e. use perf_record_opts,
 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
 * using pipes, etc.
 */
983
const struct option record_options[] = {
984
	OPT_CALLBACK('e', "event", &record.evlist, "event",
985
		     "event selector. use 'perf list' to list available events",
986
		     parse_events_option),
987
	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
L
Li Zefan 已提交
988
		     "event filter", parse_filter),
989
	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
990
		    "record events on existing process id"),
991
	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
992
		    "record events on existing thread id"),
993
	OPT_INTEGER('r', "realtime", &record.realtime_prio,
994
		    "collect data with this RT SCHED_FIFO priority"),
995
	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
996
		    "collect data without buffering"),
997
	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
998
		    "collect raw sample records from all opened counters"),
999
	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1000
			    "system-wide collection from all CPUs"),
1001
	OPT_BOOLEAN('A', "append", &record.append_file,
I
Ingo Molnar 已提交
1002
			    "append to the output file to do incremental profiling"),
1003
	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1004
		    "list of cpus to monitor"),
1005
	OPT_BOOLEAN('f', "force", &record.force,
1006
			"overwrite existing data file (deprecated)"),
1007 1008
	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
	OPT_STRING('o', "output", &record.output_name, "file",
I
Ingo Molnar 已提交
1009
		    "output file name"),
1010
	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1011
		    "child tasks do not inherit counters"),
1012 1013
	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1014
		     "number of mmap data pages"),
1015
	OPT_BOOLEAN(0, "group", &record.opts.group,
1016
		    "put the counters into a counter group"),
1017 1018 1019
	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
			     callchain_help, &parse_callchain_opt,
			     "fp"),
1020
	OPT_INCR('v', "verbose", &verbose,
1021
		    "be more verbose (show counter open errors, etc)"),
1022
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1023
	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1024
		    "per thread counts"),
1025
	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1026
		    "Sample addresses"),
1027
	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1028
	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1029
	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1030
		    "don't sample"),
1031
	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1032
		    "do not update the buildid cache"),
1033
	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1034
		    "do not collect buildids in perf.data"),
1035
	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
S
Stephane Eranian 已提交
1036 1037
		     "monitor event in cgroup name only",
		     parse_cgroups),
1038 1039
	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
		   "user to profile"),
1040 1041 1042 1043 1044 1045 1046

	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
		     "branch any", "sample any taken branches",
		     parse_branch_stack),

	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
		     "branch filter mask", "branch stack filter modes",
1047
		     parse_branch_stack),
1048 1049 1050
	OPT_END()
};

1051
int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1052
{
1053 1054
	int err = -ENOMEM;
	struct perf_evsel *pos;
1055 1056
	struct perf_evlist *evsel_list;
	struct perf_record *rec = &record;
1057
	char errbuf[BUFSIZ];
1058

1059
	evsel_list = perf_evlist__new(NULL, NULL);
1060 1061 1062
	if (evsel_list == NULL)
		return -ENOMEM;

1063 1064
	rec->evlist = evsel_list;

1065
	argc = parse_options(argc, argv, record_options, record_usage,
1066
			    PARSE_OPT_STOP_AT_NON_OPTION);
1067
	if (!argc && perf_target__none(&rec->opts.target))
1068
		usage_with_options(record_usage, record_options);
1069

1070
	if (rec->force && rec->append_file) {
1071 1072
		ui__error("Can't overwrite and append at the same time."
			  " You need to choose between -f and -A");
1073
		usage_with_options(record_usage, record_options);
1074 1075
	} else if (rec->append_file) {
		rec->write_mode = WRITE_APPEND;
1076
	} else {
1077
		rec->write_mode = WRITE_FORCE;
1078 1079
	}

1080
	if (nr_cgroups && !rec->opts.target.system_wide) {
1081 1082
		ui__error("cgroup monitoring only available in"
			  " system-wide mode\n");
S
Stephane Eranian 已提交
1083 1084 1085
		usage_with_options(record_usage, record_options);
	}

1086
	symbol__init();
1087

1088
	if (symbol_conf.kptr_restrict)
1089 1090 1091 1092 1093 1094 1095 1096
		pr_warning(
"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
"check /proc/sys/kernel/kptr_restrict.\n\n"
"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
"file is not found in the buildid cache or in the vmlinux path.\n\n"
"Samples in kernel modules won't be resolved at all.\n\n"
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
1097

1098
	if (rec->no_buildid_cache || rec->no_buildid)
1099
		disable_buildid_cache();
1100

1101 1102
	if (evsel_list->nr_entries == 0 &&
	    perf_evlist__add_default(evsel_list) < 0) {
1103 1104
		pr_err("Not enough memory for event selector list\n");
		goto out_symbol_exit;
1105
	}
1106

1107 1108 1109 1110 1111 1112 1113 1114 1115
	err = perf_target__validate(&rec->opts.target);
	if (err) {
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
		ui__warning("%s", errbuf);
	}

	err = perf_target__parse_uid(&rec->opts.target);
	if (err) {
		int saved_errno = errno;
1116

1117
		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1118
		ui__error("%s", errbuf);
1119 1120

		err = -saved_errno;
1121
		goto out_free_fd;
1122
	}
1123

1124
	err = -ENOMEM;
1125
	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1126
		usage_with_options(record_usage, record_options);
1127

1128
	list_for_each_entry(pos, &evsel_list->entries, node) {
1129
		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1130
			goto out_free_fd;
1131
	}
1132

1133 1134 1135 1136
	if (rec->opts.user_interval != ULLONG_MAX)
		rec->opts.default_interval = rec->opts.user_interval;
	if (rec->opts.user_freq != UINT_MAX)
		rec->opts.freq = rec->opts.user_freq;
1137

1138 1139 1140
	/*
	 * User specified count overrides default frequency.
	 */
1141 1142 1143 1144
	if (rec->opts.default_interval)
		rec->opts.freq = 0;
	else if (rec->opts.freq) {
		rec->opts.default_interval = rec->opts.freq;
1145
	} else {
1146
		ui__error("frequency and count are zero, aborting\n");
1147
		err = -EINVAL;
1148
		goto out_free_fd;
1149 1150
	}

1151
	err = __cmd_record(&record, argc, argv);
1152
out_free_fd:
1153
	perf_evlist__delete_maps(evsel_list);
1154 1155
out_symbol_exit:
	symbol__exit();
1156
	return err;
1157
}