builtin-record.c 24.1 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8 9
#define _FILE_OFFSET_BITS 64

10
#include "builtin.h"
11 12 13

#include "perf.h"

14
#include "util/build-id.h"
15
#include "util/util.h"
16
#include "util/parse-options.h"
17
#include "util/parse-events.h"
18

19
#include "util/header.h"
20
#include "util/event.h"
21
#include "util/evlist.h"
22
#include "util/evsel.h"
23
#include "util/debug.h"
24
#include "util/session.h"
25
#include "util/symbol.h"
26
#include "util/cpumap.h"
27

28
#include <unistd.h>
29
#include <sched.h>
30
#include <sys/mman.h>
31

32 33
#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))

34 35 36 37 38
enum write_mode_t {
	WRITE_FORCE,
	WRITE_APPEND
};

39 40
static u64			user_interval			= ULLONG_MAX;
static u64			default_interval		=      0;
41
static u64			sample_type;
42

43
static struct cpu_map		*cpus;
44
static unsigned int		page_size;
45
static unsigned int		mmap_pages			=    128;
46
static unsigned int		user_freq 			= UINT_MAX;
47
static int			freq				=   1000;
48
static int			output;
49
static int			pipe_output			=      0;
50
static const char		*output_name			= "perf.data";
51
static int			group				=      0;
52
static int			realtime_prio			=      0;
53
static bool			nodelay				=  false;
54
static bool			raw_samples			=  false;
55
static bool			sample_id_all_avail		=   true;
56
static bool			system_wide			=  false;
57
static pid_t			target_pid			=     -1;
58
static pid_t			target_tid			=     -1;
59
static struct thread_map	*threads;
60
static pid_t			child_pid			=     -1;
61
static bool			no_inherit			=  false;
62
static enum write_mode_t	write_mode			= WRITE_FORCE;
63 64 65 66
static bool			call_graph			=  false;
static bool			inherit_stat			=  false;
static bool			no_samples			=  false;
static bool			sample_address			=  false;
67
static bool			sample_time			=  false;
68
static bool			no_buildid			=  false;
69
static bool			no_buildid_cache		=  false;
70
static struct perf_evlist	*evsel_list;
71 72 73

static long			samples				=      0;
static u64			bytes_written			=      0;
74

75
static int			file_new			=      1;
76
static off_t			post_processing_offset;
77

78
static struct perf_session	*session;
79
static const char		*cpu_list;
80

81
struct mmap_data {
82 83 84
	void			*base;
	unsigned int		mask;
	unsigned int		prev;
85 86
};

87
static struct mmap_data		mmap_array[MAX_NR_CPUS];
88

89
static unsigned long mmap_read_head(struct mmap_data *md)
90
{
91
	struct perf_event_mmap_page *pc = md->base;
92
	long head;
93 94 95 96 97 98 99

	head = pc->data_head;
	rmb();

	return head;
}

100 101
static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
{
102
	struct perf_event_mmap_page *pc = md->base;
103 104 105 106 107 108 109 110

	/*
	 * ensure all reads are done before we write the tail out.
	 */
	/* mb(); */
	pc->data_tail = tail;
}

111 112 113 114 115
static void advance_output(size_t size)
{
	bytes_written += size;
}

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
static void write_output(void *buf, size_t size)
{
	while (size) {
		int ret = write(output, buf, size);

		if (ret < 0)
			die("failed to write");

		size -= ret;
		buf += ret;

		bytes_written += ret;
	}
}

131
static int process_synthesized_event(event_t *event,
132
				     struct sample_data *sample __used,
133
				     struct perf_session *self __used)
134
{
135
	write_output(event, event->header.size);
136 137 138
	return 0;
}

139 140 141 142 143 144 145 146 147 148 149
static void mmap_read(struct mmap_data *md)
{
	unsigned int head = mmap_read_head(md);
	unsigned int old = md->prev;
	unsigned char *data = md->base + page_size;
	unsigned long size;
	void *buf;
	int diff;

	/*
	 * If we're further behind than half the buffer, there's a chance
150
	 * the writer will bite our tail and mess up the samples under us.
151 152 153 154 155 156
	 *
	 * If we somehow ended up ahead of the head, we got messed up.
	 *
	 * In either case, truncate and restart at head.
	 */
	diff = head - old;
157
	if (diff < 0) {
158
		fprintf(stderr, "WARNING: failed to keep up with mmap data\n");
159 160 161 162 163 164 165
		/*
		 * head points to a known good entry, start there.
		 */
		old = head;
	}

	if (old != head)
166
		samples++;
167 168 169 170 171 172 173

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
174

175
		write_output(buf, size);
176 177 178 179 180
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
181

182
	write_output(buf, size);
183 184

	md->prev = old;
185
	mmap_write_tail(md, old);
186 187 188
}

static volatile int done = 0;
189
static volatile int signr = -1;
190

191
static void sig_handler(int sig)
192
{
193
	done = 1;
194 195 196 197 198
	signr = sig;
}

static void sig_atexit(void)
{
199
	if (child_pid > 0)
200 201
		kill(child_pid, SIGTERM);

202
	if (signr == -1 || signr == SIGUSR1)
203 204 205 206
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
207 208
}

209
static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
210 211 212
{
	struct perf_header_attr *h_attr;

213 214
	if (nr < session->header.attrs) {
		h_attr = session->header.attr[nr];
215 216
	} else {
		h_attr = perf_header_attr__new(a);
217
		if (h_attr != NULL)
218
			if (perf_header__add_attr(&session->header, h_attr) < 0) {
219 220 221
				perf_header_attr__delete(h_attr);
				h_attr = NULL;
			}
222 223 224 225 226
	}

	return h_attr;
}

227 228
static void create_counter(struct perf_evlist *evlist,
			   struct perf_evsel *evsel, int cpu)
229
{
230 231
	char *filter = evsel->filter;
	struct perf_event_attr *attr = &evsel->attr;
232
	struct perf_header_attr *h_attr;
233
	int thread_index;
L
Li Zefan 已提交
234
	int ret;
235 236 237 238 239 240
	struct {
		u64 count;
		u64 time_enabled;
		u64 time_running;
		u64 id;
	} read_data;
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311

	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
		h_attr = get_header_attr(attr, evsel->idx);
		if (h_attr == NULL)
			die("nomem\n");

		if (!file_new) {
			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
				fprintf(stderr, "incompatible append\n");
				exit(-1);
			}
		}

		if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
			perror("Unable to read perf file descriptor");
			exit(-1);
		}

		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
			pr_warning("Not enough memory to add id\n");
			exit(-1);
		}

		assert(FD(evsel, cpu, thread_index) >= 0);
		fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK);

		if (evsel->idx || thread_index) {
			struct perf_evsel *first;
			first = list_entry(evlist->entries.next, struct perf_evsel, node);
			ret = ioctl(FD(evsel, cpu, thread_index),
				    PERF_EVENT_IOC_SET_OUTPUT,
				    FD(first, cpu, 0));
			if (ret) {
				error("failed to set output: %d (%s)\n", errno,
						strerror(errno));
				exit(-1);
			}
		} else {
			mmap_array[cpu].prev = 0;
			mmap_array[cpu].mask = mmap_pages*page_size - 1;
			mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0);
			if (mmap_array[cpu].base == MAP_FAILED) {
				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
				exit(-1);
			}

			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index);
			evlist->pollfd[evlist->nr_fds].events = POLLIN;
			evlist->nr_fds++;
		}

		if (filter != NULL) {
			ret = ioctl(FD(evsel, cpu, thread_index),
				    PERF_EVENT_IOC_SET_FILTER, filter);
			if (ret) {
				error("failed to set filter with %d (%s)\n", errno,
						strerror(errno));
				exit(-1);
			}
		}
	}

	if (!sample_type)
		sample_type = attr->sample_type;
}

static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
{
	struct perf_event_attr *attr = &evsel->attr;
	int track = !evsel->idx; /* only the first counter needs these */
312 313 314 315

	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
				  PERF_FORMAT_TOTAL_TIME_RUNNING |
				  PERF_FORMAT_ID;
316

317
	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
318

319
	if (evlist->nr_entries > 1)
320 321
		attr->sample_type |= PERF_SAMPLE_ID;

322 323 324 325 326
	/*
	 * We default some events to a 1 default interval. But keep
	 * it a weak assumption overridable by the user.
	 */
	if (!attr->sample_period || (user_freq != UINT_MAX &&
327
				     user_interval != ULLONG_MAX)) {
328 329 330 331 332 333 334
		if (freq) {
			attr->sample_type	|= PERF_SAMPLE_PERIOD;
			attr->freq		= 1;
			attr->sample_freq	= freq;
		} else {
			attr->sample_period = default_interval;
		}
335
	}
336

337 338 339 340 341 342
	if (no_samples)
		attr->sample_freq = 0;

	if (inherit_stat)
		attr->inherit_stat = 1;

343
	if (sample_address) {
344
		attr->sample_type	|= PERF_SAMPLE_ADDR;
345 346
		attr->mmap_data = track;
	}
347

348 349 350
	if (call_graph)
		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;

A
Arun Sharma 已提交
351 352 353
	if (system_wide)
		attr->sample_type	|= PERF_SAMPLE_CPU;

354 355
	if (sample_id_all_avail &&
	    (sample_time || system_wide || !no_inherit || cpu_list))
356 357
		attr->sample_type	|= PERF_SAMPLE_TIME;

I
Ingo Molnar 已提交
358
	if (raw_samples) {
359
		attr->sample_type	|= PERF_SAMPLE_TIME;
360
		attr->sample_type	|= PERF_SAMPLE_RAW;
I
Ingo Molnar 已提交
361 362
		attr->sample_type	|= PERF_SAMPLE_CPU;
	}
363

364 365 366 367 368
	if (nodelay) {
		attr->watermark = 0;
		attr->wakeup_events = 1;
	}

369 370
	attr->mmap		= track;
	attr->comm		= track;
371

372
	if (target_pid == -1 && target_tid == -1 && !system_wide) {
373
		attr->disabled = 1;
374
		attr->enable_on_exec = 1;
375
	}
376
}
377

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
static void open_counters(struct perf_evlist *evlist)
{
	struct perf_evsel *pos;
	int cpu;

	list_for_each_entry(pos, &evlist->entries, node) {
		struct perf_event_attr *attr = &pos->attr;
		/*
		 * Check if parse_single_tracepoint_event has already asked for
		 * PERF_SAMPLE_TIME.
		 *
		 * XXX this is kludgy but short term fix for problems introduced by
		 * eac23d1c that broke 'perf script' by having different sample_types
		 * when using multiple tracepoint events when we use a perf binary
		 * that tries to use sample_id_all on an older kernel.
		 *
		 * We need to move counter creation to perf_session, support
		 * different sample_types, etc.
		 */
		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
398

399 400 401 402 403
		config_attr(pos, evlist);
retry_sample_id:
		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
try_again:
		if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) {
404 405 406 407 408 409
			int err = errno;

			if (err == EPERM || err == EACCES)
				die("Permission error - are you root?\n"
					"\t Consider tweaking"
					" /proc/sys/kernel/perf_event_paranoid.\n");
410
			else if (err ==  ENODEV && cpu_list) {
411 412
				die("No such device - did you specify"
					" an out-of-range profile CPU?\n");
413 414 415 416 417
			} else if (err == EINVAL && sample_id_all_avail) {
				/*
				 * Old kernel, no attr->sample_id_type_all field
				 */
				sample_id_all_avail = false;
418
				if (!sample_time && !raw_samples && !time_needed)
419 420
					attr->sample_type &= ~PERF_SAMPLE_TIME;

421
				goto retry_sample_id;
422
			}
423

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
			/*
			 * If it's cycles then fall back to hrtimer
			 * based cpu-clock-tick sw counter, which
			 * is always available even if no PMU support:
			 */
			if (attr->type == PERF_TYPE_HARDWARE
					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {

				if (verbose)
					warning(" ... trying to fall back to cpu-clock-ticks\n");
				attr->type = PERF_TYPE_SOFTWARE;
				attr->config = PERF_COUNT_SW_CPU_CLOCK;
				goto try_again;
			}
			printf("\n");
439
			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
440
			      err, strerror(err));
441 442

#if defined(__i386__) || defined(__x86_64__)
443 444 445 446 447
			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
				die("No hardware sampling interrupt available."
				    " No APIC? If so then you can boot the kernel"
				    " with the \"lapic\" boot parameter to"
				    " force-enable it.\n");
448 449
#endif

450
			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
L
Li Zefan 已提交
451 452
		}
	}
453

454 455 456 457
	for (cpu = 0; cpu < cpus->nr; ++cpu) {
		list_for_each_entry(pos, &evlist->entries, node)
			create_counter(evlist, pos, cpu);
	}
458 459
}

460 461 462 463
static int process_buildids(void)
{
	u64 size = lseek(output, 0, SEEK_CUR);

464 465 466
	if (size == 0)
		return 0;

467 468 469 470 471 472
	session->fd = output;
	return __perf_session__process_events(session, post_processing_offset,
					      size - post_processing_offset,
					      size, &build_id__mark_dso_hit_ops);
}

473 474
static void atexit_header(void)
{
475 476
	if (!pipe_output) {
		session->header.data_size += bytes_written;
477

478 479
		if (!no_buildid)
			process_buildids();
480
		perf_header__write(&session->header, evsel_list, output, true);
481
		perf_session__delete(session);
482
		perf_evlist__delete(evsel_list);
483
		symbol__exit();
484
	}
485 486
}

487
static void event__synthesize_guest_os(struct machine *machine, void *data)
488 489
{
	int err;
490
	struct perf_session *psession = data;
491

492
	if (machine__is_host(machine))
493 494 495 496 497 498 499 500 501 502 503
		return;

	/*
	 *As for guest kernel when processing subcommand record&report,
	 *we arrange module mmap prior to guest kernel mmap and trigger
	 *a preload dso because default guest module symbols are loaded
	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
	 *method is used to avoid symbol missing when the first addr is
	 *in module instead of in guest kernel.
	 */
	err = event__synthesize_modules(process_synthesized_event,
504
					psession, machine);
505 506
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
507
		       " relocation symbol.\n", machine->pid);
508 509 510 511 512 513

	/*
	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
	 * have no _text sometimes.
	 */
	err = event__synthesize_kernel_mmap(process_synthesized_event,
514
					    psession, machine, "_text");
515 516
	if (err < 0)
		err = event__synthesize_kernel_mmap(process_synthesized_event,
517
						    psession, machine, "_stext");
518 519
	if (err < 0)
		pr_err("Couldn't record guest kernel [%d]'s reference"
520
		       " relocation symbol.\n", machine->pid);
521 522
}

523 524 525 526 527 528 529
static struct perf_event_header finished_round_event = {
	.size = sizeof(struct perf_event_header),
	.type = PERF_RECORD_FINISHED_ROUND,
};

static void mmap_read_all(void)
{
530
	int i;
531

532
	for (i = 0; i < cpus->nr; i++) {
533 534
		if (mmap_array[i].base)
			mmap_read(&mmap_array[i]);
535 536 537 538 539 540
	}

	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
		write_output(&finished_round_event, sizeof(finished_round_event));
}

541
static int __cmd_record(int argc, const char **argv)
542
{
543
	int i;
I
Ingo Molnar 已提交
544 545
	struct stat st;
	int flags;
546
	int err;
547
	unsigned long waking = 0;
548
	int child_ready_pipe[2], go_pipe[2];
549
	const bool forks = argc > 0;
550
	char buf;
551
	struct machine *machine;
552 553 554

	page_size = sysconf(_SC_PAGE_SIZE);

555 556 557
	atexit(sig_atexit);
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
558
	signal(SIGUSR1, sig_handler);
559

560
	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
561 562 563 564
		perror("failed to create pipes");
		exit(-1);
	}

565 566 567
	if (!strcmp(output_name, "-"))
		pipe_output = 1;
	else if (!stat(output_name, &st) && st.st_size) {
568
		if (write_mode == WRITE_FORCE) {
569 570 571 572 573
			char oldname[PATH_MAX];
			snprintf(oldname, sizeof(oldname), "%s.old",
				 output_name);
			unlink(oldname);
			rename(output_name, oldname);
574
		}
575 576
	} else if (write_mode == WRITE_APPEND) {
		write_mode = WRITE_FORCE;
577 578
	}

579
	flags = O_CREAT|O_RDWR;
580
	if (write_mode == WRITE_APPEND)
581
		file_new = 0;
I
Ingo Molnar 已提交
582 583 584
	else
		flags |= O_TRUNC;

585 586 587 588
	if (pipe_output)
		output = STDOUT_FILENO;
	else
		output = open(output_name, flags, S_IRUSR | S_IWUSR);
589 590 591 592 593
	if (output < 0) {
		perror("failed to create output file");
		exit(-1);
	}

594
	session = perf_session__new(output_name, O_WRONLY,
595
				    write_mode == WRITE_FORCE, false, NULL);
596
	if (session == NULL) {
597 598 599 600
		pr_err("Not enough memory for reading perf file header\n");
		return -1;
	}

601 602 603
	if (!no_buildid)
		perf_header__set_feat(&session->header, HEADER_BUILD_ID);

604
	if (!file_new) {
605
		err = perf_header__read(session, output);
606
		if (err < 0)
607
			goto out_delete_session;
608 609
	}

610
	if (have_tracepoints(&evsel_list->entries))
611
		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
612

613 614 615
	/*
 	 * perf_session__delete(session) will be called at atexit_header()
	 */
616 617
	atexit(atexit_header);

618
	if (forks) {
619
		child_pid = fork();
620
		if (child_pid < 0) {
621 622 623
			perror("failed to fork");
			exit(-1);
		}
624

625
		if (!child_pid) {
626 627
			if (pipe_output)
				dup2(2, 1);
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
			close(child_ready_pipe[0]);
			close(go_pipe[1]);
			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);

			/*
			 * Do a dummy execvp to get the PLT entry resolved,
			 * so we avoid the resolver overhead on the real
			 * execvp call.
			 */
			execvp("", (char **)argv);

			/*
			 * Tell the parent we're ready to go
			 */
			close(child_ready_pipe[1]);

			/*
			 * Wait until the parent tells us to go.
			 */
			if (read(go_pipe[0], &buf, 1) == -1)
				perror("unable to read pipe");

			execvp(argv[0], (char **)argv);

			perror(argv[0]);
653
			kill(getppid(), SIGUSR1);
654
			exit(-1);
655
		}
656

657
		if (!system_wide && target_tid == -1 && target_pid == -1)
658
			threads->map[0] = child_pid;
659

660 661 662 663 664 665 666 667 668 669 670 671
		close(child_ready_pipe[1]);
		close(go_pipe[0]);
		/*
		 * wait for child to settle
		 */
		if (read(child_ready_pipe[0], &buf, 1) == -1) {
			perror("unable to read pipe");
			exit(-1);
		}
		close(child_ready_pipe[0]);
	}

672
	open_counters(evsel_list);
673

674 675
	perf_session__set_sample_type(session, sample_type);

676 677 678 679 680
	if (pipe_output) {
		err = perf_header__write_pipe(output);
		if (err < 0)
			return err;
	} else if (file_new) {
681 682
		err = perf_header__write(&session->header, evsel_list,
					 output, false);
683 684
		if (err < 0)
			return err;
685 686
	}

687 688
	post_processing_offset = lseek(output, 0, SEEK_CUR);

689 690
	perf_session__set_sample_id_all(session, sample_id_all_avail);

691 692 693 694 695 696 697 698
	if (pipe_output) {
		err = event__synthesize_attrs(&session->header,
					      process_synthesized_event,
					      session);
		if (err < 0) {
			pr_err("Couldn't synthesize attrs.\n");
			return err;
		}
699 700 701 702 703 704 705

		err = event__synthesize_event_types(process_synthesized_event,
						    session);
		if (err < 0) {
			pr_err("Couldn't synthesize event_types.\n");
			return err;
		}
706

707
		if (have_tracepoints(&evsel_list->entries)) {
708 709 710 711 712 713 714 715
			/*
			 * FIXME err <= 0 here actually means that
			 * there were no tracepoints so its not really
			 * an error, just that we don't need to
			 * synthesize anything.  We really have to
			 * return this more properly and also
			 * propagate errors that now are calling die()
			 */
716
			err = event__synthesize_tracing_data(output, evsel_list,
717 718 719 720 721 722
							     process_synthesized_event,
							     session);
			if (err <= 0) {
				pr_err("Couldn't record tracing data.\n");
				return err;
			}
723
			advance_output(err);
724
		}
725 726
	}

727 728
	machine = perf_session__find_host_machine(session);
	if (!machine) {
729 730 731 732
		pr_err("Couldn't find native kernel information.\n");
		return -1;
	}

733
	err = event__synthesize_kernel_mmap(process_synthesized_event,
734
					    session, machine, "_text");
735 736
	if (err < 0)
		err = event__synthesize_kernel_mmap(process_synthesized_event,
737
						    session, machine, "_stext");
738 739 740 741
	if (err < 0)
		pr_err("Couldn't record kernel reference relocation symbol\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/kallsyms permission or run as root.\n");
742

743
	err = event__synthesize_modules(process_synthesized_event,
744
					session, machine);
745 746 747 748 749
	if (err < 0)
		pr_err("Couldn't record kernel module information.\n"
		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
		       "Check /proc/modules permission or run as root.\n");

750
	if (perf_guest)
751
		perf_session__process_machines(session, event__synthesize_guest_os);
752

753
	if (!system_wide)
754
		event__synthesize_thread(target_tid, process_synthesized_event,
755
					 session);
756
	else
757
		event__synthesize_threads(process_synthesized_event, session);
758

759 760 761 762 763
	if (realtime_prio) {
		struct sched_param param;

		param.sched_priority = realtime_prio;
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
764
			pr_err("Could not set realtime priority.\n");
765 766 767 768
			exit(-1);
		}
	}

769 770 771
	/*
	 * Let the child rip
	 */
772 773
	if (forks)
		close(go_pipe[1]);
774

775
	for (;;) {
776
		int hits = samples;
777
		int thread;
778

779
		mmap_read_all();
780

781 782 783
		if (hits == samples) {
			if (done)
				break;
784
			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
785 786 787 788
			waking++;
		}

		if (done) {
789
			for (i = 0; i < cpus->nr; i++) {
790 791
				struct perf_evsel *pos;

792
				list_for_each_entry(pos, &evsel_list->entries, node) {
793
					for (thread = 0;
794
						thread < threads->nr;
795
						thread++)
796
						ioctl(FD(pos, i, thread),
797 798
							PERF_EVENT_IOC_DISABLE);
				}
799
			}
800
		}
801 802
	}

803
	if (quiet || signr == SIGUSR1)
804 805
		return 0;

806 807
	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);

808 809 810 811
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
812
		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
813 814 815
		(double)bytes_written / 1024.0 / 1024.0,
		output_name,
		bytes_written / 24);
816

817
	return 0;
818 819 820 821

out_delete_session:
	perf_session__delete(session);
	return err;
822
}
823 824

static const char * const record_usage[] = {
825 826
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
827 828 829
	NULL
};

830 831
static bool force, append_file;

832
const struct option record_options[] = {
833
	OPT_CALLBACK('e', "event", &evsel_list, "event",
834 835
		     "event selector. use 'perf list' to list available events",
		     parse_events),
836
	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
L
Li Zefan 已提交
837
		     "event filter", parse_filter),
838
	OPT_INTEGER('p', "pid", &target_pid,
839 840 841
		    "record events on existing process id"),
	OPT_INTEGER('t', "tid", &target_tid,
		    "record events on existing thread id"),
842 843
	OPT_INTEGER('r', "realtime", &realtime_prio,
		    "collect data with this RT SCHED_FIFO priority"),
844 845
	OPT_BOOLEAN('D', "no-delay", &nodelay,
		    "collect data without buffering"),
846 847
	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
		    "collect raw sample records from all opened counters"),
848 849
	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			    "system-wide collection from all CPUs"),
I
Ingo Molnar 已提交
850 851
	OPT_BOOLEAN('A', "append", &append_file,
			    "append to the output file to do incremental profiling"),
852 853
	OPT_STRING('C', "cpu", &cpu_list, "cpu",
		    "list of cpus to monitor"),
854
	OPT_BOOLEAN('f', "force", &force,
855
			"overwrite existing data file (deprecated)"),
856
	OPT_U64('c', "count", &user_interval, "event period to sample"),
I
Ingo Molnar 已提交
857 858
	OPT_STRING('o', "output", &output_name, "file",
		    "output file name"),
859 860
	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
		    "child tasks do not inherit counters"),
861 862
	OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
863 864
	OPT_BOOLEAN('g', "call-graph", &call_graph,
		    "do call-graph (stack chain/backtrace) recording"),
865
	OPT_INCR('v', "verbose", &verbose,
866
		    "be more verbose (show counter open errors, etc)"),
867
	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
868 869
	OPT_BOOLEAN('s', "stat", &inherit_stat,
		    "per thread counts"),
870 871
	OPT_BOOLEAN('d', "data", &sample_address,
		    "Sample addresses"),
872
	OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
873 874
	OPT_BOOLEAN('n', "no-samples", &no_samples,
		    "don't sample"),
875
	OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
876
		    "do not update the buildid cache"),
877 878
	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
		    "do not collect buildids in perf.data"),
879 880 881
	OPT_END()
};

882
int cmd_record(int argc, const char **argv, const char *prefix __used)
883
{
884 885
	int err = -ENOMEM;
	struct perf_evsel *pos;
886

887 888 889 890
	evsel_list = perf_evlist__new();
	if (evsel_list == NULL)
		return -ENOMEM;

891
	argc = parse_options(argc, argv, record_options, record_usage,
892
			    PARSE_OPT_STOP_AT_NON_OPTION);
893
	if (!argc && target_pid == -1 && target_tid == -1 &&
894
		!system_wide && !cpu_list)
895
		usage_with_options(record_usage, record_options);
896

897 898 899
	if (force && append_file) {
		fprintf(stderr, "Can't overwrite and append at the same time."
				" You need to choose between -f and -A");
900
		usage_with_options(record_usage, record_options);
901 902 903 904 905 906
	} else if (append_file) {
		write_mode = WRITE_APPEND;
	} else {
		write_mode = WRITE_FORCE;
	}

907
	symbol__init();
908 909

	if (no_buildid_cache || no_buildid)
910
		disable_buildid_cache();
911

912 913
	if (evsel_list->nr_entries == 0 &&
	    perf_evlist__add_default(evsel_list) < 0) {
914 915
		pr_err("Not enough memory for event selector list\n");
		goto out_symbol_exit;
916
	}
917

918
	if (target_pid != -1)
919 920
		target_tid = target_pid;

921 922 923 924
	threads = thread_map__new(target_pid, target_tid);
	if (threads == NULL) {
		pr_err("Problems finding threads of monitor\n");
		usage_with_options(record_usage, record_options);
925 926
	}

927 928 929 930 931 932 933
	if (target_tid != -1)
		cpus = cpu_map__dummy_new();
	else
		cpus = cpu_map__new(cpu_list);

	if (cpus == NULL)
		usage_with_options(record_usage, record_options);
934

935
	list_for_each_entry(pos, &evsel_list->entries, node) {
936
		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
937
			goto out_free_fd;
938 939
		if (perf_header__push_event(pos->attr.config, event_name(pos)))
			goto out_free_fd;
940
	}
941 942

	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
943
		goto out_free_fd;
944

945
	if (user_interval != ULLONG_MAX)
946 947 948 949
		default_interval = user_interval;
	if (user_freq != UINT_MAX)
		freq = user_freq;

950 951 952 953 954 955 956 957 958
	/*
	 * User specified count overrides default frequency.
	 */
	if (default_interval)
		freq = 0;
	else if (freq) {
		default_interval = freq;
	} else {
		fprintf(stderr, "frequency and count are zero, aborting\n");
959
		err = -EINVAL;
960
		goto out_free_fd;
961 962
	}

963 964 965
	err = __cmd_record(argc, argv);

out_free_fd:
966 967
	thread_map__delete(threads);
	threads = NULL;
968 969
out_symbol_exit:
	symbol__exit();
970
	return err;
971
}