builtin-record.c 12.9 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8
#include "builtin.h"
9 10 11

#include "perf.h"

12
#include "util/util.h"
13
#include "util/parse-options.h"
14
#include "util/parse-events.h"
15
#include "util/string.h"
16

17
#include <unistd.h>
18 19
#include <sched.h>

20 21
#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
22

23
static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
24 25 26

static long			default_interval		= 100000;

27
static int			nr_cpus				= 0;
28
static unsigned int		page_size;
29
static unsigned int		mmap_pages			= 128;
I
Ingo Molnar 已提交
30
static int			freq				= 0;
31
static int			output;
32
static const char		*output_name			= "perf.data";
33
static int			group				= 0;
34 35
static unsigned int		realtime_prio			= 0;
static int			system_wide			= 0;
36
static pid_t			target_pid			= -1;
37
static int			inherit				= 1;
38
static int			force				= 0;
I
Ingo Molnar 已提交
39
static int			append_file			= 0;
40
static int			call_graph			= 0;
41
static int			verbose				= 0;
42

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
static long			samples;
static struct timeval		last_read;
static struct timeval		this_read;

static __u64			bytes_written;

static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];

static int			nr_poll;
static int			nr_cpu;

struct mmap_event {
	struct perf_event_header	header;
	__u32				pid;
	__u32				tid;
	__u64				start;
	__u64				len;
	__u64				pgoff;
	char				filename[PATH_MAX];
};

struct comm_event {
	struct perf_event_header	header;
	__u32				pid;
	__u32				tid;
	char				comm[16];
69 70
};

71

72
struct mmap_data {
73 74 75 76
	int			counter;
	void			*base;
	unsigned int		mask;
	unsigned int		prev;
77 78
};

79 80
static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
static unsigned int mmap_read_head(struct mmap_data *md)
{
	struct perf_counter_mmap_page *pc = md->base;
	int head;

	head = pc->data_head;
	rmb();

	return head;
}

static void mmap_read(struct mmap_data *md)
{
	unsigned int head = mmap_read_head(md);
	unsigned int old = md->prev;
	unsigned char *data = md->base + page_size;
	unsigned long size;
	void *buf;
	int diff;

	gettimeofday(&this_read, NULL);

	/*
	 * If we're further behind than half the buffer, there's a chance
105
	 * the writer will bite our tail and mess up the samples under us.
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
	 *
	 * If we somehow ended up ahead of the head, we got messed up.
	 *
	 * In either case, truncate and restart at head.
	 */
	diff = head - old;
	if (diff > md->mask / 2 || diff < 0) {
		struct timeval iv;
		unsigned long msecs;

		timersub(&this_read, &last_read, &iv);
		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;

		fprintf(stderr, "WARNING: failed to keep up with mmap data."
				"  Last read %lu msecs ago.\n", msecs);

		/*
		 * head points to a known good entry, start there.
		 */
		old = head;
	}

	last_read = this_read;

	if (old != head)
131
		samples++;
132 133 134 135 136 137 138

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
139

140 141
		while (size) {
			int ret = write(output, buf, size);
142 143 144 145

			if (ret < 0)
				die("failed to write");

146 147
			size -= ret;
			buf += ret;
148 149

			bytes_written += ret;
150 151 152 153 154 155
		}
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
156

157 158
	while (size) {
		int ret = write(output, buf, size);
159 160 161 162

		if (ret < 0)
			die("failed to write");

163 164
		size -= ret;
		buf += ret;
165 166

		bytes_written += ret;
167 168 169 170 171 172
	}

	md->prev = old;
}

static volatile int done = 0;
173
static volatile int signr = -1;
174

175
static void sig_handler(int sig)
176
{
177
	done = 1;
178 179 180 181 182 183 184 185 186 187
	signr = sig;
}

static void sig_atexit(void)
{
	if (signr == -1)
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
188 189
}

190
static void pid_synthesize_comm_event(pid_t pid, int full)
191
{
192
	struct comm_event comm_ev;
193 194
	char filename[PATH_MAX];
	char bf[BUFSIZ];
195
	int fd, ret;
196
	size_t size;
197
	char *field, *sep;
198 199
	DIR *tasks;
	struct dirent dirent, *next;
200 201 202 203 204 205 206 207 208 209 210 211 212 213

	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);

	fd = open(filename, O_RDONLY);
	if (fd < 0) {
		fprintf(stderr, "couldn't open %s\n", filename);
		exit(EXIT_FAILURE);
	}
	if (read(fd, bf, sizeof(bf)) < 0) {
		fprintf(stderr, "couldn't read %s\n", filename);
		exit(EXIT_FAILURE);
	}
	close(fd);

214
	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
215
	memset(&comm_ev, 0, sizeof(comm_ev));
216 217 218 219 220 221 222 223
	field = strchr(bf, '(');
	if (field == NULL)
		goto out_failure;
	sep = strchr(++field, ')');
	if (sep == NULL)
		goto out_failure;
	size = sep - field;
	memcpy(comm_ev.comm, field, size++);
224 225

	comm_ev.pid = pid;
226
	comm_ev.header.type = PERF_EVENT_COMM;
227
	size = ALIGN(size, sizeof(__u64));
228
	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
229

230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	if (!full) {
		comm_ev.tid = pid;

		ret = write(output, &comm_ev, comm_ev.header.size);
		if (ret < 0) {
			perror("failed to write");
			exit(-1);
		}
		return;
	}

	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);

	tasks = opendir(filename);
	while (!readdir_r(tasks, &dirent, &next) && next) {
		char *end;
		pid = strtol(dirent.d_name, &end, 10);
		if (*end)
			continue;

		comm_ev.tid = pid;

		ret = write(output, &comm_ev, comm_ev.header.size);
		if (ret < 0) {
			perror("failed to write");
			exit(-1);
		}
257
	}
258 259 260
	closedir(tasks);
	return;

261 262 263 264
out_failure:
	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
		filename);
	exit(EXIT_FAILURE);
265 266
}

267
static void pid_synthesize_mmap_samples(pid_t pid)
268 269 270 271 272 273 274 275 276 277 278 279
{
	char filename[PATH_MAX];
	FILE *fp;

	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);

	fp = fopen(filename, "r");
	if (fp == NULL) {
		fprintf(stderr, "couldn't open %s\n", filename);
		exit(EXIT_FAILURE);
	}
	while (1) {
280
		char bf[BUFSIZ], *pbf = bf;
281 282 283
		struct mmap_event mmap_ev = {
			.header.type = PERF_EVENT_MMAP,
		};
284
		int n;
285 286 287 288 289
		size_t size;
		if (fgets(bf, sizeof(bf), fp) == NULL)
			break;

		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
290 291 292 293 294 295 296 297 298
		n = hex2u64(pbf, &mmap_ev.start);
		if (n < 0)
			continue;
		pbf += n + 1;
		n = hex2u64(pbf, &mmap_ev.len);
		if (n < 0)
			continue;
		pbf += n + 3;
		if (*pbf == 'x') { /* vm_exec */
299 300 301 302 303 304 305 306 307
			char *execname = strrchr(bf, ' ');

			if (execname == NULL || execname[1] != '/')
				continue;

			execname += 1;
			size = strlen(execname);
			execname[size - 1] = '\0'; /* Remove \n */
			memcpy(mmap_ev.filename, execname, size);
308
			size = ALIGN(size, sizeof(__u64));
309 310 311
			mmap_ev.len -= mmap_ev.start;
			mmap_ev.header.size = (sizeof(mmap_ev) -
					       (sizeof(mmap_ev.filename) - size));
312
			mmap_ev.pid = pid;
313 314 315 316 317 318 319 320 321 322 323 324
			mmap_ev.tid = pid;

			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
				perror("failed to write");
				exit(-1);
			}
		}
	}

	fclose(fp);
}

325
static void synthesize_samples(void)
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
{
	DIR *proc;
	struct dirent dirent, *next;

	proc = opendir("/proc");

	while (!readdir_r(proc, &dirent, &next) && next) {
		char *end;
		pid_t pid;

		pid = strtol(dirent.d_name, &end, 10);
		if (*end) /* only interested in proper numerical dirents */
			continue;

		pid_synthesize_comm_event(pid, 1);
341
		pid_synthesize_mmap_samples(pid);
342 343 344 345 346
	}

	closedir(proc);
}

347 348 349
static int group_fd;

static void create_counter(int counter, int cpu, pid_t pid)
350
{
351
	struct perf_counter_attr *attr = attrs + counter;
352 353
	int track = 1;

354
	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
355

356
	if (freq) {
357
		attr->sample_type	|= PERF_SAMPLE_PERIOD;
358 359
		attr->freq		= 1;
		attr->sample_freq	= freq;
360
	}
361 362 363 364

	if (call_graph)
		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;

365 366 367
	attr->mmap		= track;
	attr->comm		= track;
	attr->inherit		= (cpu < 0) && inherit;
368
	attr->disabled		= 1;
369

370
	track = 0; /* only the first counter needs these */
371

372
try_again:
373
	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
374

375 376
	if (fd[nr_cpu][counter] < 0) {
		int err = errno;
377

378
		if (err == EPERM)
379 380 381 382 383 384 385 386
			die("Permission error - are you root?\n");

		/*
		 * If it's cycles then fall back to hrtimer
		 * based cpu-clock-tick sw counter, which
		 * is always available even if no PMU support:
		 */
		if (attr->type == PERF_TYPE_HARDWARE
387
			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
388 389 390 391

			if (verbose)
				warning(" ... trying to fall back to cpu-clock-ticks\n");
			attr->type = PERF_TYPE_SOFTWARE;
392
			attr->config = PERF_COUNT_SW_CPU_CLOCK;
393 394
			goto try_again;
		}
395 396 397 398
		printf("\n");
		error("perfcounter syscall returned with %d (%s)\n",
			fd[nr_cpu][counter], strerror(err));
		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
399 400
		exit(-1);
	}
401

402 403
	assert(fd[nr_cpu][counter] >= 0);
	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
404

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
	/*
	 * First counter acts as the group leader:
	 */
	if (group && group_fd == -1)
		group_fd = fd[nr_cpu][counter];

	event_array[nr_poll].fd = fd[nr_cpu][counter];
	event_array[nr_poll].events = POLLIN;
	nr_poll++;

	mmap_array[nr_cpu][counter].counter = counter;
	mmap_array[nr_cpu][counter].prev = 0;
	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
		exit(-1);
	}
424 425

	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
426
}
427

428 429 430
static void open_counters(int cpu, pid_t pid)
{
	int counter;
431

432 433
	if (pid > 0) {
		pid_synthesize_comm_event(pid, 0);
434
		pid_synthesize_mmap_samples(pid);
435
	}
436 437 438 439 440

	group_fd = -1;
	for (counter = 0; counter < nr_counters; counter++)
		create_counter(counter, cpu, pid);

441 442 443
	nr_cpu++;
}

444
static int __cmd_record(int argc, const char **argv)
445 446
{
	int i, counter;
I
Ingo Molnar 已提交
447
	struct stat st;
448
	pid_t pid;
I
Ingo Molnar 已提交
449
	int flags;
450 451 452 453 454 455 456
	int ret;

	page_size = sysconf(_SC_PAGE_SIZE);
	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	assert(nr_cpus <= MAX_NR_CPUS);
	assert(nr_cpus >= 0);

I
Ingo Molnar 已提交
457 458
	if (!stat(output_name, &st) && !force && !append_file) {
		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
459 460 461 462
				output_name);
		exit(-1);
	}

I
Ingo Molnar 已提交
463 464 465 466 467 468 469
	flags = O_CREAT|O_RDWR;
	if (append_file)
		flags |= O_APPEND;
	else
		flags |= O_TRUNC;

	output = open(output_name, flags, S_IRUSR|S_IWUSR);
470 471 472 473 474
	if (output < 0) {
		perror("failed to create output file");
		exit(-1);
	}

475
	if (!system_wide) {
476
		open_counters(-1, target_pid != -1 ? target_pid : getpid());
477 478
	} else for (i = 0; i < nr_cpus; i++)
		open_counters(i, target_pid);
479

480
	atexit(sig_atexit);
481 482
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
483

484
	if (target_pid == -1 && argc) {
485 486 487
		pid = fork();
		if (pid < 0)
			perror("failed to fork");
488

489
		if (!pid) {
490
			if (execvp(argv[0], (char **)argv)) {
491 492 493
				perror(argv[0]);
				exit(-1);
			}
494 495 496 497 498 499 500 501 502 503 504 505 506
		}
	}

	if (realtime_prio) {
		struct sched_param param;

		param.sched_priority = realtime_prio;
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
			printf("Could not set realtime priority.\n");
			exit(-1);
		}
	}

507
	if (system_wide)
508
		synthesize_samples();
509 510

	while (!done) {
511
		int hits = samples;
512

513
		for (i = 0; i < nr_cpu; i++) {
514 515 516 517
			for (counter = 0; counter < nr_counters; counter++)
				mmap_read(&mmap_array[i][counter]);
		}

518
		if (hits == samples)
519 520 521
			ret = poll(event_array, nr_poll, 100);
	}

522 523 524 525
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
526
		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
527 528 529
		(double)bytes_written / 1024.0 / 1024.0,
		output_name,
		bytes_written / 24);
530

531 532
	return 0;
}
533 534

static const char * const record_usage[] = {
535 536
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
537 538 539
	NULL
};

540
static const struct option options[] = {
541
	OPT_CALLBACK('e', "event", NULL, "event",
542 543
		     "event selector. use 'perf list' to list available events",
		     parse_events),
544 545 546 547 548 549
	OPT_INTEGER('p', "pid", &target_pid,
		    "record events on existing pid"),
	OPT_INTEGER('r', "realtime", &realtime_prio,
		    "collect data with this RT SCHED_FIFO priority"),
	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			    "system-wide collection from all CPUs"),
I
Ingo Molnar 已提交
550 551
	OPT_BOOLEAN('A', "append", &append_file,
			    "append to the output file to do incremental profiling"),
552 553
	OPT_BOOLEAN('f', "force", &force,
			"overwrite existing data file"),
554
	OPT_LONG('c', "count", &default_interval,
I
Ingo Molnar 已提交
555 556 557 558 559
		    "event period to sample"),
	OPT_STRING('o', "output", &output_name, "file",
		    "output file name"),
	OPT_BOOLEAN('i', "inherit", &inherit,
		    "child tasks inherit counters"),
I
Ingo Molnar 已提交
560 561
	OPT_INTEGER('F', "freq", &freq,
		    "profile at this frequency"),
I
Ingo Molnar 已提交
562 563
	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
		    "number of mmap data pages"),
564 565
	OPT_BOOLEAN('g', "call-graph", &call_graph,
		    "do call-graph (stack chain/backtrace) recording"),
566 567
	OPT_BOOLEAN('v', "verbose", &verbose,
		    "be more verbose (show counter open errors, etc)"),
568 569 570 571 572 573 574 575
	OPT_END()
};

int cmd_record(int argc, const char **argv, const char *prefix)
{
	int counter;

	argc = parse_options(argc, argv, options, record_usage, 0);
576
	if (!argc && target_pid == -1 && !system_wide)
577 578
		usage_with_options(record_usage, options);

579 580 581 582 583
	if (!nr_counters) {
		nr_counters	= 1;
		attrs[0].type	= PERF_TYPE_HARDWARE;
		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
	}
584 585

	for (counter = 0; counter < nr_counters; counter++) {
586
		if (attrs[counter].sample_period)
587 588
			continue;

589
		attrs[counter].sample_period = default_interval;
590 591 592 593
	}

	return __cmd_record(argc, argv);
}