builtin-record.c 13.0 KB
Newer Older
I
Ingo Molnar 已提交
1
/*
2 3 4 5 6
 * builtin-record.c
 *
 * Builtin record command: Record the profile of a workload
 * (or a CPU, or a PID) into the perf.data output file - for
 * later analysis via perf report.
I
Ingo Molnar 已提交
7
 */
8
#include "builtin.h"
9 10 11

#include "perf.h"

12
#include "util/util.h"
13
#include "util/parse-options.h"
14
#include "util/parse-events.h"
15
#include "util/string.h"
16

17
#include <unistd.h>
18 19
#include <sched.h>

20 21
#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
22

23
static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
24 25 26

static long			default_interval		= 100000;

27
static int			nr_cpus				= 0;
28
static unsigned int		page_size;
29
static unsigned int		mmap_pages			= 128;
I
Ingo Molnar 已提交
30
static int			freq				= 0;
31
static int			output;
32
static const char		*output_name			= "perf.data";
33
static int			group				= 0;
34 35
static unsigned int		realtime_prio			= 0;
static int			system_wide			= 0;
36
static pid_t			target_pid			= -1;
37
static int			inherit				= 1;
38
static int			force				= 0;
I
Ingo Molnar 已提交
39
static int			append_file			= 0;
40
static int			call_graph			= 0;
41
static int			verbose				= 0;
42

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
static long			samples;
static struct timeval		last_read;
static struct timeval		this_read;

static __u64			bytes_written;

static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];

static int			nr_poll;
static int			nr_cpu;

struct mmap_event {
	struct perf_event_header	header;
	__u32				pid;
	__u32				tid;
	__u64				start;
	__u64				len;
	__u64				pgoff;
	char				filename[PATH_MAX];
};

struct comm_event {
	struct perf_event_header	header;
	__u32				pid;
	__u32				tid;
	char				comm[16];
69 70
};

71

72
struct mmap_data {
73 74 75 76
	int			counter;
	void			*base;
	unsigned int		mask;
	unsigned int		prev;
77 78
};

79 80
static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
static unsigned int mmap_read_head(struct mmap_data *md)
{
	struct perf_counter_mmap_page *pc = md->base;
	int head;

	head = pc->data_head;
	rmb();

	return head;
}

static void mmap_read(struct mmap_data *md)
{
	unsigned int head = mmap_read_head(md);
	unsigned int old = md->prev;
	unsigned char *data = md->base + page_size;
	unsigned long size;
	void *buf;
	int diff;

	gettimeofday(&this_read, NULL);

	/*
	 * If we're further behind than half the buffer, there's a chance
105
	 * the writer will bite our tail and mess up the samples under us.
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
	 *
	 * If we somehow ended up ahead of the head, we got messed up.
	 *
	 * In either case, truncate and restart at head.
	 */
	diff = head - old;
	if (diff > md->mask / 2 || diff < 0) {
		struct timeval iv;
		unsigned long msecs;

		timersub(&this_read, &last_read, &iv);
		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;

		fprintf(stderr, "WARNING: failed to keep up with mmap data."
				"  Last read %lu msecs ago.\n", msecs);

		/*
		 * head points to a known good entry, start there.
		 */
		old = head;
	}

	last_read = this_read;

	if (old != head)
131
		samples++;
132 133 134 135 136 137 138

	size = head - old;

	if ((old & md->mask) + size != (head & md->mask)) {
		buf = &data[old & md->mask];
		size = md->mask + 1 - (old & md->mask);
		old += size;
139

140 141
		while (size) {
			int ret = write(output, buf, size);
142 143 144 145

			if (ret < 0)
				die("failed to write");

146 147
			size -= ret;
			buf += ret;
148 149

			bytes_written += ret;
150 151 152 153 154 155
		}
	}

	buf = &data[old & md->mask];
	size = head - old;
	old += size;
156

157 158
	while (size) {
		int ret = write(output, buf, size);
159 160 161 162

		if (ret < 0)
			die("failed to write");

163 164
		size -= ret;
		buf += ret;
165 166

		bytes_written += ret;
167 168 169 170 171 172
	}

	md->prev = old;
}

static volatile int done = 0;
173
static volatile int signr = -1;
174

175
static void sig_handler(int sig)
176
{
177
	done = 1;
178 179 180 181 182 183 184 185 186 187
	signr = sig;
}

static void sig_atexit(void)
{
	if (signr == -1)
		return;

	signal(signr, SIG_DFL);
	kill(getpid(), signr);
188 189
}

190
static void pid_synthesize_comm_event(pid_t pid, int full)
191
{
192
	struct comm_event comm_ev;
193 194
	char filename[PATH_MAX];
	char bf[BUFSIZ];
195
	int fd, ret;
196
	size_t size;
197
	char *field, *sep;
198 199
	DIR *tasks;
	struct dirent dirent, *next;
200 201 202 203 204

	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);

	fd = open(filename, O_RDONLY);
	if (fd < 0) {
205 206 207 208 209 210
		/*
		 * We raced with a task exiting - just return:
		 */
		if (verbose)
			fprintf(stderr, "couldn't open %s\n", filename);
		return;
211 212 213 214 215 216 217
	}
	if (read(fd, bf, sizeof(bf)) < 0) {
		fprintf(stderr, "couldn't read %s\n", filename);
		exit(EXIT_FAILURE);
	}
	close(fd);

218
	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
219
	memset(&comm_ev, 0, sizeof(comm_ev));
220 221 222 223 224 225 226 227
	field = strchr(bf, '(');
	if (field == NULL)
		goto out_failure;
	sep = strchr(++field, ')');
	if (sep == NULL)
		goto out_failure;
	size = sep - field;
	memcpy(comm_ev.comm, field, size++);
228 229

	comm_ev.pid = pid;
230
	comm_ev.header.type = PERF_EVENT_COMM;
231
	size = ALIGN(size, sizeof(__u64));
232
	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
233

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
	if (!full) {
		comm_ev.tid = pid;

		ret = write(output, &comm_ev, comm_ev.header.size);
		if (ret < 0) {
			perror("failed to write");
			exit(-1);
		}
		return;
	}

	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);

	tasks = opendir(filename);
	while (!readdir_r(tasks, &dirent, &next) && next) {
		char *end;
		pid = strtol(dirent.d_name, &end, 10);
		if (*end)
			continue;

		comm_ev.tid = pid;

		ret = write(output, &comm_ev, comm_ev.header.size);
		if (ret < 0) {
			perror("failed to write");
			exit(-1);
		}
261
	}
262 263 264
	closedir(tasks);
	return;

265 266 267 268
out_failure:
	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
		filename);
	exit(EXIT_FAILURE);
269 270
}

271
static void pid_synthesize_mmap_samples(pid_t pid)
272 273 274 275 276 277 278 279
{
	char filename[PATH_MAX];
	FILE *fp;

	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);

	fp = fopen(filename, "r");
	if (fp == NULL) {
280 281 282 283 284 285
		/*
		 * We raced with a task exiting - just return:
		 */
		if (verbose)
			fprintf(stderr, "couldn't open %s\n", filename);
		return;
286 287
	}
	while (1) {
288
		char bf[BUFSIZ], *pbf = bf;
289 290 291
		struct mmap_event mmap_ev = {
			.header.type = PERF_EVENT_MMAP,
		};
292
		int n;
293 294 295 296 297
		size_t size;
		if (fgets(bf, sizeof(bf), fp) == NULL)
			break;

		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
298 299 300 301 302 303 304 305 306
		n = hex2u64(pbf, &mmap_ev.start);
		if (n < 0)
			continue;
		pbf += n + 1;
		n = hex2u64(pbf, &mmap_ev.len);
		if (n < 0)
			continue;
		pbf += n + 3;
		if (*pbf == 'x') { /* vm_exec */
307 308 309 310 311 312 313 314 315
			char *execname = strrchr(bf, ' ');

			if (execname == NULL || execname[1] != '/')
				continue;

			execname += 1;
			size = strlen(execname);
			execname[size - 1] = '\0'; /* Remove \n */
			memcpy(mmap_ev.filename, execname, size);
316
			size = ALIGN(size, sizeof(__u64));
317 318 319
			mmap_ev.len -= mmap_ev.start;
			mmap_ev.header.size = (sizeof(mmap_ev) -
					       (sizeof(mmap_ev.filename) - size));
320
			mmap_ev.pid = pid;
321 322 323 324 325 326 327 328 329 330 331 332
			mmap_ev.tid = pid;

			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
				perror("failed to write");
				exit(-1);
			}
		}
	}

	fclose(fp);
}

333
static void synthesize_samples(void)
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
{
	DIR *proc;
	struct dirent dirent, *next;

	proc = opendir("/proc");

	while (!readdir_r(proc, &dirent, &next) && next) {
		char *end;
		pid_t pid;

		pid = strtol(dirent.d_name, &end, 10);
		if (*end) /* only interested in proper numerical dirents */
			continue;

		pid_synthesize_comm_event(pid, 1);
349
		pid_synthesize_mmap_samples(pid);
350 351 352 353 354
	}

	closedir(proc);
}

355 356 357
static int group_fd;

static void create_counter(int counter, int cpu, pid_t pid)
358
{
359
	struct perf_counter_attr *attr = attrs + counter;
360 361
	int track = 1;

362
	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
363

364
	if (freq) {
365
		attr->sample_type	|= PERF_SAMPLE_PERIOD;
366 367
		attr->freq		= 1;
		attr->sample_freq	= freq;
368
	}
369 370 371 372

	if (call_graph)
		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;

373 374 375
	attr->mmap		= track;
	attr->comm		= track;
	attr->inherit		= (cpu < 0) && inherit;
376
	attr->disabled		= 1;
377

378
	track = 0; /* only the first counter needs these */
379

380
try_again:
381
	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
382

383 384
	if (fd[nr_cpu][counter] < 0) {
		int err = errno;
385

386
		if (err == EPERM)
387 388 389 390 391 392 393 394
			die("Permission error - are you root?\n");

		/*
		 * If it's cycles then fall back to hrtimer
		 * based cpu-clock-tick sw counter, which
		 * is always available even if no PMU support:
		 */
		if (attr->type == PERF_TYPE_HARDWARE
395
			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
396 397 398 399

			if (verbose)
				warning(" ... trying to fall back to cpu-clock-ticks\n");
			attr->type = PERF_TYPE_SOFTWARE;
400
			attr->config = PERF_COUNT_SW_CPU_CLOCK;
401 402
			goto try_again;
		}
403 404 405 406
		printf("\n");
		error("perfcounter syscall returned with %d (%s)\n",
			fd[nr_cpu][counter], strerror(err));
		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
407 408
		exit(-1);
	}
409

410 411
	assert(fd[nr_cpu][counter] >= 0);
	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
412

413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
	/*
	 * First counter acts as the group leader:
	 */
	if (group && group_fd == -1)
		group_fd = fd[nr_cpu][counter];

	event_array[nr_poll].fd = fd[nr_cpu][counter];
	event_array[nr_poll].events = POLLIN;
	nr_poll++;

	mmap_array[nr_cpu][counter].counter = counter;
	mmap_array[nr_cpu][counter].prev = 0;
	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
		exit(-1);
	}
432 433

	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
434
}
435

436 437 438
static void open_counters(int cpu, pid_t pid)
{
	int counter;
439

440 441
	if (pid > 0) {
		pid_synthesize_comm_event(pid, 0);
442
		pid_synthesize_mmap_samples(pid);
443
	}
444 445 446 447 448

	group_fd = -1;
	for (counter = 0; counter < nr_counters; counter++)
		create_counter(counter, cpu, pid);

449 450 451
	nr_cpu++;
}

452
static int __cmd_record(int argc, const char **argv)
453 454
{
	int i, counter;
I
Ingo Molnar 已提交
455
	struct stat st;
456
	pid_t pid;
I
Ingo Molnar 已提交
457
	int flags;
458 459 460 461 462 463 464
	int ret;

	page_size = sysconf(_SC_PAGE_SIZE);
	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	assert(nr_cpus <= MAX_NR_CPUS);
	assert(nr_cpus >= 0);

I
Ingo Molnar 已提交
465 466
	if (!stat(output_name, &st) && !force && !append_file) {
		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
467 468 469 470
				output_name);
		exit(-1);
	}

I
Ingo Molnar 已提交
471 472 473 474 475 476 477
	flags = O_CREAT|O_RDWR;
	if (append_file)
		flags |= O_APPEND;
	else
		flags |= O_TRUNC;

	output = open(output_name, flags, S_IRUSR|S_IWUSR);
478 479 480 481 482
	if (output < 0) {
		perror("failed to create output file");
		exit(-1);
	}

483
	if (!system_wide) {
484
		open_counters(-1, target_pid != -1 ? target_pid : getpid());
485 486
	} else for (i = 0; i < nr_cpus; i++)
		open_counters(i, target_pid);
487

488
	atexit(sig_atexit);
489 490
	signal(SIGCHLD, sig_handler);
	signal(SIGINT, sig_handler);
491

492
	if (target_pid == -1 && argc) {
493 494 495
		pid = fork();
		if (pid < 0)
			perror("failed to fork");
496

497
		if (!pid) {
498
			if (execvp(argv[0], (char **)argv)) {
499 500 501
				perror(argv[0]);
				exit(-1);
			}
502 503 504 505 506 507 508 509 510 511 512 513 514
		}
	}

	if (realtime_prio) {
		struct sched_param param;

		param.sched_priority = realtime_prio;
		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
			printf("Could not set realtime priority.\n");
			exit(-1);
		}
	}

515
	if (system_wide)
516
		synthesize_samples();
517 518

	while (!done) {
519
		int hits = samples;
520

521
		for (i = 0; i < nr_cpu; i++) {
522 523 524 525
			for (counter = 0; counter < nr_counters; counter++)
				mmap_read(&mmap_array[i][counter]);
		}

526
		if (hits == samples)
527 528 529
			ret = poll(event_array, nr_poll, 100);
	}

530 531 532 533
	/*
	 * Approximate RIP event size: 24 bytes.
	 */
	fprintf(stderr,
534
		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
535 536 537
		(double)bytes_written / 1024.0 / 1024.0,
		output_name,
		bytes_written / 24);
538

539 540
	return 0;
}
541 542

static const char * const record_usage[] = {
543 544
	"perf record [<options>] [<command>]",
	"perf record [<options>] -- <command> [<options>]",
545 546 547
	NULL
};

548
static const struct option options[] = {
549
	OPT_CALLBACK('e', "event", NULL, "event",
550 551
		     "event selector. use 'perf list' to list available events",
		     parse_events),
552 553 554 555 556 557
	OPT_INTEGER('p', "pid", &target_pid,
		    "record events on existing pid"),
	OPT_INTEGER('r', "realtime", &realtime_prio,
		    "collect data with this RT SCHED_FIFO priority"),
	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			    "system-wide collection from all CPUs"),
I
Ingo Molnar 已提交
558 559
	OPT_BOOLEAN('A', "append", &append_file,
			    "append to the output file to do incremental profiling"),
560 561
	OPT_BOOLEAN('f', "force", &force,
			"overwrite existing data file"),
562
	OPT_LONG('c', "count", &default_interval,
I
Ingo Molnar 已提交
563 564 565 566 567
		    "event period to sample"),
	OPT_STRING('o', "output", &output_name, "file",
		    "output file name"),
	OPT_BOOLEAN('i', "inherit", &inherit,
		    "child tasks inherit counters"),
I
Ingo Molnar 已提交
568 569
	OPT_INTEGER('F', "freq", &freq,
		    "profile at this frequency"),
I
Ingo Molnar 已提交
570 571
	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
		    "number of mmap data pages"),
572 573
	OPT_BOOLEAN('g', "call-graph", &call_graph,
		    "do call-graph (stack chain/backtrace) recording"),
574 575
	OPT_BOOLEAN('v', "verbose", &verbose,
		    "be more verbose (show counter open errors, etc)"),
576 577 578 579 580 581 582 583
	OPT_END()
};

int cmd_record(int argc, const char **argv, const char *prefix)
{
	int counter;

	argc = parse_options(argc, argv, options, record_usage, 0);
584
	if (!argc && target_pid == -1 && !system_wide)
585 586
		usage_with_options(record_usage, options);

587 588 589 590 591
	if (!nr_counters) {
		nr_counters	= 1;
		attrs[0].type	= PERF_TYPE_HARDWARE;
		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
	}
592 593

	for (counter = 0; counter < nr_counters; counter++) {
594
		if (attrs[counter].sample_period)
595 596
			continue;

597
		attrs[counter].sample_period = default_interval;
598 599 600 601
	}

	return __cmd_record(argc, argv);
}