builtin-sched.c 44.0 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
13
#include "util/cloexec.h"
I
Ingo Molnar 已提交
14 15

#include "util/parse-options.h"
16
#include "util/trace-event.h"
I
Ingo Molnar 已提交
17 18 19

#include "util/debug.h"

20
#include <sys/prctl.h>
21
#include <sys/resource.h>
I
Ingo Molnar 已提交
22

23 24 25
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
26
#include <api/fs/fs.h>
27

28 29 30 31
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
32
#define MAX_PID			1024000
I
Ingo Molnar 已提交
33

34
struct sched_atom;
I
Ingo Molnar 已提交
35

36 37 38 39
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
40

41 42
	unsigned long		nr_events;
	unsigned long		curr_event;
43
	struct sched_atom	**atoms;
44 45 46

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
47

48 49 50 51 52 53 54 55 56 57
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
58
	SCHED_EVENT_MIGRATION,
59 60
};

61
struct sched_atom {
62
	enum sched_event_type	type;
63
	int			specific_wait;
64 65 66 67 68 69 70
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

71
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
72 73 74 75 76 77 78 79 80 81 82

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
83
	u64			sched_out_time;
84 85 86 87 88
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

89 90
struct work_atoms {
	struct list_head	work_list;
91 92 93
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
94
	u64			max_lat_at;
95 96 97 98 99
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
};

100
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
101

102
struct perf_sched;
103

104 105 106
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
107

108 109
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
110

111 112
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
113

114 115 116
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
117 118

	int (*migrate_task_event)(struct perf_sched *sched,
119 120 121
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
122 123 124 125 126 127
};

struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
128
	struct task_desc **pid_to_task;
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
	struct rb_root	 atom_root, sorted_atom_root;
	struct list_head sort_list, cmp_pid;
};
174 175

static u64 get_nsecs(void)
I
Ingo Molnar 已提交
176 177 178 179 180 181 182 183
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

184
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
185
{
186
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
187 188 189

	do {
		T1 = get_nsecs();
190
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
191 192
}

193
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
194 195 196 197 198 199 200 201 202
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

203
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
204
{
205
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
206 207 208 209
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
210
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
211 212 213 214
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
215
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
216

217
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
218 219
}

220
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
221
{
222
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
223 224 225 226 227 228 229 230 231 232
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
233
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
234

235
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
236 237
}

238
static struct sched_atom *
239
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
240
{
241
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
242 243 244 245 246 247 248
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
249 250 251
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
252

253
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
254 255 256 257

	return event;
}

258
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
259 260 261 262
{
	if (!task->nr_events)
		return NULL;

263
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
264 265
}

266 267
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
268
{
269
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
270 271

	/*
272 273 274
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
275
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
276
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
277 278 279 280 281 282 283 284 285
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

286
	sched->nr_run_events++;
I
Ingo Molnar 已提交
287 288
}

289 290
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
291
{
292
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
293 294 295 296 297 298 299

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
300
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
301 302 303
		return;
	}
	if (wakee_event->wait_sem) {
304
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
305 306 307
		return;
	}

308
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
309 310 311 312
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

313
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
314 315
}

316 317
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
318
{
319
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
320 321 322

	event->type = SCHED_EVENT_SLEEP;

323
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
324 325
}

326 327
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
328 329
{
	struct task_desc *task;
330
	static int pid_max;
I
Ingo Molnar 已提交
331

332 333 334 335 336 337
	if (sched->pid_to_task == NULL) {
		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
			pid_max = MAX_PID;
		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
	}
	BUG_ON(pid >= (unsigned long)pid_max);
I
Ingo Molnar 已提交
338

339
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
340 341 342 343

	if (task)
		return task;

344
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
345
	task->pid = pid;
346
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
347 348 349 350 351
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
352
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
353

354 355
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
356
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
357 358
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
359

I
Ingo Molnar 已提交
360
	if (verbose)
361
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
362 363 364 365 366

	return task;
}


367
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
368 369 370 371
{
	struct task_desc *task;
	unsigned long i;

372 373
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
374
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
375 376 377 378
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

379
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
380 381 382 383
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

384 385
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
386
		j = i + 1;
387
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
388
			j = 0;
389 390
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
391 392 393
	}
}

394 395
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
396 397 398
{
	int ret = 0;

399
	switch (atom->type) {
I
Ingo Molnar 已提交
400
		case SCHED_EVENT_RUN:
401
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
402 403
			break;
		case SCHED_EVENT_SLEEP:
404 405
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
406 407 408
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
409 410
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
411 412
			BUG_ON(ret);
			break;
413 414
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
415 416 417 418 419
		default:
			BUG_ON(1);
	}
}

420
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
421 422
{
	struct rusage ru;
423
	u64 sum;
I
Ingo Molnar 已提交
424 425 426 427 428 429 430 431 432 433 434
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

435
static int self_open_counters(void)
I
Ingo Molnar 已提交
436
{
437
	struct perf_event_attr attr;
438
	char sbuf[STRERR_BUFSIZE];
439
	int fd;
I
Ingo Molnar 已提交
440

441
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
442

443 444
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
445

446 447
	fd = sys_perf_event_open(&attr, 0, -1, -1,
				 perf_event_open_cloexec_flag());
448 449

	if (fd < 0)
450
		pr_err("Error: sys_perf_event_open() syscall returned "
451 452
		       "with %d (%s)\n", fd,
		       strerror_r(errno, sbuf, sizeof(sbuf)));
453 454 455 456 457 458 459 460 461 462 463 464
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
465 466
}

467 468 469 470 471
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
};

I
Ingo Molnar 已提交
472 473
static void *thread_func(void *ctx)
{
474 475 476
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
477
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
478 479
	unsigned long i, ret;
	char comm2[22];
480
	int fd;
I
Ingo Molnar 已提交
481

482
	zfree(&parms);
483

I
Ingo Molnar 已提交
484 485
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
486
	fd = self_open_counters();
487 488
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
489 490 491
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
492
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
493
	BUG_ON(ret);
494
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
495 496
	BUG_ON(ret);

497
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
498 499 500

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
501
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
502 503
	}

504
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
505 506 507 508
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

509
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
510
	BUG_ON(ret);
511
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
512 513 514 515 516
	BUG_ON(ret);

	goto again;
}

517
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
518 519 520 521 522 523 524 525
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
526 527
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
528
	BUG_ON(err);
529
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
530
	BUG_ON(err);
531
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
532
	BUG_ON(err);
533 534 535 536 537
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
I
Ingo Molnar 已提交
538 539 540 541
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
542
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
543 544 545 546
		BUG_ON(err);
	}
}

547
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
548
{
549
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
550 551 552
	struct task_desc *task;
	unsigned long i, ret;

553 554 555
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
556

557 558
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
559 560 561 562
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
563
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
564 565 566 567
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

568
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
569

570 571
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
572 573 574
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
575
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
576 577 578 579
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
580 581 582
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
I
Ingo Molnar 已提交
583

584 585 586 587 588
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
					 sched->parent_cpu_usage)/10;
I
Ingo Molnar 已提交
589

590
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
591 592
	BUG_ON(ret);

593 594
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
595 596 597 598 599
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

600
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
601
{
K
Kyle McMartin 已提交
602
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
603 604

	T0 = get_nsecs();
605
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
606 607 608
	T1 = get_nsecs();

	delta = T1 - T0;
609 610
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
611

612
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
613 614 615 616
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
617 618 619 620
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
I
Ingo Molnar 已提交
621

622
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
I
Ingo Molnar 已提交
623

624
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
I
Ingo Molnar 已提交
625

I
Ingo Molnar 已提交
626
	printf("cpu: %0.2f / %0.2f",
627
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
I
Ingo Molnar 已提交
628 629 630

#if 0
	/*
631
	 * rusage statistics done by the parent, these are less
632
	 * accurate than the sched->sum_exec_runtime based statistics:
633
	 */
I
Ingo Molnar 已提交
634
	printf(" [%0.2f / %0.2f]",
635 636
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
I
Ingo Molnar 已提交
637 638
#endif

I
Ingo Molnar 已提交
639
	printf("\n");
I
Ingo Molnar 已提交
640

641 642 643
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
644 645
}

646
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
647
{
648
	u64 T0, T1;
I
Ingo Molnar 已提交
649 650

	T0 = get_nsecs();
651
	burn_nsecs(sched, 1e6);
I
Ingo Molnar 已提交
652 653
	T1 = get_nsecs();

654
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
655 656 657 658 659

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

660
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
661 662
}

663
static int
664
replay_wakeup_event(struct perf_sched *sched,
665 666
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
667
{
668 669
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
670
	struct task_desc *waker, *wakee;
671

I
Ingo Molnar 已提交
672
	if (verbose) {
673
		printf("sched_wakeup event %p\n", evsel);
674

675
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
I
Ingo Molnar 已提交
676
	}
677

678
	waker = register_pid(sched, sample->tid, "<unknown>");
679
	wakee = register_pid(sched, pid, comm);
680

681
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
682
	return 0;
I
Ingo Molnar 已提交
683 684
}

685 686 687 688
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
I
Ingo Molnar 已提交
689
{
690 691 692 693 694
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
695
	struct task_desc *prev, __maybe_unused *next;
696 697
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
698 699
	s64 delta;

I
Ingo Molnar 已提交
700
	if (verbose)
701
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
702

703
	if (cpu >= MAX_CPUS || cpu < 0)
704
		return 0;
705

706
	timestamp0 = sched->cpu_last_switched[cpu];
707 708 709 710 711
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

712
	if (delta < 0) {
713
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
714 715
		return -1;
	}
716

717 718
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
719

720 721
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
722

723
	sched->cpu_last_switched[cpu] = timestamp;
724

725
	add_sched_event_run(sched, prev, timestamp, delta);
726
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
727 728

	return 0;
729 730
}

731 732 733
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
734
{
735 736
	struct thread *child, *parent;

737 738 739 740
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
741 742 743 744 745 746

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
		return 0;
	}
747

748
	if (verbose) {
749
		printf("fork event\n");
750 751
		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
752
	}
753

754 755
	register_pid(sched, parent->tid, thread__comm_str(parent));
	register_pid(sched, child->tid, thread__comm_str(child));
756
	return 0;
757
}
758

759 760
struct sort_dimension {
	const char		*name;
761
	sort_fn_t		cmp;
762 763 764
	struct list_head	list;
};

765
static int
766
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
767 768 769 770
{
	struct sort_dimension *sort;
	int ret = 0;

771 772
	BUG_ON(list_empty(list));

773 774 775 776 777 778 779 780 781
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

782
static struct work_atoms *
783 784 785 786
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
787
	struct work_atoms key = { .thread = thread };
788 789

	while (node) {
790
		struct work_atoms *atoms;
791 792
		int cmp;

793
		atoms = container_of(node, struct work_atoms, node);
794 795 796 797 798 799 800 801 802 803 804 805 806 807

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

808
static void
809
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
810
			 struct list_head *sort_list)
811 812 813 814
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
815
		struct work_atoms *this;
816
		int cmp;
817

818
		this = container_of(*new, struct work_atoms, node);
819
		parent = *new;
820 821 822 823

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
824 825
			new = &((*new)->rb_left);
		else
826
			new = &((*new)->rb_right);
827 828 829 830 831 832
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

833
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
834
{
835
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
836 837 838 839
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
840

841
	atoms->thread = thread__get(thread);
842
	INIT_LIST_HEAD(&atoms->work_list);
843
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
844
	return 0;
845 846
}

847
static char sched_out_state(u64 prev_state)
848 849 850
{
	const char *str = TASK_STATE_TO_CHAR_STR;

851
	return str[prev_state];
852 853
}

854
static int
855 856 857
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
858
{
859
	struct work_atom *atom = zalloc(sizeof(*atom));
860 861 862 863
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
864

865 866
	atom->sched_out_time = timestamp;

867
	if (run_state == 'R') {
868
		atom->state = THREAD_WAIT_CPU;
869
		atom->wake_up_time = atom->sched_out_time;
870 871
	}

872
	list_add_tail(&atom->list, &atoms->work_list);
873
	return 0;
874 875 876
}

static void
877 878
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
879 880 881 882 883 884 885 886 887 888 889 890 891
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
892
{
893
	struct work_atom *atom;
894
	u64 delta;
895

896
	if (list_empty(&atoms->work_list))
897 898
		return;

899
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
900

901
	if (atom->state != THREAD_WAIT_CPU)
902 903
		return;

904 905
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
906 907 908
		return;
	}

909 910
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
911

912
	delta = atom->sched_in_time - atom->wake_up_time;
913
	atoms->total_lat += delta;
914
	if (delta > atoms->max_lat) {
915
		atoms->max_lat = delta;
916 917
		atoms->max_lat_at = timestamp;
	}
918
	atoms->nb_atoms++;
919 920
}

921 922 923 924
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
925
{
926 927 928
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
929
	struct work_atoms *out_events, *in_events;
930
	struct thread *sched_out, *sched_in;
931 932
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
I
Ingo Molnar 已提交
933 934
	s64 delta;

935
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
936

937 938
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
939 940 941 942 943
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

944 945 946 947
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
948

949 950
	sched_out = machine__findnew_thread(machine, -1, prev_pid);
	sched_in = machine__findnew_thread(machine, -1, next_pid);
951

952
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
953
	if (!out_events) {
954
		if (thread_atoms_insert(sched, sched_out))
955
			return -1;
956
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
957 958 959 960
		if (!out_events) {
			pr_err("out-event: Internal tree error");
			return -1;
		}
961
	}
962
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
963
		return -1;
964

965
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
966
	if (!in_events) {
967
		if (thread_atoms_insert(sched, sched_in))
968
			return -1;
969
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
970 971 972 973
		if (!in_events) {
			pr_err("in-event: Internal tree error");
			return -1;
		}
974 975 976 977
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
978 979
		if (add_sched_out_event(in_events, 'R', timestamp))
			return -1;
980
	}
981
	add_sched_in_event(in_events, timestamp);
982 983

	return 0;
984
}
985

986 987 988 989
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
990
{
991 992
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
993
	struct thread *thread = machine__findnew_thread(machine, -1, pid);
994
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
995 996
	u64 timestamp = sample->time;
	int cpu = sample->cpu;
997 998 999

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
1000
		if (thread_atoms_insert(sched, thread))
1001
			return -1;
1002
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1003
		if (!atoms) {
1004
			pr_err("in-event: Internal tree error");
1005 1006 1007 1008
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1009 1010
	}

1011
	add_runtime_event(atoms, runtime, timestamp);
1012
	return 0;
1013 1014
}

1015 1016 1017 1018
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1019
{
1020
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1021
	struct work_atoms *atoms;
1022
	struct work_atom *atom;
1023
	struct thread *wakee;
1024
	u64 timestamp = sample->time;
1025

1026
	wakee = machine__findnew_thread(machine, -1, pid);
1027
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1028
	if (!atoms) {
1029
		if (thread_atoms_insert(sched, wakee))
1030
			return -1;
1031
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1032
		if (!atoms) {
1033
			pr_err("wakeup-event: Internal tree error");
1034 1035 1036 1037
			return -1;
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
			return -1;
1038 1039
	}

1040
	BUG_ON(list_empty(&atoms->work_list));
1041

1042
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1043

1044
	/*
1045 1046 1047 1048 1049 1050
	 * As we do not guarantee the wakeup event happens when
	 * task is out of run queue, also may happen when task is
	 * on run queue and wakeup only change ->state to TASK_RUNNING,
	 * then we should not set the ->wake_up_time when wake up a
	 * task which is on run queue.
	 *
1051 1052
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
1053
	 * skip in this case.
1054
	 */
1055
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1056
		return 0;
1057

1058
	sched->nr_timestamps++;
1059
	if (atom->sched_out_time > timestamp) {
1060
		sched->nr_unordered_timestamps++;
1061
		return 0;
1062
	}
1063

1064 1065
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1066
	return 0;
1067 1068
}

1069 1070 1071 1072
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1073
{
1074
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1075
	u64 timestamp = sample->time;
1076 1077 1078 1079 1080 1081 1082
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1083
	if (sched->profile_cpu == -1)
1084
		return 0;
1085

1086
	migrant = machine__findnew_thread(machine, -1, pid);
1087
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1088
	if (!atoms) {
1089
		if (thread_atoms_insert(sched, migrant))
1090
			return -1;
1091
		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1092
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1093
		if (!atoms) {
1094
			pr_err("migration-event: Internal tree error");
1095 1096 1097 1098
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1099 1100 1101 1102 1103 1104 1105
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1106
	sched->nr_timestamps++;
1107 1108

	if (atom->sched_out_time > timestamp)
1109
		sched->nr_unordered_timestamps++;
1110 1111

	return 0;
1112 1113
}

1114
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1115 1116 1117
{
	int i;
	int ret;
1118
	u64 avg;
1119

1120
	if (!work_list->nb_atoms)
1121
		return;
1122 1123 1124
	/*
	 * Ignore idle threads:
	 */
1125
	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1126
		return;
1127

1128 1129
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1130

1131
	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1132

M
mingo 已提交
1133
	for (i = 0; i < 24 - ret; i++)
1134 1135
		printf(" ");

1136
	avg = work_list->total_lat / work_list->nb_atoms;
1137

1138
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1139 1140
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1141 1142
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1143 1144
}

1145
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1146
{
1147
	if (l->thread->tid < r->thread->tid)
1148
		return -1;
1149
	if (l->thread->tid > r->thread->tid)
1150 1151 1152 1153 1154
		return 1;

	return 0;
}

1155
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1176
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1177 1178 1179 1180 1181 1182 1183 1184 1185
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1186
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1187 1188 1189 1190 1191 1192 1193 1194 1195
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1196
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1197 1198 1199 1200 1201 1202 1203 1204 1205
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1206
static int sort_dimension__add(const char *tok, struct list_head *list)
1207
{
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1236

1237
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1248
static void perf_sched__sort_lat(struct perf_sched *sched)
1249 1250 1251 1252
{
	struct rb_node *node;

	for (;;) {
1253
		struct work_atoms *data;
1254
		node = rb_first(&sched->atom_root);
1255 1256 1257
		if (!node)
			break;

1258
		rb_erase(node, &sched->atom_root);
1259
		data = rb_entry(node, struct work_atoms, node);
1260
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1261 1262 1263
	}
}

1264
static int process_sched_wakeup_event(struct perf_tool *tool,
1265
				      struct perf_evsel *evsel,
1266
				      struct perf_sample *sample,
1267
				      struct machine *machine)
1268
{
1269
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1270

1271 1272
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1273

1274
	return 0;
1275 1276
}

1277 1278
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1279
{
1280 1281
	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	struct thread *sched_in;
1282
	int new_shortname;
1283
	u64 timestamp0, timestamp = sample->time;
1284
	s64 delta;
1285
	int cpu, this_cpu = sample->cpu;
1286 1287 1288

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1289 1290
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1291

1292 1293
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1294 1295 1296 1297 1298
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1299
	if (delta < 0) {
1300
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1301 1302
		return -1;
	}
1303

1304
	sched_in = machine__findnew_thread(machine, -1, next_pid);
1305

1306
	sched->curr_thread[this_cpu] = sched_in;
1307 1308 1309 1310 1311

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1312 1313 1314 1315 1316 1317 1318
		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
			/*
			 * Don't allocate a letter-number for swapper:0
			 * as a shortname. Instead, we use '.' for it.
			 */
			sched_in->shortname[0] = '.';
			sched_in->shortname[1] = ' ';
1319
		} else {
1320 1321 1322 1323 1324
			sched_in->shortname[0] = sched->next_shortname1;
			sched_in->shortname[1] = sched->next_shortname2;

			if (sched->next_shortname1 < 'Z') {
				sched->next_shortname1++;
1325
			} else {
1326 1327 1328 1329 1330
				sched->next_shortname1 = 'A';
				if (sched->next_shortname2 < '9')
					sched->next_shortname2++;
				else
					sched->next_shortname2 = '0';
1331 1332 1333 1334 1335
			}
		}
		new_shortname = 1;
	}

1336
	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1337 1338 1339 1340 1341
		if (cpu != this_cpu)
			printf(" ");
		else
			printf("*");

1342 1343 1344
		if (sched->curr_thread[cpu])
			printf("%2s ", sched->curr_thread[cpu]->shortname);
		else
1345 1346 1347 1348 1349 1350
			printf("   ");
	}

	printf("  %12.6f secs ", (double)timestamp/1e9);
	if (new_shortname) {
		printf("%s => %s:%d\n",
1351
		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1352 1353 1354
	} else {
		printf("\n");
	}
1355 1356

	return 0;
1357 1358
}

1359
static int process_sched_switch_event(struct perf_tool *tool,
1360
				      struct perf_evsel *evsel,
1361
				      struct perf_sample *sample,
1362
				      struct machine *machine)
1363
{
1364
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1365
	int this_cpu = sample->cpu, err = 0;
1366 1367
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1368

1369
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1370 1371 1372 1373
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1374
		if (sched->curr_pid[this_cpu] != prev_pid)
1375
			sched->nr_context_switch_bugs++;
1376 1377
	}

1378 1379
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1380 1381

	sched->curr_pid[this_cpu] = next_pid;
1382
	return err;
1383 1384
}

1385
static int process_sched_runtime_event(struct perf_tool *tool,
1386
				       struct perf_evsel *evsel,
1387
				       struct perf_sample *sample,
1388
				       struct machine *machine)
1389
{
1390
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1391

1392 1393
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1394

1395
	return 0;
1396 1397
}

1398 1399 1400 1401
static int perf_sched__process_fork_event(struct perf_tool *tool,
					  union perf_event *event,
					  struct perf_sample *sample,
					  struct machine *machine)
1402
{
1403
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1404

1405 1406 1407 1408
	/* run the fork event through the perf machineruy */
	perf_event__process_fork(tool, event, sample, machine);

	/* and then run additional processing needed for this command */
1409
	if (sched->tp_handler->fork_event)
1410
		return sched->tp_handler->fork_event(sched, event, machine);
1411

1412
	return 0;
1413 1414
}

1415
static int process_sched_migrate_task_event(struct perf_tool *tool,
1416
					    struct perf_evsel *evsel,
1417
					    struct perf_sample *sample,
1418
					    struct machine *machine)
1419
{
1420
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1421

1422 1423
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1424

1425
	return 0;
1426 1427
}

1428
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1429
				  struct perf_evsel *evsel,
1430
				  struct perf_sample *sample,
1431
				  struct machine *machine);
I
Ingo Molnar 已提交
1432

1433 1434
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1435 1436 1437
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1438
{
1439
	int err = 0;
I
Ingo Molnar 已提交
1440

1441 1442
	if (evsel->handler != NULL) {
		tracepoint_handler f = evsel->handler;
1443
		err = f(tool, evsel, sample, machine);
1444
	}
I
Ingo Molnar 已提交
1445

1446
	return err;
I
Ingo Molnar 已提交
1447 1448
}

1449
static int perf_sched__read_events(struct perf_sched *sched)
I
Ingo Molnar 已提交
1450
{
1451 1452 1453 1454 1455 1456 1457
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1458
	struct perf_session *session;
1459 1460 1461 1462
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
	};
1463
	int rc = -1;
1464

1465
	session = perf_session__new(&file, false, &sched->tool);
1466 1467 1468 1469
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1470

1471
	symbol__init(&session->header.env);
1472

1473 1474
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1475

1476
	if (perf_session__has_traces(session, "record -R")) {
1477
		int err = perf_session__process_events(session);
1478 1479 1480 1481
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1482

1483 1484 1485
		sched->nr_events      = session->evlist->stats.nr_events[0];
		sched->nr_lost_events = session->evlist->stats.total_lost;
		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1486
	}
1487

1488
	rc = 0;
1489 1490
out_delete:
	perf_session__delete(session);
1491
	return rc;
I
Ingo Molnar 已提交
1492 1493
}

1494
static void print_bad_events(struct perf_sched *sched)
1495
{
1496
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1497
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1498 1499
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1500
	}
1501
	if (sched->nr_lost_events && sched->nr_events) {
1502
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1503 1504
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1505
	}
1506
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1507
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1508 1509 1510
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1511 1512 1513 1514 1515
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1516
static int perf_sched__lat(struct perf_sched *sched)
1517 1518 1519 1520
{
	struct rb_node *next;

	setup_pager();
1521

1522
	if (perf_sched__read_events(sched))
1523
		return -1;
1524

1525
	perf_sched__sort_lat(sched);
1526

1527 1528 1529
	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1530

1531
	next = rb_first(&sched->sorted_atom_root);
1532 1533 1534 1535 1536

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1537
		output_lat_thread(sched, work_list);
1538
		next = rb_next(next);
1539
		thread__zput(work_list->thread);
1540 1541
	}

1542
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1543
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1544
		(double)sched->all_runtime / 1e6, sched->all_count);
1545 1546 1547

	printf(" ---------------------------------------------------\n");

1548
	print_bad_events(sched);
1549 1550
	printf("\n");

1551
	return 0;
1552 1553
}

1554
static int perf_sched__map(struct perf_sched *sched)
1555
{
1556
	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1557

1558
	setup_pager();
1559
	if (perf_sched__read_events(sched))
1560
		return -1;
1561
	print_bad_events(sched);
1562
	return 0;
1563 1564
}

1565
static int perf_sched__replay(struct perf_sched *sched)
1566 1567 1568
{
	unsigned long i;

1569 1570
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1571

1572
	test_calibrations(sched);
1573

1574
	if (perf_sched__read_events(sched))
1575
		return -1;
1576

1577 1578 1579
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1580

1581 1582 1583 1584 1585
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1586
		printf("run atoms optimized: %ld\n",
1587
			sched->nr_run_events_optimized);
1588

1589 1590
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1591

1592
	create_tasks(sched);
1593
	printf("------------------------------------------------------------\n");
1594 1595
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1596 1597

	return 0;
1598 1599
}

1600 1601
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1602
{
1603
	char *tmp, *tok, *str = strdup(sched->sort_order);
1604 1605 1606

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1607
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1608
			error("Unknown --sort key: `%s'", tok);
1609
			usage_with_options(usage_msg, options);
1610 1611 1612 1613 1614
		}
	}

	free(str);

1615
	sort_dimension__add("pid", &sched->cmp_pid);
1616 1617
}

1618 1619 1620 1621
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
1635
		"-e", "sched:sched_wakeup_new",
1636 1637
		"-e", "sched:sched_migrate_task",
	};
1638 1639 1640 1641

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1642
	if (rec_argv == NULL)
1643 1644
		return -ENOMEM;

1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1656
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
1657
{
1658 1659 1660 1661 1662 1663 1664
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_sched__process_fork_event,
1665
			.ordered_events = true,
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
	};
1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option sched_options[] = {
1698
	OPT_STRING('i', "input", &input_name, "file",
1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
1714 1715 1716 1717
	const char *const sched_subcommands[] = { "record", "latency", "map",
						  "replay", "script", NULL };
	const char *sched_usage[] = {
		NULL,
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};
A
Adrian Hunter 已提交
1734 1735 1736 1737
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
		sched.curr_pid[i] = -1;
1738

1739 1740
	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
1741 1742
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
1743

1744
	/*
1745
	 * Aliased to 'perf script' for now:
1746
	 */
1747 1748
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1749

1750 1751 1752
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1753
		sched.tp_handler = &lat_ops;
1754 1755 1756 1757 1758
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
1759 1760
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
1761
	} else if (!strcmp(argv[0], "map")) {
1762 1763 1764
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
1765
	} else if (!strncmp(argv[0], "rep", 3)) {
1766
		sched.tp_handler = &replay_ops;
1767 1768 1769 1770 1771
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
1772
		return perf_sched__replay(&sched);
1773 1774 1775 1776
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
1777
	return 0;
I
Ingo Molnar 已提交
1778
}