builtin-sched.c 46.1 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
I
Ingo Molnar 已提交
13 14

#include "util/parse-options.h"
15
#include "util/trace-event.h"
I
Ingo Molnar 已提交
16 17 18

#include "util/debug.h"

19
#include <sys/prctl.h>
20
#include <sys/resource.h>
I
Ingo Molnar 已提交
21

22 23 24
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
25

26 27 28 29 30
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
#define MAX_PID			65536
I
Ingo Molnar 已提交
31

32
struct sched_atom;
I
Ingo Molnar 已提交
33

34 35 36 37
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
38

39 40
	unsigned long		nr_events;
	unsigned long		curr_event;
41
	struct sched_atom	**atoms;
42 43 44

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
45

46 47 48 49 50 51 52 53 54 55
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
56
	SCHED_EVENT_MIGRATION,
57 58
};

59
struct sched_atom {
60
	enum sched_event_type	type;
61
	int			specific_wait;
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

#define TASK_STATE_TO_CHAR_STR "RSDTtZX"

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
81
	u64			sched_out_time;
82 83 84 85 86
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

87 88
struct work_atoms {
	struct list_head	work_list;
89 90 91
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
92
	u64			max_lat_at;
93 94 95 96 97
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
};

98
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
99

100
struct trace_switch_event {
101
	char *prev_comm;
102 103 104
	u32  prev_pid;
	u32  prev_prio;
	u64  prev_state;
105
	char *next_comm;
106 107 108 109 110
	u32  next_pid;
	u32  next_prio;
};

struct trace_runtime_event {
111
	char *comm;
112 113 114 115 116 117
	u32  pid;
	u64  runtime;
	u64  vruntime;
};

struct trace_wakeup_event {
118
	char *comm;
119 120 121 122 123 124 125
	u32  pid;
	u32  prio;
	u32  success;
	u32  cpu;
};

struct trace_fork_event {
126
	char *parent_comm;
127
	u32  parent_pid;
128
	char *child_comm;
129 130 131 132
	u32   child_pid;
};

struct trace_migrate_task_event {
133
	char *comm;
134 135 136 137 138 139 140 141 142 143 144
	u32  pid;
	u32  prio;
	u32  cpu;
};

struct perf_sched;

struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched,
			    struct trace_switch_event *event,
			    struct machine *machine,
145
			    struct perf_evsel *evsel,
146 147 148 149 150 151 152 153 154 155
			    struct perf_sample *sample);

	int (*runtime_event)(struct perf_sched *sched,
			     struct trace_runtime_event *event,
			     struct machine *machine,
			     struct perf_sample *sample);

	int (*wakeup_event)(struct perf_sched *sched,
			    struct trace_wakeup_event *event,
			    struct machine *machine,
156
			    struct perf_evsel *evsel,
157 158 159 160
			    struct perf_sample *sample);

	int (*fork_event)(struct perf_sched *sched,
			  struct trace_fork_event *event,
161
			  struct perf_evsel *evsel);
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220

	int (*migrate_task_event)(struct perf_sched *sched,
				  struct trace_migrate_task_event *event,
				  struct machine *machine,
				  struct perf_sample *sample);
};

struct perf_sched {
	struct perf_tool tool;
	const char	 *input_name;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
	struct task_desc *pid_to_task[MAX_PID];
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_state_machine_bugs;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
	struct rb_root	 atom_root, sorted_atom_root;
	struct list_head sort_list, cmp_pid;
};
221 222

static u64 get_nsecs(void)
I
Ingo Molnar 已提交
223 224 225 226 227 228 229 230
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

231
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
232
{
233
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
234 235 236

	do {
		T1 = get_nsecs();
237
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
238 239
}

240
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
241 242 243 244 245 246 247 248 249
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

250
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
251
{
252
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
253 254 255 256
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
257
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
258 259 260 261
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
262
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
263

264
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
265 266
}

267
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
268
{
269
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
270 271 272 273 274 275 276 277 278 279
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
280
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
281

282
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
283 284
}

285
static struct sched_atom *
286
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
287
{
288
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
289 290 291 292 293 294 295
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
296 297 298
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
299

300
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
301 302 303 304

	return event;
}

305
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
306 307 308 309
{
	if (!task->nr_events)
		return NULL;

310
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
311 312
}

313 314
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
315
{
316
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
317 318

	/*
319 320 321
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
322
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
323
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
324 325 326 327 328 329 330 331 332
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

333
	sched->nr_run_events++;
I
Ingo Molnar 已提交
334 335
}

336 337
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
338
{
339
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
340 341 342 343 344 345 346

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
347
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
348 349 350
		return;
	}
	if (wakee_event->wait_sem) {
351
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
352 353 354
		return;
	}

355
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
356 357 358 359
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

360
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
361 362
}

363 364
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
365
{
366
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
367 368 369

	event->type = SCHED_EVENT_SLEEP;

370
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
371 372
}

373 374
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
375 376 377 378 379
{
	struct task_desc *task;

	BUG_ON(pid >= MAX_PID);

380
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
381 382 383 384

	if (task)
		return task;

385
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
386
	task->pid = pid;
387
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
388 389 390 391 392
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
393
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
394

395 396 397 398 399
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
400

I
Ingo Molnar 已提交
401
	if (verbose)
402
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
403 404 405 406 407

	return task;
}


408
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
409 410 411 412
{
	struct task_desc *task;
	unsigned long i;

413 414
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
415
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
416 417 418 419
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

420
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
421 422 423 424
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

425 426
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
427
		j = i + 1;
428
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
429
			j = 0;
430 431
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
432 433 434
	}
}

435 436
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
437 438 439
{
	int ret = 0;

440
	switch (atom->type) {
I
Ingo Molnar 已提交
441
		case SCHED_EVENT_RUN:
442
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
443 444
			break;
		case SCHED_EVENT_SLEEP:
445 446
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
447 448 449
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
450 451
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
452 453
			BUG_ON(ret);
			break;
454 455
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
456 457 458 459 460
		default:
			BUG_ON(1);
	}
}

461
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
462 463
{
	struct rusage ru;
464
	u64 sum;
I
Ingo Molnar 已提交
465 466 467 468 469 470 471 472 473 474 475
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

476
static int self_open_counters(void)
I
Ingo Molnar 已提交
477
{
478 479
	struct perf_event_attr attr;
	int fd;
I
Ingo Molnar 已提交
480

481
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
482

483 484
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
485

486 487 488
	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);

	if (fd < 0)
489 490
		pr_debug("Error: sys_perf_event_open() syscall returned"
			 "with %d (%s)\n", fd, strerror(errno));
491 492 493 494 495 496 497 498 499 500 501 502
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
503 504
}

505 506 507 508 509
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
};

I
Ingo Molnar 已提交
510 511
static void *thread_func(void *ctx)
{
512 513 514
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
515
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
516 517
	unsigned long i, ret;
	char comm2[22];
518
	int fd;
I
Ingo Molnar 已提交
519

520 521
	free(parms);

I
Ingo Molnar 已提交
522 523
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
524
	fd = self_open_counters();
525 526
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
527 528 529
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
530
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
531
	BUG_ON(ret);
532
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
533 534
	BUG_ON(ret);

535
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
536 537 538

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
539
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
540 541
	}

542
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
543 544 545 546
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

547
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
548
	BUG_ON(ret);
549
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
550 551 552 553 554
	BUG_ON(ret);

	goto again;
}

555
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
556 557 558 559 560 561 562 563
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
564 565
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
566
	BUG_ON(err);
567
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
568
	BUG_ON(err);
569
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
570
	BUG_ON(err);
571 572 573 574 575
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
I
Ingo Molnar 已提交
576 577 578 579
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
580
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
581 582 583 584
		BUG_ON(err);
	}
}

585
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
586
{
587
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
588 589 590
	struct task_desc *task;
	unsigned long i, ret;

591 592 593
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
594

595 596
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
597 598 599 600
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
601
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
602 603 604 605
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

606
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
607

608 609
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
610 611 612
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
613
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
614 615 616 617
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
618 619 620
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
I
Ingo Molnar 已提交
621

622 623 624 625 626
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
					 sched->parent_cpu_usage)/10;
I
Ingo Molnar 已提交
627

628
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
629 630
	BUG_ON(ret);

631 632
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
633 634 635 636 637
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

638
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
639
{
K
Kyle McMartin 已提交
640
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
641 642

	T0 = get_nsecs();
643
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
644 645 646
	T1 = get_nsecs();

	delta = T1 - T0;
647 648
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
649

650
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
651 652 653 654
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
655 656 657 658
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
I
Ingo Molnar 已提交
659

660
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
I
Ingo Molnar 已提交
661

662
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
I
Ingo Molnar 已提交
663

I
Ingo Molnar 已提交
664
	printf("cpu: %0.2f / %0.2f",
665
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
I
Ingo Molnar 已提交
666 667 668

#if 0
	/*
669
	 * rusage statistics done by the parent, these are less
670
	 * accurate than the sched->sum_exec_runtime based statistics:
671
	 */
I
Ingo Molnar 已提交
672
	printf(" [%0.2f / %0.2f]",
673 674
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
I
Ingo Molnar 已提交
675 676
#endif

I
Ingo Molnar 已提交
677
	printf("\n");
I
Ingo Molnar 已提交
678

679 680 681
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
682 683
}

684
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
685
{
686
	u64 T0, T1;
I
Ingo Molnar 已提交
687 688

	T0 = get_nsecs();
689
	burn_nsecs(sched, 1e6);
I
Ingo Molnar 已提交
690 691
	T1 = get_nsecs();

692
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
693 694 695 696 697

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

698
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
699 700
}

701
static int
702 703
replay_wakeup_event(struct perf_sched *sched,
		    struct trace_wakeup_event *wakeup_event,
704
		    struct machine *machine __maybe_unused,
705
		    struct perf_evsel *evsel, struct perf_sample *sample)
706 707
{
	struct task_desc *waker, *wakee;
708

I
Ingo Molnar 已提交
709
	if (verbose) {
710
		printf("sched_wakeup event %p\n", evsel);
711

I
Ingo Molnar 已提交
712
		printf(" ... pid %d woke up %s/%d\n",
713
		       sample->tid, wakeup_event->comm, wakeup_event->pid);
I
Ingo Molnar 已提交
714
	}
715

716
	waker = register_pid(sched, sample->tid, "<unknown>");
717
	wakee = register_pid(sched, wakeup_event->pid, wakeup_event->comm);
718

719
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
720
	return 0;
I
Ingo Molnar 已提交
721 722
}

723
static int
724 725
replay_switch_event(struct perf_sched *sched,
		    struct trace_switch_event *switch_event,
726
		    struct machine *machine __maybe_unused,
727
		    struct perf_evsel *evsel,
728
		    struct perf_sample *sample)
I
Ingo Molnar 已提交
729
{
730
	struct task_desc *prev, __maybe_unused *next;
731 732
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
733 734
	s64 delta;

I
Ingo Molnar 已提交
735
	if (verbose)
736
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
737

738
	if (cpu >= MAX_CPUS || cpu < 0)
739
		return 0;
740

741
	timestamp0 = sched->cpu_last_switched[cpu];
742 743 744 745 746
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

747 748 749 750
	if (delta < 0) {
		pr_debug("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
751

I
Ingo Molnar 已提交
752
	if (verbose) {
753
		printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
754 755
			switch_event->prev_comm, switch_event->prev_pid,
			switch_event->next_comm, switch_event->next_pid,
I
Ingo Molnar 已提交
756 757
			delta);
	}
758

759 760
	prev = register_pid(sched, switch_event->prev_pid, switch_event->prev_comm);
	next = register_pid(sched, switch_event->next_pid, switch_event->next_comm);
761

762
	sched->cpu_last_switched[cpu] = timestamp;
763

764 765
	add_sched_event_run(sched, prev, timestamp, delta);
	add_sched_event_sleep(sched, prev, timestamp, switch_event->prev_state);
766 767

	return 0;
768 769
}

770
static int
771
replay_fork_event(struct perf_sched *sched, struct trace_fork_event *fork_event,
772
		  struct perf_evsel *evsel)
773 774
{
	if (verbose) {
775
		printf("sched_fork event %p\n", evsel);
776 777 778
		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
	}
779 780
	register_pid(sched, fork_event->parent_pid, fork_event->parent_comm);
	register_pid(sched, fork_event->child_pid, fork_event->child_comm);
781
	return 0;
782
}
783

784 785
struct sort_dimension {
	const char		*name;
786
	sort_fn_t		cmp;
787 788 789
	struct list_head	list;
};

790
static int
791
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
792 793 794 795
{
	struct sort_dimension *sort;
	int ret = 0;

796 797
	BUG_ON(list_empty(list));

798 799 800 801 802 803 804 805 806
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

807
static struct work_atoms *
808 809 810 811
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
812
	struct work_atoms key = { .thread = thread };
813 814

	while (node) {
815
		struct work_atoms *atoms;
816 817
		int cmp;

818
		atoms = container_of(node, struct work_atoms, node);
819 820 821 822 823 824 825 826 827 828 829 830 831 832

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

833
static void
834
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
835
			 struct list_head *sort_list)
836 837 838 839
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
840
		struct work_atoms *this;
841
		int cmp;
842

843
		this = container_of(*new, struct work_atoms, node);
844
		parent = *new;
845 846 847 848

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
849 850
			new = &((*new)->rb_left);
		else
851
			new = &((*new)->rb_right);
852 853 854 855 856 857
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

858
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
859
{
860
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
861 862 863 864
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
865

866
	atoms->thread = thread;
867
	INIT_LIST_HEAD(&atoms->work_list);
868
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
869
	return 0;
870 871
}

872 873
static int latency_fork_event(struct perf_sched *sched __maybe_unused,
			      struct trace_fork_event *fork_event __maybe_unused,
874
			      struct perf_evsel *evsel __maybe_unused)
875 876
{
	/* should insert the newcomer */
877
	return 0;
878 879 880 881 882 883 884 885 886
}

static char sched_out_state(struct trace_switch_event *switch_event)
{
	const char *str = TASK_STATE_TO_CHAR_STR;

	return str[switch_event->prev_state];
}

887
static int
888 889 890
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
891
{
892
	struct work_atom *atom = zalloc(sizeof(*atom));
893 894 895 896
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
897

898 899
	atom->sched_out_time = timestamp;

900
	if (run_state == 'R') {
901
		atom->state = THREAD_WAIT_CPU;
902
		atom->wake_up_time = atom->sched_out_time;
903 904
	}

905
	list_add_tail(&atom->list, &atoms->work_list);
906
	return 0;
907 908 909
}

static void
910 911
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
912 913 914 915 916 917 918 919 920 921 922 923 924
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
925
{
926
	struct work_atom *atom;
927
	u64 delta;
928

929
	if (list_empty(&atoms->work_list))
930 931
		return;

932
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
933

934
	if (atom->state != THREAD_WAIT_CPU)
935 936
		return;

937 938
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
939 940 941
		return;
	}

942 943
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
944

945
	delta = atom->sched_in_time - atom->wake_up_time;
946
	atoms->total_lat += delta;
947
	if (delta > atoms->max_lat) {
948
		atoms->max_lat = delta;
949 950
		atoms->max_lat_at = timestamp;
	}
951
	atoms->nb_atoms++;
952 953
}

954
static int
955 956
latency_switch_event(struct perf_sched *sched,
		     struct trace_switch_event *switch_event,
957
		     struct machine *machine,
958
		     struct perf_evsel *evsel __maybe_unused,
959
		     struct perf_sample *sample)
960
{
961
	struct work_atoms *out_events, *in_events;
962
	struct thread *sched_out, *sched_in;
963 964
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
I
Ingo Molnar 已提交
965 966
	s64 delta;

967
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
968

969 970
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
971 972 973 974 975
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

976 977 978 979
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
980

981 982
	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
983

984
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
985
	if (!out_events) {
986
		if (thread_atoms_insert(sched, sched_out))
987
			return -1;
988
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
989 990 991 992
		if (!out_events) {
			pr_err("out-event: Internal tree error");
			return -1;
		}
993
	}
994 995
	if (add_sched_out_event(out_events, sched_out_state(switch_event), timestamp))
		return -1;
996

997
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
998
	if (!in_events) {
999
		if (thread_atoms_insert(sched, sched_in))
1000
			return -1;
1001
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1002 1003 1004 1005
		if (!in_events) {
			pr_err("in-event: Internal tree error");
			return -1;
		}
1006 1007 1008 1009
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
1010 1011
		if (add_sched_out_event(in_events, 'R', timestamp))
			return -1;
1012
	}
1013
	add_sched_in_event(in_events, timestamp);
1014 1015

	return 0;
1016
}
1017

1018
static int
1019 1020
latency_runtime_event(struct perf_sched *sched,
		      struct trace_runtime_event *runtime_event,
1021
		      struct machine *machine, struct perf_sample *sample)
1022
{
1023
	struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1024
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1025 1026
	u64 timestamp = sample->time;
	int cpu = sample->cpu;
1027 1028 1029

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
1030
		if (thread_atoms_insert(sched, thread))
1031
			return -1;
1032
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1033 1034 1035 1036 1037 1038
		if (!atoms) {
			pr_debug("in-event: Internal tree error");
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1039 1040
	}

1041
	add_runtime_event(atoms, runtime_event->runtime, timestamp);
1042
	return 0;
1043 1044
}

1045
static int
1046 1047
latency_wakeup_event(struct perf_sched *sched,
		     struct trace_wakeup_event *wakeup_event,
1048
		     struct machine *machine,
1049
		     struct perf_evsel *evsel __maybe_unused,
1050
		     struct perf_sample *sample)
1051
{
1052
	struct work_atoms *atoms;
1053
	struct work_atom *atom;
1054
	struct thread *wakee;
1055
	u64 timestamp = sample->time;
1056 1057 1058

	/* Note for later, it may be interesting to observe the failing cases */
	if (!wakeup_event->success)
1059
		return 0;
1060

1061
	wakee = machine__findnew_thread(machine, wakeup_event->pid);
1062
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1063
	if (!atoms) {
1064
		if (thread_atoms_insert(sched, wakee))
1065
			return -1;
1066
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1067 1068 1069 1070 1071 1072
		if (!atoms) {
			pr_debug("wakeup-event: Internal tree error");
			return -1;
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
			return -1;
1073 1074
	}

1075
	BUG_ON(list_empty(&atoms->work_list));
1076

1077
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1078

1079 1080 1081 1082 1083
	/*
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
	 * make useless noise.
	 */
1084 1085
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
		sched->nr_state_machine_bugs++;
1086

1087
	sched->nr_timestamps++;
1088
	if (atom->sched_out_time > timestamp) {
1089
		sched->nr_unordered_timestamps++;
1090
		return 0;
1091
	}
1092

1093 1094
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1095
	return 0;
1096 1097
}

1098
static int
1099 1100
latency_migrate_task_event(struct perf_sched *sched,
			   struct trace_migrate_task_event *migrate_task_event,
1101
			   struct machine *machine, struct perf_sample *sample)
1102
{
1103
	u64 timestamp = sample->time;
1104 1105 1106 1107 1108 1109 1110
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1111
	if (sched->profile_cpu == -1)
1112
		return 0;
1113

1114
	migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1115
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1116
	if (!atoms) {
1117
		if (thread_atoms_insert(sched, migrant))
1118
			return -1;
1119 1120
		register_pid(sched, migrant->pid, migrant->comm);
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1121 1122 1123 1124 1125 1126
		if (!atoms) {
			pr_debug("migration-event: Internal tree error");
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1127 1128 1129 1130 1131 1132 1133
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1134
	sched->nr_timestamps++;
1135 1136

	if (atom->sched_out_time > timestamp)
1137
		sched->nr_unordered_timestamps++;
1138 1139

	return 0;
1140 1141
}

1142
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1143 1144 1145
{
	int i;
	int ret;
1146
	u64 avg;
1147

1148
	if (!work_list->nb_atoms)
1149
		return;
1150 1151 1152
	/*
	 * Ignore idle threads:
	 */
1153
	if (!strcmp(work_list->thread->comm, "swapper"))
1154
		return;
1155

1156 1157
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1158

1159
	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1160

M
mingo 已提交
1161
	for (i = 0; i < 24 - ret; i++)
1162 1163
		printf(" ");

1164
	avg = work_list->total_lat / work_list->nb_atoms;
1165

1166
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1167 1168
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1169 1170
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1171 1172
}

1173
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1174 1175 1176 1177 1178 1179 1180 1181 1182
{
	if (l->thread->pid < r->thread->pid)
		return -1;
	if (l->thread->pid > r->thread->pid)
		return 1;

	return 0;
}

1183
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1204
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1205 1206 1207 1208 1209 1210 1211 1212 1213
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1214
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1215 1216 1217 1218 1219 1220 1221 1222 1223
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1224
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1225 1226 1227 1228 1229 1230 1231 1232 1233
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1234
static int sort_dimension__add(const char *tok, struct list_head *list)
1235
{
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1264

1265
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1276
static void perf_sched__sort_lat(struct perf_sched *sched)
1277 1278 1279 1280
{
	struct rb_node *node;

	for (;;) {
1281
		struct work_atoms *data;
1282
		node = rb_first(&sched->atom_root);
1283 1284 1285
		if (!node)
			break;

1286
		rb_erase(node, &sched->atom_root);
1287
		data = rb_entry(node, struct work_atoms, node);
1288
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1289 1290 1291
	}
}

1292
static int process_sched_wakeup_event(struct perf_tool *tool,
1293
				      struct perf_evsel *evsel,
1294
				      struct perf_sample *sample,
1295
				      struct machine *machine)
1296
{
1297
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1298

1299 1300 1301 1302 1303 1304 1305 1306
	if (sched->tp_handler->wakeup_event) {
		struct trace_wakeup_event event = {
			.comm	 = perf_evsel__strval(evsel, sample, "comm"),
			.pid	 = perf_evsel__intval(evsel, sample, "pid"),
			.prio	 = perf_evsel__intval(evsel, sample, "prio"),
			.success = perf_evsel__intval(evsel, sample, "success"),
			.cpu	 = perf_evsel__intval(evsel, sample, "cpu"),
		};
1307

1308 1309
		return sched->tp_handler->wakeup_event(sched, &event, machine, evsel, sample);
	}
1310

1311
	return 0;
1312 1313
}

1314
static int
1315 1316
map_switch_event(struct perf_sched *sched,
		 struct trace_switch_event *switch_event,
1317
		 struct machine *machine,
1318
		 struct perf_evsel *evsel __maybe_unused,
1319
		 struct perf_sample *sample)
1320
{
1321
	struct thread *sched_out __maybe_unused, *sched_in;
1322
	int new_shortname;
1323
	u64 timestamp0, timestamp = sample->time;
1324
	s64 delta;
1325
	int cpu, this_cpu = sample->cpu;
1326 1327 1328

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1329 1330
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1331

1332 1333
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1334 1335 1336 1337 1338
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1339 1340 1341 1342
	if (delta < 0) {
		pr_debug("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
1343

1344 1345
	sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
	sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1346

1347
	sched->curr_thread[this_cpu] = sched_in;
1348 1349 1350 1351 1352

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1353 1354
		sched_in->shortname[0] = sched->next_shortname1;
		sched_in->shortname[1] = sched->next_shortname2;
1355

1356 1357
		if (sched->next_shortname1 < 'Z') {
			sched->next_shortname1++;
1358
		} else {
1359 1360 1361
			sched->next_shortname1='A';
			if (sched->next_shortname2 < '9') {
				sched->next_shortname2++;
1362
			} else {
1363
				sched->next_shortname2='0';
1364 1365 1366 1367 1368
			}
		}
		new_shortname = 1;
	}

1369
	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1370 1371 1372 1373 1374
		if (cpu != this_cpu)
			printf(" ");
		else
			printf("*");

1375 1376 1377
		if (sched->curr_thread[cpu]) {
			if (sched->curr_thread[cpu]->pid)
				printf("%2s ", sched->curr_thread[cpu]->shortname);
1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
			else
				printf(".  ");
		} else
			printf("   ");
	}

	printf("  %12.6f secs ", (double)timestamp/1e9);
	if (new_shortname) {
		printf("%s => %s:%d\n",
			sched_in->shortname, sched_in->comm, sched_in->pid);
	} else {
		printf("\n");
	}
1391 1392

	return 0;
1393 1394
}

1395
static int process_sched_switch_event(struct perf_tool *tool,
1396
				      struct perf_evsel *evsel,
1397
				      struct perf_sample *sample,
1398
				      struct machine *machine)
1399
{
1400
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1401
	int this_cpu = sample->cpu, err = 0;
1402 1403
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1404

1405
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1406 1407 1408 1409
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1410
		if (sched->curr_pid[this_cpu] != prev_pid)
1411
			sched->nr_context_switch_bugs++;
1412 1413
	}

1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428
	if (sched->tp_handler->switch_event) {
		struct trace_switch_event event = {
			.prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
			.prev_pid   = prev_pid,
			.prev_prio  = perf_evsel__intval(evsel, sample, "prev_prio"),
			.prev_state = perf_evsel__intval(evsel, sample, "prev_state"),
			.next_comm  = perf_evsel__strval(evsel, sample, "next_comm"),
			.next_pid   = next_pid,
			.next_prio  = perf_evsel__intval(evsel, sample, "next_prio"),
		};

		err = sched->tp_handler->switch_event(sched, &event, machine, evsel, sample);
	}

	sched->curr_pid[this_cpu] = next_pid;
1429
	return err;
1430 1431
}

1432
static int process_sched_runtime_event(struct perf_tool *tool,
1433
				       struct perf_evsel *evsel,
1434
				       struct perf_sample *sample,
1435
				       struct machine *machine)
1436
{
1437
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1438

1439 1440 1441 1442 1443 1444 1445 1446 1447
	if (sched->tp_handler->runtime_event) {
		struct trace_runtime_event event = {
			.comm	  = perf_evsel__strval(evsel, sample, "comm"),
			.pid	  = perf_evsel__intval(evsel, sample, "pid"),
			.runtime  = perf_evsel__intval(evsel, sample, "runtime"),
			.vruntime = perf_evsel__intval(evsel, sample, "vruntime"),
		};
		return sched->tp_handler->runtime_event(sched, &event, machine, sample);
	}
1448

1449
	return 0;
1450 1451
}

1452
static int process_sched_fork_event(struct perf_tool *tool,
1453
				    struct perf_evsel *evsel,
1454
				    struct perf_sample *sample,
1455
				    struct machine *machine __maybe_unused)
1456
{
1457
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1458

1459 1460 1461 1462 1463 1464 1465 1466 1467
	if (sched->tp_handler->fork_event) {
		struct trace_fork_event event = {
			.parent_comm = perf_evsel__strval(evsel, sample, "parent_comm"),
			.child_comm  = perf_evsel__strval(evsel, sample, "child_comm"),
			.parent_pid  = perf_evsel__intval(evsel, sample, "parent_pid"),
			.child_pid  = perf_evsel__intval(evsel, sample, "child_pid"),
		};
		return sched->tp_handler->fork_event(sched, &event, evsel);
	}
1468

1469
	return 0;
1470 1471
}

1472
static int process_sched_exit_event(struct perf_tool *tool __maybe_unused,
1473
				    struct perf_evsel *evsel,
1474
				    struct perf_sample *sample __maybe_unused,
1475
				    struct machine *machine __maybe_unused)
1476
{
1477
	pr_debug("sched_exit event %p\n", evsel);
1478
	return 0;
I
Ingo Molnar 已提交
1479 1480
}

1481
static int process_sched_migrate_task_event(struct perf_tool *tool,
1482
					    struct perf_evsel *evsel,
1483
					    struct perf_sample *sample,
1484
					    struct machine *machine)
1485
{
1486
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1487

1488 1489 1490 1491 1492 1493 1494 1495 1496
	if (sched->tp_handler->migrate_task_event) {
		struct trace_migrate_task_event event = {
			.comm = perf_evsel__strval(evsel, sample, "comm"),
			.pid  = perf_evsel__intval(evsel, sample, "pid"),
			.prio = perf_evsel__intval(evsel, sample, "prio"),
			.cpu  = perf_evsel__intval(evsel, sample, "cpu"),
		};
		return sched->tp_handler->migrate_task_event(sched, &event, machine, sample);
	}
1497

1498
	return 0;
1499 1500
}

1501
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1502
				  struct perf_evsel *evsel,
1503
				  struct perf_sample *sample,
1504
				  struct machine *machine);
I
Ingo Molnar 已提交
1505

1506 1507
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1508 1509 1510
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1511
{
1512
	struct thread *thread = machine__findnew_thread(machine, sample->pid);
1513
	int err = 0;
I
Ingo Molnar 已提交
1514 1515

	if (thread == NULL) {
1516
		pr_debug("problem processing %s event, skipping it.\n",
1517
			 perf_evsel__name(evsel));
I
Ingo Molnar 已提交
1518 1519 1520
		return -1;
	}

1521 1522
	evsel->hists.stats.total_period += sample->period;
	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1523

1524 1525
	if (evsel->handler.func != NULL) {
		tracepoint_handler f = evsel->handler.func;
1526
		err = f(tool, evsel, sample, machine);
1527
	}
I
Ingo Molnar 已提交
1528

1529
	return err;
I
Ingo Molnar 已提交
1530 1531
}

1532 1533
static int perf_sched__read_events(struct perf_sched *sched, bool destroy,
				   struct perf_session **psession)
I
Ingo Molnar 已提交
1534
{
1535 1536 1537 1538 1539 1540 1541 1542 1543
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_process_fork", process_sched_fork_event, },
		{ "sched:sched_process_exit", process_sched_exit_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1544 1545
	struct perf_session *session;

1546
	session = perf_session__new(sched->input_name, O_RDONLY, 0, false, &sched->tool);
1547 1548 1549 1550
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1551

1552 1553
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1554

1555
	if (perf_session__has_traces(session, "record -R")) {
1556
		int err = perf_session__process_events(session, &sched->tool);
1557 1558 1559 1560
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1561

1562 1563 1564
		sched->nr_events      = session->hists.stats.nr_events[0];
		sched->nr_lost_events = session->hists.stats.total_lost;
		sched->nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1565
	}
1566

1567 1568 1569 1570 1571
	if (destroy)
		perf_session__delete(session);

	if (psession)
		*psession = session;
1572 1573 1574 1575 1576 1577

	return 0;

out_delete:
	perf_session__delete(session);
	return -1;
I
Ingo Molnar 已提交
1578 1579
}

1580
static void print_bad_events(struct perf_sched *sched)
1581
{
1582
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1583
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1584 1585
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1586
	}
1587
	if (sched->nr_lost_events && sched->nr_events) {
1588
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1589 1590
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1591
	}
1592
	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
1593
		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1594 1595 1596
			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_state_machine_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1597 1598 1599
			printf(" (due to lost events?)");
		printf("\n");
	}
1600
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1601
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1602 1603 1604
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1605 1606 1607 1608 1609
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1610
static int perf_sched__lat(struct perf_sched *sched)
1611 1612
{
	struct rb_node *next;
1613
	struct perf_session *session;
1614 1615

	setup_pager();
1616
	if (perf_sched__read_events(sched, false, &session))
1617
		return -1;
1618
	perf_sched__sort_lat(sched);
1619

1620 1621 1622
	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1623

1624
	next = rb_first(&sched->sorted_atom_root);
1625 1626 1627 1628 1629

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1630
		output_lat_thread(sched, work_list);
1631 1632 1633 1634
		next = rb_next(next);
	}

	printf(" -----------------------------------------------------------------------------------------\n");
1635
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1636
		(double)sched->all_runtime / 1e6, sched->all_count);
1637 1638 1639

	printf(" ---------------------------------------------------\n");

1640
	print_bad_events(sched);
1641 1642
	printf("\n");

1643
	perf_session__delete(session);
1644
	return 0;
1645 1646
}

1647
static int perf_sched__map(struct perf_sched *sched)
1648
{
1649
	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1650

1651
	setup_pager();
1652
	if (perf_sched__read_events(sched, true, NULL))
1653
		return -1;
1654
	print_bad_events(sched);
1655
	return 0;
1656 1657
}

1658
static int perf_sched__replay(struct perf_sched *sched)
1659 1660 1661
{
	unsigned long i;

1662 1663
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1664

1665
	test_calibrations(sched);
1666

1667
	if (perf_sched__read_events(sched, true, NULL))
1668
		return -1;
1669

1670 1671 1672
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1673

1674 1675 1676 1677 1678
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1679
		printf("run atoms optimized: %ld\n",
1680
			sched->nr_run_events_optimized);
1681

1682 1683
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1684

1685
	create_tasks(sched);
1686
	printf("------------------------------------------------------------\n");
1687 1688
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1689 1690

	return 0;
1691 1692
}

1693 1694
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1695
{
1696
	char *tmp, *tok, *str = strdup(sched->sort_order);
1697 1698 1699

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1700
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1701
			error("Unknown --sort key: `%s'", tok);
1702
			usage_with_options(usage_msg, options);
1703 1704 1705 1706 1707
		}
	}

	free(str);

1708
	sort_dimension__add("pid", &sched->cmp_pid);
1709 1710
}

1711 1712 1713 1714
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-f",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_exit",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
		"-e", "sched:sched_migrate_task",
	};
1732 1733 1734 1735

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1736
	if (rec_argv == NULL)
1737 1738
		return -ENOMEM;

1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1750
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
1751
{
1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_event__process_task,
			.ordered_samples = true,
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
	};
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option sched_options[] = {
	OPT_STRING('i', "input", &sched.input_name, "file",
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
	const char * const sched_usage[] = {
		"perf sched [<options>] {record|latency|map|replay|script}",
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.fork_event	    = latency_fork_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};

1829 1830 1831 1832
	argc = parse_options(argc, argv, sched_options, sched_usage,
			     PARSE_OPT_STOP_AT_NON_OPTION);
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
1833

1834
	/*
1835
	 * Aliased to 'perf script' for now:
1836
	 */
1837 1838
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1839

1840
	symbol__init();
1841 1842 1843
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1844
		sched.tp_handler = &lat_ops;
1845 1846 1847 1848 1849
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
1850 1851
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
1852
	} else if (!strcmp(argv[0], "map")) {
1853 1854 1855
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
1856
	} else if (!strncmp(argv[0], "rep", 3)) {
1857
		sched.tp_handler = &replay_ops;
1858 1859 1860 1861 1862
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
1863
		return perf_sched__replay(&sched);
1864 1865 1866 1867
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
1868
	return 0;
I
Ingo Molnar 已提交
1869
}