builtin-sched.c 43.8 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
I
Ingo Molnar 已提交
13 14

#include "util/parse-options.h"
15
#include "util/trace-event.h"
I
Ingo Molnar 已提交
16 17 18

#include "util/debug.h"

19
#include <sys/prctl.h>
20
#include <sys/resource.h>
I
Ingo Molnar 已提交
21

22 23 24
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
25

26 27 28 29 30
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
#define MAX_PID			65536
I
Ingo Molnar 已提交
31

32
struct sched_atom;
I
Ingo Molnar 已提交
33

34 35 36 37
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
38

39 40
	unsigned long		nr_events;
	unsigned long		curr_event;
41
	struct sched_atom	**atoms;
42 43 44

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
45

46 47 48 49 50 51 52 53 54 55
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
56
	SCHED_EVENT_MIGRATION,
57 58
};

59
struct sched_atom {
60
	enum sched_event_type	type;
61
	int			specific_wait;
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

#define TASK_STATE_TO_CHAR_STR "RSDTtZX"

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
81
	u64			sched_out_time;
82 83 84 85 86
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

87 88
struct work_atoms {
	struct list_head	work_list;
89 90 91
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
92
	u64			max_lat_at;
93 94 95 96 97
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
};

98
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
99

100
struct perf_sched;
101

102 103 104
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
105

106 107
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
108

109 110
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
111

112 113 114
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
115 116

	int (*migrate_task_event)(struct perf_sched *sched,
117 118 119
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
};

struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
	struct task_desc *pid_to_task[MAX_PID];
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_state_machine_bugs;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
	struct rb_root	 atom_root, sorted_atom_root;
	struct list_head sort_list, cmp_pid;
};
173 174

static u64 get_nsecs(void)
I
Ingo Molnar 已提交
175 176 177 178 179 180 181 182
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

183
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
184
{
185
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
186 187 188

	do {
		T1 = get_nsecs();
189
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
190 191
}

192
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
193 194 195 196 197 198 199 200 201
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

202
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
203
{
204
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
205 206 207 208
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
209
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
210 211 212 213
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
214
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
215

216
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
217 218
}

219
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
220
{
221
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
222 223 224 225 226 227 228 229 230 231
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
232
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
233

234
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
235 236
}

237
static struct sched_atom *
238
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
239
{
240
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
241 242 243 244 245 246 247
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
248 249 250
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
251

252
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
253 254 255 256

	return event;
}

257
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
258 259 260 261
{
	if (!task->nr_events)
		return NULL;

262
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
263 264
}

265 266
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
267
{
268
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
269 270

	/*
271 272 273
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
274
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
275
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
276 277 278 279 280 281 282 283 284
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

285
	sched->nr_run_events++;
I
Ingo Molnar 已提交
286 287
}

288 289
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
290
{
291
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
292 293 294 295 296 297 298

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
299
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
300 301 302
		return;
	}
	if (wakee_event->wait_sem) {
303
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
304 305 306
		return;
	}

307
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
308 309 310 311
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

312
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
313 314
}

315 316
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
317
{
318
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
319 320 321

	event->type = SCHED_EVENT_SLEEP;

322
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
323 324
}

325 326
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
327 328 329 330 331
{
	struct task_desc *task;

	BUG_ON(pid >= MAX_PID);

332
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
333 334 335 336

	if (task)
		return task;

337
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
338
	task->pid = pid;
339
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
340 341 342 343 344
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
345
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
346

347 348 349 350 351
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
352

I
Ingo Molnar 已提交
353
	if (verbose)
354
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
355 356 357 358 359

	return task;
}


360
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
361 362 363 364
{
	struct task_desc *task;
	unsigned long i;

365 366
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
367
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
368 369 370 371
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

372
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
373 374 375 376
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

377 378
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
379
		j = i + 1;
380
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
381
			j = 0;
382 383
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
384 385 386
	}
}

387 388
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
389 390 391
{
	int ret = 0;

392
	switch (atom->type) {
I
Ingo Molnar 已提交
393
		case SCHED_EVENT_RUN:
394
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
395 396
			break;
		case SCHED_EVENT_SLEEP:
397 398
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
399 400 401
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
402 403
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
404 405
			BUG_ON(ret);
			break;
406 407
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
408 409 410 411 412
		default:
			BUG_ON(1);
	}
}

413
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
414 415
{
	struct rusage ru;
416
	u64 sum;
I
Ingo Molnar 已提交
417 418 419 420 421 422 423 424 425 426 427
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

428
static int self_open_counters(void)
I
Ingo Molnar 已提交
429
{
430 431
	struct perf_event_attr attr;
	int fd;
I
Ingo Molnar 已提交
432

433
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
434

435 436
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
437

438 439 440
	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);

	if (fd < 0)
441 442
		pr_err("Error: sys_perf_event_open() syscall returned "
		       "with %d (%s)\n", fd, strerror(errno));
443 444 445 446 447 448 449 450 451 452 453 454
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
455 456
}

457 458 459 460 461
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
};

I
Ingo Molnar 已提交
462 463
static void *thread_func(void *ctx)
{
464 465 466
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
467
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
468 469
	unsigned long i, ret;
	char comm2[22];
470
	int fd;
I
Ingo Molnar 已提交
471

472 473
	free(parms);

I
Ingo Molnar 已提交
474 475
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
476
	fd = self_open_counters();
477 478
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
479 480 481
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
482
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
483
	BUG_ON(ret);
484
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
485 486
	BUG_ON(ret);

487
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
488 489 490

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
491
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
492 493
	}

494
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
495 496 497 498
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

499
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
500
	BUG_ON(ret);
501
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
502 503 504 505 506
	BUG_ON(ret);

	goto again;
}

507
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
508 509 510 511 512 513 514 515
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
516 517
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
518
	BUG_ON(err);
519
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
520
	BUG_ON(err);
521
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
522
	BUG_ON(err);
523 524 525 526 527
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
I
Ingo Molnar 已提交
528 529 530 531
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
532
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
533 534 535 536
		BUG_ON(err);
	}
}

537
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
538
{
539
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
540 541 542
	struct task_desc *task;
	unsigned long i, ret;

543 544 545
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
546

547 548
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
549 550 551 552
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
553
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
554 555 556 557
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

558
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
559

560 561
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
562 563 564
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
565
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
566 567 568 569
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
570 571 572
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
I
Ingo Molnar 已提交
573

574 575 576 577 578
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
					 sched->parent_cpu_usage)/10;
I
Ingo Molnar 已提交
579

580
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
581 582
	BUG_ON(ret);

583 584
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
585 586 587 588 589
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

590
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
591
{
K
Kyle McMartin 已提交
592
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
593 594

	T0 = get_nsecs();
595
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
596 597 598
	T1 = get_nsecs();

	delta = T1 - T0;
599 600
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
601

602
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
603 604 605 606
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
607 608 609 610
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
I
Ingo Molnar 已提交
611

612
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
I
Ingo Molnar 已提交
613

614
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
I
Ingo Molnar 已提交
615

I
Ingo Molnar 已提交
616
	printf("cpu: %0.2f / %0.2f",
617
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
I
Ingo Molnar 已提交
618 619 620

#if 0
	/*
621
	 * rusage statistics done by the parent, these are less
622
	 * accurate than the sched->sum_exec_runtime based statistics:
623
	 */
I
Ingo Molnar 已提交
624
	printf(" [%0.2f / %0.2f]",
625 626
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
I
Ingo Molnar 已提交
627 628
#endif

I
Ingo Molnar 已提交
629
	printf("\n");
I
Ingo Molnar 已提交
630

631 632 633
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
634 635
}

636
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
637
{
638
	u64 T0, T1;
I
Ingo Molnar 已提交
639 640

	T0 = get_nsecs();
641
	burn_nsecs(sched, 1e6);
I
Ingo Molnar 已提交
642 643
	T1 = get_nsecs();

644
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
645 646 647 648 649

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

650
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
651 652
}

653
static int
654
replay_wakeup_event(struct perf_sched *sched,
655 656
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
657
{
658 659
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
660
	struct task_desc *waker, *wakee;
661

I
Ingo Molnar 已提交
662
	if (verbose) {
663
		printf("sched_wakeup event %p\n", evsel);
664

665
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
I
Ingo Molnar 已提交
666
	}
667

668
	waker = register_pid(sched, sample->tid, "<unknown>");
669
	wakee = register_pid(sched, pid, comm);
670

671
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
672
	return 0;
I
Ingo Molnar 已提交
673 674
}

675 676 677 678
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
I
Ingo Molnar 已提交
679
{
680 681 682 683 684
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
685
	struct task_desc *prev, __maybe_unused *next;
686 687
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
688 689
	s64 delta;

I
Ingo Molnar 已提交
690
	if (verbose)
691
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
692

693
	if (cpu >= MAX_CPUS || cpu < 0)
694
		return 0;
695

696
	timestamp0 = sched->cpu_last_switched[cpu];
697 698 699 700 701
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

702
	if (delta < 0) {
703
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
704 705
		return -1;
	}
706

707 708
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
709

710 711
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
712

713
	sched->cpu_last_switched[cpu] = timestamp;
714

715
	add_sched_event_run(sched, prev, timestamp, delta);
716
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
717 718

	return 0;
719 720
}

721 722 723
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
724
{
725 726
	struct thread *child, *parent;

727 728 729 730
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
731 732 733 734 735 736

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
		return 0;
	}
737

738
	if (verbose) {
739 740 741
		printf("fork event\n");
		printf("... parent: %s/%d\n", parent->comm, parent->tid);
		printf("...  child: %s/%d\n", child->comm, child->tid);
742
	}
743

744 745
	register_pid(sched, parent->tid, parent->comm);
	register_pid(sched, child->tid, child->comm);
746
	return 0;
747
}
748

749 750
struct sort_dimension {
	const char		*name;
751
	sort_fn_t		cmp;
752 753 754
	struct list_head	list;
};

755
static int
756
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
757 758 759 760
{
	struct sort_dimension *sort;
	int ret = 0;

761 762
	BUG_ON(list_empty(list));

763 764 765 766 767 768 769 770 771
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

772
static struct work_atoms *
773 774 775 776
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
777
	struct work_atoms key = { .thread = thread };
778 779

	while (node) {
780
		struct work_atoms *atoms;
781 782
		int cmp;

783
		atoms = container_of(node, struct work_atoms, node);
784 785 786 787 788 789 790 791 792 793 794 795 796 797

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

798
static void
799
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
800
			 struct list_head *sort_list)
801 802 803 804
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
805
		struct work_atoms *this;
806
		int cmp;
807

808
		this = container_of(*new, struct work_atoms, node);
809
		parent = *new;
810 811 812 813

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
814 815
			new = &((*new)->rb_left);
		else
816
			new = &((*new)->rb_right);
817 818 819 820 821 822
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

823
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
824
{
825
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
826 827 828 829
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
830

831
	atoms->thread = thread;
832
	INIT_LIST_HEAD(&atoms->work_list);
833
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
834
	return 0;
835 836
}

837
static char sched_out_state(u64 prev_state)
838 839 840
{
	const char *str = TASK_STATE_TO_CHAR_STR;

841
	return str[prev_state];
842 843
}

844
static int
845 846 847
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
848
{
849
	struct work_atom *atom = zalloc(sizeof(*atom));
850 851 852 853
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
854

855 856
	atom->sched_out_time = timestamp;

857
	if (run_state == 'R') {
858
		atom->state = THREAD_WAIT_CPU;
859
		atom->wake_up_time = atom->sched_out_time;
860 861
	}

862
	list_add_tail(&atom->list, &atoms->work_list);
863
	return 0;
864 865 866
}

static void
867 868
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
869 870 871 872 873 874 875 876 877 878 879 880 881
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
882
{
883
	struct work_atom *atom;
884
	u64 delta;
885

886
	if (list_empty(&atoms->work_list))
887 888
		return;

889
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
890

891
	if (atom->state != THREAD_WAIT_CPU)
892 893
		return;

894 895
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
896 897 898
		return;
	}

899 900
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
901

902
	delta = atom->sched_in_time - atom->wake_up_time;
903
	atoms->total_lat += delta;
904
	if (delta > atoms->max_lat) {
905
		atoms->max_lat = delta;
906 907
		atoms->max_lat_at = timestamp;
	}
908
	atoms->nb_atoms++;
909 910
}

911 912 913 914
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
915
{
916 917 918
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
919
	struct work_atoms *out_events, *in_events;
920
	struct thread *sched_out, *sched_in;
921 922
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
I
Ingo Molnar 已提交
923 924
	s64 delta;

925
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
926

927 928
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
929 930 931 932 933
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

934 935 936 937
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
938

939 940
	sched_out = machine__findnew_thread(machine, 0, prev_pid);
	sched_in = machine__findnew_thread(machine, 0, next_pid);
941

942
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
943
	if (!out_events) {
944
		if (thread_atoms_insert(sched, sched_out))
945
			return -1;
946
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
947 948 949 950
		if (!out_events) {
			pr_err("out-event: Internal tree error");
			return -1;
		}
951
	}
952
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
953
		return -1;
954

955
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
956
	if (!in_events) {
957
		if (thread_atoms_insert(sched, sched_in))
958
			return -1;
959
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
960 961 962 963
		if (!in_events) {
			pr_err("in-event: Internal tree error");
			return -1;
		}
964 965 966 967
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
968 969
		if (add_sched_out_event(in_events, 'R', timestamp))
			return -1;
970
	}
971
	add_sched_in_event(in_events, timestamp);
972 973

	return 0;
974
}
975

976 977 978 979
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
980
{
981 982
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
983
	struct thread *thread = machine__findnew_thread(machine, 0, pid);
984
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
985 986
	u64 timestamp = sample->time;
	int cpu = sample->cpu;
987 988 989

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
990
		if (thread_atoms_insert(sched, thread))
991
			return -1;
992
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
993
		if (!atoms) {
994
			pr_err("in-event: Internal tree error");
995 996 997 998
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
999 1000
	}

1001
	add_runtime_event(atoms, runtime, timestamp);
1002
	return 0;
1003 1004
}

1005 1006 1007 1008
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1009
{
1010 1011
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid"),
		  success = perf_evsel__intval(evsel, sample, "success");
1012
	struct work_atoms *atoms;
1013
	struct work_atom *atom;
1014
	struct thread *wakee;
1015
	u64 timestamp = sample->time;
1016 1017

	/* Note for later, it may be interesting to observe the failing cases */
1018
	if (!success)
1019
		return 0;
1020

1021
	wakee = machine__findnew_thread(machine, 0, pid);
1022
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1023
	if (!atoms) {
1024
		if (thread_atoms_insert(sched, wakee))
1025
			return -1;
1026
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1027
		if (!atoms) {
1028
			pr_err("wakeup-event: Internal tree error");
1029 1030 1031 1032
			return -1;
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
			return -1;
1033 1034
	}

1035
	BUG_ON(list_empty(&atoms->work_list));
1036

1037
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1038

1039 1040 1041 1042 1043
	/*
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
	 * make useless noise.
	 */
1044 1045
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
		sched->nr_state_machine_bugs++;
1046

1047
	sched->nr_timestamps++;
1048
	if (atom->sched_out_time > timestamp) {
1049
		sched->nr_unordered_timestamps++;
1050
		return 0;
1051
	}
1052

1053 1054
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1055
	return 0;
1056 1057
}

1058 1059 1060 1061
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1062
{
1063
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1064
	u64 timestamp = sample->time;
1065 1066 1067 1068 1069 1070 1071
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1072
	if (sched->profile_cpu == -1)
1073
		return 0;
1074

1075
	migrant = machine__findnew_thread(machine, 0, pid);
1076
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1077
	if (!atoms) {
1078
		if (thread_atoms_insert(sched, migrant))
1079
			return -1;
1080
		register_pid(sched, migrant->tid, migrant->comm);
1081
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1082
		if (!atoms) {
1083
			pr_err("migration-event: Internal tree error");
1084 1085 1086 1087
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1088 1089 1090 1091 1092 1093 1094
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1095
	sched->nr_timestamps++;
1096 1097

	if (atom->sched_out_time > timestamp)
1098
		sched->nr_unordered_timestamps++;
1099 1100

	return 0;
1101 1102
}

1103
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1104 1105 1106
{
	int i;
	int ret;
1107
	u64 avg;
1108

1109
	if (!work_list->nb_atoms)
1110
		return;
1111 1112 1113
	/*
	 * Ignore idle threads:
	 */
1114
	if (!strcmp(work_list->thread->comm, "swapper"))
1115
		return;
1116

1117 1118
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1119

1120
	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->tid);
1121

M
mingo 已提交
1122
	for (i = 0; i < 24 - ret; i++)
1123 1124
		printf(" ");

1125
	avg = work_list->total_lat / work_list->nb_atoms;
1126

1127
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1128 1129
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1130 1131
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1132 1133
}

1134
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1135
{
1136
	if (l->thread->tid < r->thread->tid)
1137
		return -1;
1138
	if (l->thread->tid > r->thread->tid)
1139 1140 1141 1142 1143
		return 1;

	return 0;
}

1144
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1165
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1166 1167 1168 1169 1170 1171 1172 1173 1174
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1175
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1176 1177 1178 1179 1180 1181 1182 1183 1184
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1185
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1186 1187 1188 1189 1190 1191 1192 1193 1194
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1195
static int sort_dimension__add(const char *tok, struct list_head *list)
1196
{
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1225

1226
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1237
static void perf_sched__sort_lat(struct perf_sched *sched)
1238 1239 1240 1241
{
	struct rb_node *node;

	for (;;) {
1242
		struct work_atoms *data;
1243
		node = rb_first(&sched->atom_root);
1244 1245 1246
		if (!node)
			break;

1247
		rb_erase(node, &sched->atom_root);
1248
		data = rb_entry(node, struct work_atoms, node);
1249
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1250 1251 1252
	}
}

1253
static int process_sched_wakeup_event(struct perf_tool *tool,
1254
				      struct perf_evsel *evsel,
1255
				      struct perf_sample *sample,
1256
				      struct machine *machine)
1257
{
1258
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1259

1260 1261
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1262

1263
	return 0;
1264 1265
}

1266 1267
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1268
{
1269 1270
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1271
	struct thread *sched_out __maybe_unused, *sched_in;
1272
	int new_shortname;
1273
	u64 timestamp0, timestamp = sample->time;
1274
	s64 delta;
1275
	int cpu, this_cpu = sample->cpu;
1276 1277 1278

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1279 1280
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1281

1282 1283
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1284 1285 1286 1287 1288
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1289
	if (delta < 0) {
1290
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1291 1292
		return -1;
	}
1293

1294 1295
	sched_out = machine__findnew_thread(machine, 0, prev_pid);
	sched_in = machine__findnew_thread(machine, 0, next_pid);
1296

1297
	sched->curr_thread[this_cpu] = sched_in;
1298 1299 1300 1301 1302

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1303 1304
		sched_in->shortname[0] = sched->next_shortname1;
		sched_in->shortname[1] = sched->next_shortname2;
1305

1306 1307
		if (sched->next_shortname1 < 'Z') {
			sched->next_shortname1++;
1308
		} else {
1309 1310 1311
			sched->next_shortname1='A';
			if (sched->next_shortname2 < '9') {
				sched->next_shortname2++;
1312
			} else {
1313
				sched->next_shortname2='0';
1314 1315 1316 1317 1318
			}
		}
		new_shortname = 1;
	}

1319
	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1320 1321 1322 1323 1324
		if (cpu != this_cpu)
			printf(" ");
		else
			printf("*");

1325
		if (sched->curr_thread[cpu]) {
1326
			if (sched->curr_thread[cpu]->tid)
1327
				printf("%2s ", sched->curr_thread[cpu]->shortname);
1328 1329 1330 1331 1332 1333 1334 1335 1336
			else
				printf(".  ");
		} else
			printf("   ");
	}

	printf("  %12.6f secs ", (double)timestamp/1e9);
	if (new_shortname) {
		printf("%s => %s:%d\n",
1337
			sched_in->shortname, sched_in->comm, sched_in->tid);
1338 1339 1340
	} else {
		printf("\n");
	}
1341 1342

	return 0;
1343 1344
}

1345
static int process_sched_switch_event(struct perf_tool *tool,
1346
				      struct perf_evsel *evsel,
1347
				      struct perf_sample *sample,
1348
				      struct machine *machine)
1349
{
1350
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1351
	int this_cpu = sample->cpu, err = 0;
1352 1353
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1354

1355
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1356 1357 1358 1359
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1360
		if (sched->curr_pid[this_cpu] != prev_pid)
1361
			sched->nr_context_switch_bugs++;
1362 1363
	}

1364 1365
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1366 1367

	sched->curr_pid[this_cpu] = next_pid;
1368
	return err;
1369 1370
}

1371
static int process_sched_runtime_event(struct perf_tool *tool,
1372
				       struct perf_evsel *evsel,
1373
				       struct perf_sample *sample,
1374
				       struct machine *machine)
1375
{
1376
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1377

1378 1379
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1380

1381
	return 0;
1382 1383
}

1384 1385 1386 1387
static int perf_sched__process_fork_event(struct perf_tool *tool,
					  union perf_event *event,
					  struct perf_sample *sample,
					  struct machine *machine)
1388
{
1389
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1390

1391 1392 1393 1394
	/* run the fork event through the perf machineruy */
	perf_event__process_fork(tool, event, sample, machine);

	/* and then run additional processing needed for this command */
1395
	if (sched->tp_handler->fork_event)
1396
		return sched->tp_handler->fork_event(sched, event, machine);
1397

1398
	return 0;
1399 1400
}

1401
static int process_sched_migrate_task_event(struct perf_tool *tool,
1402
					    struct perf_evsel *evsel,
1403
					    struct perf_sample *sample,
1404
					    struct machine *machine)
1405
{
1406
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1407

1408 1409
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1410

1411
	return 0;
1412 1413
}

1414
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1415
				  struct perf_evsel *evsel,
1416
				  struct perf_sample *sample,
1417
				  struct machine *machine);
I
Ingo Molnar 已提交
1418

1419 1420
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1421 1422 1423
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1424
{
1425
	int err = 0;
I
Ingo Molnar 已提交
1426

1427 1428
	evsel->hists.stats.total_period += sample->period;
	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1429

1430 1431
	if (evsel->handler.func != NULL) {
		tracepoint_handler f = evsel->handler.func;
1432
		err = f(tool, evsel, sample, machine);
1433
	}
I
Ingo Molnar 已提交
1434

1435
	return err;
I
Ingo Molnar 已提交
1436 1437
}

1438
static int perf_sched__read_events(struct perf_sched *sched,
1439
				   struct perf_session **psession)
I
Ingo Molnar 已提交
1440
{
1441 1442 1443 1444 1445 1446 1447
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1448 1449
	struct perf_session *session;

1450
	session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
1451 1452 1453 1454
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1455

1456 1457
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1458

1459
	if (perf_session__has_traces(session, "record -R")) {
1460
		int err = perf_session__process_events(session, &sched->tool);
1461 1462 1463 1464
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1465

1466 1467 1468
		sched->nr_events      = session->stats.nr_events[0];
		sched->nr_lost_events = session->stats.total_lost;
		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
1469
	}
1470

1471 1472
	if (psession)
		*psession = session;
1473 1474
	else
		perf_session__delete(session);
1475 1476 1477 1478 1479 1480

	return 0;

out_delete:
	perf_session__delete(session);
	return -1;
I
Ingo Molnar 已提交
1481 1482
}

1483
static void print_bad_events(struct perf_sched *sched)
1484
{
1485
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1486
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1487 1488
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1489
	}
1490
	if (sched->nr_lost_events && sched->nr_events) {
1491
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1492 1493
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1494
	}
1495
	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
1496
		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1497 1498 1499
			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_state_machine_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1500 1501 1502
			printf(" (due to lost events?)");
		printf("\n");
	}
1503
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1504
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1505 1506 1507
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1508 1509 1510 1511 1512
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1513
static int perf_sched__lat(struct perf_sched *sched)
1514 1515
{
	struct rb_node *next;
1516
	struct perf_session *session;
1517 1518

	setup_pager();
1519 1520 1521

	/* save session -- references to threads are held in work_list */
	if (perf_sched__read_events(sched, &session))
1522
		return -1;
1523

1524
	perf_sched__sort_lat(sched);
1525

1526 1527 1528
	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1529

1530
	next = rb_first(&sched->sorted_atom_root);
1531 1532 1533 1534 1535

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1536
		output_lat_thread(sched, work_list);
1537 1538 1539 1540
		next = rb_next(next);
	}

	printf(" -----------------------------------------------------------------------------------------\n");
1541
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1542
		(double)sched->all_runtime / 1e6, sched->all_count);
1543 1544 1545

	printf(" ---------------------------------------------------\n");

1546
	print_bad_events(sched);
1547 1548
	printf("\n");

1549
	perf_session__delete(session);
1550
	return 0;
1551 1552
}

1553
static int perf_sched__map(struct perf_sched *sched)
1554
{
1555
	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1556

1557
	setup_pager();
1558
	if (perf_sched__read_events(sched, NULL))
1559
		return -1;
1560
	print_bad_events(sched);
1561
	return 0;
1562 1563
}

1564
static int perf_sched__replay(struct perf_sched *sched)
1565 1566 1567
{
	unsigned long i;

1568 1569
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1570

1571
	test_calibrations(sched);
1572

1573
	if (perf_sched__read_events(sched, NULL))
1574
		return -1;
1575

1576 1577 1578
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1579

1580 1581 1582 1583 1584
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1585
		printf("run atoms optimized: %ld\n",
1586
			sched->nr_run_events_optimized);
1587

1588 1589
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1590

1591
	create_tasks(sched);
1592
	printf("------------------------------------------------------------\n");
1593 1594
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1595 1596

	return 0;
1597 1598
}

1599 1600
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1601
{
1602
	char *tmp, *tok, *str = strdup(sched->sort_order);
1603 1604 1605

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1606
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1607
			error("Unknown --sort key: `%s'", tok);
1608
			usage_with_options(usage_msg, options);
1609 1610 1611 1612 1613
		}
	}

	free(str);

1614
	sort_dimension__add("pid", &sched->cmp_pid);
1615 1616
}

1617 1618 1619 1620
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
		"-e", "sched:sched_migrate_task",
	};
1636 1637 1638 1639

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1640
	if (rec_argv == NULL)
1641 1642
		return -ENOMEM;

1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1654 1655 1656 1657 1658 1659
static const char default_sort_order[] = "avg, max, switch, runtime";
static struct perf_sched sched = {
	.tool = {
		.sample		 = perf_sched__process_tracepoint_sample,
		.comm		 = perf_event__process_comm,
		.lost		 = perf_event__process_lost,
1660
		.fork		 = perf_sched__process_fork_event,
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674
		.ordered_samples = true,
	},
	.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
	.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
	.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
	.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
	.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
	.sort_order	      = default_sort_order,
	.replay_repeat	      = 10,
	.profile_cpu	      = -1,
	.next_shortname1      = 'A',
	.next_shortname2      = '0',
};

1675
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
1676
{
1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option sched_options[] = {
1698
	OPT_STRING('i', "input", &input_name, "file",
1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
	const char * const sched_usage[] = {
		"perf sched [<options>] {record|latency|map|replay|script}",
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};

1733 1734 1735 1736
	argc = parse_options(argc, argv, sched_options, sched_usage,
			     PARSE_OPT_STOP_AT_NON_OPTION);
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
1737

1738
	/*
1739
	 * Aliased to 'perf script' for now:
1740
	 */
1741 1742
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1743

1744
	symbol__init();
1745 1746 1747
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1748
		sched.tp_handler = &lat_ops;
1749 1750 1751 1752 1753
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
1754 1755
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
1756
	} else if (!strcmp(argv[0], "map")) {
1757 1758 1759
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
1760
	} else if (!strncmp(argv[0], "rep", 3)) {
1761
		sched.tp_handler = &replay_ops;
1762 1763 1764 1765 1766
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
1767
		return perf_sched__replay(&sched);
1768 1769 1770 1771
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
1772
	return 0;
I
Ingo Molnar 已提交
1773
}