builtin-sched.c 43.8 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
13
#include "util/cloexec.h"
I
Ingo Molnar 已提交
14 15

#include "util/parse-options.h"
16
#include "util/trace-event.h"
I
Ingo Molnar 已提交
17 18 19

#include "util/debug.h"

20
#include <sys/prctl.h>
21
#include <sys/resource.h>
I
Ingo Molnar 已提交
22

23 24 25
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
26

27 28 29 30
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
31
#define MAX_PID			1024000
I
Ingo Molnar 已提交
32

33
struct sched_atom;
I
Ingo Molnar 已提交
34

35 36 37 38
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
39

40 41
	unsigned long		nr_events;
	unsigned long		curr_event;
42
	struct sched_atom	**atoms;
43 44 45

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
46

47 48 49 50 51 52 53 54 55 56
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
57
	SCHED_EVENT_MIGRATION,
58 59
};

60
struct sched_atom {
61
	enum sched_event_type	type;
62
	int			specific_wait;
63 64 65 66 67 68 69
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

70
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
71 72 73 74 75 76 77 78 79 80 81

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
82
	u64			sched_out_time;
83 84 85 86 87
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

88 89
struct work_atoms {
	struct list_head	work_list;
90 91 92
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
93
	u64			max_lat_at;
94 95 96 97 98
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
};

99
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
100

101
struct perf_sched;
102

103 104 105
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
106

107 108
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
109

110 111
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
112

113 114 115
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
116 117

	int (*migrate_task_event)(struct perf_sched *sched,
118 119 120
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
};

struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
	struct task_desc *pid_to_task[MAX_PID];
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
	struct rb_root	 atom_root, sorted_atom_root;
	struct list_head sort_list, cmp_pid;
};
173 174

static u64 get_nsecs(void)
I
Ingo Molnar 已提交
175 176 177 178 179 180 181 182
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

183
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
184
{
185
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
186 187 188

	do {
		T1 = get_nsecs();
189
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
190 191
}

192
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
193 194 195 196 197 198 199 200 201
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

202
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
203
{
204
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
205 206 207 208
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
209
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
210 211 212 213
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
214
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
215

216
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
217 218
}

219
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
220
{
221
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
222 223 224 225 226 227 228 229 230 231
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
232
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
233

234
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
235 236
}

237
static struct sched_atom *
238
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
239
{
240
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
241 242 243 244 245 246 247
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
248 249 250
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
251

252
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
253 254 255 256

	return event;
}

257
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
258 259 260 261
{
	if (!task->nr_events)
		return NULL;

262
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
263 264
}

265 266
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
267
{
268
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
269 270

	/*
271 272 273
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
274
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
275
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
276 277 278 279 280 281 282 283 284
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

285
	sched->nr_run_events++;
I
Ingo Molnar 已提交
286 287
}

288 289
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
290
{
291
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
292 293 294 295 296 297 298

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
299
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
300 301 302
		return;
	}
	if (wakee_event->wait_sem) {
303
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
304 305 306
		return;
	}

307
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
308 309 310 311
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

312
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
313 314
}

315 316
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
317
{
318
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
319 320 321

	event->type = SCHED_EVENT_SLEEP;

322
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
323 324
}

325 326
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
327 328 329 330 331
{
	struct task_desc *task;

	BUG_ON(pid >= MAX_PID);

332
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
333 334 335 336

	if (task)
		return task;

337
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
338
	task->pid = pid;
339
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
340 341 342 343 344
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
345
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
346

347 348
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
349
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
350 351
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
352

I
Ingo Molnar 已提交
353
	if (verbose)
354
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
355 356 357 358 359

	return task;
}


360
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
361 362 363 364
{
	struct task_desc *task;
	unsigned long i;

365 366
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
367
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
368 369 370 371
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

372
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
373 374 375 376
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

377 378
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
379
		j = i + 1;
380
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
381
			j = 0;
382 383
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
384 385 386
	}
}

387 388
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
389 390 391
{
	int ret = 0;

392
	switch (atom->type) {
I
Ingo Molnar 已提交
393
		case SCHED_EVENT_RUN:
394
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
395 396
			break;
		case SCHED_EVENT_SLEEP:
397 398
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
399 400 401
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
402 403
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
404 405
			BUG_ON(ret);
			break;
406 407
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
408 409 410 411 412
		default:
			BUG_ON(1);
	}
}

413
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
414 415
{
	struct rusage ru;
416
	u64 sum;
I
Ingo Molnar 已提交
417 418 419 420 421 422 423 424 425 426 427
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

428
static int self_open_counters(void)
I
Ingo Molnar 已提交
429
{
430
	struct perf_event_attr attr;
431
	char sbuf[STRERR_BUFSIZE];
432
	int fd;
I
Ingo Molnar 已提交
433

434
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
435

436 437
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
438

439 440
	fd = sys_perf_event_open(&attr, 0, -1, -1,
				 perf_event_open_cloexec_flag());
441 442

	if (fd < 0)
443
		pr_err("Error: sys_perf_event_open() syscall returned "
444 445
		       "with %d (%s)\n", fd,
		       strerror_r(errno, sbuf, sizeof(sbuf)));
446 447 448 449 450 451 452 453 454 455 456 457
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
458 459
}

460 461 462 463 464
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
};

I
Ingo Molnar 已提交
465 466
static void *thread_func(void *ctx)
{
467 468 469
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
470
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
471 472
	unsigned long i, ret;
	char comm2[22];
473
	int fd;
I
Ingo Molnar 已提交
474

475
	zfree(&parms);
476

I
Ingo Molnar 已提交
477 478
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
479
	fd = self_open_counters();
480 481
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
482 483 484
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
485
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
486
	BUG_ON(ret);
487
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
488 489
	BUG_ON(ret);

490
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
491 492 493

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
494
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
495 496
	}

497
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
498 499 500 501
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

502
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
503
	BUG_ON(ret);
504
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
505 506 507 508 509
	BUG_ON(ret);

	goto again;
}

510
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
511 512 513 514 515 516 517 518
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
519 520
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
521
	BUG_ON(err);
522
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
523
	BUG_ON(err);
524
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
525
	BUG_ON(err);
526 527 528 529 530
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
I
Ingo Molnar 已提交
531 532 533 534
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
535
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
536 537 538 539
		BUG_ON(err);
	}
}

540
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
541
{
542
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
543 544 545
	struct task_desc *task;
	unsigned long i, ret;

546 547 548
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
549

550 551
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
552 553 554 555
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
556
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
557 558 559 560
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

561
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
562

563 564
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
565 566 567
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
568
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
569 570 571 572
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
573 574 575
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
I
Ingo Molnar 已提交
576

577 578 579 580 581
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
					 sched->parent_cpu_usage)/10;
I
Ingo Molnar 已提交
582

583
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
584 585
	BUG_ON(ret);

586 587
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
588 589 590 591 592
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

593
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
594
{
K
Kyle McMartin 已提交
595
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
596 597

	T0 = get_nsecs();
598
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
599 600 601
	T1 = get_nsecs();

	delta = T1 - T0;
602 603
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
604

605
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
606 607 608 609
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
610 611 612 613
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
I
Ingo Molnar 已提交
614

615
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
I
Ingo Molnar 已提交
616

617
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
I
Ingo Molnar 已提交
618

I
Ingo Molnar 已提交
619
	printf("cpu: %0.2f / %0.2f",
620
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
I
Ingo Molnar 已提交
621 622 623

#if 0
	/*
624
	 * rusage statistics done by the parent, these are less
625
	 * accurate than the sched->sum_exec_runtime based statistics:
626
	 */
I
Ingo Molnar 已提交
627
	printf(" [%0.2f / %0.2f]",
628 629
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
I
Ingo Molnar 已提交
630 631
#endif

I
Ingo Molnar 已提交
632
	printf("\n");
I
Ingo Molnar 已提交
633

634 635 636
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
637 638
}

639
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
640
{
641
	u64 T0, T1;
I
Ingo Molnar 已提交
642 643

	T0 = get_nsecs();
644
	burn_nsecs(sched, 1e6);
I
Ingo Molnar 已提交
645 646
	T1 = get_nsecs();

647
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
648 649 650 651 652

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

653
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
654 655
}

656
static int
657
replay_wakeup_event(struct perf_sched *sched,
658 659
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
660
{
661 662
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
663
	struct task_desc *waker, *wakee;
664

I
Ingo Molnar 已提交
665
	if (verbose) {
666
		printf("sched_wakeup event %p\n", evsel);
667

668
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
I
Ingo Molnar 已提交
669
	}
670

671
	waker = register_pid(sched, sample->tid, "<unknown>");
672
	wakee = register_pid(sched, pid, comm);
673

674
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
675
	return 0;
I
Ingo Molnar 已提交
676 677
}

678 679 680 681
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
I
Ingo Molnar 已提交
682
{
683 684 685 686 687
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
688
	struct task_desc *prev, __maybe_unused *next;
689 690
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
691 692
	s64 delta;

I
Ingo Molnar 已提交
693
	if (verbose)
694
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
695

696
	if (cpu >= MAX_CPUS || cpu < 0)
697
		return 0;
698

699
	timestamp0 = sched->cpu_last_switched[cpu];
700 701 702 703 704
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

705
	if (delta < 0) {
706
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
707 708
		return -1;
	}
709

710 711
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
712

713 714
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
715

716
	sched->cpu_last_switched[cpu] = timestamp;
717

718
	add_sched_event_run(sched, prev, timestamp, delta);
719
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
720 721

	return 0;
722 723
}

724 725 726
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
727
{
728 729
	struct thread *child, *parent;

730 731 732 733
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
734 735 736 737 738 739

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
		return 0;
	}
740

741
	if (verbose) {
742
		printf("fork event\n");
743 744
		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
745
	}
746

747 748
	register_pid(sched, parent->tid, thread__comm_str(parent));
	register_pid(sched, child->tid, thread__comm_str(child));
749
	return 0;
750
}
751

752 753
struct sort_dimension {
	const char		*name;
754
	sort_fn_t		cmp;
755 756 757
	struct list_head	list;
};

758
static int
759
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
760 761 762 763
{
	struct sort_dimension *sort;
	int ret = 0;

764 765
	BUG_ON(list_empty(list));

766 767 768 769 770 771 772 773 774
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

775
static struct work_atoms *
776 777 778 779
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
780
	struct work_atoms key = { .thread = thread };
781 782

	while (node) {
783
		struct work_atoms *atoms;
784 785
		int cmp;

786
		atoms = container_of(node, struct work_atoms, node);
787 788 789 790 791 792 793 794 795 796 797 798 799 800

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

801
static void
802
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
803
			 struct list_head *sort_list)
804 805 806 807
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
808
		struct work_atoms *this;
809
		int cmp;
810

811
		this = container_of(*new, struct work_atoms, node);
812
		parent = *new;
813 814 815 816

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
817 818
			new = &((*new)->rb_left);
		else
819
			new = &((*new)->rb_right);
820 821 822 823 824 825
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

826
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
827
{
828
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
829 830 831 832
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
833

834
	atoms->thread = thread__get(thread);
835
	INIT_LIST_HEAD(&atoms->work_list);
836
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
837
	return 0;
838 839
}

840
static char sched_out_state(u64 prev_state)
841 842 843
{
	const char *str = TASK_STATE_TO_CHAR_STR;

844
	return str[prev_state];
845 846
}

847
static int
848 849 850
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
851
{
852
	struct work_atom *atom = zalloc(sizeof(*atom));
853 854 855 856
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
857

858 859
	atom->sched_out_time = timestamp;

860
	if (run_state == 'R') {
861
		atom->state = THREAD_WAIT_CPU;
862
		atom->wake_up_time = atom->sched_out_time;
863 864
	}

865
	list_add_tail(&atom->list, &atoms->work_list);
866
	return 0;
867 868 869
}

static void
870 871
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
872 873 874 875 876 877 878 879 880 881 882 883 884
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
885
{
886
	struct work_atom *atom;
887
	u64 delta;
888

889
	if (list_empty(&atoms->work_list))
890 891
		return;

892
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
893

894
	if (atom->state != THREAD_WAIT_CPU)
895 896
		return;

897 898
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
899 900 901
		return;
	}

902 903
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
904

905
	delta = atom->sched_in_time - atom->wake_up_time;
906
	atoms->total_lat += delta;
907
	if (delta > atoms->max_lat) {
908
		atoms->max_lat = delta;
909 910
		atoms->max_lat_at = timestamp;
	}
911
	atoms->nb_atoms++;
912 913
}

914 915 916 917
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
918
{
919 920 921
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
922
	struct work_atoms *out_events, *in_events;
923
	struct thread *sched_out, *sched_in;
924 925
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
I
Ingo Molnar 已提交
926 927
	s64 delta;

928
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
929

930 931
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
932 933 934 935 936
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

937 938 939 940
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
941

942 943
	sched_out = machine__findnew_thread(machine, -1, prev_pid);
	sched_in = machine__findnew_thread(machine, -1, next_pid);
944

945
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
946
	if (!out_events) {
947
		if (thread_atoms_insert(sched, sched_out))
948
			return -1;
949
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
950 951 952 953
		if (!out_events) {
			pr_err("out-event: Internal tree error");
			return -1;
		}
954
	}
955
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
956
		return -1;
957

958
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
959
	if (!in_events) {
960
		if (thread_atoms_insert(sched, sched_in))
961
			return -1;
962
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
963 964 965 966
		if (!in_events) {
			pr_err("in-event: Internal tree error");
			return -1;
		}
967 968 969 970
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
971 972
		if (add_sched_out_event(in_events, 'R', timestamp))
			return -1;
973
	}
974
	add_sched_in_event(in_events, timestamp);
975 976

	return 0;
977
}
978

979 980 981 982
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
983
{
984 985
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
986
	struct thread *thread = machine__findnew_thread(machine, -1, pid);
987
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
988 989
	u64 timestamp = sample->time;
	int cpu = sample->cpu;
990 991 992

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
993
		if (thread_atoms_insert(sched, thread))
994
			return -1;
995
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
996
		if (!atoms) {
997
			pr_err("in-event: Internal tree error");
998 999 1000 1001
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1002 1003
	}

1004
	add_runtime_event(atoms, runtime, timestamp);
1005
	return 0;
1006 1007
}

1008 1009 1010 1011
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1012
{
1013
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1014
	struct work_atoms *atoms;
1015
	struct work_atom *atom;
1016
	struct thread *wakee;
1017
	u64 timestamp = sample->time;
1018

1019
	wakee = machine__findnew_thread(machine, -1, pid);
1020
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1021
	if (!atoms) {
1022
		if (thread_atoms_insert(sched, wakee))
1023
			return -1;
1024
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1025
		if (!atoms) {
1026
			pr_err("wakeup-event: Internal tree error");
1027 1028 1029 1030
			return -1;
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
			return -1;
1031 1032
	}

1033
	BUG_ON(list_empty(&atoms->work_list));
1034

1035
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1036

1037
	/*
1038 1039 1040 1041 1042 1043
	 * As we do not guarantee the wakeup event happens when
	 * task is out of run queue, also may happen when task is
	 * on run queue and wakeup only change ->state to TASK_RUNNING,
	 * then we should not set the ->wake_up_time when wake up a
	 * task which is on run queue.
	 *
1044 1045
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
1046
	 * skip in this case.
1047
	 */
1048
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1049
		return 0;
1050

1051
	sched->nr_timestamps++;
1052
	if (atom->sched_out_time > timestamp) {
1053
		sched->nr_unordered_timestamps++;
1054
		return 0;
1055
	}
1056

1057 1058
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1059
	return 0;
1060 1061
}

1062 1063 1064 1065
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1066
{
1067
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1068
	u64 timestamp = sample->time;
1069 1070 1071 1072 1073 1074 1075
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1076
	if (sched->profile_cpu == -1)
1077
		return 0;
1078

1079
	migrant = machine__findnew_thread(machine, -1, pid);
1080
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1081
	if (!atoms) {
1082
		if (thread_atoms_insert(sched, migrant))
1083
			return -1;
1084
		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1085
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1086
		if (!atoms) {
1087
			pr_err("migration-event: Internal tree error");
1088 1089 1090 1091
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1092 1093 1094 1095 1096 1097 1098
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1099
	sched->nr_timestamps++;
1100 1101

	if (atom->sched_out_time > timestamp)
1102
		sched->nr_unordered_timestamps++;
1103 1104

	return 0;
1105 1106
}

1107
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1108 1109 1110
{
	int i;
	int ret;
1111
	u64 avg;
1112

1113
	if (!work_list->nb_atoms)
1114
		return;
1115 1116 1117
	/*
	 * Ignore idle threads:
	 */
1118
	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1119
		return;
1120

1121 1122
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1123

1124
	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1125

M
mingo 已提交
1126
	for (i = 0; i < 24 - ret; i++)
1127 1128
		printf(" ");

1129
	avg = work_list->total_lat / work_list->nb_atoms;
1130

1131
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1132 1133
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1134 1135
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1136 1137
}

1138
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1139
{
1140
	if (l->thread->tid < r->thread->tid)
1141
		return -1;
1142
	if (l->thread->tid > r->thread->tid)
1143 1144 1145 1146 1147
		return 1;

	return 0;
}

1148
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1169
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1170 1171 1172 1173 1174 1175 1176 1177 1178
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1179
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1180 1181 1182 1183 1184 1185 1186 1187 1188
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1189
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1190 1191 1192 1193 1194 1195 1196 1197 1198
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1199
static int sort_dimension__add(const char *tok, struct list_head *list)
1200
{
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1229

1230
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1241
static void perf_sched__sort_lat(struct perf_sched *sched)
1242 1243 1244 1245
{
	struct rb_node *node;

	for (;;) {
1246
		struct work_atoms *data;
1247
		node = rb_first(&sched->atom_root);
1248 1249 1250
		if (!node)
			break;

1251
		rb_erase(node, &sched->atom_root);
1252
		data = rb_entry(node, struct work_atoms, node);
1253
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1254 1255 1256
	}
}

1257
static int process_sched_wakeup_event(struct perf_tool *tool,
1258
				      struct perf_evsel *evsel,
1259
				      struct perf_sample *sample,
1260
				      struct machine *machine)
1261
{
1262
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1263

1264 1265
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1266

1267
	return 0;
1268 1269
}

1270 1271
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1272
{
1273 1274
	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	struct thread *sched_in;
1275
	int new_shortname;
1276
	u64 timestamp0, timestamp = sample->time;
1277
	s64 delta;
1278
	int cpu, this_cpu = sample->cpu;
1279 1280 1281

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1282 1283
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1284

1285 1286
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1287 1288 1289 1290 1291
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1292
	if (delta < 0) {
1293
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1294 1295
		return -1;
	}
1296

1297
	sched_in = machine__findnew_thread(machine, -1, next_pid);
1298

1299
	sched->curr_thread[this_cpu] = sched_in;
1300 1301 1302 1303 1304

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1305 1306 1307 1308 1309 1310 1311
		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
			/*
			 * Don't allocate a letter-number for swapper:0
			 * as a shortname. Instead, we use '.' for it.
			 */
			sched_in->shortname[0] = '.';
			sched_in->shortname[1] = ' ';
1312
		} else {
1313 1314 1315 1316 1317
			sched_in->shortname[0] = sched->next_shortname1;
			sched_in->shortname[1] = sched->next_shortname2;

			if (sched->next_shortname1 < 'Z') {
				sched->next_shortname1++;
1318
			} else {
1319 1320 1321 1322 1323
				sched->next_shortname1 = 'A';
				if (sched->next_shortname2 < '9')
					sched->next_shortname2++;
				else
					sched->next_shortname2 = '0';
1324 1325 1326 1327 1328
			}
		}
		new_shortname = 1;
	}

1329
	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1330 1331 1332 1333 1334
		if (cpu != this_cpu)
			printf(" ");
		else
			printf("*");

1335 1336 1337
		if (sched->curr_thread[cpu])
			printf("%2s ", sched->curr_thread[cpu]->shortname);
		else
1338 1339 1340 1341 1342 1343
			printf("   ");
	}

	printf("  %12.6f secs ", (double)timestamp/1e9);
	if (new_shortname) {
		printf("%s => %s:%d\n",
1344
		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1345 1346 1347
	} else {
		printf("\n");
	}
1348 1349

	return 0;
1350 1351
}

1352
static int process_sched_switch_event(struct perf_tool *tool,
1353
				      struct perf_evsel *evsel,
1354
				      struct perf_sample *sample,
1355
				      struct machine *machine)
1356
{
1357
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1358
	int this_cpu = sample->cpu, err = 0;
1359 1360
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1361

1362
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1363 1364 1365 1366
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1367
		if (sched->curr_pid[this_cpu] != prev_pid)
1368
			sched->nr_context_switch_bugs++;
1369 1370
	}

1371 1372
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1373 1374

	sched->curr_pid[this_cpu] = next_pid;
1375
	return err;
1376 1377
}

1378
static int process_sched_runtime_event(struct perf_tool *tool,
1379
				       struct perf_evsel *evsel,
1380
				       struct perf_sample *sample,
1381
				       struct machine *machine)
1382
{
1383
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1384

1385 1386
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1387

1388
	return 0;
1389 1390
}

1391 1392 1393 1394
static int perf_sched__process_fork_event(struct perf_tool *tool,
					  union perf_event *event,
					  struct perf_sample *sample,
					  struct machine *machine)
1395
{
1396
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1397

1398 1399 1400 1401
	/* run the fork event through the perf machineruy */
	perf_event__process_fork(tool, event, sample, machine);

	/* and then run additional processing needed for this command */
1402
	if (sched->tp_handler->fork_event)
1403
		return sched->tp_handler->fork_event(sched, event, machine);
1404

1405
	return 0;
1406 1407
}

1408
static int process_sched_migrate_task_event(struct perf_tool *tool,
1409
					    struct perf_evsel *evsel,
1410
					    struct perf_sample *sample,
1411
					    struct machine *machine)
1412
{
1413
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1414

1415 1416
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1417

1418
	return 0;
1419 1420
}

1421
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1422
				  struct perf_evsel *evsel,
1423
				  struct perf_sample *sample,
1424
				  struct machine *machine);
I
Ingo Molnar 已提交
1425

1426 1427
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1428 1429 1430
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1431
{
1432
	int err = 0;
I
Ingo Molnar 已提交
1433

1434 1435
	if (evsel->handler != NULL) {
		tracepoint_handler f = evsel->handler;
1436
		err = f(tool, evsel, sample, machine);
1437
	}
I
Ingo Molnar 已提交
1438

1439
	return err;
I
Ingo Molnar 已提交
1440 1441
}

1442
static int perf_sched__read_events(struct perf_sched *sched)
I
Ingo Molnar 已提交
1443
{
1444 1445 1446 1447 1448 1449 1450
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1451
	struct perf_session *session;
1452 1453 1454 1455
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
	};
1456
	int rc = -1;
1457

1458
	session = perf_session__new(&file, false, &sched->tool);
1459 1460 1461 1462
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1463

1464
	symbol__init(&session->header.env);
1465

1466 1467
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1468

1469
	if (perf_session__has_traces(session, "record -R")) {
1470
		int err = perf_session__process_events(session);
1471 1472 1473 1474
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1475

1476 1477 1478
		sched->nr_events      = session->evlist->stats.nr_events[0];
		sched->nr_lost_events = session->evlist->stats.total_lost;
		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1479
	}
1480

1481
	rc = 0;
1482 1483
out_delete:
	perf_session__delete(session);
1484
	return rc;
I
Ingo Molnar 已提交
1485 1486
}

1487
static void print_bad_events(struct perf_sched *sched)
1488
{
1489
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1490
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1491 1492
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1493
	}
1494
	if (sched->nr_lost_events && sched->nr_events) {
1495
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1496 1497
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1498
	}
1499
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1500
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1501 1502 1503
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1504 1505 1506 1507 1508
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1509
static int perf_sched__lat(struct perf_sched *sched)
1510 1511 1512 1513
{
	struct rb_node *next;

	setup_pager();
1514

1515
	if (perf_sched__read_events(sched))
1516
		return -1;
1517

1518
	perf_sched__sort_lat(sched);
1519

1520 1521 1522
	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1523

1524
	next = rb_first(&sched->sorted_atom_root);
1525 1526 1527 1528 1529

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1530
		output_lat_thread(sched, work_list);
1531
		next = rb_next(next);
1532
		thread__zput(work_list->thread);
1533 1534
	}

1535
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1536
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1537
		(double)sched->all_runtime / 1e6, sched->all_count);
1538 1539 1540

	printf(" ---------------------------------------------------\n");

1541
	print_bad_events(sched);
1542 1543
	printf("\n");

1544
	return 0;
1545 1546
}

1547
static int perf_sched__map(struct perf_sched *sched)
1548
{
1549
	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1550

1551
	setup_pager();
1552
	if (perf_sched__read_events(sched))
1553
		return -1;
1554
	print_bad_events(sched);
1555
	return 0;
1556 1557
}

1558
static int perf_sched__replay(struct perf_sched *sched)
1559 1560 1561
{
	unsigned long i;

1562 1563
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1564

1565
	test_calibrations(sched);
1566

1567
	if (perf_sched__read_events(sched))
1568
		return -1;
1569

1570 1571 1572
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1573

1574 1575 1576 1577 1578
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1579
		printf("run atoms optimized: %ld\n",
1580
			sched->nr_run_events_optimized);
1581

1582 1583
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1584

1585
	create_tasks(sched);
1586
	printf("------------------------------------------------------------\n");
1587 1588
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1589 1590

	return 0;
1591 1592
}

1593 1594
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1595
{
1596
	char *tmp, *tok, *str = strdup(sched->sort_order);
1597 1598 1599

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1600
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1601
			error("Unknown --sort key: `%s'", tok);
1602
			usage_with_options(usage_msg, options);
1603 1604 1605 1606 1607
		}
	}

	free(str);

1608
	sort_dimension__add("pid", &sched->cmp_pid);
1609 1610
}

1611 1612 1613 1614
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
1628
		"-e", "sched:sched_wakeup_new",
1629 1630
		"-e", "sched:sched_migrate_task",
	};
1631 1632 1633 1634

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1635
	if (rec_argv == NULL)
1636 1637
		return -ENOMEM;

1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1649
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
1650
{
1651 1652 1653 1654 1655 1656 1657
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_sched__process_fork_event,
1658
			.ordered_events = true,
1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
	};
1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option sched_options[] = {
1691
	OPT_STRING('i', "input", &input_name, "file",
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
1707 1708 1709 1710
	const char *const sched_subcommands[] = { "record", "latency", "map",
						  "replay", "script", NULL };
	const char *sched_usage[] = {
		NULL,
1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};
A
Adrian Hunter 已提交
1727 1728 1729 1730
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
		sched.curr_pid[i] = -1;
1731

1732 1733
	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
1734 1735
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
1736

1737
	/*
1738
	 * Aliased to 'perf script' for now:
1739
	 */
1740 1741
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1742

1743 1744 1745
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1746
		sched.tp_handler = &lat_ops;
1747 1748 1749 1750 1751
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
1752 1753
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
1754
	} else if (!strcmp(argv[0], "map")) {
1755 1756 1757
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
1758
	} else if (!strncmp(argv[0], "rep", 3)) {
1759
		sched.tp_handler = &replay_ops;
1760 1761 1762 1763 1764
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
1765
		return perf_sched__replay(&sched);
1766 1767 1768 1769
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
1770
	return 0;
I
Ingo Molnar 已提交
1771
}