builtin-sched.c 73.5 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
13
#include "util/cloexec.h"
J
Jiri Olsa 已提交
14
#include "util/thread_map.h"
15
#include "util/color.h"
16
#include "util/stat.h"
17
#include "util/callchain.h"
I
Ingo Molnar 已提交
18

19
#include <subcmd/parse-options.h>
20
#include "util/trace-event.h"
I
Ingo Molnar 已提交
21 22 23

#include "util/debug.h"

24
#include <linux/log2.h>
25
#include <sys/prctl.h>
26
#include <sys/resource.h>
I
Ingo Molnar 已提交
27

28 29 30
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
31
#include <api/fs/fs.h>
32
#include <linux/time64.h>
33

34 35 36 37
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
38
#define MAX_PID			1024000
I
Ingo Molnar 已提交
39

40
struct sched_atom;
I
Ingo Molnar 已提交
41

42 43 44 45
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
46

47 48
	unsigned long		nr_events;
	unsigned long		curr_event;
49
	struct sched_atom	**atoms;
50 51 52

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
53

54 55 56 57 58 59 60 61 62 63
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
64
	SCHED_EVENT_MIGRATION,
65 66
};

67
struct sched_atom {
68
	enum sched_event_type	type;
69
	int			specific_wait;
70 71 72 73 74 75 76
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

77
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
78 79 80 81 82 83 84 85 86 87 88

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
89
	u64			sched_out_time;
90 91 92 93 94
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

95 96
struct work_atoms {
	struct list_head	work_list;
97 98 99
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
100
	u64			max_lat_at;
101 102 103
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
104
	int			num_merged;
105 106
};

107
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
108

109
struct perf_sched;
110

111 112 113
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
114

115 116
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
117

118 119
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
120

121 122 123
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
124 125

	int (*migrate_task_event)(struct perf_sched *sched,
126 127 128
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
129 130
};

J
Jiri Olsa 已提交
131
#define COLOR_PIDS PERF_COLOR_BLUE
J
Jiri Olsa 已提交
132
#define COLOR_CPUS PERF_COLOR_BG_RED
J
Jiri Olsa 已提交
133

134 135 136 137
struct perf_sched_map {
	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
	int			*comp_cpus;
	bool			 comp;
J
Jiri Olsa 已提交
138 139
	struct thread_map	*color_pids;
	const char		*color_pids_str;
J
Jiri Olsa 已提交
140 141
	struct cpu_map		*color_cpus;
	const char		*color_cpus_str;
142 143
	struct cpu_map		*cpus;
	const char		*cpus_str;
144 145
};

146 147 148 149
struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
150
	struct task_desc **pid_to_task;
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
193
	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
194
	struct list_head sort_list, cmp_pid;
195
	bool force;
196
	bool skip_merge;
197
	struct perf_sched_map map;
198 199 200 201

	/* options for timehist command */
	bool		summary;
	bool		summary_only;
202 203
	bool		show_callchain;
	unsigned int	max_stack;
204
	bool		show_cpu_visual;
205
	bool		show_wakeups;
206
	u64		skipped_samples;
207
};
208

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
/* per thread run time data */
struct thread_runtime {
	u64 last_time;      /* time of previous sched in/out event */
	u64 dt_run;         /* run time */
	u64 dt_wait;        /* time between CPU access (off cpu) */
	u64 dt_delay;       /* time between wakeup and sched-in */
	u64 ready_to_run;   /* time of wakeup */

	struct stats run_stats;
	u64 total_run_time;
};

/* per event run time data */
struct evsel_runtime {
	u64 *last_time; /* time this event was last seen per cpu */
	u32 ncpu;       /* highest cpu slot allocated */
};

/* track idle times per cpu */
static struct thread **idle_threads;
static int idle_max_cpu;
static char idle_comm[] = "<idle>";

232
static u64 get_nsecs(void)
I
Ingo Molnar 已提交
233 234 235 236 237
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

238
	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
I
Ingo Molnar 已提交
239 240
}

241
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
242
{
243
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
244 245 246

	do {
		T1 = get_nsecs();
247
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
248 249
}

250
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
251 252 253 254 255 256 257 258 259
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

260
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
261
{
262
	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
I
Ingo Molnar 已提交
263 264 265 266
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
267
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
268 269 270 271
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
272
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
273

274
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
275 276
}

277
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
278
{
279
	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
I
Ingo Molnar 已提交
280 281 282 283 284 285 286 287 288 289
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
290
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
291

292
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
293 294
}

295
static struct sched_atom *
296
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
297
{
298
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
299 300 301 302 303 304 305
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
306 307 308
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
309

310
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
311 312 313 314

	return event;
}

315
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
316 317 318 319
{
	if (!task->nr_events)
		return NULL;

320
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
321 322
}

323 324
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
325
{
326
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
327 328

	/*
329 330 331
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
332
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
333
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
334 335 336 337 338 339 340 341 342
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

343
	sched->nr_run_events++;
I
Ingo Molnar 已提交
344 345
}

346 347
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
348
{
349
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
350 351 352 353 354 355 356

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
357
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
358 359 360
		return;
	}
	if (wakee_event->wait_sem) {
361
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
362 363 364
		return;
	}

365
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
366 367 368 369
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

370
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
371 372
}

373 374
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
375
{
376
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
377 378 379

	event->type = SCHED_EVENT_SLEEP;

380
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
381 382
}

383 384
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
385 386
{
	struct task_desc *task;
387
	static int pid_max;
I
Ingo Molnar 已提交
388

389 390 391 392 393
	if (sched->pid_to_task == NULL) {
		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
			pid_max = MAX_PID;
		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
	}
394 395 396 397 398 399
	if (pid >= (unsigned long)pid_max) {
		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
			sizeof(struct task_desc *))) == NULL);
		while (pid >= (unsigned long)pid_max)
			sched->pid_to_task[pid_max++] = NULL;
	}
I
Ingo Molnar 已提交
400

401
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
402 403 404 405

	if (task)
		return task;

406
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
407
	task->pid = pid;
408
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
409 410 411 412 413
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
414
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
415

416 417
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
418
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
419 420
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
421

I
Ingo Molnar 已提交
422
	if (verbose)
423
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
424 425 426 427 428

	return task;
}


429
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
430 431 432 433
{
	struct task_desc *task;
	unsigned long i;

434 435
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
436
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
437 438 439 440
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

441
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
442 443 444 445
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

446 447
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
448
		j = i + 1;
449
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
450
			j = 0;
451 452
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
453 454 455
	}
}

456 457
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
458 459 460
{
	int ret = 0;

461
	switch (atom->type) {
I
Ingo Molnar 已提交
462
		case SCHED_EVENT_RUN:
463
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
464 465
			break;
		case SCHED_EVENT_SLEEP:
466 467
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
468 469 470
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
471 472
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
473 474
			BUG_ON(ret);
			break;
475 476
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
477 478 479 480 481
		default:
			BUG_ON(1);
	}
}

482
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
483 484
{
	struct rusage ru;
485
	u64 sum;
I
Ingo Molnar 已提交
486 487 488 489 490
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

491 492
	sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
I
Ingo Molnar 已提交
493 494 495 496

	return sum;
}

497
static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
I
Ingo Molnar 已提交
498
{
499
	struct perf_event_attr attr;
500
	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
501
	int fd;
502 503
	struct rlimit limit;
	bool need_privilege = false;
I
Ingo Molnar 已提交
504

505
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
506

507 508
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
509

510
force_again:
511 512
	fd = sys_perf_event_open(&attr, 0, -1, -1,
				 perf_event_open_cloexec_flag());
513

514
	if (fd < 0) {
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
		if (errno == EMFILE) {
			if (sched->force) {
				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
				limit.rlim_cur += sched->nr_tasks - cur_task;
				if (limit.rlim_cur > limit.rlim_max) {
					limit.rlim_max = limit.rlim_cur;
					need_privilege = true;
				}
				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
					if (need_privilege && errno == EPERM)
						strcpy(info, "Need privilege\n");
				} else
					goto force_again;
			} else
				strcpy(info, "Have a try with -f option\n");
		}
531
		pr_err("Error: sys_perf_event_open() syscall returned "
532
		       "with %d (%s)\n%s", fd,
533
		       str_error_r(errno, sbuf, sizeof(sbuf)), info);
534 535
		exit(EXIT_FAILURE);
	}
536 537 538 539 540 541 542 543 544 545 546 547
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
548 549
}

550 551 552
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
553
	int fd;
554 555
};

I
Ingo Molnar 已提交
556 557
static void *thread_func(void *ctx)
{
558 559 560
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
561
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
562 563
	unsigned long i, ret;
	char comm2[22];
564
	int fd = parms->fd;
I
Ingo Molnar 已提交
565

566
	zfree(&parms);
567

I
Ingo Molnar 已提交
568 569
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
570 571
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
572 573 574
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
575
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
576
	BUG_ON(ret);
577
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
578 579
	BUG_ON(ret);

580
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
581 582 583

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
584
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
585 586
	}

587
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
588 589 590 591
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

592
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
593
	BUG_ON(ret);
594
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
595 596 597 598 599
	BUG_ON(ret);

	goto again;
}

600
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
601 602 603 604 605 606 607 608
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
609 610
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
611
	BUG_ON(err);
612
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
613
	BUG_ON(err);
614
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
615
	BUG_ON(err);
616 617 618 619 620
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
621
		parms->fd = self_open_counters(sched, i);
I
Ingo Molnar 已提交
622 623 624 625
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
626
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
627 628 629 630
		BUG_ON(err);
	}
}

631
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
632
{
633
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
634 635 636
	struct task_desc *task;
	unsigned long i, ret;

637 638 639
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
640

641 642
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
643 644 645 646
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
647
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
648 649 650 651
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

652
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
653

654 655
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
656 657 658
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
659
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
660 661 662 663
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
664 665
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
666
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
I
Ingo Molnar 已提交
667

668 669 670
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
671 672
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
					 sched->parent_cpu_usage)/sched->replay_repeat;
I
Ingo Molnar 已提交
673

674
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
675 676
	BUG_ON(ret);

677 678
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
679 680 681 682 683
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

684
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
685
{
K
Kyle McMartin 已提交
686
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
687 688

	T0 = get_nsecs();
689
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
690 691 692
	T1 = get_nsecs();

	delta = T1 - T0;
693 694
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
695

696
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
697 698 699 700
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
701 702 703
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
704
	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
I
Ingo Molnar 已提交
705

706
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
I
Ingo Molnar 已提交
707

708
	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
I
Ingo Molnar 已提交
709

I
Ingo Molnar 已提交
710
	printf("cpu: %0.2f / %0.2f",
711
		(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
I
Ingo Molnar 已提交
712 713 714

#if 0
	/*
715
	 * rusage statistics done by the parent, these are less
716
	 * accurate than the sched->sum_exec_runtime based statistics:
717
	 */
I
Ingo Molnar 已提交
718
	printf(" [%0.2f / %0.2f]",
719 720
		(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
		(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
I
Ingo Molnar 已提交
721 722
#endif

I
Ingo Molnar 已提交
723
	printf("\n");
I
Ingo Molnar 已提交
724

725 726 727
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
728 729
}

730
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
731
{
732
	u64 T0, T1;
I
Ingo Molnar 已提交
733 734

	T0 = get_nsecs();
735
	burn_nsecs(sched, NSEC_PER_MSEC);
I
Ingo Molnar 已提交
736 737
	T1 = get_nsecs();

738
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
739 740

	T0 = get_nsecs();
741
	sleep_nsecs(NSEC_PER_MSEC);
I
Ingo Molnar 已提交
742 743
	T1 = get_nsecs();

744
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
745 746
}

747
static int
748
replay_wakeup_event(struct perf_sched *sched,
749 750
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
751
{
752 753
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
754
	struct task_desc *waker, *wakee;
755

I
Ingo Molnar 已提交
756
	if (verbose) {
757
		printf("sched_wakeup event %p\n", evsel);
758

759
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
I
Ingo Molnar 已提交
760
	}
761

762
	waker = register_pid(sched, sample->tid, "<unknown>");
763
	wakee = register_pid(sched, pid, comm);
764

765
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
766
	return 0;
I
Ingo Molnar 已提交
767 768
}

769 770 771 772
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
I
Ingo Molnar 已提交
773
{
774 775 776 777 778
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
779
	struct task_desc *prev, __maybe_unused *next;
780 781
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
782 783
	s64 delta;

I
Ingo Molnar 已提交
784
	if (verbose)
785
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
786

787
	if (cpu >= MAX_CPUS || cpu < 0)
788
		return 0;
789

790
	timestamp0 = sched->cpu_last_switched[cpu];
791 792 793 794 795
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

796
	if (delta < 0) {
797
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
798 799
		return -1;
	}
800

801 802
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
803

804 805
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
806

807
	sched->cpu_last_switched[cpu] = timestamp;
808

809
	add_sched_event_run(sched, prev, timestamp, delta);
810
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
811 812

	return 0;
813 814
}

815 816 817
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
818
{
819 820
	struct thread *child, *parent;

821 822 823 824
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
825 826 827 828

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
829
		goto out_put;
830
	}
831

832
	if (verbose) {
833
		printf("fork event\n");
834 835
		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
836
	}
837

838 839
	register_pid(sched, parent->tid, thread__comm_str(parent));
	register_pid(sched, child->tid, thread__comm_str(child));
840 841 842
out_put:
	thread__put(child);
	thread__put(parent);
843
	return 0;
844
}
845

846 847
struct sort_dimension {
	const char		*name;
848
	sort_fn_t		cmp;
849 850 851
	struct list_head	list;
};

852
static int
853
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
854 855 856 857
{
	struct sort_dimension *sort;
	int ret = 0;

858 859
	BUG_ON(list_empty(list));

860 861 862 863 864 865 866 867 868
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

869
static struct work_atoms *
870 871 872 873
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
874
	struct work_atoms key = { .thread = thread };
875 876

	while (node) {
877
		struct work_atoms *atoms;
878 879
		int cmp;

880
		atoms = container_of(node, struct work_atoms, node);
881 882 883 884 885 886 887 888 889 890 891 892 893 894

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

895
static void
896
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
897
			 struct list_head *sort_list)
898 899 900 901
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
902
		struct work_atoms *this;
903
		int cmp;
904

905
		this = container_of(*new, struct work_atoms, node);
906
		parent = *new;
907 908 909 910

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
911 912
			new = &((*new)->rb_left);
		else
913
			new = &((*new)->rb_right);
914 915 916 917 918 919
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

920
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
921
{
922
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
923 924 925 926
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
927

928
	atoms->thread = thread__get(thread);
929
	INIT_LIST_HEAD(&atoms->work_list);
930
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
931
	return 0;
932 933
}

934
static char sched_out_state(u64 prev_state)
935 936 937
{
	const char *str = TASK_STATE_TO_CHAR_STR;

938
	return str[prev_state];
939 940
}

941
static int
942 943 944
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
945
{
946
	struct work_atom *atom = zalloc(sizeof(*atom));
947 948 949 950
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
951

952 953
	atom->sched_out_time = timestamp;

954
	if (run_state == 'R') {
955
		atom->state = THREAD_WAIT_CPU;
956
		atom->wake_up_time = atom->sched_out_time;
957 958
	}

959
	list_add_tail(&atom->list, &atoms->work_list);
960
	return 0;
961 962 963
}

static void
964 965
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
966 967 968 969 970 971 972 973 974 975 976 977 978
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
979
{
980
	struct work_atom *atom;
981
	u64 delta;
982

983
	if (list_empty(&atoms->work_list))
984 985
		return;

986
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
987

988
	if (atom->state != THREAD_WAIT_CPU)
989 990
		return;

991 992
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
993 994 995
		return;
	}

996 997
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
998

999
	delta = atom->sched_in_time - atom->wake_up_time;
1000
	atoms->total_lat += delta;
1001
	if (delta > atoms->max_lat) {
1002
		atoms->max_lat = delta;
1003 1004
		atoms->max_lat_at = timestamp;
	}
1005
	atoms->nb_atoms++;
1006 1007
}

1008 1009 1010 1011
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1012
{
1013 1014 1015
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
1016
	struct work_atoms *out_events, *in_events;
1017
	struct thread *sched_out, *sched_in;
1018
	u64 timestamp0, timestamp = sample->time;
1019
	int cpu = sample->cpu, err = -1;
I
Ingo Molnar 已提交
1020 1021
	s64 delta;

1022
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
1023

1024 1025
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
1026 1027 1028 1029 1030
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1031 1032 1033 1034
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
1035

1036 1037
	sched_out = machine__findnew_thread(machine, -1, prev_pid);
	sched_in = machine__findnew_thread(machine, -1, next_pid);
1038 1039
	if (sched_out == NULL || sched_in == NULL)
		goto out_put;
1040

1041
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1042
	if (!out_events) {
1043
		if (thread_atoms_insert(sched, sched_out))
1044
			goto out_put;
1045
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1046 1047
		if (!out_events) {
			pr_err("out-event: Internal tree error");
1048
			goto out_put;
1049
		}
1050
	}
1051
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1052
		return -1;
1053

1054
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1055
	if (!in_events) {
1056
		if (thread_atoms_insert(sched, sched_in))
1057
			goto out_put;
1058
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1059 1060
		if (!in_events) {
			pr_err("in-event: Internal tree error");
1061
			goto out_put;
1062
		}
1063 1064 1065 1066
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
1067
		if (add_sched_out_event(in_events, 'R', timestamp))
1068
			goto out_put;
1069
	}
1070
	add_sched_in_event(in_events, timestamp);
1071 1072 1073 1074 1075
	err = 0;
out_put:
	thread__put(sched_out);
	thread__put(sched_in);
	return err;
1076
}
1077

1078 1079 1080 1081
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
1082
{
1083 1084
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1085
	struct thread *thread = machine__findnew_thread(machine, -1, pid);
1086
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1087
	u64 timestamp = sample->time;
1088 1089 1090 1091
	int cpu = sample->cpu, err = -1;

	if (thread == NULL)
		return -1;
1092 1093 1094

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
1095
		if (thread_atoms_insert(sched, thread))
1096
			goto out_put;
1097
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1098
		if (!atoms) {
1099
			pr_err("in-event: Internal tree error");
1100
			goto out_put;
1101 1102
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
1103
			goto out_put;
1104 1105
	}

1106
	add_runtime_event(atoms, runtime, timestamp);
1107 1108 1109 1110
	err = 0;
out_put:
	thread__put(thread);
	return err;
1111 1112
}

1113 1114 1115 1116
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1117
{
1118
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1119
	struct work_atoms *atoms;
1120
	struct work_atom *atom;
1121
	struct thread *wakee;
1122
	u64 timestamp = sample->time;
1123
	int err = -1;
1124

1125
	wakee = machine__findnew_thread(machine, -1, pid);
1126 1127
	if (wakee == NULL)
		return -1;
1128
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1129
	if (!atoms) {
1130
		if (thread_atoms_insert(sched, wakee))
1131
			goto out_put;
1132
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1133
		if (!atoms) {
1134
			pr_err("wakeup-event: Internal tree error");
1135
			goto out_put;
1136 1137
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
1138
			goto out_put;
1139 1140
	}

1141
	BUG_ON(list_empty(&atoms->work_list));
1142

1143
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1144

1145
	/*
1146 1147 1148 1149 1150 1151
	 * As we do not guarantee the wakeup event happens when
	 * task is out of run queue, also may happen when task is
	 * on run queue and wakeup only change ->state to TASK_RUNNING,
	 * then we should not set the ->wake_up_time when wake up a
	 * task which is on run queue.
	 *
1152 1153
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
1154
	 * skip in this case.
1155
	 */
1156
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1157
		goto out_ok;
1158

1159
	sched->nr_timestamps++;
1160
	if (atom->sched_out_time > timestamp) {
1161
		sched->nr_unordered_timestamps++;
1162
		goto out_ok;
1163
	}
1164

1165 1166
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1167 1168 1169 1170 1171
out_ok:
	err = 0;
out_put:
	thread__put(wakee);
	return err;
1172 1173
}

1174 1175 1176 1177
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1178
{
1179
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1180
	u64 timestamp = sample->time;
1181 1182 1183
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;
1184
	int err = -1;
1185 1186 1187 1188

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1189
	if (sched->profile_cpu == -1)
1190
		return 0;
1191

1192
	migrant = machine__findnew_thread(machine, -1, pid);
1193 1194
	if (migrant == NULL)
		return -1;
1195
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1196
	if (!atoms) {
1197
		if (thread_atoms_insert(sched, migrant))
1198
			goto out_put;
1199
		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1200
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1201
		if (!atoms) {
1202
			pr_err("migration-event: Internal tree error");
1203
			goto out_put;
1204 1205
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
1206
			goto out_put;
1207 1208 1209 1210 1211 1212 1213
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1214
	sched->nr_timestamps++;
1215 1216

	if (atom->sched_out_time > timestamp)
1217
		sched->nr_unordered_timestamps++;
1218 1219 1220 1221
	err = 0;
out_put:
	thread__put(migrant);
	return err;
1222 1223
}

1224
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1225 1226 1227
{
	int i;
	int ret;
1228
	u64 avg;
1229
	char max_lat_at[32];
1230

1231
	if (!work_list->nb_atoms)
1232
		return;
1233 1234 1235
	/*
	 * Ignore idle threads:
	 */
1236
	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1237
		return;
1238

1239 1240
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1241

1242 1243 1244 1245
	if (work_list->num_merged > 1)
		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
	else
		ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1246

M
mingo 已提交
1247
	for (i = 0; i < 24 - ret; i++)
1248 1249
		printf(" ");

1250
	avg = work_list->total_lat / work_list->nb_atoms;
1251
	timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at));
1252

1253
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n",
1254 1255 1256
	      (double)work_list->total_runtime / NSEC_PER_MSEC,
		 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
		 (double)work_list->max_lat / NSEC_PER_MSEC,
1257
		 max_lat_at);
1258 1259
}

1260
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1261
{
1262 1263
	if (l->thread == r->thread)
		return 0;
1264
	if (l->thread->tid < r->thread->tid)
1265
		return -1;
1266
	if (l->thread->tid > r->thread->tid)
1267
		return 1;
1268
	return (int)(l->thread - r->thread);
1269 1270
}

1271
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1292
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1293 1294 1295 1296 1297 1298 1299 1300 1301
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1302
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1303 1304 1305 1306 1307 1308 1309 1310 1311
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1312
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1313 1314 1315 1316 1317 1318 1319 1320 1321
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1322
static int sort_dimension__add(const char *tok, struct list_head *list)
1323
{
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1352

1353
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1364
static void perf_sched__sort_lat(struct perf_sched *sched)
1365 1366
{
	struct rb_node *node;
1367 1368
	struct rb_root *root = &sched->atom_root;
again:
1369
	for (;;) {
1370
		struct work_atoms *data;
1371
		node = rb_first(root);
1372 1373 1374
		if (!node)
			break;

1375
		rb_erase(node, root);
1376
		data = rb_entry(node, struct work_atoms, node);
1377
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1378
	}
1379 1380 1381 1382
	if (root == &sched->atom_root) {
		root = &sched->merged_atom_root;
		goto again;
	}
1383 1384
}

1385
static int process_sched_wakeup_event(struct perf_tool *tool,
1386
				      struct perf_evsel *evsel,
1387
				      struct perf_sample *sample,
1388
				      struct machine *machine)
1389
{
1390
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1391

1392 1393
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1394

1395
	return 0;
1396 1397
}

J
Jiri Olsa 已提交
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
union map_priv {
	void	*ptr;
	bool	 color;
};

static bool thread__has_color(struct thread *thread)
{
	union map_priv priv = {
		.ptr = thread__priv(thread),
	};

	return priv.color;
}

static struct thread*
map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
{
	struct thread *thread = machine__findnew_thread(machine, pid, tid);
	union map_priv priv = {
		.color = false,
	};

	if (!sched->map.color_pids || !thread || thread__priv(thread))
		return thread;

	if (thread_map__has(sched->map.color_pids, tid))
		priv.color = true;

	thread__set_priv(thread, priv.ptr);
	return thread;
}

1430 1431
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1432
{
1433 1434
	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	struct thread *sched_in;
1435
	int new_shortname;
1436
	u64 timestamp0, timestamp = sample->time;
1437
	s64 delta;
1438 1439 1440
	int i, this_cpu = sample->cpu;
	int cpus_nr;
	bool new_cpu = false;
1441
	const char *color = PERF_COLOR_NORMAL;
1442
	char stimestamp[32];
1443 1444 1445

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1446 1447
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1448

1449 1450 1451 1452 1453 1454 1455 1456 1457
	if (sched->map.comp) {
		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
		if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
			sched->map.comp_cpus[cpus_nr++] = this_cpu;
			new_cpu = true;
		}
	} else
		cpus_nr = sched->max_cpu;

1458 1459
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1460 1461 1462 1463 1464
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1465
	if (delta < 0) {
1466
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1467 1468
		return -1;
	}
1469

J
Jiri Olsa 已提交
1470
	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1471 1472
	if (sched_in == NULL)
		return -1;
1473

1474
	sched->curr_thread[this_cpu] = thread__get(sched_in);
1475 1476 1477 1478 1479

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1480 1481 1482 1483 1484 1485 1486
		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
			/*
			 * Don't allocate a letter-number for swapper:0
			 * as a shortname. Instead, we use '.' for it.
			 */
			sched_in->shortname[0] = '.';
			sched_in->shortname[1] = ' ';
1487
		} else {
1488 1489 1490 1491 1492
			sched_in->shortname[0] = sched->next_shortname1;
			sched_in->shortname[1] = sched->next_shortname2;

			if (sched->next_shortname1 < 'Z') {
				sched->next_shortname1++;
1493
			} else {
1494 1495 1496 1497 1498
				sched->next_shortname1 = 'A';
				if (sched->next_shortname2 < '9')
					sched->next_shortname2++;
				else
					sched->next_shortname2 = '0';
1499 1500 1501 1502 1503
			}
		}
		new_shortname = 1;
	}

1504 1505
	for (i = 0; i < cpus_nr; i++) {
		int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
J
Jiri Olsa 已提交
1506 1507
		struct thread *curr_thread = sched->curr_thread[cpu];
		const char *pid_color = color;
J
Jiri Olsa 已提交
1508
		const char *cpu_color = color;
J
Jiri Olsa 已提交
1509 1510 1511

		if (curr_thread && thread__has_color(curr_thread))
			pid_color = COLOR_PIDS;
1512

1513 1514 1515
		if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
			continue;

J
Jiri Olsa 已提交
1516 1517 1518
		if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
			cpu_color = COLOR_CPUS;

1519
		if (cpu != this_cpu)
1520
			color_fprintf(stdout, color, " ");
1521
		else
J
Jiri Olsa 已提交
1522
			color_fprintf(stdout, cpu_color, "*");
1523

1524
		if (sched->curr_thread[cpu])
J
Jiri Olsa 已提交
1525
			color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1526
		else
1527
			color_fprintf(stdout, color, "   ");
1528 1529
	}

1530 1531 1532
	if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
		goto out;

1533 1534
	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
	color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1535
	if (new_shortname || (verbose && sched_in->tid)) {
J
Jiri Olsa 已提交
1536 1537 1538 1539 1540 1541
		const char *pid_color = color;

		if (thread__has_color(sched_in))
			pid_color = COLOR_PIDS;

		color_fprintf(stdout, pid_color, "%s => %s:%d",
1542
		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1543
	}
1544

1545
	if (sched->map.comp && new_cpu)
1546
		color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1547

1548
out:
1549
	color_fprintf(stdout, color, "\n");
1550

1551 1552
	thread__put(sched_in);

1553
	return 0;
1554 1555
}

1556
static int process_sched_switch_event(struct perf_tool *tool,
1557
				      struct perf_evsel *evsel,
1558
				      struct perf_sample *sample,
1559
				      struct machine *machine)
1560
{
1561
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1562
	int this_cpu = sample->cpu, err = 0;
1563 1564
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1565

1566
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1567 1568 1569 1570
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1571
		if (sched->curr_pid[this_cpu] != prev_pid)
1572
			sched->nr_context_switch_bugs++;
1573 1574
	}

1575 1576
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1577 1578

	sched->curr_pid[this_cpu] = next_pid;
1579
	return err;
1580 1581
}

1582
static int process_sched_runtime_event(struct perf_tool *tool,
1583
				       struct perf_evsel *evsel,
1584
				       struct perf_sample *sample,
1585
				       struct machine *machine)
1586
{
1587
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1588

1589 1590
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1591

1592
	return 0;
1593 1594
}

1595 1596 1597 1598
static int perf_sched__process_fork_event(struct perf_tool *tool,
					  union perf_event *event,
					  struct perf_sample *sample,
					  struct machine *machine)
1599
{
1600
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1601

1602 1603 1604 1605
	/* run the fork event through the perf machineruy */
	perf_event__process_fork(tool, event, sample, machine);

	/* and then run additional processing needed for this command */
1606
	if (sched->tp_handler->fork_event)
1607
		return sched->tp_handler->fork_event(sched, event, machine);
1608

1609
	return 0;
1610 1611
}

1612
static int process_sched_migrate_task_event(struct perf_tool *tool,
1613
					    struct perf_evsel *evsel,
1614
					    struct perf_sample *sample,
1615
					    struct machine *machine)
1616
{
1617
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1618

1619 1620
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1621

1622
	return 0;
1623 1624
}

1625
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1626
				  struct perf_evsel *evsel,
1627
				  struct perf_sample *sample,
1628
				  struct machine *machine);
I
Ingo Molnar 已提交
1629

1630 1631
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1632 1633 1634
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1635
{
1636
	int err = 0;
I
Ingo Molnar 已提交
1637

1638 1639
	if (evsel->handler != NULL) {
		tracepoint_handler f = evsel->handler;
1640
		err = f(tool, evsel, sample, machine);
1641
	}
I
Ingo Molnar 已提交
1642

1643
	return err;
I
Ingo Molnar 已提交
1644 1645
}

1646
static int perf_sched__read_events(struct perf_sched *sched)
I
Ingo Molnar 已提交
1647
{
1648 1649 1650 1651 1652 1653 1654
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1655
	struct perf_session *session;
1656 1657 1658
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
1659
		.force = sched->force,
1660
	};
1661
	int rc = -1;
1662

1663
	session = perf_session__new(&file, false, &sched->tool);
1664 1665 1666 1667
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1668

1669
	symbol__init(&session->header.env);
1670

1671 1672
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1673

1674
	if (perf_session__has_traces(session, "record -R")) {
1675
		int err = perf_session__process_events(session);
1676 1677 1678 1679
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1680

1681 1682 1683
		sched->nr_events      = session->evlist->stats.nr_events[0];
		sched->nr_lost_events = session->evlist->stats.total_lost;
		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1684
	}
1685

1686
	rc = 0;
1687 1688
out_delete:
	perf_session__delete(session);
1689
	return rc;
I
Ingo Molnar 已提交
1690 1691
}

1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786
/*
 * scheduling times are printed as msec.usec
 */
static inline void print_sched_time(unsigned long long nsecs, int width)
{
	unsigned long msecs;
	unsigned long usecs;

	msecs  = nsecs / NSEC_PER_MSEC;
	nsecs -= msecs * NSEC_PER_MSEC;
	usecs  = nsecs / NSEC_PER_USEC;
	printf("%*lu.%03lu ", width, msecs, usecs);
}

/*
 * returns runtime data for event, allocating memory for it the
 * first time it is used.
 */
static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel)
{
	struct evsel_runtime *r = evsel->priv;

	if (r == NULL) {
		r = zalloc(sizeof(struct evsel_runtime));
		evsel->priv = r;
	}

	return r;
}

/*
 * save last time event was seen per cpu
 */
static void perf_evsel__save_time(struct perf_evsel *evsel,
				  u64 timestamp, u32 cpu)
{
	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);

	if (r == NULL)
		return;

	if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
		int i, n = __roundup_pow_of_two(cpu+1);
		void *p = r->last_time;

		p = realloc(r->last_time, n * sizeof(u64));
		if (!p)
			return;

		r->last_time = p;
		for (i = r->ncpu; i < n; ++i)
			r->last_time[i] = (u64) 0;

		r->ncpu = n;
	}

	r->last_time[cpu] = timestamp;
}

/* returns last time this event was seen on the given cpu */
static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
{
	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);

	if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
		return 0;

	return r->last_time[cpu];
}

static int comm_width = 20;

static char *timehist_get_commstr(struct thread *thread)
{
	static char str[32];
	const char *comm = thread__comm_str(thread);
	pid_t tid = thread->tid;
	pid_t pid = thread->pid_;
	int n;

	if (pid == 0)
		n = scnprintf(str, sizeof(str), "%s", comm);

	else if (tid != pid)
		n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);

	else
		n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);

	if (n > comm_width)
		comm_width = n;

	return str;
}

1787
static void timehist_header(struct perf_sched *sched)
1788
{
1789 1790 1791
	u32 ncpus = sched->max_cpu + 1;
	u32 i, j;

1792 1793
	printf("%15s %6s ", "time", "cpu");

1794 1795 1796 1797 1798 1799 1800 1801 1802 1803
	if (sched->show_cpu_visual) {
		printf(" ");
		for (i = 0, j = 0; i < ncpus; ++i) {
			printf("%x", j++);
			if (j > 15)
				j = 0;
		}
		printf(" ");
	}

1804 1805 1806 1807 1808 1809 1810 1811 1812 1813
	printf(" %-20s  %9s  %9s  %9s",
		"task name", "wait time", "sch delay", "run time");

	printf("\n");

	/*
	 * units row
	 */
	printf("%15s %-6s ", "", "");

1814 1815 1816
	if (sched->show_cpu_visual)
		printf(" %*s ", ncpus, "");

1817 1818 1819 1820 1821 1822 1823
	printf(" %-20s  %9s  %9s  %9s\n", "[tid/pid]", "(msec)", "(msec)", "(msec)");

	/*
	 * separator
	 */
	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);

1824 1825 1826
	if (sched->show_cpu_visual)
		printf(" %.*s ", ncpus, graph_dotted_line);

1827 1828 1829 1830 1831 1832 1833
	printf(" %.20s  %.9s  %.9s  %.9s",
		graph_dotted_line, graph_dotted_line, graph_dotted_line,
		graph_dotted_line);

	printf("\n");
}

1834 1835
static void timehist_print_sample(struct perf_sched *sched,
				  struct perf_sample *sample,
1836
				  struct addr_location *al,
1837 1838 1839
				  struct thread *thread)
{
	struct thread_runtime *tr = thread__priv(thread);
1840
	u32 max_cpus = sched->max_cpu + 1;
1841 1842 1843 1844 1845
	char tstr[64];

	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
	printf("%15s [%04d] ", tstr, sample->cpu);

1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
	if (sched->show_cpu_visual) {
		u32 i;
		char c;

		printf(" ");
		for (i = 0; i < max_cpus; ++i) {
			/* flag idle times with 'i'; others are sched events */
			if (i == sample->cpu)
				c = (thread->tid == 0) ? 'i' : 's';
			else
				c = ' ';
			printf("%c", c);
		}
		printf(" ");
	}

1862 1863 1864 1865 1866
	printf(" %-*s ", comm_width, timehist_get_commstr(thread));

	print_sched_time(tr->dt_wait, 6);
	print_sched_time(tr->dt_delay, 6);
	print_sched_time(tr->dt_run, 6);
1867 1868 1869 1870

	if (sched->show_wakeups)
		printf("  %-*s", comm_width, "");

1871 1872 1873 1874 1875 1876 1877 1878
	if (thread->tid == 0)
		goto out;

	if (sched->show_callchain)
		printf("  ");

	sample__fprintf_sym(sample, al, 0,
			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
1879 1880
			    EVSEL__PRINT_CALLCHAIN_ARROW |
			    EVSEL__PRINT_SKIP_IGNORED,
1881 1882 1883
			    &callchain_cursor, stdout);

out:
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934
	printf("\n");
}

/*
 * Explanation of delta-time stats:
 *
 *            t = time of current schedule out event
 *        tprev = time of previous sched out event
 *                also time of schedule-in event for current task
 *    last_time = time of last sched change event for current task
 *                (i.e, time process was last scheduled out)
 * ready_to_run = time of wakeup for current task
 *
 * -----|------------|------------|------------|------
 *    last         ready        tprev          t
 *    time         to run
 *
 *      |-------- dt_wait --------|
 *                   |- dt_delay -|-- dt_run --|
 *
 *   dt_run = run time of current task
 *  dt_wait = time between last schedule out event for task and tprev
 *            represents time spent off the cpu
 * dt_delay = time between wakeup and schedule-in of task
 */

static void timehist_update_runtime_stats(struct thread_runtime *r,
					 u64 t, u64 tprev)
{
	r->dt_delay   = 0;
	r->dt_wait    = 0;
	r->dt_run     = 0;
	if (tprev) {
		r->dt_run = t - tprev;
		if (r->ready_to_run) {
			if (r->ready_to_run > tprev)
				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
			else
				r->dt_delay = tprev - r->ready_to_run;
		}

		if (r->last_time > tprev)
			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
		else if (r->last_time)
			r->dt_wait = tprev - r->last_time;
	}

	update_stats(&r->run_stats, r->dt_run);
	r->total_run_time += r->dt_run;
}

1935 1936 1937 1938
static bool is_idle_sample(struct perf_sched *sched,
			   struct perf_sample *sample,
			   struct perf_evsel *evsel,
			   struct machine *machine)
1939
{
1940 1941 1942
	struct thread *thread;
	struct callchain_cursor *cursor = &callchain_cursor;

1943 1944 1945 1946 1947 1948 1949 1950
	/* pid 0 == swapper == idle task */
	if (sample->pid == 0)
		return true;

	if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) {
		if (perf_evsel__intval(evsel, sample, "prev_pid") == 0)
			return true;
	}
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969

	/* want main thread for process - has maps */
	thread = machine__findnew_thread(machine, sample->pid, sample->pid);
	if (thread == NULL) {
		pr_debug("Failed to get thread for pid %d.\n", sample->pid);
		return false;
	}

	if (!symbol_conf.use_callchain || sample->callchain == NULL)
		return false;

	if (thread__resolve_callchain(thread, cursor, evsel, sample,
				      NULL, NULL, sched->max_stack) != 0) {
		if (verbose)
			error("Failed to resolve callchain. Skipping\n");

		return false;
	}
	callchain_cursor_commit(cursor);
1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
	return false;
}

/*
 * Track idle stats per cpu by maintaining a local thread
 * struct for the idle task on each cpu.
 */
static int init_idle_threads(int ncpu)
{
	int i;

	idle_threads = zalloc(ncpu * sizeof(struct thread *));
	if (!idle_threads)
		return -ENOMEM;

	idle_max_cpu = ncpu - 1;

	/* allocate the actual thread struct if needed */
	for (i = 0; i < ncpu; ++i) {
		idle_threads[i] = thread__new(0, 0);
		if (idle_threads[i] == NULL)
			return -ENOMEM;

		thread__set_comm(idle_threads[i], idle_comm, 0);
	}

	return 0;
}

static void free_idle_threads(void)
{
	int i;

	if (idle_threads == NULL)
		return;

	for (i = 0; i <= idle_max_cpu; ++i) {
		if ((idle_threads[i]))
			thread__delete(idle_threads[i]);
	}

	free(idle_threads);
}

static struct thread *get_idle_thread(int cpu)
{
	/*
	 * expand/allocate array of pointers to local thread
	 * structs if needed
	 */
	if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
		int i, j = __roundup_pow_of_two(cpu+1);
		void *p;

		p = realloc(idle_threads, j * sizeof(struct thread *));
		if (!p)
			return NULL;

		idle_threads = (struct thread **) p;
		i = idle_max_cpu ? idle_max_cpu + 1 : 0;
		for (; i < j; ++i)
			idle_threads[i] = NULL;

		idle_max_cpu = j;
	}

	/* allocate a new thread struct if needed */
	if (idle_threads[cpu] == NULL) {
		idle_threads[cpu] = thread__new(0, 0);

		if (idle_threads[cpu]) {
			idle_threads[cpu]->tid = 0;
			thread__set_comm(idle_threads[cpu], idle_comm, 0);
		}
	}

	return idle_threads[cpu];
}

/*
 * handle runtime stats saved per thread
 */
static struct thread_runtime *thread__init_runtime(struct thread *thread)
{
	struct thread_runtime *r;

	r = zalloc(sizeof(struct thread_runtime));
	if (!r)
		return NULL;

	init_stats(&r->run_stats);
	thread__set_priv(thread, r);

	return r;
}

static struct thread_runtime *thread__get_runtime(struct thread *thread)
{
	struct thread_runtime *tr;

	tr = thread__priv(thread);
	if (tr == NULL) {
		tr = thread__init_runtime(thread);
		if (tr == NULL)
			pr_debug("Failed to malloc memory for runtime data.\n");
	}

	return tr;
}

2080 2081
static struct thread *timehist_get_thread(struct perf_sched *sched,
					  struct perf_sample *sample,
2082 2083 2084 2085 2086
					  struct machine *machine,
					  struct perf_evsel *evsel)
{
	struct thread *thread;

2087
	if (is_idle_sample(sched, sample, evsel, machine)) {
2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102
		thread = get_idle_thread(sample->cpu);
		if (thread == NULL)
			pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);

	} else {
		thread = machine__findnew_thread(machine, sample->pid, sample->tid);
		if (thread == NULL) {
			pr_debug("Failed to get thread for tid %d. skipping sample.\n",
				 sample->tid);
		}
	}

	return thread;
}

2103 2104
static bool timehist_skip_sample(struct perf_sched *sched,
				 struct thread *thread)
2105 2106 2107
{
	bool rc = false;

2108
	if (thread__is_filtered(thread)) {
2109
		rc = true;
2110 2111
		sched->skipped_samples++;
	}
2112 2113 2114 2115

	return rc;
}

2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
static void timehist_print_wakeup_event(struct perf_sched *sched,
					struct perf_sample *sample,
					struct machine *machine,
					struct thread *awakened)
{
	struct thread *thread;
	char tstr[64];

	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
	if (thread == NULL)
		return;

	/* show wakeup unless both awakee and awaker are filtered */
	if (timehist_skip_sample(sched, thread) &&
	    timehist_skip_sample(sched, awakened)) {
		return;
	}

	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
	printf("%15s [%04d] ", tstr, sample->cpu);
2136 2137
	if (sched->show_cpu_visual)
		printf(" %*s ", sched->max_cpu + 1, "");
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149

	printf(" %-*s ", comm_width, timehist_get_commstr(thread));

	/* dt spacer */
	printf("  %9s  %9s  %9s ", "", "", "");

	printf("awakened: %s", timehist_get_commstr(awakened));

	printf("\n");
}

static int timehist_sched_wakeup_event(struct perf_tool *tool,
2150 2151 2152 2153 2154
				       union perf_event *event __maybe_unused,
				       struct perf_evsel *evsel,
				       struct perf_sample *sample,
				       struct machine *machine)
{
2155
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
	struct thread *thread;
	struct thread_runtime *tr = NULL;
	/* want pid of awakened task not pid in sample */
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");

	thread = machine__findnew_thread(machine, 0, pid);
	if (thread == NULL)
		return -1;

	tr = thread__get_runtime(thread);
	if (tr == NULL)
		return -1;

	if (tr->ready_to_run == 0)
		tr->ready_to_run = sample->time;

2172 2173 2174 2175
	/* show wakeups if requested */
	if (sched->show_wakeups)
		timehist_print_wakeup_event(sched, sample, machine, thread);

2176 2177 2178
	return 0;
}

2179
static int timehist_sched_change_event(struct perf_tool *tool,
2180 2181 2182 2183 2184
				       union perf_event *event,
				       struct perf_evsel *evsel,
				       struct perf_sample *sample,
				       struct machine *machine)
{
2185
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
	struct addr_location al;
	struct thread *thread;
	struct thread_runtime *tr = NULL;
	u64 tprev;
	int rc = 0;

	if (machine__resolve(machine, &al, sample) < 0) {
		pr_err("problem processing %d event. skipping it\n",
		       event->header.type);
		rc = -1;
		goto out;
	}

2199
	thread = timehist_get_thread(sched, sample, machine, evsel);
2200 2201 2202 2203 2204
	if (thread == NULL) {
		rc = -1;
		goto out;
	}

2205
	if (timehist_skip_sample(sched, thread))
2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
		goto out;

	tr = thread__get_runtime(thread);
	if (tr == NULL) {
		rc = -1;
		goto out;
	}

	tprev = perf_evsel__get_time(evsel, sample->cpu);

	timehist_update_runtime_stats(tr, sample->time, tprev);
2217
	if (!sched->summary_only)
2218
		timehist_print_sample(sched, sample, &al, thread);
2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257

out:
	if (tr) {
		/* time of this sched_switch event becomes last time task seen */
		tr->last_time = sample->time;

		/* sched out event for task so reset ready to run time */
		tr->ready_to_run = 0;
	}

	perf_evsel__save_time(evsel, sample->time, sample->cpu);

	return rc;
}

static int timehist_sched_switch_event(struct perf_tool *tool,
			     union perf_event *event,
			     struct perf_evsel *evsel,
			     struct perf_sample *sample,
			     struct machine *machine __maybe_unused)
{
	return timehist_sched_change_event(tool, event, evsel, sample, machine);
}

static int process_lost(struct perf_tool *tool __maybe_unused,
			union perf_event *event,
			struct perf_sample *sample,
			struct machine *machine __maybe_unused)
{
	char tstr[64];

	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
	printf("%15s ", tstr);
	printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);

	return 0;
}


2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382
static void print_thread_runtime(struct thread *t,
				 struct thread_runtime *r)
{
	double mean = avg_stats(&r->run_stats);
	float stddev;

	printf("%*s   %5d  %9" PRIu64 " ",
	       comm_width, timehist_get_commstr(t), t->ppid,
	       (u64) r->run_stats.n);

	print_sched_time(r->total_run_time, 8);
	stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
	print_sched_time(r->run_stats.min, 6);
	printf(" ");
	print_sched_time((u64) mean, 6);
	printf(" ");
	print_sched_time(r->run_stats.max, 6);
	printf("  ");
	printf("%5.2f", stddev);
	printf("\n");
}

struct total_run_stats {
	u64  sched_count;
	u64  task_count;
	u64  total_run_time;
};

static int __show_thread_runtime(struct thread *t, void *priv)
{
	struct total_run_stats *stats = priv;
	struct thread_runtime *r;

	if (thread__is_filtered(t))
		return 0;

	r = thread__priv(t);
	if (r && r->run_stats.n) {
		stats->task_count++;
		stats->sched_count += r->run_stats.n;
		stats->total_run_time += r->total_run_time;
		print_thread_runtime(t, r);
	}

	return 0;
}

static int show_thread_runtime(struct thread *t, void *priv)
{
	if (t->dead)
		return 0;

	return __show_thread_runtime(t, priv);
}

static int show_deadthread_runtime(struct thread *t, void *priv)
{
	if (!t->dead)
		return 0;

	return __show_thread_runtime(t, priv);
}

static void timehist_print_summary(struct perf_sched *sched,
				   struct perf_session *session)
{
	struct machine *m = &session->machines.host;
	struct total_run_stats totals;
	u64 task_count;
	struct thread *t;
	struct thread_runtime *r;
	int i;

	memset(&totals, 0, sizeof(totals));

	if (comm_width < 30)
		comm_width = 30;

	printf("\nRuntime summary\n");
	printf("%*s  parent   sched-in  ", comm_width, "comm");
	printf("   run-time    min-run     avg-run     max-run  stddev\n");
	printf("%*s            (count)  ", comm_width, "");
	printf("     (msec)     (msec)      (msec)      (msec)       %%\n");
	printf("%.105s\n", graph_dotted_line);

	machine__for_each_thread(m, show_thread_runtime, &totals);
	task_count = totals.task_count;
	if (!task_count)
		printf("<no still running tasks>\n");

	printf("\nTerminated tasks:\n");
	machine__for_each_thread(m, show_deadthread_runtime, &totals);
	if (task_count == totals.task_count)
		printf("<no terminated tasks>\n");

	/* CPU idle stats not tracked when samples were skipped */
	if (sched->skipped_samples)
		return;

	printf("\nIdle stats:\n");
	for (i = 0; i <= idle_max_cpu; ++i) {
		t = idle_threads[i];
		if (!t)
			continue;

		r = thread__priv(t);
		if (r && r->run_stats.n) {
			totals.sched_count += r->run_stats.n;
			printf("    CPU %2d idle for ", i);
			print_sched_time(r->total_run_time, 6);
			printf(" msec\n");
		} else
			printf("    CPU %2d idle entire time window\n", i);
	}

	printf("\n"
	       "    Total number of unique tasks: %" PRIu64 "\n"
	       "Total number of context switches: %" PRIu64 "\n"
	       "           Total run time (msec): ",
	       totals.task_count, totals.sched_count);

	print_sched_time(totals.total_run_time, 2);
	printf("\n");
}

2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
typedef int (*sched_handler)(struct perf_tool *tool,
			  union perf_event *event,
			  struct perf_evsel *evsel,
			  struct perf_sample *sample,
			  struct machine *machine);

static int perf_timehist__process_sample(struct perf_tool *tool,
					 union perf_event *event,
					 struct perf_sample *sample,
					 struct perf_evsel *evsel,
					 struct machine *machine)
{
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
	int err = 0;
	int this_cpu = sample->cpu;

	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;

	if (evsel->handler != NULL) {
		sched_handler f = evsel->handler;

		err = f(tool, event, evsel, sample, machine);
	}

	return err;
}

2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
static int timehist_check_attr(struct perf_sched *sched,
			       struct perf_evlist *evlist)
{
	struct perf_evsel *evsel;
	struct evsel_runtime *er;

	list_for_each_entry(evsel, &evlist->entries, node) {
		er = perf_evsel__get_runtime(evsel);
		if (er == NULL) {
			pr_err("Failed to allocate memory for evsel runtime data\n");
			return -1;
		}

		if (sched->show_callchain &&
		    !(evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) {
			pr_info("Samples do not have callchains.\n");
			sched->show_callchain = 0;
			symbol_conf.use_callchain = 0;
		}
	}

	return 0;
}

2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447
static int perf_sched__timehist(struct perf_sched *sched)
{
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",       timehist_sched_switch_event, },
		{ "sched:sched_wakeup",	      timehist_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
	};
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
	};

	struct perf_session *session;
2448
	struct perf_evlist *evlist;
2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466
	int err = -1;

	/*
	 * event handlers for timehist option
	 */
	sched->tool.sample	 = perf_timehist__process_sample;
	sched->tool.mmap	 = perf_event__process_mmap;
	sched->tool.comm	 = perf_event__process_comm;
	sched->tool.exit	 = perf_event__process_exit;
	sched->tool.fork	 = perf_event__process_fork;
	sched->tool.lost	 = process_lost;
	sched->tool.attr	 = perf_event__process_attr;
	sched->tool.tracing_data = perf_event__process_tracing_data;
	sched->tool.build_id	 = perf_event__process_build_id;

	sched->tool.ordered_events = true;
	sched->tool.ordering_requires_timestamps = true;

2467 2468
	symbol_conf.use_callchain = sched->show_callchain;

2469 2470 2471 2472
	session = perf_session__new(&file, false, &sched->tool);
	if (session == NULL)
		return -ENOMEM;

2473 2474
	evlist = session->evlist;

2475 2476
	symbol__init(&session->header.env);

2477 2478 2479
	if (timehist_check_attr(sched, evlist) != 0)
		goto out;

2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
	setup_pager();

	/* setup per-evsel handlers */
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out;

	if (!perf_session__has_traces(session, "record -R"))
		goto out;

	/* pre-allocate struct for per-CPU idle stats */
	sched->max_cpu = session->header.env.nr_cpus_online;
	if (sched->max_cpu == 0)
		sched->max_cpu = 4;
	if (init_idle_threads(sched->max_cpu))
		goto out;

2496 2497 2498 2499 2500
	/* summary_only implies summary option, but don't overwrite summary if set */
	if (sched->summary_only)
		sched->summary = sched->summary_only;

	if (!sched->summary_only)
2501
		timehist_header(sched);
2502 2503 2504 2505 2506 2507 2508

	err = perf_session__process_events(session);
	if (err) {
		pr_err("Failed to process events, error %d", err);
		goto out;
	}

2509 2510 2511 2512 2513 2514 2515
	sched->nr_events      = evlist->stats.nr_events[0];
	sched->nr_lost_events = evlist->stats.total_lost;
	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];

	if (sched->summary)
		timehist_print_summary(sched, session);

2516 2517 2518 2519 2520 2521 2522 2523
out:
	free_idle_threads();
	perf_session__delete(session);

	return err;
}


2524
static void print_bad_events(struct perf_sched *sched)
2525
{
2526
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
2527
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
2528 2529
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
2530
	}
2531
	if (sched->nr_lost_events && sched->nr_events) {
2532
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
2533 2534
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
2535
	}
2536
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
2537
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
2538 2539 2540
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
2541 2542 2543 2544 2545
			printf(" (due to lost events?)");
		printf("\n");
	}
}

2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598
static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;
	struct work_atoms *this;
	const char *comm = thread__comm_str(data->thread), *this_comm;

	while (*new) {
		int cmp;

		this = container_of(*new, struct work_atoms, node);
		parent = *new;

		this_comm = thread__comm_str(this->thread);
		cmp = strcmp(comm, this_comm);
		if (cmp > 0) {
			new = &((*new)->rb_left);
		} else if (cmp < 0) {
			new = &((*new)->rb_right);
		} else {
			this->num_merged++;
			this->total_runtime += data->total_runtime;
			this->nb_atoms += data->nb_atoms;
			this->total_lat += data->total_lat;
			list_splice(&data->work_list, &this->work_list);
			if (this->max_lat < data->max_lat) {
				this->max_lat = data->max_lat;
				this->max_lat_at = data->max_lat_at;
			}
			zfree(&data);
			return;
		}
	}

	data->num_merged++;
	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

static void perf_sched__merge_lat(struct perf_sched *sched)
{
	struct work_atoms *data;
	struct rb_node *node;

	if (sched->skip_merge)
		return;

	while ((node = rb_first(&sched->atom_root))) {
		rb_erase(node, &sched->atom_root);
		data = rb_entry(node, struct work_atoms, node);
		__merge_work_atoms(&sched->merged_atom_root, data);
	}
}

2599
static int perf_sched__lat(struct perf_sched *sched)
2600 2601 2602 2603
{
	struct rb_node *next;

	setup_pager();
2604

2605
	if (perf_sched__read_events(sched))
2606
		return -1;
2607

2608
	perf_sched__merge_lat(sched);
2609
	perf_sched__sort_lat(sched);
2610

2611 2612 2613
	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
2614

2615
	next = rb_first(&sched->sorted_atom_root);
2616 2617 2618 2619 2620

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
2621
		output_lat_thread(sched, work_list);
2622
		next = rb_next(next);
2623
		thread__zput(work_list->thread);
2624 2625
	}

2626
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
2627
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
2628
		(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
2629 2630 2631

	printf(" ---------------------------------------------------\n");

2632
	print_bad_events(sched);
2633 2634
	printf("\n");

2635
	return 0;
2636 2637
}

2638 2639
static int setup_map_cpus(struct perf_sched *sched)
{
2640 2641
	struct cpu_map *map;

2642 2643 2644 2645
	sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);

	if (sched->map.comp) {
		sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
J
Jiri Olsa 已提交
2646 2647
		if (!sched->map.comp_cpus)
			return -1;
2648 2649
	}

2650 2651 2652 2653 2654 2655 2656 2657 2658 2659
	if (!sched->map.cpus_str)
		return 0;

	map = cpu_map__new(sched->map.cpus_str);
	if (!map) {
		pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
		return -1;
	}

	sched->map.cpus = map;
2660 2661 2662
	return 0;
}

J
Jiri Olsa 已提交
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679
static int setup_color_pids(struct perf_sched *sched)
{
	struct thread_map *map;

	if (!sched->map.color_pids_str)
		return 0;

	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
	if (!map) {
		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
		return -1;
	}

	sched->map.color_pids = map;
	return 0;
}

J
Jiri Olsa 已提交
2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696
static int setup_color_cpus(struct perf_sched *sched)
{
	struct cpu_map *map;

	if (!sched->map.color_cpus_str)
		return 0;

	map = cpu_map__new(sched->map.color_cpus_str);
	if (!map) {
		pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
		return -1;
	}

	sched->map.color_cpus = map;
	return 0;
}

2697
static int perf_sched__map(struct perf_sched *sched)
2698
{
2699 2700
	if (setup_map_cpus(sched))
		return -1;
2701

J
Jiri Olsa 已提交
2702 2703 2704
	if (setup_color_pids(sched))
		return -1;

J
Jiri Olsa 已提交
2705 2706 2707
	if (setup_color_cpus(sched))
		return -1;

2708
	setup_pager();
2709
	if (perf_sched__read_events(sched))
2710
		return -1;
2711
	print_bad_events(sched);
2712
	return 0;
2713 2714
}

2715
static int perf_sched__replay(struct perf_sched *sched)
2716 2717 2718
{
	unsigned long i;

2719 2720
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
2721

2722
	test_calibrations(sched);
2723

2724
	if (perf_sched__read_events(sched))
2725
		return -1;
2726

2727 2728 2729
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
2730

2731 2732 2733 2734 2735
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
2736
		printf("run atoms optimized: %ld\n",
2737
			sched->nr_run_events_optimized);
2738

2739 2740
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
2741

2742
	create_tasks(sched);
2743
	printf("------------------------------------------------------------\n");
2744 2745
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
2746 2747

	return 0;
2748 2749
}

2750 2751
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
2752
{
2753
	char *tmp, *tok, *str = strdup(sched->sort_order);
2754 2755 2756

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
2757
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
2758 2759
			usage_with_options_msg(usage_msg, options,
					"Unknown --sort key: `%s'", tok);
2760 2761 2762 2763 2764
		}
	}

	free(str);

2765
	sort_dimension__add("pid", &sched->cmp_pid);
2766 2767
}

2768 2769 2770 2771
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
2785
		"-e", "sched:sched_wakeup_new",
2786 2787
		"-e", "sched:sched_migrate_task",
	};
2788 2789 2790 2791

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

2792
	if (rec_argv == NULL)
2793 2794
		return -ENOMEM;

2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

2806
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
2807
{
2808 2809 2810 2811 2812 2813 2814
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_sched__process_fork_event,
2815
			.ordered_events = true,
2816 2817 2818 2819 2820 2821 2822 2823 2824 2825
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
2826
		.skip_merge           = 0,
2827 2828
		.show_callchain	      = 1,
		.max_stack            = 5,
2829
	};
2830 2831 2832 2833 2834 2835 2836 2837 2838
	const struct option sched_options[] = {
	OPT_STRING('i', "input", &input_name, "file",
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
2839 2840 2841 2842 2843 2844 2845
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
2846 2847
	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
		    "latency stats per pid instead of per comm"),
2848
	OPT_PARENT(sched_options)
2849 2850 2851 2852 2853 2854
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
2855
	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
2856
	OPT_PARENT(sched_options)
2857
	};
2858 2859 2860
	const struct option map_options[] = {
	OPT_BOOLEAN(0, "compact", &sched.map.comp,
		    "map output in compact mode"),
J
Jiri Olsa 已提交
2861 2862
	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
		   "highlight given pids in map"),
J
Jiri Olsa 已提交
2863 2864
	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
                    "highlight given CPUs in map"),
2865 2866
	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
                    "display given CPUs in map"),
2867
	OPT_PARENT(sched_options)
2868
	};
2869 2870 2871 2872 2873
	const struct option timehist_options[] = {
	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
		   "file", "vmlinux pathname"),
	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
		   "file", "kallsyms pathname"),
2874 2875 2876 2877
	OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
		    "Display call chains if present (default on)"),
	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
		   "Maximum number of functions to display backtrace."),
2878 2879
	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
		    "Look for files with symbols relative to this directory"),
2880 2881 2882 2883
	OPT_BOOLEAN('s', "summary", &sched.summary_only,
		    "Show only syscall summary with statistics"),
	OPT_BOOLEAN('S', "with-summary", &sched.summary,
		    "Show all syscalls and summary with statistics"),
2884
	OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
2885
	OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
2886 2887 2888
	OPT_PARENT(sched_options)
	};

2889 2890 2891 2892 2893 2894 2895 2896
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
2897 2898 2899 2900
	const char * const map_usage[] = {
		"perf sched map [<options>]",
		NULL
	};
2901 2902 2903 2904
	const char * const timehist_usage[] = {
		"perf sched timehist [<options>]",
		NULL
	};
2905
	const char *const sched_subcommands[] = { "record", "latency", "map",
2906 2907
						  "replay", "script",
						  "timehist", NULL };
2908 2909
	const char *sched_usage[] = {
		NULL,
2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};
A
Adrian Hunter 已提交
2926 2927 2928 2929
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
		sched.curr_pid[i] = -1;
2930

2931 2932
	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2933 2934
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
2935

2936
	/*
2937
	 * Aliased to 'perf script' for now:
2938
	 */
2939 2940
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
2941

2942 2943 2944
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
2945
		sched.tp_handler = &lat_ops;
2946 2947 2948 2949 2950
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
2951 2952
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
2953
	} else if (!strcmp(argv[0], "map")) {
2954
		if (argc) {
J
Jiri Olsa 已提交
2955
			argc = parse_options(argc, argv, map_options, map_usage, 0);
2956 2957 2958
			if (argc)
				usage_with_options(map_usage, map_options);
		}
2959 2960 2961
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
2962
	} else if (!strncmp(argv[0], "rep", 3)) {
2963
		sched.tp_handler = &replay_ops;
2964 2965 2966 2967 2968
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
2969
		return perf_sched__replay(&sched);
2970 2971 2972 2973 2974 2975 2976
	} else if (!strcmp(argv[0], "timehist")) {
		if (argc) {
			argc = parse_options(argc, argv, timehist_options,
					     timehist_usage, 0);
			if (argc)
				usage_with_options(timehist_usage, timehist_options);
		}
2977 2978 2979 2980 2981 2982 2983
		if (sched.show_wakeups && sched.summary_only) {
			pr_err(" Error: -s and -w are mutually exclusive.\n");
			parse_options_usage(timehist_usage, timehist_options, "s", true);
			parse_options_usage(NULL, timehist_options, "w", true);
			return -EINVAL;
		}

2984
		return perf_sched__timehist(&sched);
2985 2986 2987 2988
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
2989
	return 0;
I
Ingo Molnar 已提交
2990
}