builtin-sched.c 50.3 KB
Newer Older
I
Ingo Molnar 已提交
1
#include "builtin.h"
2
#include "perf.h"
I
Ingo Molnar 已提交
3 4

#include "util/util.h"
5
#include "util/evlist.h"
I
Ingo Molnar 已提交
6
#include "util/cache.h"
7
#include "util/evsel.h"
I
Ingo Molnar 已提交
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
13
#include "util/cloexec.h"
J
Jiri Olsa 已提交
14
#include "util/thread_map.h"
15
#include "util/color.h"
I
Ingo Molnar 已提交
16

17
#include <subcmd/parse-options.h>
18
#include "util/trace-event.h"
I
Ingo Molnar 已提交
19 20 21

#include "util/debug.h"

22
#include <sys/prctl.h>
23
#include <sys/resource.h>
I
Ingo Molnar 已提交
24

25 26 27
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
28
#include <api/fs/fs.h>
29

30 31 32 33
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
34
#define MAX_PID			1024000
I
Ingo Molnar 已提交
35

36
struct sched_atom;
I
Ingo Molnar 已提交
37

38 39 40 41
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
I
Ingo Molnar 已提交
42

43 44
	unsigned long		nr_events;
	unsigned long		curr_event;
45
	struct sched_atom	**atoms;
46 47 48

	pthread_t		thread;
	sem_t			sleep_sem;
I
Ingo Molnar 已提交
49

50 51 52 53 54 55 56 57 58 59
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
60
	SCHED_EVENT_MIGRATION,
61 62
};

63
struct sched_atom {
64
	enum sched_event_type	type;
65
	int			specific_wait;
66 67 68 69 70 71 72
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

73
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
74 75 76 77 78 79 80 81 82 83 84

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
85
	u64			sched_out_time;
86 87 88 89 90
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

91 92
struct work_atoms {
	struct list_head	work_list;
93 94 95
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
96
	u64			max_lat_at;
97 98 99
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
100
	int			num_merged;
101 102
};

103
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
104

105
struct perf_sched;
106

107 108 109
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
110

111 112
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
113

114 115
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
116

117 118 119
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
120 121

	int (*migrate_task_event)(struct perf_sched *sched,
122 123 124
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
125 126
};

J
Jiri Olsa 已提交
127 128
#define COLOR_PIDS PERF_COLOR_BLUE

129 130 131 132
struct perf_sched_map {
	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
	int			*comp_cpus;
	bool			 comp;
J
Jiri Olsa 已提交
133 134
	struct thread_map	*color_pids;
	const char		*color_pids_str;
135 136
};

137 138 139 140
struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
141
	struct task_desc **pid_to_task;
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
184
	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
185
	struct list_head sort_list, cmp_pid;
186
	bool force;
187
	bool skip_merge;
188
	struct perf_sched_map map;
189
};
190 191

static u64 get_nsecs(void)
I
Ingo Molnar 已提交
192 193 194 195 196 197 198 199
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

200
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
I
Ingo Molnar 已提交
201
{
202
	u64 T0 = get_nsecs(), T1;
I
Ingo Molnar 已提交
203 204 205

	do {
		T1 = get_nsecs();
206
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
I
Ingo Molnar 已提交
207 208
}

209
static void sleep_nsecs(u64 nsecs)
I
Ingo Molnar 已提交
210 211 212 213 214 215 216 217 218
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

219
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
220
{
221
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
222 223 224 225
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
226
		burn_nsecs(sched, 0);
I
Ingo Molnar 已提交
227 228 229 230
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
231
	sched->run_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
232

233
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
234 235
}

236
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
I
Ingo Molnar 已提交
237
{
238
	u64 T0, T1, delta, min_delta = 1000000000ULL;
I
Ingo Molnar 已提交
239 240 241 242 243 244 245 246 247 248
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
249
	sched->sleep_measurement_overhead = min_delta;
I
Ingo Molnar 已提交
250

251
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
I
Ingo Molnar 已提交
252 253
}

254
static struct sched_atom *
255
get_new_event(struct task_desc *task, u64 timestamp)
I
Ingo Molnar 已提交
256
{
257
	struct sched_atom *event = zalloc(sizeof(*event));
I
Ingo Molnar 已提交
258 259 260 261 262 263 264
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
265 266 267
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
I
Ingo Molnar 已提交
268

269
	task->atoms[idx] = event;
I
Ingo Molnar 已提交
270 271 272 273

	return event;
}

274
static struct sched_atom *last_event(struct task_desc *task)
I
Ingo Molnar 已提交
275 276 277 278
{
	if (!task->nr_events)
		return NULL;

279
	return task->atoms[task->nr_events - 1];
I
Ingo Molnar 已提交
280 281
}

282 283
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
I
Ingo Molnar 已提交
284
{
285
	struct sched_atom *event, *curr_event = last_event(task);
I
Ingo Molnar 已提交
286 287

	/*
288 289 290
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
I
Ingo Molnar 已提交
291
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
292
		sched->nr_run_events_optimized++;
I
Ingo Molnar 已提交
293 294 295 296 297 298 299 300 301
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

302
	sched->nr_run_events++;
I
Ingo Molnar 已提交
303 304
}

305 306
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
I
Ingo Molnar 已提交
307
{
308
	struct sched_atom *event, *wakee_event;
I
Ingo Molnar 已提交
309 310 311 312 313 314 315

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
316
		sched->targetless_wakeups++;
I
Ingo Molnar 已提交
317 318 319
		return;
	}
	if (wakee_event->wait_sem) {
320
		sched->multitarget_wakeups++;
I
Ingo Molnar 已提交
321 322 323
		return;
	}

324
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
I
Ingo Molnar 已提交
325 326 327 328
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

329
	sched->nr_wakeup_events++;
I
Ingo Molnar 已提交
330 331
}

332 333
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
I
Ingo Molnar 已提交
334
{
335
	struct sched_atom *event = get_new_event(task, timestamp);
I
Ingo Molnar 已提交
336 337 338

	event->type = SCHED_EVENT_SLEEP;

339
	sched->nr_sleep_events++;
I
Ingo Molnar 已提交
340 341
}

342 343
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
I
Ingo Molnar 已提交
344 345
{
	struct task_desc *task;
346
	static int pid_max;
I
Ingo Molnar 已提交
347

348 349 350 351 352
	if (sched->pid_to_task == NULL) {
		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
			pid_max = MAX_PID;
		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
	}
353 354 355 356 357 358
	if (pid >= (unsigned long)pid_max) {
		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
			sizeof(struct task_desc *))) == NULL);
		while (pid >= (unsigned long)pid_max)
			sched->pid_to_task[pid_max++] = NULL;
	}
I
Ingo Molnar 已提交
359

360
	task = sched->pid_to_task[pid];
I
Ingo Molnar 已提交
361 362 363 364

	if (task)
		return task;

365
	task = zalloc(sizeof(*task));
I
Ingo Molnar 已提交
366
	task->pid = pid;
367
	task->nr = sched->nr_tasks;
I
Ingo Molnar 已提交
368 369 370 371 372
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
373
	add_sched_event_sleep(sched, task, 0, 0);
I
Ingo Molnar 已提交
374

375 376
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
377
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
378 379
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
I
Ingo Molnar 已提交
380

I
Ingo Molnar 已提交
381
	if (verbose)
382
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
I
Ingo Molnar 已提交
383 384 385 386 387

	return task;
}


388
static void print_task_traces(struct perf_sched *sched)
I
Ingo Molnar 已提交
389 390 391 392
{
	struct task_desc *task;
	unsigned long i;

393 394
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
395
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
I
Ingo Molnar 已提交
396 397 398 399
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

400
static void add_cross_task_wakeups(struct perf_sched *sched)
I
Ingo Molnar 已提交
401 402 403 404
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

405 406
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
I
Ingo Molnar 已提交
407
		j = i + 1;
408
		if (j == sched->nr_tasks)
I
Ingo Molnar 已提交
409
			j = 0;
410 411
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
I
Ingo Molnar 已提交
412 413 414
	}
}

415 416
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
I
Ingo Molnar 已提交
417 418 419
{
	int ret = 0;

420
	switch (atom->type) {
I
Ingo Molnar 已提交
421
		case SCHED_EVENT_RUN:
422
			burn_nsecs(sched, atom->duration);
I
Ingo Molnar 已提交
423 424
			break;
		case SCHED_EVENT_SLEEP:
425 426
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
I
Ingo Molnar 已提交
427 428 429
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
430 431
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
I
Ingo Molnar 已提交
432 433
			BUG_ON(ret);
			break;
434 435
		case SCHED_EVENT_MIGRATION:
			break;
I
Ingo Molnar 已提交
436 437 438 439 440
		default:
			BUG_ON(1);
	}
}

441
static u64 get_cpu_usage_nsec_parent(void)
I
Ingo Molnar 已提交
442 443
{
	struct rusage ru;
444
	u64 sum;
I
Ingo Molnar 已提交
445 446 447 448 449 450 451 452 453 454 455
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

456
static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
I
Ingo Molnar 已提交
457
{
458
	struct perf_event_attr attr;
459
	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
460
	int fd;
461 462
	struct rlimit limit;
	bool need_privilege = false;
I
Ingo Molnar 已提交
463

464
	memset(&attr, 0, sizeof(attr));
I
Ingo Molnar 已提交
465

466 467
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
I
Ingo Molnar 已提交
468

469
force_again:
470 471
	fd = sys_perf_event_open(&attr, 0, -1, -1,
				 perf_event_open_cloexec_flag());
472

473
	if (fd < 0) {
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
		if (errno == EMFILE) {
			if (sched->force) {
				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
				limit.rlim_cur += sched->nr_tasks - cur_task;
				if (limit.rlim_cur > limit.rlim_max) {
					limit.rlim_max = limit.rlim_cur;
					need_privilege = true;
				}
				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
					if (need_privilege && errno == EPERM)
						strcpy(info, "Need privilege\n");
				} else
					goto force_again;
			} else
				strcpy(info, "Have a try with -f option\n");
		}
490
		pr_err("Error: sys_perf_event_open() syscall returned "
491 492
		       "with %d (%s)\n%s", fd,
		       strerror_r(errno, sbuf, sizeof(sbuf)), info);
493 494
		exit(EXIT_FAILURE);
	}
495 496 497 498 499 500 501 502 503 504 505 506
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
I
Ingo Molnar 已提交
507 508
}

509 510 511
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
512
	int fd;
513 514
};

I
Ingo Molnar 已提交
515 516
static void *thread_func(void *ctx)
{
517 518 519
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
520
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
521 522
	unsigned long i, ret;
	char comm2[22];
523
	int fd = parms->fd;
I
Ingo Molnar 已提交
524

525
	zfree(&parms);
526

I
Ingo Molnar 已提交
527 528
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
529 530
	if (fd < 0)
		return NULL;
I
Ingo Molnar 已提交
531 532 533
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
534
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
535
	BUG_ON(ret);
536
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
537 538
	BUG_ON(ret);

539
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
540 541 542

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
543
		perf_sched__process_event(sched, this_task->atoms[i]);
I
Ingo Molnar 已提交
544 545
	}

546
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
I
Ingo Molnar 已提交
547 548 549 550
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

551
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
552
	BUG_ON(ret);
553
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
554 555 556 557 558
	BUG_ON(ret);

	goto again;
}

559
static void create_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
560 561 562 563 564 565 566 567
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
568 569
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
I
Ingo Molnar 已提交
570
	BUG_ON(err);
571
	err = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
572
	BUG_ON(err);
573
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
574
	BUG_ON(err);
575 576 577 578 579
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
580
		parms->fd = self_open_counters(sched, i);
I
Ingo Molnar 已提交
581 582 583 584
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
585
		err = pthread_create(&task->thread, &attr, thread_func, parms);
I
Ingo Molnar 已提交
586 587 588 589
		BUG_ON(err);
	}
}

590
static void wait_for_tasks(struct perf_sched *sched)
I
Ingo Molnar 已提交
591
{
592
	u64 cpu_usage_0, cpu_usage_1;
I
Ingo Molnar 已提交
593 594 595
	struct task_desc *task;
	unsigned long i, ret;

596 597 598
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
599

600 601
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
602 603 604 605
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
606
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
I
Ingo Molnar 已提交
607 608 609 610
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

611
	pthread_mutex_unlock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
612

613 614
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
615 616 617
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
618
		sched->cpu_usage += task->cpu_usage;
I
Ingo Molnar 已提交
619 620 621 622
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
623 624
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
625
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
I
Ingo Molnar 已提交
626

627 628 629
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
630 631
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
					 sched->parent_cpu_usage)/sched->replay_repeat;
I
Ingo Molnar 已提交
632

633
	ret = pthread_mutex_lock(&sched->start_work_mutex);
I
Ingo Molnar 已提交
634 635
	BUG_ON(ret);

636 637
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
I
Ingo Molnar 已提交
638 639 640 641 642
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

643
static void run_one_test(struct perf_sched *sched)
I
Ingo Molnar 已提交
644
{
K
Kyle McMartin 已提交
645
	u64 T0, T1, delta, avg_delta, fluct;
I
Ingo Molnar 已提交
646 647

	T0 = get_nsecs();
648
	wait_for_tasks(sched);
I
Ingo Molnar 已提交
649 650 651
	T1 = get_nsecs();

	delta = T1 - T0;
652 653
	sched->sum_runtime += delta;
	sched->nr_runs++;
I
Ingo Molnar 已提交
654

655
	avg_delta = sched->sum_runtime / sched->nr_runs;
I
Ingo Molnar 已提交
656 657 658 659
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
660 661 662
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
663
	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
I
Ingo Molnar 已提交
664

665
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
I
Ingo Molnar 已提交
666

667
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
I
Ingo Molnar 已提交
668

I
Ingo Molnar 已提交
669
	printf("cpu: %0.2f / %0.2f",
670
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
I
Ingo Molnar 已提交
671 672 673

#if 0
	/*
674
	 * rusage statistics done by the parent, these are less
675
	 * accurate than the sched->sum_exec_runtime based statistics:
676
	 */
I
Ingo Molnar 已提交
677
	printf(" [%0.2f / %0.2f]",
678 679
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
I
Ingo Molnar 已提交
680 681
#endif

I
Ingo Molnar 已提交
682
	printf("\n");
I
Ingo Molnar 已提交
683

684 685 686
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
I
Ingo Molnar 已提交
687 688
}

689
static void test_calibrations(struct perf_sched *sched)
I
Ingo Molnar 已提交
690
{
691
	u64 T0, T1;
I
Ingo Molnar 已提交
692 693

	T0 = get_nsecs();
694
	burn_nsecs(sched, 1e6);
I
Ingo Molnar 已提交
695 696
	T1 = get_nsecs();

697
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
698 699 700 701 702

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

703
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
I
Ingo Molnar 已提交
704 705
}

706
static int
707
replay_wakeup_event(struct perf_sched *sched,
708 709
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
710
{
711 712
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
713
	struct task_desc *waker, *wakee;
714

I
Ingo Molnar 已提交
715
	if (verbose) {
716
		printf("sched_wakeup event %p\n", evsel);
717

718
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
I
Ingo Molnar 已提交
719
	}
720

721
	waker = register_pid(sched, sample->tid, "<unknown>");
722
	wakee = register_pid(sched, pid, comm);
723

724
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
725
	return 0;
I
Ingo Molnar 已提交
726 727
}

728 729 730 731
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
I
Ingo Molnar 已提交
732
{
733 734 735 736 737
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
738
	struct task_desc *prev, __maybe_unused *next;
739 740
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
741 742
	s64 delta;

I
Ingo Molnar 已提交
743
	if (verbose)
744
		printf("sched_switch event %p\n", evsel);
I
Ingo Molnar 已提交
745

746
	if (cpu >= MAX_CPUS || cpu < 0)
747
		return 0;
748

749
	timestamp0 = sched->cpu_last_switched[cpu];
750 751 752 753 754
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

755
	if (delta < 0) {
756
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
757 758
		return -1;
	}
759

760 761
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
762

763 764
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
765

766
	sched->cpu_last_switched[cpu] = timestamp;
767

768
	add_sched_event_run(sched, prev, timestamp, delta);
769
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
770 771

	return 0;
772 773
}

774 775 776
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
777
{
778 779
	struct thread *child, *parent;

780 781 782 783
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
784 785 786 787

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
788
		goto out_put;
789
	}
790

791
	if (verbose) {
792
		printf("fork event\n");
793 794
		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
795
	}
796

797 798
	register_pid(sched, parent->tid, thread__comm_str(parent));
	register_pid(sched, child->tid, thread__comm_str(child));
799 800 801
out_put:
	thread__put(child);
	thread__put(parent);
802
	return 0;
803
}
804

805 806
struct sort_dimension {
	const char		*name;
807
	sort_fn_t		cmp;
808 809 810
	struct list_head	list;
};

811
static int
812
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
813 814 815 816
{
	struct sort_dimension *sort;
	int ret = 0;

817 818
	BUG_ON(list_empty(list));

819 820 821 822 823 824 825 826 827
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

828
static struct work_atoms *
829 830 831 832
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
833
	struct work_atoms key = { .thread = thread };
834 835

	while (node) {
836
		struct work_atoms *atoms;
837 838
		int cmp;

839
		atoms = container_of(node, struct work_atoms, node);
840 841 842 843 844 845 846 847 848 849 850 851 852 853

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

854
static void
855
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
856
			 struct list_head *sort_list)
857 858 859 860
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
861
		struct work_atoms *this;
862
		int cmp;
863

864
		this = container_of(*new, struct work_atoms, node);
865
		parent = *new;
866 867 868 869

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
870 871
			new = &((*new)->rb_left);
		else
872
			new = &((*new)->rb_right);
873 874 875 876 877 878
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

879
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
880
{
881
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
882 883 884 885
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
886

887
	atoms->thread = thread__get(thread);
888
	INIT_LIST_HEAD(&atoms->work_list);
889
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
890
	return 0;
891 892
}

893
static char sched_out_state(u64 prev_state)
894 895 896
{
	const char *str = TASK_STATE_TO_CHAR_STR;

897
	return str[prev_state];
898 899
}

900
static int
901 902 903
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
904
{
905
	struct work_atom *atom = zalloc(sizeof(*atom));
906 907 908 909
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
910

911 912
	atom->sched_out_time = timestamp;

913
	if (run_state == 'R') {
914
		atom->state = THREAD_WAIT_CPU;
915
		atom->wake_up_time = atom->sched_out_time;
916 917
	}

918
	list_add_tail(&atom->list, &atoms->work_list);
919
	return 0;
920 921 922
}

static void
923 924
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
925 926 927 928 929 930 931 932 933 934 935 936 937
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
938
{
939
	struct work_atom *atom;
940
	u64 delta;
941

942
	if (list_empty(&atoms->work_list))
943 944
		return;

945
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
946

947
	if (atom->state != THREAD_WAIT_CPU)
948 949
		return;

950 951
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
952 953 954
		return;
	}

955 956
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
957

958
	delta = atom->sched_in_time - atom->wake_up_time;
959
	atoms->total_lat += delta;
960
	if (delta > atoms->max_lat) {
961
		atoms->max_lat = delta;
962 963
		atoms->max_lat_at = timestamp;
	}
964
	atoms->nb_atoms++;
965 966
}

967 968 969 970
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
971
{
972 973 974
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
975
	struct work_atoms *out_events, *in_events;
976
	struct thread *sched_out, *sched_in;
977
	u64 timestamp0, timestamp = sample->time;
978
	int cpu = sample->cpu, err = -1;
I
Ingo Molnar 已提交
979 980
	s64 delta;

981
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
I
Ingo Molnar 已提交
982

983 984
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
I
Ingo Molnar 已提交
985 986 987 988 989
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

990 991 992 993
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
994

995 996
	sched_out = machine__findnew_thread(machine, -1, prev_pid);
	sched_in = machine__findnew_thread(machine, -1, next_pid);
997 998
	if (sched_out == NULL || sched_in == NULL)
		goto out_put;
999

1000
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1001
	if (!out_events) {
1002
		if (thread_atoms_insert(sched, sched_out))
1003
			goto out_put;
1004
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1005 1006
		if (!out_events) {
			pr_err("out-event: Internal tree error");
1007
			goto out_put;
1008
		}
1009
	}
1010
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1011
		return -1;
1012

1013
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1014
	if (!in_events) {
1015
		if (thread_atoms_insert(sched, sched_in))
1016
			goto out_put;
1017
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1018 1019
		if (!in_events) {
			pr_err("in-event: Internal tree error");
1020
			goto out_put;
1021
		}
1022 1023 1024 1025
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
1026
		if (add_sched_out_event(in_events, 'R', timestamp))
1027
			goto out_put;
1028
	}
1029
	add_sched_in_event(in_events, timestamp);
1030 1031 1032 1033 1034
	err = 0;
out_put:
	thread__put(sched_out);
	thread__put(sched_in);
	return err;
1035
}
1036

1037 1038 1039 1040
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
1041
{
1042 1043
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1044
	struct thread *thread = machine__findnew_thread(machine, -1, pid);
1045
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1046
	u64 timestamp = sample->time;
1047 1048 1049 1050
	int cpu = sample->cpu, err = -1;

	if (thread == NULL)
		return -1;
1051 1052 1053

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
1054
		if (thread_atoms_insert(sched, thread))
1055
			goto out_put;
1056
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1057
		if (!atoms) {
1058
			pr_err("in-event: Internal tree error");
1059
			goto out_put;
1060 1061
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
1062
			goto out_put;
1063 1064
	}

1065
	add_runtime_event(atoms, runtime, timestamp);
1066 1067 1068 1069
	err = 0;
out_put:
	thread__put(thread);
	return err;
1070 1071
}

1072 1073 1074 1075
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1076
{
1077
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
1078
	struct work_atoms *atoms;
1079
	struct work_atom *atom;
1080
	struct thread *wakee;
1081
	u64 timestamp = sample->time;
1082
	int err = -1;
1083

1084
	wakee = machine__findnew_thread(machine, -1, pid);
1085 1086
	if (wakee == NULL)
		return -1;
1087
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1088
	if (!atoms) {
1089
		if (thread_atoms_insert(sched, wakee))
1090
			goto out_put;
1091
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1092
		if (!atoms) {
1093
			pr_err("wakeup-event: Internal tree error");
1094
			goto out_put;
1095 1096
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
1097
			goto out_put;
1098 1099
	}

1100
	BUG_ON(list_empty(&atoms->work_list));
1101

1102
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1103

1104
	/*
1105 1106 1107 1108 1109 1110
	 * As we do not guarantee the wakeup event happens when
	 * task is out of run queue, also may happen when task is
	 * on run queue and wakeup only change ->state to TASK_RUNNING,
	 * then we should not set the ->wake_up_time when wake up a
	 * task which is on run queue.
	 *
1111 1112
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
1113
	 * skip in this case.
1114
	 */
1115
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1116
		goto out_ok;
1117

1118
	sched->nr_timestamps++;
1119
	if (atom->sched_out_time > timestamp) {
1120
		sched->nr_unordered_timestamps++;
1121
		goto out_ok;
1122
	}
1123

1124 1125
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1126 1127 1128 1129 1130
out_ok:
	err = 0;
out_put:
	thread__put(wakee);
	return err;
1131 1132
}

1133 1134 1135 1136
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1137
{
1138
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1139
	u64 timestamp = sample->time;
1140 1141 1142
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;
1143
	int err = -1;
1144 1145 1146 1147

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1148
	if (sched->profile_cpu == -1)
1149
		return 0;
1150

1151
	migrant = machine__findnew_thread(machine, -1, pid);
1152 1153
	if (migrant == NULL)
		return -1;
1154
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1155
	if (!atoms) {
1156
		if (thread_atoms_insert(sched, migrant))
1157
			goto out_put;
1158
		register_pid(sched, migrant->tid, thread__comm_str(migrant));
1159
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1160
		if (!atoms) {
1161
			pr_err("migration-event: Internal tree error");
1162
			goto out_put;
1163 1164
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
1165
			goto out_put;
1166 1167 1168 1169 1170 1171 1172
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1173
	sched->nr_timestamps++;
1174 1175

	if (atom->sched_out_time > timestamp)
1176
		sched->nr_unordered_timestamps++;
1177 1178 1179 1180
	err = 0;
out_put:
	thread__put(migrant);
	return err;
1181 1182
}

1183
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1184 1185 1186
{
	int i;
	int ret;
1187
	u64 avg;
1188

1189
	if (!work_list->nb_atoms)
1190
		return;
1191 1192 1193
	/*
	 * Ignore idle threads:
	 */
1194
	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1195
		return;
1196

1197 1198
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1199

1200 1201 1202 1203
	if (work_list->num_merged > 1)
		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
	else
		ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1204

M
mingo 已提交
1205
	for (i = 0; i < 24 - ret; i++)
1206 1207
		printf(" ");

1208
	avg = work_list->total_lat / work_list->nb_atoms;
1209

1210
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1211 1212
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1213 1214
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1215 1216
}

1217
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1218
{
1219 1220
	if (l->thread == r->thread)
		return 0;
1221
	if (l->thread->tid < r->thread->tid)
1222
		return -1;
1223
	if (l->thread->tid > r->thread->tid)
1224
		return 1;
1225
	return (int)(l->thread - r->thread);
1226 1227
}

1228
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1249
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1250 1251 1252 1253 1254 1255 1256 1257 1258
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1259
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1260 1261 1262 1263 1264 1265 1266 1267 1268
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1269
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1270 1271 1272 1273 1274 1275 1276 1277 1278
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1279
static int sort_dimension__add(const char *tok, struct list_head *list)
1280
{
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1309

1310
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1321
static void perf_sched__sort_lat(struct perf_sched *sched)
1322 1323
{
	struct rb_node *node;
1324 1325
	struct rb_root *root = &sched->atom_root;
again:
1326
	for (;;) {
1327
		struct work_atoms *data;
1328
		node = rb_first(root);
1329 1330 1331
		if (!node)
			break;

1332
		rb_erase(node, root);
1333
		data = rb_entry(node, struct work_atoms, node);
1334
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1335
	}
1336 1337 1338 1339
	if (root == &sched->atom_root) {
		root = &sched->merged_atom_root;
		goto again;
	}
1340 1341
}

1342
static int process_sched_wakeup_event(struct perf_tool *tool,
1343
				      struct perf_evsel *evsel,
1344
				      struct perf_sample *sample,
1345
				      struct machine *machine)
1346
{
1347
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1348

1349 1350
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1351

1352
	return 0;
1353 1354
}

J
Jiri Olsa 已提交
1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386
union map_priv {
	void	*ptr;
	bool	 color;
};

static bool thread__has_color(struct thread *thread)
{
	union map_priv priv = {
		.ptr = thread__priv(thread),
	};

	return priv.color;
}

static struct thread*
map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
{
	struct thread *thread = machine__findnew_thread(machine, pid, tid);
	union map_priv priv = {
		.color = false,
	};

	if (!sched->map.color_pids || !thread || thread__priv(thread))
		return thread;

	if (thread_map__has(sched->map.color_pids, tid))
		priv.color = true;

	thread__set_priv(thread, priv.ptr);
	return thread;
}

1387 1388
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1389
{
1390 1391
	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	struct thread *sched_in;
1392
	int new_shortname;
1393
	u64 timestamp0, timestamp = sample->time;
1394
	s64 delta;
1395 1396 1397
	int i, this_cpu = sample->cpu;
	int cpus_nr;
	bool new_cpu = false;
1398
	const char *color = PERF_COLOR_NORMAL;
1399 1400 1401

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1402 1403
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1404

1405 1406 1407 1408 1409 1410 1411 1412 1413
	if (sched->map.comp) {
		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
		if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
			sched->map.comp_cpus[cpus_nr++] = this_cpu;
			new_cpu = true;
		}
	} else
		cpus_nr = sched->max_cpu;

1414 1415
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1416 1417 1418 1419 1420
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1421
	if (delta < 0) {
1422
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1423 1424
		return -1;
	}
1425

J
Jiri Olsa 已提交
1426
	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1427 1428
	if (sched_in == NULL)
		return -1;
1429

1430
	sched->curr_thread[this_cpu] = thread__get(sched_in);
1431 1432 1433 1434 1435

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1436 1437 1438 1439 1440 1441 1442
		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
			/*
			 * Don't allocate a letter-number for swapper:0
			 * as a shortname. Instead, we use '.' for it.
			 */
			sched_in->shortname[0] = '.';
			sched_in->shortname[1] = ' ';
1443
		} else {
1444 1445 1446 1447 1448
			sched_in->shortname[0] = sched->next_shortname1;
			sched_in->shortname[1] = sched->next_shortname2;

			if (sched->next_shortname1 < 'Z') {
				sched->next_shortname1++;
1449
			} else {
1450 1451 1452 1453 1454
				sched->next_shortname1 = 'A';
				if (sched->next_shortname2 < '9')
					sched->next_shortname2++;
				else
					sched->next_shortname2 = '0';
1455 1456 1457 1458 1459
			}
		}
		new_shortname = 1;
	}

1460 1461
	for (i = 0; i < cpus_nr; i++) {
		int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
J
Jiri Olsa 已提交
1462 1463 1464 1465 1466
		struct thread *curr_thread = sched->curr_thread[cpu];
		const char *pid_color = color;

		if (curr_thread && thread__has_color(curr_thread))
			pid_color = COLOR_PIDS;
1467

1468
		if (cpu != this_cpu)
1469
			color_fprintf(stdout, color, " ");
1470
		else
1471
			color_fprintf(stdout, color, "*");
1472

1473
		if (sched->curr_thread[cpu])
J
Jiri Olsa 已提交
1474
			color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1475
		else
1476
			color_fprintf(stdout, color, "   ");
1477 1478
	}

1479
	color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp/1e9);
1480
	if (new_shortname) {
J
Jiri Olsa 已提交
1481 1482 1483 1484 1485 1486
		const char *pid_color = color;

		if (thread__has_color(sched_in))
			pid_color = COLOR_PIDS;

		color_fprintf(stdout, pid_color, "%s => %s:%d",
1487
		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1488
	}
1489

1490
	if (sched->map.comp && new_cpu)
1491
		color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1492

1493
	color_fprintf(stdout, color, "\n");
1494

1495 1496
	thread__put(sched_in);

1497
	return 0;
1498 1499
}

1500
static int process_sched_switch_event(struct perf_tool *tool,
1501
				      struct perf_evsel *evsel,
1502
				      struct perf_sample *sample,
1503
				      struct machine *machine)
1504
{
1505
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1506
	int this_cpu = sample->cpu, err = 0;
1507 1508
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1509

1510
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1511 1512 1513 1514
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1515
		if (sched->curr_pid[this_cpu] != prev_pid)
1516
			sched->nr_context_switch_bugs++;
1517 1518
	}

1519 1520
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1521 1522

	sched->curr_pid[this_cpu] = next_pid;
1523
	return err;
1524 1525
}

1526
static int process_sched_runtime_event(struct perf_tool *tool,
1527
				       struct perf_evsel *evsel,
1528
				       struct perf_sample *sample,
1529
				       struct machine *machine)
1530
{
1531
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1532

1533 1534
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1535

1536
	return 0;
1537 1538
}

1539 1540 1541 1542
static int perf_sched__process_fork_event(struct perf_tool *tool,
					  union perf_event *event,
					  struct perf_sample *sample,
					  struct machine *machine)
1543
{
1544
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1545

1546 1547 1548 1549
	/* run the fork event through the perf machineruy */
	perf_event__process_fork(tool, event, sample, machine);

	/* and then run additional processing needed for this command */
1550
	if (sched->tp_handler->fork_event)
1551
		return sched->tp_handler->fork_event(sched, event, machine);
1552

1553
	return 0;
1554 1555
}

1556
static int process_sched_migrate_task_event(struct perf_tool *tool,
1557
					    struct perf_evsel *evsel,
1558
					    struct perf_sample *sample,
1559
					    struct machine *machine)
1560
{
1561
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1562

1563 1564
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1565

1566
	return 0;
1567 1568
}

1569
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1570
				  struct perf_evsel *evsel,
1571
				  struct perf_sample *sample,
1572
				  struct machine *machine);
I
Ingo Molnar 已提交
1573

1574 1575
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1576 1577 1578
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
I
Ingo Molnar 已提交
1579
{
1580
	int err = 0;
I
Ingo Molnar 已提交
1581

1582 1583
	if (evsel->handler != NULL) {
		tracepoint_handler f = evsel->handler;
1584
		err = f(tool, evsel, sample, machine);
1585
	}
I
Ingo Molnar 已提交
1586

1587
	return err;
I
Ingo Molnar 已提交
1588 1589
}

1590
static int perf_sched__read_events(struct perf_sched *sched)
I
Ingo Molnar 已提交
1591
{
1592 1593 1594 1595 1596 1597 1598
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1599
	struct perf_session *session;
1600 1601 1602
	struct perf_data_file file = {
		.path = input_name,
		.mode = PERF_DATA_MODE_READ,
1603
		.force = sched->force,
1604
	};
1605
	int rc = -1;
1606

1607
	session = perf_session__new(&file, false, &sched->tool);
1608 1609 1610 1611
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1612

1613
	symbol__init(&session->header.env);
1614

1615 1616
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1617

1618
	if (perf_session__has_traces(session, "record -R")) {
1619
		int err = perf_session__process_events(session);
1620 1621 1622 1623
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1624

1625 1626 1627
		sched->nr_events      = session->evlist->stats.nr_events[0];
		sched->nr_lost_events = session->evlist->stats.total_lost;
		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1628
	}
1629

1630
	rc = 0;
1631 1632
out_delete:
	perf_session__delete(session);
1633
	return rc;
I
Ingo Molnar 已提交
1634 1635
}

1636
static void print_bad_events(struct perf_sched *sched)
1637
{
1638
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1639
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1640 1641
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1642
	}
1643
	if (sched->nr_lost_events && sched->nr_events) {
1644
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1645 1646
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1647
	}
1648
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1649
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1650 1651 1652
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1653 1654 1655 1656 1657
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710
static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;
	struct work_atoms *this;
	const char *comm = thread__comm_str(data->thread), *this_comm;

	while (*new) {
		int cmp;

		this = container_of(*new, struct work_atoms, node);
		parent = *new;

		this_comm = thread__comm_str(this->thread);
		cmp = strcmp(comm, this_comm);
		if (cmp > 0) {
			new = &((*new)->rb_left);
		} else if (cmp < 0) {
			new = &((*new)->rb_right);
		} else {
			this->num_merged++;
			this->total_runtime += data->total_runtime;
			this->nb_atoms += data->nb_atoms;
			this->total_lat += data->total_lat;
			list_splice(&data->work_list, &this->work_list);
			if (this->max_lat < data->max_lat) {
				this->max_lat = data->max_lat;
				this->max_lat_at = data->max_lat_at;
			}
			zfree(&data);
			return;
		}
	}

	data->num_merged++;
	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

static void perf_sched__merge_lat(struct perf_sched *sched)
{
	struct work_atoms *data;
	struct rb_node *node;

	if (sched->skip_merge)
		return;

	while ((node = rb_first(&sched->atom_root))) {
		rb_erase(node, &sched->atom_root);
		data = rb_entry(node, struct work_atoms, node);
		__merge_work_atoms(&sched->merged_atom_root, data);
	}
}

1711
static int perf_sched__lat(struct perf_sched *sched)
1712 1713 1714 1715
{
	struct rb_node *next;

	setup_pager();
1716

1717
	if (perf_sched__read_events(sched))
1718
		return -1;
1719

1720
	perf_sched__merge_lat(sched);
1721
	perf_sched__sort_lat(sched);
1722

1723 1724 1725
	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1726

1727
	next = rb_first(&sched->sorted_atom_root);
1728 1729 1730 1731 1732

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1733
		output_lat_thread(sched, work_list);
1734
		next = rb_next(next);
1735
		thread__zput(work_list->thread);
1736 1737
	}

1738
	printf(" -----------------------------------------------------------------------------------------------------------------\n");
1739
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1740
		(double)sched->all_runtime / 1e6, sched->all_count);
1741 1742 1743

	printf(" ---------------------------------------------------\n");

1744
	print_bad_events(sched);
1745 1746
	printf("\n");

1747
	return 0;
1748 1749
}

1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761
static int setup_map_cpus(struct perf_sched *sched)
{
	sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);

	if (sched->map.comp) {
		sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
		return sched->map.comp_cpus ? 0 : -1;
	}

	return 0;
}

J
Jiri Olsa 已提交
1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778
static int setup_color_pids(struct perf_sched *sched)
{
	struct thread_map *map;

	if (!sched->map.color_pids_str)
		return 0;

	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
	if (!map) {
		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
		return -1;
	}

	sched->map.color_pids = map;
	return 0;
}

1779
static int perf_sched__map(struct perf_sched *sched)
1780
{
1781 1782
	if (setup_map_cpus(sched))
		return -1;
1783

J
Jiri Olsa 已提交
1784 1785 1786
	if (setup_color_pids(sched))
		return -1;

1787
	setup_pager();
1788
	if (perf_sched__read_events(sched))
1789
		return -1;
1790
	print_bad_events(sched);
1791
	return 0;
1792 1793
}

1794
static int perf_sched__replay(struct perf_sched *sched)
1795 1796 1797
{
	unsigned long i;

1798 1799
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1800

1801
	test_calibrations(sched);
1802

1803
	if (perf_sched__read_events(sched))
1804
		return -1;
1805

1806 1807 1808
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1809

1810 1811 1812 1813 1814
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1815
		printf("run atoms optimized: %ld\n",
1816
			sched->nr_run_events_optimized);
1817

1818 1819
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1820

1821
	create_tasks(sched);
1822
	printf("------------------------------------------------------------\n");
1823 1824
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1825 1826

	return 0;
1827 1828
}

1829 1830
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1831
{
1832
	char *tmp, *tok, *str = strdup(sched->sort_order);
1833 1834 1835

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1836
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1837 1838
			usage_with_options_msg(usage_msg, options,
					"Unknown --sort key: `%s'", tok);
1839 1840 1841 1842 1843
		}
	}

	free(str);

1844
	sort_dimension__add("pid", &sched->cmp_pid);
1845 1846
}

1847 1848 1849 1850
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
1864
		"-e", "sched:sched_wakeup_new",
1865 1866
		"-e", "sched:sched_migrate_task",
	};
1867 1868 1869 1870

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1871
	if (rec_argv == NULL)
1872 1873
		return -ENOMEM;

1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1885
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
I
Ingo Molnar 已提交
1886
{
1887 1888 1889 1890 1891 1892 1893
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_sched__process_fork_event,
1894
			.ordered_events = true,
1895 1896 1897 1898 1899 1900 1901 1902 1903 1904
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
1905
		.skip_merge           = 0,
1906
	};
1907 1908 1909 1910 1911 1912 1913 1914 1915
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
1916 1917
	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
		    "latency stats per pid instead of per comm"),
1918 1919 1920 1921 1922 1923 1924 1925 1926
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
1927
	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
1928 1929 1930
	OPT_END()
	};
	const struct option sched_options[] = {
1931
	OPT_STRING('i', "input", &input_name, "file",
1932 1933 1934 1935 1936 1937 1938
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
1939 1940 1941
	const struct option map_options[] = {
	OPT_BOOLEAN(0, "compact", &sched.map.comp,
		    "map output in compact mode"),
J
Jiri Olsa 已提交
1942 1943
	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
		   "highlight given pids in map"),
1944 1945
	OPT_END()
	};
1946 1947 1948 1949 1950 1951 1952 1953
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
1954 1955 1956 1957
	const char * const map_usage[] = {
		"perf sched map [<options>]",
		NULL
	};
1958 1959 1960 1961
	const char *const sched_subcommands[] = { "record", "latency", "map",
						  "replay", "script", NULL };
	const char *sched_usage[] = {
		NULL,
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};
A
Adrian Hunter 已提交
1978 1979 1980 1981
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
		sched.curr_pid[i] = -1;
1982

1983 1984
	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
1985 1986
	if (!argc)
		usage_with_options(sched_usage, sched_options);
I
Ingo Molnar 已提交
1987

1988
	/*
1989
	 * Aliased to 'perf script' for now:
1990
	 */
1991 1992
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1993

1994 1995 1996
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1997
		sched.tp_handler = &lat_ops;
1998 1999 2000 2001 2002
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
2003 2004
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
2005
	} else if (!strcmp(argv[0], "map")) {
2006
		if (argc) {
J
Jiri Olsa 已提交
2007
			argc = parse_options(argc, argv, map_options, map_usage, 0);
2008 2009 2010
			if (argc)
				usage_with_options(map_usage, map_options);
		}
2011 2012 2013
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
2014
	} else if (!strncmp(argv[0], "rep", 3)) {
2015
		sched.tp_handler = &replay_ops;
2016 2017 2018 2019 2020
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
2021
		return perf_sched__replay(&sched);
2022 2023 2024 2025
	} else {
		usage_with_options(sched_usage, sched_options);
	}

I
Ingo Molnar 已提交
2026
	return 0;
I
Ingo Molnar 已提交
2027
}