trace_syscalls.c 16.0 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7 8 9 10 11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13 14
static int sys_refcount_enter;
static int sys_refcount_exit;
15 16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18 19 20 21 22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23 24 25 26 27 28 29 30 31 32 33
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

34
struct trace_event_functions enter_syscall_print_funcs = {
35
	.trace		= print_syscall_enter,
36 37 38
};

struct trace_event_functions exit_syscall_print_funcs = {
39
	.trace		= print_syscall_exit,
40 41
};

42
struct ftrace_event_class event_class_syscall_enter = {
43 44 45 46 47
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
48 49 50
};

struct ftrace_event_class event_class_syscall_exit = {
51 52 53 54 55
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
56 57
};

58 59
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
60 61 62

static struct syscall_metadata **syscalls_metadata;

63 64
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
65
{
66 67
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
68 69 70
	char str[KSYM_SYMBOL_LEN];


71 72
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
73 74 75 76 77 78 79 80 81
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
82 83
		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
			return *start;
84 85 86 87 88 89 90 91 92 93 94 95
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

96
enum print_line_t
97 98
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
99 100 101 102 103 104 105
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

106
	trace = (typeof(trace))ent;
107 108
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
109

110 111 112
	if (!entry)
		goto end;

113
	if (entry->enter_event->event.type != ent->type) {
114 115 116 117
		WARN_ON_ONCE(1);
		goto end;
	}

118 119 120 121 122 123
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
124
		if (trace_flags & TRACE_ITER_VERBOSE) {
125 126 127 128 129
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
130
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
131
				       trace->args[i],
132
				       i == entry->nb_args - 1 ? "" : ", ");
133 134 135 136
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

137 138 139 140
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

141
end:
142 143 144 145
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

146 147 148 149
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
150 151
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
152 153 154 155 156 157 158 159
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

160
	trace = (typeof(trace))ent;
161 162
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
163

164 165 166 167 168
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

169
	if (entry->exit_event->event.type != ent->type) {
170 171 172 173
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

174 175 176 177 178 179 180 181
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

182 183 184 185 186
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
187 188
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
189

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

251
static int syscall_enter_define_fields(struct ftrace_event_call *call)
252 253
{
	struct syscall_trace_enter trace;
254
	struct syscall_metadata *meta = call->data;
255 256 257 258
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

259 260 261 262
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

263
	for (i = 0; i < meta->nb_args; i++) {
264 265
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
266 267
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
268 269 270 271 272 273
		offset += sizeof(unsigned long);
	}

	return ret;
}

274
static int syscall_exit_define_fields(struct ftrace_event_call *call)
275 276 277 278
{
	struct syscall_trace_exit trace;
	int ret;

279 280 281 282
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

283
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
284
				 FILTER_OTHER);
285 286 287 288

	return ret;
}

289
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
290
{
291 292 293
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
294
	struct ring_buffer *buffer;
295
	int size;
296 297 298
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
299 300
	if (syscall_nr < 0)
		return;
301 302
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
303

304 305 306 307 308 309
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

310
	event = trace_current_buffer_lock_reserve(&buffer,
311
			sys_data->enter_event->event.type, size, 0, 0);
312 313 314 315 316 317 318
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

319 320 321
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
322 323
}

324
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
325
{
326 327 328
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
329
	struct ring_buffer *buffer;
330 331 332
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
333 334
	if (syscall_nr < 0)
		return;
335 336
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
337

338 339 340 341
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

342
	event = trace_current_buffer_lock_reserve(&buffer,
343
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
344 345 346 347 348 349 350
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

351 352 353
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
354 355
}

356
int reg_event_syscall_enter(struct ftrace_event_call *call)
357
{
358 359 360
	int ret = 0;
	int num;

361
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
362
	if (num < 0 || num >= NR_syscalls)
363 364 365
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
366
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
367
	if (!ret) {
368 369 370 371 372
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
373 374
}

375
void unreg_event_syscall_enter(struct ftrace_event_call *call)
376
{
377
	int num;
378

379
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
380
	if (num < 0 || num >= NR_syscalls)
381 382 383 384 385
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
386
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
387 388
	mutex_unlock(&syscall_trace_lock);
}
389

390
int reg_event_syscall_exit(struct ftrace_event_call *call)
391
{
392 393 394
	int ret = 0;
	int num;

395
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
396
	if (num < 0 || num >= NR_syscalls)
397 398 399
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
400
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
401
	if (!ret) {
402 403
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
404
	}
405 406 407
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
408

409
void unreg_event_syscall_exit(struct ftrace_event_call *call)
410 411
{
	int num;
412

413
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
414
	if (num < 0 || num >= NR_syscalls)
415 416 417 418 419
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
420
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
421
	mutex_unlock(&syscall_trace_lock);
422
}
423

424 425 426 427
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

428 429 430
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

431 432 433
	id = trace_event_raw_init(call);

	if (id < 0) {
434
		free_syscall_print_fmt(call);
435
		return id;
436
	}
437 438

	return id;
439 440
}

441 442 443 444 445
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
462 463 464 465
		if (!meta)
			continue;

		meta->syscall_nr = i;
466 467 468 469 470 471 472
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

473
#ifdef CONFIG_PERF_EVENTS
474

475 476 477 478
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
479

480
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
481 482
{
	struct syscall_metadata *sys_data;
483
	struct syscall_trace_enter *rec;
484
	struct hlist_head *head;
485
	int syscall_nr;
486
	int rctx;
487
	int size;
488 489

	syscall_nr = syscall_get_nr(current, regs);
490
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
491 492 493 494 495 496
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

497 498 499 500 501
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

502 503
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
504 505
		return;

506
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
507
				sys_data->enter_event->event.type, regs, &rctx);
508 509
	if (!rec)
		return;
510 511 512 513

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
514

515
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
516
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
517 518
}

519
int perf_sysenter_enable(struct ftrace_event_call *call)
520 521 522 523
{
	int ret = 0;
	int num;

524
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
525 526

	mutex_lock(&syscall_trace_lock);
527
	if (!sys_perf_refcount_enter)
528
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
529 530 531 532
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
533 534
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
535 536 537 538 539
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

540
void perf_sysenter_disable(struct ftrace_event_call *call)
541 542 543
{
	int num;

544
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
545 546

	mutex_lock(&syscall_trace_lock);
547 548 549
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
550
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
551 552 553
	mutex_unlock(&syscall_trace_lock);
}

554
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
555 556
{
	struct syscall_metadata *sys_data;
557
	struct syscall_trace_exit *rec;
558
	struct hlist_head *head;
559
	int syscall_nr;
560
	int rctx;
561
	int size;
562 563

	syscall_nr = syscall_get_nr(current, regs);
564
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
565 566 567 568 569 570
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

571 572 573
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
574

575 576 577 578
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
579 580
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
581 582
		return;

583
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
584
				sys_data->exit_event->event.type, regs, &rctx);
585 586
	if (!rec)
		return;
587 588 589 590

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

591
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
592
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
593 594
}

595
int perf_sysexit_enable(struct ftrace_event_call *call)
596 597 598 599
{
	int ret = 0;
	int num;

600
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
601 602

	mutex_lock(&syscall_trace_lock);
603
	if (!sys_perf_refcount_exit)
604
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
605 606
	if (ret) {
		pr_info("event trace: Could not activate"
607
				"syscall exit trace point");
608
	} else {
609 610
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
611 612 613 614 615
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

616
void perf_sysexit_disable(struct ftrace_event_call *call)
617 618 619
{
	int num;

620
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
621 622

	mutex_lock(&syscall_trace_lock);
623 624 625
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
626
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
627 628 629
	mutex_unlock(&syscall_trace_lock);
}

630
#endif /* CONFIG_PERF_EVENTS */
631

632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}