trace_syscalls.c 15.7 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7 8 9 10 11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13 14
static int sys_refcount_enter;
static int sys_refcount_exit;
15 16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18 19 20 21 22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

static struct list_head *
syscall_get_exit_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->exit_fields;
}

42 43
struct ftrace_event_class event_class_syscall_enter = {
	.system			= "syscalls",
44 45 46
	.reg			= syscall_enter_register,
	.define_fields		= syscall_enter_define_fields,
	.get_fields		= syscall_get_enter_fields,
47
	.raw_init		= init_syscall_trace,
48 49 50 51
};

struct ftrace_event_class event_class_syscall_exit = {
	.system			= "syscalls",
52 53 54
	.reg			= syscall_exit_register,
	.define_fields		= syscall_exit_define_fields,
	.get_fields		= syscall_get_exit_fields,
55
	.raw_init		= init_syscall_trace,
56 57
};

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

95 96 97 98 99 100 101 102 103
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

104
	trace = (typeof(trace))ent;
105 106
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
107

108 109 110
	if (!entry)
		goto end;

111
	if (entry->enter_event->id != ent->type) {
112 113 114 115
		WARN_ON_ONCE(1);
		goto end;
	}

116 117 118 119 120 121
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
122
		if (trace_flags & TRACE_ITER_VERBOSE) {
123 124 125 126 127
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
128
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
129
				       trace->args[i],
130
				       i == entry->nb_args - 1 ? "" : ", ");
131 132 133 134
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

135 136 137 138
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

139
end:
140 141 142 143
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

144 145 146 147 148 149 150 151 152 153 154 155 156
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

157
	trace = (typeof(trace))ent;
158 159
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
160

161 162 163 164 165
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

166
	if (entry->exit_event->id != ent->type) {
167 168 169 170
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

171 172 173 174 175 176 177 178
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

179 180 181 182 183
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
184 185
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
186

187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

248
static int syscall_enter_define_fields(struct ftrace_event_call *call)
249 250
{
	struct syscall_trace_enter trace;
251
	struct syscall_metadata *meta = call->data;
252 253 254 255
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

256 257 258 259
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

260
	for (i = 0; i < meta->nb_args; i++) {
261 262
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
263 264
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
265 266 267 268 269 270
		offset += sizeof(unsigned long);
	}

	return ret;
}

271
static int syscall_exit_define_fields(struct ftrace_event_call *call)
272 273 274 275
{
	struct syscall_trace_exit trace;
	int ret;

276 277 278 279
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

280
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
281
				 FILTER_OTHER);
282 283 284 285

	return ret;
}

286
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
287
{
288 289 290
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
291
	struct ring_buffer *buffer;
292
	int size;
293 294 295
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
296 297
	if (syscall_nr < 0)
		return;
298 299
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
300

301 302 303 304 305 306
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

307 308
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
309 310 311 312 313 314 315
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

316 317 318
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
319 320
}

321
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
322
{
323 324 325
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
326
	struct ring_buffer *buffer;
327 328 329
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
330 331
	if (syscall_nr < 0)
		return;
332 333
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
334

335 336 337 338
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

339 340
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
341 342 343 344 345 346 347
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

348 349 350
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
351 352
}

353
int reg_event_syscall_enter(struct ftrace_event_call *call)
354
{
355 356 357
	int ret = 0;
	int num;

358
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
359
	if (num < 0 || num >= NR_syscalls)
360 361 362
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
363
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
364
	if (!ret) {
365 366 367 368 369
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
370 371
}

372
void unreg_event_syscall_enter(struct ftrace_event_call *call)
373
{
374
	int num;
375

376
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
377
	if (num < 0 || num >= NR_syscalls)
378 379 380 381 382
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
383
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
384 385
	mutex_unlock(&syscall_trace_lock);
}
386

387
int reg_event_syscall_exit(struct ftrace_event_call *call)
388
{
389 390 391
	int ret = 0;
	int num;

392
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
393
	if (num < 0 || num >= NR_syscalls)
394 395 396
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
397
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
398
	if (!ret) {
399 400
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
401
	}
402 403 404
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
405

406
void unreg_event_syscall_exit(struct ftrace_event_call *call)
407 408
{
	int num;
409

410
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
411
	if (num < 0 || num >= NR_syscalls)
412 413 414 415 416
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
417
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
418
	mutex_unlock(&syscall_trace_lock);
419
}
420

421 422 423 424
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

425 426 427
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

428 429 430
	id = trace_event_raw_init(call);

	if (id < 0) {
431
		free_syscall_print_fmt(call);
432
		return id;
433
	}
434 435

	return id;
436 437
}

438 439 440 441 442
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
459 460 461 462
		if (!meta)
			continue;

		meta->syscall_nr = i;
463 464 465 466 467 468 469
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

470
#ifdef CONFIG_PERF_EVENTS
471

472 473 474 475
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
476

477
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
478 479
{
	struct syscall_metadata *sys_data;
480 481
	struct syscall_trace_enter *rec;
	unsigned long flags;
482
	int syscall_nr;
483
	int rctx;
484
	int size;
485 486

	syscall_nr = syscall_get_nr(current, regs);
487
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
488 489 490 491 492 493
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

494 495 496 497 498
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

499 500
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
501 502
		return;

503
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
504 505 506
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
507 508 509 510

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
511
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
512 513
}

514
int perf_sysenter_enable(struct ftrace_event_call *call)
515 516 517 518
{
	int ret = 0;
	int num;

519
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
520 521

	mutex_lock(&syscall_trace_lock);
522
	if (!sys_perf_refcount_enter)
523
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
524 525 526 527
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
528 529
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
530 531 532 533 534
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

535
void perf_sysenter_disable(struct ftrace_event_call *call)
536 537 538
{
	int num;

539
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
540 541

	mutex_lock(&syscall_trace_lock);
542 543 544
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
545
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
546 547 548
	mutex_unlock(&syscall_trace_lock);
}

549
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
550 551
{
	struct syscall_metadata *sys_data;
552 553
	struct syscall_trace_exit *rec;
	unsigned long flags;
554
	int syscall_nr;
555
	int rctx;
556
	int size;
557 558

	syscall_nr = syscall_get_nr(current, regs);
559
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
560 561 562 563 564 565
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

566 567 568
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
569

570 571 572 573
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
574 575
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
576 577
		return;

578
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
579 580 581
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
582 583 584 585

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

586
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
587 588
}

589
int perf_sysexit_enable(struct ftrace_event_call *call)
590 591 592 593
{
	int ret = 0;
	int num;

594
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
595 596

	mutex_lock(&syscall_trace_lock);
597
	if (!sys_perf_refcount_exit)
598
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
599 600
	if (ret) {
		pr_info("event trace: Could not activate"
601
				"syscall exit trace point");
602
	} else {
603 604
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
605 606 607 608 609
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

610
void perf_sysexit_disable(struct ftrace_event_call *call)
611 612 613
{
	int num;

614
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
615 616

	mutex_lock(&syscall_trace_lock);
617 618 619
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
620
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
621 622 623
	mutex_unlock(&syscall_trace_lock);
}

624
#endif /* CONFIG_PERF_EVENTS */
625

626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}