trace_syscalls.c 16.6 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
6
#include <linux/ftrace.h>
7
#include <linux/perf_event.h>
8
#include <asm/syscall.h>
9
#include <asm/asm-offsets.h>
10 11 12 13

#include "trace_output.h"
#include "trace.h"

14
static DEFINE_MUTEX(syscall_trace_lock);
15 16
static int sys_refcount_enter;
static int sys_refcount_exit;
17 18
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
19

20 21 22 23 24
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

25 26 27 28 29 30 31 32 33 34 35
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

36
struct trace_event_functions enter_syscall_print_funcs = {
37
	.trace		= print_syscall_enter,
38 39 40
};

struct trace_event_functions exit_syscall_print_funcs = {
41
	.trace		= print_syscall_exit,
42 43
};

44
struct ftrace_event_class event_class_syscall_enter = {
45 46 47 48 49
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
50 51 52
};

struct ftrace_event_class event_class_syscall_exit = {
53 54 55 56 57
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
58 59
};

60 61
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
62 63 64

static struct syscall_metadata **syscalls_metadata;

65 66 67 68 69 70 71 72 73 74 75 76 77
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
	/*
	 * Only compare after the "sys" prefix. Archs that use
	 * syscall wrappers may have syscalls symbols aliases prefixed
	 * with "SyS" instead of "sys", leading to an unwanted
	 * mismatch.
	 */
	return !strcmp(sym + 3, name + 3);
}
#endif

78 79
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
80
{
81 82
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
83 84 85
	char str[KSYM_SYMBOL_LEN];


86 87
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
88 89
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

90 91 92
	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
		return NULL;

93
	for ( ; start < stop; start++) {
94
		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
95
			return *start;
96 97 98 99 100 101 102 103 104 105 106 107
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

108
enum print_line_t
109 110
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
111 112 113 114 115 116 117
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

118
	trace = (typeof(trace))ent;
119 120
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
121

122 123 124
	if (!entry)
		goto end;

125
	if (entry->enter_event->event.type != ent->type) {
126 127 128 129
		WARN_ON_ONCE(1);
		goto end;
	}

130 131 132 133 134 135
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
136
		if (trace_flags & TRACE_ITER_VERBOSE) {
137 138 139 140 141
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
142
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
143
				       trace->args[i],
144
				       i == entry->nb_args - 1 ? "" : ", ");
145 146 147 148
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

149 150 151 152
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

153
end:
154 155 156 157
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

158 159 160 161
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
162 163
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
164 165 166 167 168 169 170 171
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

172
	trace = (typeof(trace))ent;
173 174
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
175

176 177 178 179 180
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

181
	if (entry->exit_event->event.type != ent->type) {
182 183 184 185
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

186 187 188 189 190 191 192 193
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

194 195 196 197 198
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
199 200
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
201

202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

263
static int syscall_enter_define_fields(struct ftrace_event_call *call)
264 265
{
	struct syscall_trace_enter trace;
266
	struct syscall_metadata *meta = call->data;
267 268 269 270
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

271 272 273 274
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

275
	for (i = 0; i < meta->nb_args; i++) {
276 277
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
278 279
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
280 281 282 283 284 285
		offset += sizeof(unsigned long);
	}

	return ret;
}

286
static int syscall_exit_define_fields(struct ftrace_event_call *call)
287 288 289 290
{
	struct syscall_trace_exit trace;
	int ret;

291 292 293 294
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

295
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
296
				 FILTER_OTHER);
297 298 299 300

	return ret;
}

301
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
302
{
303 304 305
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
306
	struct ring_buffer *buffer;
307
	int size;
308 309 310
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
311 312
	if (syscall_nr < 0)
		return;
313 314
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
315

316 317 318 319 320 321
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

322
	event = trace_current_buffer_lock_reserve(&buffer,
323
			sys_data->enter_event->event.type, size, 0, 0);
324 325 326 327 328 329 330
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

331 332 333
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
334 335
}

336
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
337
{
338 339 340
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
341
	struct ring_buffer *buffer;
342 343 344
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
345 346
	if (syscall_nr < 0)
		return;
347 348
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
349

350 351 352 353
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

354
	event = trace_current_buffer_lock_reserve(&buffer,
355
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
356 357 358 359 360 361 362
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

363 364 365
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
366 367
}

368
int reg_event_syscall_enter(struct ftrace_event_call *call)
369
{
370 371 372
	int ret = 0;
	int num;

373
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
374
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
375 376 377
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
378
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
379
	if (!ret) {
380 381 382 383 384
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
385 386
}

387
void unreg_event_syscall_enter(struct ftrace_event_call *call)
388
{
389
	int num;
390

391
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
392
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
393 394 395 396 397
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
398
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
399 400
	mutex_unlock(&syscall_trace_lock);
}
401

402
int reg_event_syscall_exit(struct ftrace_event_call *call)
403
{
404 405 406
	int ret = 0;
	int num;

407
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
408
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
409 410 411
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
412
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
413
	if (!ret) {
414 415
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
416
	}
417 418 419
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
420

421
void unreg_event_syscall_exit(struct ftrace_event_call *call)
422 423
{
	int num;
424

425
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
426
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
427 428 429 430 431
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
432
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
433
	mutex_unlock(&syscall_trace_lock);
434
}
435

436 437 438
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;
439 440 441 442 443 444 445 446
	int num;

	num = ((struct syscall_metadata *)call->data)->syscall_nr;
	if (num < 0 || num >= NR_syscalls) {
		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
				((struct syscall_metadata *)call->data)->name);
		return -ENOSYS;
	}
447

448 449 450
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

451 452 453
	id = trace_event_raw_init(call);

	if (id < 0) {
454
		free_syscall_print_fmt(call);
455
		return id;
456
	}
457 458

	return id;
459 460
}

461
unsigned long __init __weak arch_syscall_addr(int nr)
462 463 464 465
{
	return (unsigned long)sys_call_table[nr];
}

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
482 483 484 485
		if (!meta)
			continue;

		meta->syscall_nr = i;
486 487 488 489 490 491 492
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

493
#ifdef CONFIG_PERF_EVENTS
494

495 496 497 498
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
499

500
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
501 502
{
	struct syscall_metadata *sys_data;
503
	struct syscall_trace_enter *rec;
504
	struct hlist_head *head;
505
	int syscall_nr;
506
	int rctx;
507
	int size;
508 509

	syscall_nr = syscall_get_nr(current, regs);
510
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
511 512 513 514 515 516
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

517 518 519 520 521
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

522 523
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
524 525
		return;

526
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
527
				sys_data->enter_event->event.type, regs, &rctx);
528 529
	if (!rec)
		return;
530 531 532 533

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
534

535
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
536
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
537 538
}

539
int perf_sysenter_enable(struct ftrace_event_call *call)
540 541 542 543
{
	int ret = 0;
	int num;

544
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
545 546

	mutex_lock(&syscall_trace_lock);
547
	if (!sys_perf_refcount_enter)
548
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
549 550 551 552
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
553 554
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
555 556 557 558 559
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

560
void perf_sysenter_disable(struct ftrace_event_call *call)
561 562 563
{
	int num;

564
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
565 566

	mutex_lock(&syscall_trace_lock);
567 568 569
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
570
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
571 572 573
	mutex_unlock(&syscall_trace_lock);
}

574
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
575 576
{
	struct syscall_metadata *sys_data;
577
	struct syscall_trace_exit *rec;
578
	struct hlist_head *head;
579
	int syscall_nr;
580
	int rctx;
581
	int size;
582 583

	syscall_nr = syscall_get_nr(current, regs);
584
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
585 586 587 588 589 590
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

591 592 593
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
594

595 596 597 598
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
599 600
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
601 602
		return;

603
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
604
				sys_data->exit_event->event.type, regs, &rctx);
605 606
	if (!rec)
		return;
607 608 609 610

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

611
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
612
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
613 614
}

615
int perf_sysexit_enable(struct ftrace_event_call *call)
616 617 618 619
{
	int ret = 0;
	int num;

620
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
621 622

	mutex_lock(&syscall_trace_lock);
623
	if (!sys_perf_refcount_exit)
624
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
625 626
	if (ret) {
		pr_info("event trace: Could not activate"
627
				"syscall exit trace point");
628
	} else {
629 630
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
631 632 633 634 635
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

636
void perf_sysexit_disable(struct ftrace_event_call *call)
637 638 639
{
	int num;

640
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
641 642

	mutex_lock(&syscall_trace_lock);
643 644 645
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
646
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
647 648 649
	mutex_unlock(&syscall_trace_lock);
}

650
#endif /* CONFIG_PERF_EVENTS */
651

652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}