trace_syscalls.c 16.6 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
6
#include <linux/ftrace.h>
7
#include <linux/perf_event.h>
8 9 10 11 12
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

13
static DEFINE_MUTEX(syscall_trace_lock);
14 15
static int sys_refcount_enter;
static int sys_refcount_exit;
16 17
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18

19 20 21 22 23
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

24 25 26 27 28 29 30 31 32 33 34
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

35
struct trace_event_functions enter_syscall_print_funcs = {
36
	.trace		= print_syscall_enter,
37 38 39
};

struct trace_event_functions exit_syscall_print_funcs = {
40
	.trace		= print_syscall_exit,
41 42
};

43
struct ftrace_event_class event_class_syscall_enter = {
44 45 46 47 48
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
49 50 51
};

struct ftrace_event_class event_class_syscall_exit = {
52 53 54 55 56
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
57 58
};

59 60
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
61 62 63

static struct syscall_metadata **syscalls_metadata;

64 65 66 67 68 69 70 71 72 73 74 75 76
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
	/*
	 * Only compare after the "sys" prefix. Archs that use
	 * syscall wrappers may have syscalls symbols aliases prefixed
	 * with "SyS" instead of "sys", leading to an unwanted
	 * mismatch.
	 */
	return !strcmp(sym + 3, name + 3);
}
#endif

77 78
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
79
{
80 81
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
82 83 84
	char str[KSYM_SYMBOL_LEN];


85 86
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
87 88
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

89 90 91
	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
		return NULL;

92
	for ( ; start < stop; start++) {
93
		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
94
			return *start;
95 96 97 98 99 100 101 102 103 104 105 106
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

107
enum print_line_t
108 109
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
110 111 112 113 114 115 116
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

117
	trace = (typeof(trace))ent;
118 119
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
120

121 122 123
	if (!entry)
		goto end;

124
	if (entry->enter_event->event.type != ent->type) {
125 126 127 128
		WARN_ON_ONCE(1);
		goto end;
	}

129 130 131 132 133 134
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
135
		if (trace_flags & TRACE_ITER_VERBOSE) {
136 137 138 139 140
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
141
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
142
				       trace->args[i],
143
				       i == entry->nb_args - 1 ? "" : ", ");
144 145 146 147
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

148 149 150 151
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

152
end:
153 154 155 156
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

157 158 159 160
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
161 162
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
163 164 165 166 167 168 169 170
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

171
	trace = (typeof(trace))ent;
172 173
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
174

175 176 177 178 179
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

180
	if (entry->exit_event->event.type != ent->type) {
181 182 183 184
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

185 186 187 188 189 190 191 192
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

193 194 195 196 197
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
198 199
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
200

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

262
static int syscall_enter_define_fields(struct ftrace_event_call *call)
263 264
{
	struct syscall_trace_enter trace;
265
	struct syscall_metadata *meta = call->data;
266 267 268 269
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

270 271 272 273
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

274
	for (i = 0; i < meta->nb_args; i++) {
275 276
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
277 278
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
279 280 281 282 283 284
		offset += sizeof(unsigned long);
	}

	return ret;
}

285
static int syscall_exit_define_fields(struct ftrace_event_call *call)
286 287 288 289
{
	struct syscall_trace_exit trace;
	int ret;

290 291 292 293
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

294
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
295
				 FILTER_OTHER);
296 297 298 299

	return ret;
}

300
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
301
{
302 303 304
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
305
	struct ring_buffer *buffer;
306
	int size;
307 308 309
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
310 311
	if (syscall_nr < 0)
		return;
312 313
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
314

315 316 317 318 319 320
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

321
	event = trace_current_buffer_lock_reserve(&buffer,
322
			sys_data->enter_event->event.type, size, 0, 0);
323 324 325 326 327 328 329
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

330 331 332
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
333 334
}

335
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
336
{
337 338 339
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
340
	struct ring_buffer *buffer;
341 342 343
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
344 345
	if (syscall_nr < 0)
		return;
346 347
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
348

349 350 351 352
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

353
	event = trace_current_buffer_lock_reserve(&buffer,
354
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
355 356 357 358 359 360 361
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

362 363 364
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
365 366
}

367
int reg_event_syscall_enter(struct ftrace_event_call *call)
368
{
369 370 371
	int ret = 0;
	int num;

372
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
373
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
374 375 376
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
377
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
378
	if (!ret) {
379 380 381 382 383
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
384 385
}

386
void unreg_event_syscall_enter(struct ftrace_event_call *call)
387
{
388
	int num;
389

390
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
391
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
392 393 394 395 396
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
397
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
398 399
	mutex_unlock(&syscall_trace_lock);
}
400

401
int reg_event_syscall_exit(struct ftrace_event_call *call)
402
{
403 404 405
	int ret = 0;
	int num;

406
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
407
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
408 409 410
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
411
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
412
	if (!ret) {
413 414
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
415
	}
416 417 418
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
419

420
void unreg_event_syscall_exit(struct ftrace_event_call *call)
421 422
{
	int num;
423

424
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
425
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
426 427 428 429 430
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
431
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
432
	mutex_unlock(&syscall_trace_lock);
433
}
434

435 436 437
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;
438 439 440 441 442 443 444 445
	int num;

	num = ((struct syscall_metadata *)call->data)->syscall_nr;
	if (num < 0 || num >= NR_syscalls) {
		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
				((struct syscall_metadata *)call->data)->name);
		return -ENOSYS;
	}
446

447 448 449
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

450 451 452
	id = trace_event_raw_init(call);

	if (id < 0) {
453
		free_syscall_print_fmt(call);
454
		return id;
455
	}
456 457

	return id;
458 459
}

460
unsigned long __init __weak arch_syscall_addr(int nr)
461 462 463 464
{
	return (unsigned long)sys_call_table[nr];
}

465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
481 482 483 484
		if (!meta)
			continue;

		meta->syscall_nr = i;
485 486 487 488 489 490 491
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

492
#ifdef CONFIG_PERF_EVENTS
493

494 495 496 497
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
498

499
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
500 501
{
	struct syscall_metadata *sys_data;
502
	struct syscall_trace_enter *rec;
503
	struct hlist_head *head;
504
	int syscall_nr;
505
	int rctx;
506
	int size;
507 508

	syscall_nr = syscall_get_nr(current, regs);
509
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
510 511 512 513 514 515
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

516 517 518 519 520
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

521 522
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
523 524
		return;

525
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
526
				sys_data->enter_event->event.type, regs, &rctx);
527 528
	if (!rec)
		return;
529 530 531 532

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
533

534
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
535
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
536 537
}

538
int perf_sysenter_enable(struct ftrace_event_call *call)
539 540 541 542
{
	int ret = 0;
	int num;

543
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
544 545

	mutex_lock(&syscall_trace_lock);
546
	if (!sys_perf_refcount_enter)
547
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
548 549 550 551
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
552 553
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
554 555 556 557 558
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

559
void perf_sysenter_disable(struct ftrace_event_call *call)
560 561 562
{
	int num;

563
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
564 565

	mutex_lock(&syscall_trace_lock);
566 567 568
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
569
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
570 571 572
	mutex_unlock(&syscall_trace_lock);
}

573
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
574 575
{
	struct syscall_metadata *sys_data;
576
	struct syscall_trace_exit *rec;
577
	struct hlist_head *head;
578
	int syscall_nr;
579
	int rctx;
580
	int size;
581 582

	syscall_nr = syscall_get_nr(current, regs);
583
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
584 585 586 587 588 589
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

590 591 592
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
593

594 595 596 597
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
598 599
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
600 601
		return;

602
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
S
Steven Rostedt 已提交
603
				sys_data->exit_event->event.type, regs, &rctx);
604 605
	if (!rec)
		return;
606 607 608 609

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

610
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
611
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
612 613
}

614
int perf_sysexit_enable(struct ftrace_event_call *call)
615 616 617 618
{
	int ret = 0;
	int num;

619
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
620 621

	mutex_lock(&syscall_trace_lock);
622
	if (!sys_perf_refcount_exit)
623
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
624 625
	if (ret) {
		pr_info("event trace: Could not activate"
626
				"syscall exit trace point");
627
	} else {
628 629
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
630 631 632 633 634
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

635
void perf_sysexit_disable(struct ftrace_event_call *call)
636 637 638
{
	int num;

639
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
640 641

	mutex_lock(&syscall_trace_lock);
642 643 644
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
645
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
646 647 648
	mutex_unlock(&syscall_trace_lock);
}

649
#endif /* CONFIG_PERF_EVENTS */
650

651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}