trace_syscalls.c 13.8 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7 8 9 10 11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13 14
static int sys_refcount_enter;
static int sys_refcount_exit;
15 16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18 19 20 21
struct ftrace_event_class event_class_syscalls = {
	.system			= "syscalls"
};

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

59 60 61 62 63 64 65 66 67
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

68
	trace = (typeof(trace))ent;
69 70
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
71

72 73 74
	if (!entry)
		goto end;

75
	if (entry->enter_event->id != ent->type) {
76 77 78 79
		WARN_ON_ONCE(1);
		goto end;
	}

80 81 82 83 84 85
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
86
		if (trace_flags & TRACE_ITER_VERBOSE) {
87 88 89 90 91
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
92
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
93
				       trace->args[i],
94
				       i == entry->nb_args - 1 ? "" : ", ");
95 96 97 98
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

99 100 101 102
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

103
end:
104 105 106 107
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

108 109 110 111 112 113 114 115 116 117 118 119 120
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

121
	trace = (typeof(trace))ent;
122 123
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
124

125 126 127 128 129
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

130
	if (entry->exit_event->id != ent->type) {
131 132 133 134
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

135 136 137 138 139 140 141 142
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

143 144 145 146 147
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
148 149
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
150

151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

212 213 214
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_enter trace;
215
	struct syscall_metadata *meta = call->data;
216 217 218 219
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

220 221 222 223
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

224
	for (i = 0; i < meta->nb_args; i++) {
225 226
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
227 228
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
229 230 231 232 233 234 235 236 237 238 239
		offset += sizeof(unsigned long);
	}

	return ret;
}

int syscall_exit_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_exit trace;
	int ret;

240 241 242 243
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

244
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
245
				 FILTER_OTHER);
246 247 248 249

	return ret;
}

250
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
251
{
252 253 254
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
255
	struct ring_buffer *buffer;
256
	int size;
257 258 259
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
260 261
	if (syscall_nr < 0)
		return;
262 263
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
264

265 266 267 268 269 270
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

271 272
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
273 274 275 276 277 278 279
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

280 281 282
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
283 284
}

285
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
286
{
287 288 289
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
290
	struct ring_buffer *buffer;
291 292 293
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
294 295
	if (syscall_nr < 0)
		return;
296 297
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
298

299 300 301 302
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

303 304
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
305 306 307 308 309 310 311
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

312 313 314
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
315 316
}

317
int reg_event_syscall_enter(struct ftrace_event_call *call)
318
{
319 320 321
	int ret = 0;
	int num;

322
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
323
	if (num < 0 || num >= NR_syscalls)
324 325 326
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
327
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
328
	if (!ret) {
329 330 331 332 333
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
334 335
}

336
void unreg_event_syscall_enter(struct ftrace_event_call *call)
337
{
338
	int num;
339

340
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
341
	if (num < 0 || num >= NR_syscalls)
342 343 344 345 346
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
347
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
348 349
	mutex_unlock(&syscall_trace_lock);
}
350

351
int reg_event_syscall_exit(struct ftrace_event_call *call)
352
{
353 354 355
	int ret = 0;
	int num;

356
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
357
	if (num < 0 || num >= NR_syscalls)
358 359 360
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
361
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
362
	if (!ret) {
363 364
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
365
	}
366 367 368
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
369

370
void unreg_event_syscall_exit(struct ftrace_event_call *call)
371 372
{
	int num;
373

374
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
375
	if (num < 0 || num >= NR_syscalls)
376 377 378 379 380
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
381
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
382
	mutex_unlock(&syscall_trace_lock);
383
}
384

385 386 387 388
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

389 390 391
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

392 393 394
	id = trace_event_raw_init(call);

	if (id < 0) {
395
		free_syscall_print_fmt(call);
396
		return id;
397
	}
398 399

	return id;
400 401
}

402 403 404 405 406
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
423 424 425 426
		if (!meta)
			continue;

		meta->syscall_nr = i;
427 428 429 430 431 432 433
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

434
#ifdef CONFIG_PERF_EVENTS
435

436 437 438 439
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
440

441
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
442 443
{
	struct syscall_metadata *sys_data;
444 445
	struct syscall_trace_enter *rec;
	unsigned long flags;
446
	int syscall_nr;
447
	int rctx;
448
	int size;
449 450

	syscall_nr = syscall_get_nr(current, regs);
451
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
452 453 454 455 456 457
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

458 459 460 461 462
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

463 464
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
465 466
		return;

467
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
468 469 470
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
471 472 473 474

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
475
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
476 477
}

478
int perf_sysenter_enable(struct ftrace_event_call *call)
479 480 481 482
{
	int ret = 0;
	int num;

483
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
484 485

	mutex_lock(&syscall_trace_lock);
486
	if (!sys_perf_refcount_enter)
487
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
488 489 490 491
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
492 493
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
494 495 496 497 498
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

499
void perf_sysenter_disable(struct ftrace_event_call *call)
500 501 502
{
	int num;

503
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
504 505

	mutex_lock(&syscall_trace_lock);
506 507 508
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
509
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
510 511 512
	mutex_unlock(&syscall_trace_lock);
}

513
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
514 515
{
	struct syscall_metadata *sys_data;
516 517
	struct syscall_trace_exit *rec;
	unsigned long flags;
518
	int syscall_nr;
519
	int rctx;
520
	int size;
521 522

	syscall_nr = syscall_get_nr(current, regs);
523
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
524 525 526 527 528 529
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

530 531 532
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
533

534 535 536 537
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
538 539
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
540 541
		return;

542
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
543 544 545
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
546 547 548 549

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

550
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
551 552
}

553
int perf_sysexit_enable(struct ftrace_event_call *call)
554 555 556 557
{
	int ret = 0;
	int num;

558
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
559 560

	mutex_lock(&syscall_trace_lock);
561
	if (!sys_perf_refcount_exit)
562
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
563 564
	if (ret) {
		pr_info("event trace: Could not activate"
565
				"syscall exit trace point");
566
	} else {
567 568
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
569 570 571 572 573
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

574
void perf_sysexit_disable(struct ftrace_event_call *call)
575 576 577
{
	int num;

578
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
579 580

	mutex_lock(&syscall_trace_lock);
581 582 583
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
584
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
585 586 587
	mutex_unlock(&syscall_trace_lock);
}

588
#endif /* CONFIG_PERF_EVENTS */
589