bpf_trace.c 16.0 KB
Newer Older
1
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2
 * Copyright (c) 2016 Facebook
3 4 5 6 7 8 9 10 11
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
12
#include <linux/bpf_perf_event.h>
13 14
#include <linux/filter.h>
#include <linux/uaccess.h>
15
#include <linux/ctype.h>
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
#include "trace.h"

/**
 * trace_call_bpf - invoke BPF program
 * @prog: BPF program
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
{
	unsigned int ret;

	if (in_nmi()) /* not supported yet */
		return 1;

	preempt_disable();

	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
		/*
		 * since some bpf program is already running on this cpu,
		 * don't call into another bpf program (same or different)
		 * and don't send kprobe event into ring-buffer,
		 * so return zero here
		 */
		ret = 0;
		goto out;
	}

	rcu_read_lock();
	ret = BPF_PROG_RUN(prog, ctx);
	rcu_read_unlock();

 out:
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();

	return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);

64
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
65
{
66
	int ret;
67

68 69 70 71 72
	ret = probe_kernel_read(dst, unsafe_ptr, size);
	if (unlikely(ret < 0))
		memset(dst, 0, size);

	return ret;
73 74 75 76 77 78
}

static const struct bpf_func_proto bpf_probe_read_proto = {
	.func		= bpf_probe_read,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
79 80
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE,
81 82 83
	.arg3_type	= ARG_ANYTHING,
};

84 85
BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
	   u32, size)
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
{
	/*
	 * Ensure we're in user context which is safe for the helper to
	 * run. This helper has no business in a kthread.
	 *
	 * access_ok() should prevent writing to non-user memory, but in
	 * some situations (nommu, temporary switch, etc) access_ok() does
	 * not provide enough validation, hence the check on KERNEL_DS.
	 */

	if (unlikely(in_interrupt() ||
		     current->flags & (PF_KTHREAD | PF_EXITING)))
		return -EPERM;
	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
		return -EPERM;
	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
		return -EPERM;

	return probe_kernel_write(unsafe_ptr, src, size);
}

static const struct bpf_func_proto bpf_probe_write_user_proto = {
	.func		= bpf_probe_write_user,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_ANYTHING,
112 113
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE,
114 115 116 117 118 119 120 121 122 123
};

static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
			    current->comm, task_pid_nr(current));

	return &bpf_probe_write_user_proto;
}

124 125
/*
 * limited trace_printk()
126
 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
127
 */
128 129
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
	   u64, arg2, u64, arg3)
130
{
131
	bool str_seen = false;
132 133
	int mod[3] = {};
	int fmt_cnt = 0;
134 135
	u64 unsafe_addr;
	char buf[64];
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
	int i;

	/*
	 * bpf_check()->check_func_arg()->check_stack_boundary()
	 * guarantees that fmt points to bpf program stack,
	 * fmt_size bytes of it were initialized and fmt_size > 0
	 */
	if (fmt[--fmt_size] != 0)
		return -EINVAL;

	/* check format string for allowed specifiers */
	for (i = 0; i < fmt_size; i++) {
		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
			return -EINVAL;

		if (fmt[i] != '%')
			continue;

		if (fmt_cnt >= 3)
			return -EINVAL;

		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
		i++;
		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
162
		} else if (fmt[i] == 'p' || fmt[i] == 's') {
163 164 165 166 167
			mod[fmt_cnt]++;
			i++;
			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
				return -EINVAL;
			fmt_cnt++;
168 169 170 171 172 173 174 175
			if (fmt[i - 1] == 's') {
				if (str_seen)
					/* allow only one '%s' per fmt string */
					return -EINVAL;
				str_seen = true;

				switch (fmt_cnt) {
				case 1:
176 177
					unsafe_addr = arg1;
					arg1 = (long) buf;
178 179
					break;
				case 2:
180 181
					unsafe_addr = arg2;
					arg2 = (long) buf;
182 183
					break;
				case 3:
184 185
					unsafe_addr = arg3;
					arg3 = (long) buf;
186 187 188 189 190 191 192
					break;
				}
				buf[0] = 0;
				strncpy_from_unsafe(buf,
						    (void *) (long) unsafe_addr,
						    sizeof(buf));
			}
193 194 195 196 197 198 199 200 201 202 203 204 205 206
			continue;
		}

		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
		}

		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
			return -EINVAL;
		fmt_cnt++;
	}

	return __trace_printk(1/* fake ip will not be printed */, fmt,
207 208 209
			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
210 211 212 213 214 215
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
	.func		= bpf_trace_printk,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
216 217
	.arg1_type	= ARG_PTR_TO_MEM,
	.arg2_type	= ARG_CONST_SIZE,
218 219
};

220 221 222 223 224 225 226 227 228 229 230
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
	/*
	 * this program might be calling bpf_trace_printk,
	 * so allocate per-cpu printk buffers
	 */
	trace_printk_init_buffers();

	return &bpf_trace_printk_proto;
}

231
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
232 233
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
234 235
	unsigned int cpu = smp_processor_id();
	u64 index = flags & BPF_F_INDEX_MASK;
236
	struct bpf_event_entry *ee;
237 238
	struct perf_event *event;

239 240 241 242
	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
		return -EINVAL;
	if (index == BPF_F_CURRENT_CPU)
		index = cpu;
243 244 245
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

246
	ee = READ_ONCE(array->ptrs[index]);
247
	if (!ee)
248 249
		return -ENOENT;

250
	event = ee->event;
251 252 253 254
	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
		     event->attr.type != PERF_TYPE_RAW))
		return -EINVAL;

255
	/* make sure event is local and doesn't have pmu::count */
256
	if (unlikely(event->oncpu != cpu || event->pmu->count))
257 258
		return -EINVAL;

259 260 261 262 263 264 265 266
	/*
	 * we don't know if the function is run successfully by the
	 * return value. It can be judged in other places, such as
	 * eBPF programs.
	 */
	return perf_event_read_local(event);
}

267
static const struct bpf_func_proto bpf_perf_event_read_proto = {
268
	.func		= bpf_perf_event_read,
269
	.gpl_only	= true,
270 271 272 273 274
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_ANYTHING,
};

275 276 277
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
			u64 flags, struct perf_raw_record *raw)
278 279
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
280
	unsigned int cpu = smp_processor_id();
281
	u64 index = flags & BPF_F_INDEX_MASK;
282
	struct perf_sample_data sample_data;
283
	struct bpf_event_entry *ee;
284 285
	struct perf_event *event;

286
	if (index == BPF_F_CURRENT_CPU)
287
		index = cpu;
288 289 290
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

291
	ee = READ_ONCE(array->ptrs[index]);
292
	if (!ee)
293 294
		return -ENOENT;

295
	event = ee->event;
296 297 298 299
	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
		return -EINVAL;

300
	if (unlikely(event->oncpu != cpu))
301 302 303
		return -EOPNOTSUPP;

	perf_sample_data_init(&sample_data, 0, 0);
304
	sample_data.raw = raw;
305 306 307 308
	perf_event_output(event, &sample_data, regs);
	return 0;
}

309 310
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
	   u64, flags, void *, data, u64, size)
311 312 313 314 315 316 317 318 319 320 321 322 323 324
{
	struct perf_raw_record raw = {
		.frag = {
			.size = size,
			.data = data,
		},
	};

	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
		return -EINVAL;

	return __bpf_perf_event_output(regs, map, flags, &raw);
}

325 326
static const struct bpf_func_proto bpf_perf_event_output_proto = {
	.func		= bpf_perf_event_output,
327
	.gpl_only	= true,
328 329 330 331
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
332 333
	.arg4_type	= ARG_PTR_TO_MEM,
	.arg5_type	= ARG_CONST_SIZE,
334 335
};

336 337
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);

338 339
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
340 341
{
	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
342 343 344 345 346 347 348
	struct perf_raw_frag frag = {
		.copy		= ctx_copy,
		.size		= ctx_size,
		.data		= ctx,
	};
	struct perf_raw_record raw = {
		.frag = {
349 350 351
			{
				.next	= ctx_size ? &frag : NULL,
			},
352 353 354 355
			.size	= meta_size,
			.data	= meta,
		},
	};
356 357 358

	perf_fetch_caller_regs(regs);

359
	return __bpf_perf_event_output(regs, map, flags, &raw);
360 361
}

362
BPF_CALL_0(bpf_get_current_task)
363 364 365 366 367 368 369 370 371 372
{
	return (long) current;
}

static const struct bpf_func_proto bpf_get_current_task_proto = {
	.func		= bpf_get_current_task,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
};

373
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct cgroup *cgrp;

	if (unlikely(in_interrupt()))
		return -EINVAL;
	if (unlikely(idx >= array->map.max_entries))
		return -E2BIG;

	cgrp = READ_ONCE(array->ptrs[idx]);
	if (unlikely(!cgrp))
		return -EAGAIN;

	return task_under_cgroup_hierarchy(current, cgrp);
}

static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
	.func           = bpf_current_task_under_cgroup,
	.gpl_only       = false,
	.ret_type       = RET_INTEGER,
	.arg1_type      = ARG_CONST_MAP_PTR,
	.arg2_type      = ARG_ANYTHING,
};

398
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
399 400 401 402 403 404 405 406 407 408
{
	switch (func_id) {
	case BPF_FUNC_map_lookup_elem:
		return &bpf_map_lookup_elem_proto;
	case BPF_FUNC_map_update_elem:
		return &bpf_map_update_elem_proto;
	case BPF_FUNC_map_delete_elem:
		return &bpf_map_delete_elem_proto;
	case BPF_FUNC_probe_read:
		return &bpf_probe_read_proto;
409 410
	case BPF_FUNC_ktime_get_ns:
		return &bpf_ktime_get_ns_proto;
411 412
	case BPF_FUNC_tail_call:
		return &bpf_tail_call_proto;
413 414
	case BPF_FUNC_get_current_pid_tgid:
		return &bpf_get_current_pid_tgid_proto;
415 416
	case BPF_FUNC_get_current_task:
		return &bpf_get_current_task_proto;
417 418 419 420
	case BPF_FUNC_get_current_uid_gid:
		return &bpf_get_current_uid_gid_proto;
	case BPF_FUNC_get_current_comm:
		return &bpf_get_current_comm_proto;
421
	case BPF_FUNC_trace_printk:
422
		return bpf_get_trace_printk_proto();
423 424
	case BPF_FUNC_get_smp_processor_id:
		return &bpf_get_smp_processor_id_proto;
425 426
	case BPF_FUNC_get_numa_node_id:
		return &bpf_get_numa_node_id_proto;
427 428
	case BPF_FUNC_perf_event_read:
		return &bpf_perf_event_read_proto;
429 430
	case BPF_FUNC_probe_write_user:
		return bpf_get_probe_write_proto();
431 432
	case BPF_FUNC_current_task_under_cgroup:
		return &bpf_current_task_under_cgroup_proto;
433 434
	case BPF_FUNC_get_prandom_u32:
		return &bpf_get_prandom_u32_proto;
435 436 437 438 439 440 441 442
	default:
		return NULL;
	}
}

static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
443 444
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto;
445 446
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto;
447
	default:
448
		return tracing_func_proto(func_id);
449 450 451 452
	}
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
453 454
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
					enum bpf_reg_type *reg_type)
455 456 457 458 459 460 461 462 463 464
{
	if (off < 0 || off >= sizeof(struct pt_regs))
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

465
static const struct bpf_verifier_ops kprobe_prog_ops = {
466 467 468 469 470 471 472 473 474
	.get_func_proto  = kprobe_prog_func_proto,
	.is_valid_access = kprobe_prog_is_valid_access,
};

static struct bpf_prog_type_list kprobe_tl = {
	.ops	= &kprobe_prog_ops,
	.type	= BPF_PROG_TYPE_KPROBE,
};

475 476
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
	   u64, flags, void *, data, u64, size)
477
{
478 479
	struct pt_regs *regs = *(struct pt_regs **)tp_buff;

480 481 482
	/*
	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
483
	 * from there and call the same bpf_perf_event_output() helper inline.
484
	 */
485
	return ____bpf_perf_event_output(regs, map, flags, data, size);
486 487 488 489 490 491 492 493 494
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
	.func		= bpf_perf_event_output_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
495 496
	.arg4_type	= ARG_PTR_TO_MEM,
	.arg5_type	= ARG_CONST_SIZE,
497 498
};

499 500
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
	   u64, flags)
501
{
502
	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
503

504 505 506 507 508 509 510
	/*
	 * Same comment as in bpf_perf_event_output_tp(), only that this time
	 * the other helper's function body cannot be inlined due to being
	 * external, thus we need to call raw helper function.
	 */
	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
			       flags, 0, 0);
511 512 513 514 515 516 517 518 519 520 521
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
	.func		= bpf_get_stackid_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

522 523 524 525
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
526
		return &bpf_perf_event_output_proto_tp;
527
	case BPF_FUNC_get_stackid:
528
		return &bpf_get_stackid_proto_tp;
529 530 531 532 533
	default:
		return tracing_func_proto(func_id);
	}
}

534 535
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
				    enum bpf_reg_type *reg_type)
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
{
	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

static const struct bpf_verifier_ops tracepoint_prog_ops = {
	.get_func_proto  = tp_prog_func_proto,
	.is_valid_access = tp_prog_is_valid_access,
};

static struct bpf_prog_type_list tracepoint_tl = {
	.ops	= &tracepoint_prog_ops,
	.type	= BPF_PROG_TYPE_TRACEPOINT,
};

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
				    enum bpf_reg_type *reg_type)
{
	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
		if (size != sizeof(u64))
			return false;
	} else {
		if (size != sizeof(long))
			return false;
	}
	return true;
}

static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
				      int src_reg, int ctx_off,
				      struct bpf_insn *insn_buf,
				      struct bpf_prog *prog)
{
	struct bpf_insn *insn = insn_buf;

	switch (ctx_off) {
	case offsetof(struct bpf_perf_event_data, sample_period):
		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
585 586 587

		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
						       data), dst_reg, src_reg,
588 589 590 591 592
				      offsetof(struct bpf_perf_event_data_kern, data));
		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
				      offsetof(struct perf_sample_data, period));
		break;
	default:
593 594
		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
						       regs), dst_reg, src_reg,
595
				      offsetof(struct bpf_perf_event_data_kern, regs));
596
		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
		break;
	}

	return insn - insn_buf;
}

static const struct bpf_verifier_ops perf_event_prog_ops = {
	.get_func_proto		= tp_prog_func_proto,
	.is_valid_access	= pe_prog_is_valid_access,
	.convert_ctx_access	= pe_prog_convert_ctx_access,
};

static struct bpf_prog_type_list perf_event_tl = {
	.ops	= &perf_event_prog_ops,
	.type	= BPF_PROG_TYPE_PERF_EVENT,
};

614 615 616
static int __init register_kprobe_prog_ops(void)
{
	bpf_register_prog_type(&kprobe_tl);
617
	bpf_register_prog_type(&tracepoint_tl);
618
	bpf_register_prog_type(&perf_event_tl);
619 620 621
	return 0;
}
late_initcall(register_kprobe_prog_ops);