bpf_trace.c 16.0 KB
Newer Older
1
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2
 * Copyright (c) 2016 Facebook
3 4 5 6 7 8 9 10 11
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
12
#include <linux/bpf_perf_event.h>
13 14
#include <linux/filter.h>
#include <linux/uaccess.h>
15
#include <linux/ctype.h>
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
#include "trace.h"

/**
 * trace_call_bpf - invoke BPF program
 * @prog: BPF program
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
{
	unsigned int ret;

	if (in_nmi()) /* not supported yet */
		return 1;

	preempt_disable();

	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
		/*
		 * since some bpf program is already running on this cpu,
		 * don't call into another bpf program (same or different)
		 * and don't send kprobe event into ring-buffer,
		 * so return zero here
		 */
		ret = 0;
		goto out;
	}

	rcu_read_lock();
	ret = BPF_PROG_RUN(prog, ctx);
	rcu_read_unlock();

 out:
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();

	return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);

static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	void *dst = (void *) (long) r1;
67
	int ret, size = (int) r2;
68 69
	void *unsafe_ptr = (void *) (long) r3;

70 71 72 73 74
	ret = probe_kernel_read(dst, unsafe_ptr, size);
	if (unlikely(ret < 0))
		memset(dst, 0, size);

	return ret;
75 76 77 78 79 80
}

static const struct bpf_func_proto bpf_probe_read_proto = {
	.func		= bpf_probe_read,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
81
	.arg1_type	= ARG_PTR_TO_RAW_STACK,
82 83 84 85
	.arg2_type	= ARG_CONST_STACK_SIZE,
	.arg3_type	= ARG_ANYTHING,
};

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	void *unsafe_ptr = (void *) (long) r1;
	void *src = (void *) (long) r2;
	int size = (int) r3;

	/*
	 * Ensure we're in user context which is safe for the helper to
	 * run. This helper has no business in a kthread.
	 *
	 * access_ok() should prevent writing to non-user memory, but in
	 * some situations (nommu, temporary switch, etc) access_ok() does
	 * not provide enough validation, hence the check on KERNEL_DS.
	 */

	if (unlikely(in_interrupt() ||
		     current->flags & (PF_KTHREAD | PF_EXITING)))
		return -EPERM;
	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
		return -EPERM;
	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
		return -EPERM;

	return probe_kernel_write(unsafe_ptr, src, size);
}

static const struct bpf_func_proto bpf_probe_write_user_proto = {
	.func		= bpf_probe_write_user,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_ANYTHING,
	.arg2_type	= ARG_PTR_TO_STACK,
	.arg3_type	= ARG_CONST_STACK_SIZE,
};

static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
			    current->comm, task_pid_nr(current));

	return &bpf_probe_write_user_proto;
}

129 130
/*
 * limited trace_printk()
131
 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
132 133 134 135
 */
static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
{
	char *fmt = (char *) (long) r1;
136
	bool str_seen = false;
137 138
	int mod[3] = {};
	int fmt_cnt = 0;
139 140
	u64 unsafe_addr;
	char buf[64];
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
	int i;

	/*
	 * bpf_check()->check_func_arg()->check_stack_boundary()
	 * guarantees that fmt points to bpf program stack,
	 * fmt_size bytes of it were initialized and fmt_size > 0
	 */
	if (fmt[--fmt_size] != 0)
		return -EINVAL;

	/* check format string for allowed specifiers */
	for (i = 0; i < fmt_size; i++) {
		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
			return -EINVAL;

		if (fmt[i] != '%')
			continue;

		if (fmt_cnt >= 3)
			return -EINVAL;

		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
		i++;
		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
167
		} else if (fmt[i] == 'p' || fmt[i] == 's') {
168 169 170 171 172
			mod[fmt_cnt]++;
			i++;
			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
				return -EINVAL;
			fmt_cnt++;
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
			if (fmt[i - 1] == 's') {
				if (str_seen)
					/* allow only one '%s' per fmt string */
					return -EINVAL;
				str_seen = true;

				switch (fmt_cnt) {
				case 1:
					unsafe_addr = r3;
					r3 = (long) buf;
					break;
				case 2:
					unsafe_addr = r4;
					r4 = (long) buf;
					break;
				case 3:
					unsafe_addr = r5;
					r5 = (long) buf;
					break;
				}
				buf[0] = 0;
				strncpy_from_unsafe(buf,
						    (void *) (long) unsafe_addr,
						    sizeof(buf));
			}
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
			continue;
		}

		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
		}

		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
			return -EINVAL;
		fmt_cnt++;
	}

	return __trace_printk(1/* fake ip will not be printed */, fmt,
			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
	.func		= bpf_trace_printk,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_STACK,
	.arg2_type	= ARG_CONST_STACK_SIZE,
};

225 226 227 228 229 230 231 232 233 234 235
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
	/*
	 * this program might be calling bpf_trace_printk,
	 * so allocate per-cpu printk buffers
	 */
	trace_printk_init_buffers();

	return &bpf_trace_printk_proto;
}

236
static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
237 238 239
{
	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
	struct bpf_array *array = container_of(map, struct bpf_array, map);
240 241
	unsigned int cpu = smp_processor_id();
	u64 index = flags & BPF_F_INDEX_MASK;
242
	struct bpf_event_entry *ee;
243 244
	struct perf_event *event;

245 246 247 248
	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
		return -EINVAL;
	if (index == BPF_F_CURRENT_CPU)
		index = cpu;
249 250 251
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

252
	ee = READ_ONCE(array->ptrs[index]);
253
	if (!ee)
254 255
		return -ENOENT;

256
	event = ee->event;
257 258 259 260
	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
		     event->attr.type != PERF_TYPE_RAW))
		return -EINVAL;

261
	/* make sure event is local and doesn't have pmu::count */
262
	if (unlikely(event->oncpu != cpu || event->pmu->count))
263 264
		return -EINVAL;

265 266 267 268 269 270 271 272
	/*
	 * we don't know if the function is run successfully by the
	 * return value. It can be judged in other places, such as
	 * eBPF programs.
	 */
	return perf_event_read_local(event);
}

273
static const struct bpf_func_proto bpf_perf_event_read_proto = {
274
	.func		= bpf_perf_event_read,
275
	.gpl_only	= true,
276 277 278 279 280
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_ANYTHING,
};

281 282 283
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
			u64 flags, struct perf_raw_record *raw)
284 285
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
286
	unsigned int cpu = smp_processor_id();
287
	u64 index = flags & BPF_F_INDEX_MASK;
288
	struct perf_sample_data sample_data;
289
	struct bpf_event_entry *ee;
290 291
	struct perf_event *event;

292
	if (index == BPF_F_CURRENT_CPU)
293
		index = cpu;
294 295 296
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

297
	ee = READ_ONCE(array->ptrs[index]);
298
	if (!ee)
299 300
		return -ENOENT;

301
	event = ee->event;
302 303 304 305
	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
		return -EINVAL;

306
	if (unlikely(event->oncpu != cpu))
307 308 309
		return -EOPNOTSUPP;

	perf_sample_data_init(&sample_data, 0, 0);
310
	sample_data.raw = raw;
311 312 313 314
	perf_event_output(event, &sample_data, regs);
	return 0;
}

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
{
	struct pt_regs *regs = (struct pt_regs *)(long) r1;
	struct bpf_map *map  = (struct bpf_map *)(long) r2;
	void *data = (void *)(long) r4;
	struct perf_raw_record raw = {
		.frag = {
			.size = size,
			.data = data,
		},
	};

	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
		return -EINVAL;

	return __bpf_perf_event_output(regs, map, flags, &raw);
}

333 334
static const struct bpf_func_proto bpf_perf_event_output_proto = {
	.func		= bpf_perf_event_output,
335
	.gpl_only	= true,
336 337 338 339 340 341 342 343
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_STACK,
	.arg5_type	= ARG_CONST_STACK_SIZE,
};

344 345
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);

346 347
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
348 349
{
	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
350 351 352 353 354 355 356
	struct perf_raw_frag frag = {
		.copy		= ctx_copy,
		.size		= ctx_size,
		.data		= ctx,
	};
	struct perf_raw_record raw = {
		.frag = {
357 358 359
			{
				.next	= ctx_size ? &frag : NULL,
			},
360 361 362 363
			.size	= meta_size,
			.data	= meta,
		},
	};
364 365 366

	perf_fetch_caller_regs(regs);

367
	return __bpf_perf_event_output(regs, map, flags, &raw);
368 369
}

370 371 372 373 374 375 376 377 378 379 380
static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	return (long) current;
}

static const struct bpf_func_proto bpf_get_current_task_proto = {
	.func		= bpf_get_current_task,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
};

381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	struct bpf_map *map = (struct bpf_map *)(long)r1;
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct cgroup *cgrp;
	u32 idx = (u32)r2;

	if (unlikely(in_interrupt()))
		return -EINVAL;

	if (unlikely(idx >= array->map.max_entries))
		return -E2BIG;

	cgrp = READ_ONCE(array->ptrs[idx]);
	if (unlikely(!cgrp))
		return -EAGAIN;

	return task_under_cgroup_hierarchy(current, cgrp);
}

static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
	.func           = bpf_current_task_under_cgroup,
	.gpl_only       = false,
	.ret_type       = RET_INTEGER,
	.arg1_type      = ARG_CONST_MAP_PTR,
	.arg2_type      = ARG_ANYTHING,
};

409
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
410 411 412 413 414 415 416 417 418 419
{
	switch (func_id) {
	case BPF_FUNC_map_lookup_elem:
		return &bpf_map_lookup_elem_proto;
	case BPF_FUNC_map_update_elem:
		return &bpf_map_update_elem_proto;
	case BPF_FUNC_map_delete_elem:
		return &bpf_map_delete_elem_proto;
	case BPF_FUNC_probe_read:
		return &bpf_probe_read_proto;
420 421
	case BPF_FUNC_ktime_get_ns:
		return &bpf_ktime_get_ns_proto;
422 423
	case BPF_FUNC_tail_call:
		return &bpf_tail_call_proto;
424 425
	case BPF_FUNC_get_current_pid_tgid:
		return &bpf_get_current_pid_tgid_proto;
426 427
	case BPF_FUNC_get_current_task:
		return &bpf_get_current_task_proto;
428 429 430 431
	case BPF_FUNC_get_current_uid_gid:
		return &bpf_get_current_uid_gid_proto;
	case BPF_FUNC_get_current_comm:
		return &bpf_get_current_comm_proto;
432
	case BPF_FUNC_trace_printk:
433
		return bpf_get_trace_printk_proto();
434 435
	case BPF_FUNC_get_smp_processor_id:
		return &bpf_get_smp_processor_id_proto;
436 437
	case BPF_FUNC_perf_event_read:
		return &bpf_perf_event_read_proto;
438 439
	case BPF_FUNC_probe_write_user:
		return bpf_get_probe_write_proto();
440 441
	case BPF_FUNC_current_task_under_cgroup:
		return &bpf_current_task_under_cgroup_proto;
442 443
	case BPF_FUNC_get_prandom_u32:
		return &bpf_get_prandom_u32_proto;
444 445 446 447 448 449 450 451
	default:
		return NULL;
	}
}

static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
452 453
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto;
454 455
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto;
456
	default:
457
		return tracing_func_proto(func_id);
458 459 460 461
	}
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
462 463
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
					enum bpf_reg_type *reg_type)
464 465 466 467 468 469 470 471 472 473
{
	if (off < 0 || off >= sizeof(struct pt_regs))
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

474
static const struct bpf_verifier_ops kprobe_prog_ops = {
475 476 477 478 479 480 481 482 483
	.get_func_proto  = kprobe_prog_func_proto,
	.is_valid_access = kprobe_prog_is_valid_access,
};

static struct bpf_prog_type_list kprobe_tl = {
	.ops	= &kprobe_prog_ops,
	.type	= BPF_PROG_TYPE_KPROBE,
};

484 485 486 487 488 489 490
static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
	/*
	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
	 * from there and call the same bpf_perf_event_output() helper
	 */
491
	u64 ctx = *(long *)(uintptr_t)r1;
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508

	return bpf_perf_event_output(ctx, r2, index, r4, size);
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
	.func		= bpf_perf_event_output_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_STACK,
	.arg5_type	= ARG_CONST_STACK_SIZE,
};

static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
509
	u64 ctx = *(long *)(uintptr_t)r1;
510 511 512 513 514 515 516 517 518 519 520 521 522

	return bpf_get_stackid(ctx, r2, r3, r4, r5);
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
	.func		= bpf_get_stackid_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

523 524 525 526
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
527
		return &bpf_perf_event_output_proto_tp;
528
	case BPF_FUNC_get_stackid:
529
		return &bpf_get_stackid_proto_tp;
530 531 532 533 534
	default:
		return tracing_func_proto(func_id);
	}
}

535 536
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
				    enum bpf_reg_type *reg_type)
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
{
	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

static const struct bpf_verifier_ops tracepoint_prog_ops = {
	.get_func_proto  = tp_prog_func_proto,
	.is_valid_access = tp_prog_is_valid_access,
};

static struct bpf_prog_type_list tracepoint_tl = {
	.ops	= &tracepoint_prog_ops,
	.type	= BPF_PROG_TYPE_TRACEPOINT,
};

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
				    enum bpf_reg_type *reg_type)
{
	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
		if (size != sizeof(u64))
			return false;
	} else {
		if (size != sizeof(long))
			return false;
	}
	return true;
}

static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
				      int src_reg, int ctx_off,
				      struct bpf_insn *insn_buf,
				      struct bpf_prog *prog)
{
	struct bpf_insn *insn = insn_buf;

	switch (ctx_off) {
	case offsetof(struct bpf_perf_event_data, sample_period):
		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
586 587 588

		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
						       data), dst_reg, src_reg,
589 590 591 592 593
				      offsetof(struct bpf_perf_event_data_kern, data));
		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
				      offsetof(struct perf_sample_data, period));
		break;
	default:
594 595
		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
						       regs), dst_reg, src_reg,
596
				      offsetof(struct bpf_perf_event_data_kern, regs));
597
		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
		break;
	}

	return insn - insn_buf;
}

static const struct bpf_verifier_ops perf_event_prog_ops = {
	.get_func_proto		= tp_prog_func_proto,
	.is_valid_access	= pe_prog_is_valid_access,
	.convert_ctx_access	= pe_prog_convert_ctx_access,
};

static struct bpf_prog_type_list perf_event_tl = {
	.ops	= &perf_event_prog_ops,
	.type	= BPF_PROG_TYPE_PERF_EVENT,
};

615 616 617
static int __init register_kprobe_prog_ops(void)
{
	bpf_register_prog_type(&kprobe_tl);
618
	bpf_register_prog_type(&tracepoint_tl);
619
	bpf_register_prog_type(&perf_event_tl);
620 621 622
	return 0;
}
late_initcall(register_kprobe_prog_ops);