bpf_trace.c 10.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
13
#include <linux/ctype.h>
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
#include "trace.h"

/**
 * trace_call_bpf - invoke BPF program
 * @prog: BPF program
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
{
	unsigned int ret;

	if (in_nmi()) /* not supported yet */
		return 1;

	preempt_disable();

	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
		/*
		 * since some bpf program is already running on this cpu,
		 * don't call into another bpf program (same or different)
		 * and don't send kprobe event into ring-buffer,
		 * so return zero here
		 */
		ret = 0;
		goto out;
	}

	rcu_read_lock();
	ret = BPF_PROG_RUN(prog, ctx);
	rcu_read_unlock();

 out:
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();

	return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);

static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	void *dst = (void *) (long) r1;
65
	int ret, size = (int) r2;
66 67
	void *unsafe_ptr = (void *) (long) r3;

68 69 70 71 72
	ret = probe_kernel_read(dst, unsafe_ptr, size);
	if (unlikely(ret < 0))
		memset(dst, 0, size);

	return ret;
73 74 75 76 77 78
}

static const struct bpf_func_proto bpf_probe_read_proto = {
	.func		= bpf_probe_read,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
79
	.arg1_type	= ARG_PTR_TO_RAW_STACK,
80 81 82 83
	.arg2_type	= ARG_CONST_STACK_SIZE,
	.arg3_type	= ARG_ANYTHING,
};

84 85
/*
 * limited trace_printk()
86
 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
87 88 89 90
 */
static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
{
	char *fmt = (char *) (long) r1;
91
	bool str_seen = false;
92 93
	int mod[3] = {};
	int fmt_cnt = 0;
94 95
	u64 unsafe_addr;
	char buf[64];
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
	int i;

	/*
	 * bpf_check()->check_func_arg()->check_stack_boundary()
	 * guarantees that fmt points to bpf program stack,
	 * fmt_size bytes of it were initialized and fmt_size > 0
	 */
	if (fmt[--fmt_size] != 0)
		return -EINVAL;

	/* check format string for allowed specifiers */
	for (i = 0; i < fmt_size; i++) {
		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
			return -EINVAL;

		if (fmt[i] != '%')
			continue;

		if (fmt_cnt >= 3)
			return -EINVAL;

		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
		i++;
		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
122
		} else if (fmt[i] == 'p' || fmt[i] == 's') {
123 124 125 126 127
			mod[fmt_cnt]++;
			i++;
			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
				return -EINVAL;
			fmt_cnt++;
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
			if (fmt[i - 1] == 's') {
				if (str_seen)
					/* allow only one '%s' per fmt string */
					return -EINVAL;
				str_seen = true;

				switch (fmt_cnt) {
				case 1:
					unsafe_addr = r3;
					r3 = (long) buf;
					break;
				case 2:
					unsafe_addr = r4;
					r4 = (long) buf;
					break;
				case 3:
					unsafe_addr = r5;
					r5 = (long) buf;
					break;
				}
				buf[0] = 0;
				strncpy_from_unsafe(buf,
						    (void *) (long) unsafe_addr,
						    sizeof(buf));
			}
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
			continue;
		}

		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
		}

		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
			return -EINVAL;
		fmt_cnt++;
	}

	return __trace_printk(1/* fake ip will not be printed */, fmt,
			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
	.func		= bpf_trace_printk,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_STACK,
	.arg2_type	= ARG_CONST_STACK_SIZE,
};

180 181 182 183 184 185 186 187 188 189 190
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
	/*
	 * this program might be calling bpf_trace_printk,
	 * so allocate per-cpu printk buffers
	 */
	trace_printk_init_buffers();

	return &bpf_trace_printk_proto;
}

191 192 193 194 195
static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
{
	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct perf_event *event;
196
	struct file *file;
197 198 199 200

	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

201 202
	file = (struct file *)array->ptrs[index];
	if (unlikely(!file))
203 204
		return -ENOENT;

205 206
	event = file->private_data;

207 208 209 210 211
	/* make sure event is local and doesn't have pmu::count */
	if (event->oncpu != smp_processor_id() ||
	    event->pmu->count)
		return -EINVAL;

212 213 214 215 216 217 218 219
	/*
	 * we don't know if the function is run successfully by the
	 * return value. It can be judged in other places, such as
	 * eBPF programs.
	 */
	return perf_event_read_local(event);
}

220
static const struct bpf_func_proto bpf_perf_event_read_proto = {
221
	.func		= bpf_perf_event_read,
222
	.gpl_only	= true,
223 224 225 226 227
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_ANYTHING,
};

228 229 230 231 232 233 234 235
static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
	struct pt_regs *regs = (struct pt_regs *) (long) r1;
	struct bpf_map *map = (struct bpf_map *) (long) r2;
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	void *data = (void *) (long) r4;
	struct perf_sample_data sample_data;
	struct perf_event *event;
236
	struct file *file;
237 238 239 240 241 242 243 244
	struct perf_raw_record raw = {
		.size = size,
		.data = data,
	};

	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

245 246
	file = (struct file *)array->ptrs[index];
	if (unlikely(!file))
247 248
		return -ENOENT;

249 250
	event = file->private_data;

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
		return -EINVAL;

	if (unlikely(event->oncpu != smp_processor_id()))
		return -EOPNOTSUPP;

	perf_sample_data_init(&sample_data, 0, 0);
	sample_data.raw = &raw;
	perf_event_output(event, &sample_data, regs);
	return 0;
}

static const struct bpf_func_proto bpf_perf_event_output_proto = {
	.func		= bpf_perf_event_output,
266
	.gpl_only	= true,
267 268 269 270 271 272 273 274
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_STACK,
	.arg5_type	= ARG_CONST_STACK_SIZE,
};

275
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
276 277 278 279 280 281 282 283 284 285
{
	switch (func_id) {
	case BPF_FUNC_map_lookup_elem:
		return &bpf_map_lookup_elem_proto;
	case BPF_FUNC_map_update_elem:
		return &bpf_map_update_elem_proto;
	case BPF_FUNC_map_delete_elem:
		return &bpf_map_delete_elem_proto;
	case BPF_FUNC_probe_read:
		return &bpf_probe_read_proto;
286 287
	case BPF_FUNC_ktime_get_ns:
		return &bpf_ktime_get_ns_proto;
288 289
	case BPF_FUNC_tail_call:
		return &bpf_tail_call_proto;
290 291 292 293 294 295
	case BPF_FUNC_get_current_pid_tgid:
		return &bpf_get_current_pid_tgid_proto;
	case BPF_FUNC_get_current_uid_gid:
		return &bpf_get_current_uid_gid_proto;
	case BPF_FUNC_get_current_comm:
		return &bpf_get_current_comm_proto;
296
	case BPF_FUNC_trace_printk:
297
		return bpf_get_trace_printk_proto();
298 299
	case BPF_FUNC_get_smp_processor_id:
		return &bpf_get_smp_processor_id_proto;
300 301
	case BPF_FUNC_perf_event_read:
		return &bpf_perf_event_read_proto;
302 303 304 305 306 307 308 309
	default:
		return NULL;
	}
}

static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
310 311
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto;
312 313
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto;
314
	default:
315
		return tracing_func_proto(func_id);
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
	}
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
{
	/* check bounds */
	if (off < 0 || off >= sizeof(struct pt_regs))
		return false;

	/* only read is allowed */
	if (type != BPF_READ)
		return false;

	/* disallow misaligned access */
	if (off % size != 0)
		return false;

	return true;
}

337
static const struct bpf_verifier_ops kprobe_prog_ops = {
338 339 340 341 342 343 344 345 346
	.get_func_proto  = kprobe_prog_func_proto,
	.is_valid_access = kprobe_prog_is_valid_access,
};

static struct bpf_prog_type_list kprobe_tl = {
	.ops	= &kprobe_prog_ops,
	.type	= BPF_PROG_TYPE_KPROBE,
};

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
	/*
	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
	 * from there and call the same bpf_perf_event_output() helper
	 */
	u64 ctx = *(long *)r1;

	return bpf_perf_event_output(ctx, r2, index, r4, size);
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
	.func		= bpf_perf_event_output_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_STACK,
	.arg5_type	= ARG_CONST_STACK_SIZE,
};

static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
	u64 ctx = *(long *)r1;

	return bpf_get_stackid(ctx, r2, r3, r4, r5);
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
	.func		= bpf_get_stackid_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

386 387 388 389
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
390
		return &bpf_perf_event_output_proto_tp;
391
	case BPF_FUNC_get_stackid:
392
		return &bpf_get_stackid_proto_tp;
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
	default:
		return tracing_func_proto(func_id);
	}
}

static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
{
	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

static const struct bpf_verifier_ops tracepoint_prog_ops = {
	.get_func_proto  = tp_prog_func_proto,
	.is_valid_access = tp_prog_is_valid_access,
};

static struct bpf_prog_type_list tracepoint_tl = {
	.ops	= &tracepoint_prog_ops,
	.type	= BPF_PROG_TYPE_TRACEPOINT,
};

419 420 421
static int __init register_kprobe_prog_ops(void)
{
	bpf_register_prog_type(&kprobe_tl);
422
	bpf_register_prog_type(&tracepoint_tl);
423 424 425
	return 0;
}
late_initcall(register_kprobe_prog_ops);