verifier.c 161.4 KB
Newer Older
A
Alexei Starovoitov 已提交
1
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
A
Alexei Starovoitov 已提交
2
 * Copyright (c) 2016 Facebook
A
Alexei Starovoitov 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
17
#include <linux/bpf_verifier.h>
A
Alexei Starovoitov 已提交
18 19 20 21
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
22
#include <linux/stringify.h>
23 24
#include <linux/bsearch.h>
#include <linux/sort.h>
A
Alexei Starovoitov 已提交
25

26 27
#include "disasm.h"

28 29 30 31 32 33 34 35 36
static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name) \
	[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
};

A
Alexei Starovoitov 已提交
37 38 39 40 41 42 43 44 45 46 47 48
/* bpf_check() is a static code analyzer that walks eBPF program
 * instruction by instruction and updates register/stack state.
 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
 *
 * The first pass is depth-first-search to check that the program is a DAG.
 * It rejects the following programs:
 * - larger than BPF_MAXINSNS insns
 * - if loop is present (detected via back-edge)
 * - unreachable insns exist (shouldn't be a forest. program = one function)
 * - out of bounds or malformed jumps
 * The second pass is all possible path descent from the 1st insn.
 * Since it's analyzing all pathes through the program, the length of the
49
 * analysis is limited to 64k insn, which may be hit even if total number of
A
Alexei Starovoitov 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
 * insn is less then 4K, but there are too many branches that change stack/regs.
 * Number of 'branches to be analyzed' is limited to 1k
 *
 * On entry to each instruction, each register has a type, and the instruction
 * changes the types of the registers depending on instruction semantics.
 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
 * copied to R1.
 *
 * All registers are 64-bit.
 * R0 - return register
 * R1-R5 argument passing registers
 * R6-R9 callee saved registers
 * R10 - frame pointer read-only
 *
 * At the start of BPF program the register R1 contains a pointer to bpf_context
 * and has type PTR_TO_CTX.
 *
 * Verifier tracks arithmetic operations on pointers in case:
 *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
 * 1st insn copies R10 (which has FRAME_PTR) type into R1
 * and 2nd arithmetic instruction is pattern matched to recognize
 * that it wants to construct a pointer to some element within stack.
 * So after 2nd insn, the register R1 has type PTR_TO_STACK
 * (and -20 constant is saved for further stack bounds checking).
 * Meaning that this reg is a pointer to stack plus known immediate constant.
 *
77
 * Most of the time the registers have SCALAR_VALUE type, which
A
Alexei Starovoitov 已提交
78
 * means the register has some value, but it's not a valid pointer.
79
 * (like pointer plus pointer becomes SCALAR_VALUE type)
A
Alexei Starovoitov 已提交
80 81
 *
 * When verifier sees load or store instructions the type of base register
82
 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
A
Alexei Starovoitov 已提交
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 * types recognized by check_mem_access() function.
 *
 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
 * and the range of [ptr, ptr + map's value_size) is accessible.
 *
 * registers used to pass values to function calls are checked against
 * function argument constraints.
 *
 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
 * It means that the register type passed to this function must be
 * PTR_TO_STACK and it will be used inside the function as
 * 'pointer to map element key'
 *
 * For example the argument constraints for bpf_map_lookup_elem():
 *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 *   .arg1_type = ARG_CONST_MAP_PTR,
 *   .arg2_type = ARG_PTR_TO_MAP_KEY,
 *
 * ret_type says that this function returns 'pointer to map elem value or null'
 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
 * 2nd argument should be a pointer to stack, which will be used inside
 * the helper function as a pointer to map element key.
 *
 * On the kernel side the helper function looks like:
 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 * {
 *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 *    void *key = (void *) (unsigned long) r2;
 *    void *value;
 *
 *    here kernel can access 'key' and 'map' pointers safely, knowing that
 *    [key, key + map->key_size) bytes are valid and were initialized on
 *    the stack of eBPF program.
 * }
 *
 * Corresponding eBPF program may look like:
 *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
 *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
 *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 * here verifier looks at prototype of map_lookup_elem() and sees:
 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
 *
 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
 * and were initialized prior to this call.
 * If it's ok, then verifier allows this BPF_CALL insn and looks at
 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
 * returns ether pointer to map value or NULL.
 *
 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
 * insn, the register holding that pointer in the true branch changes state to
 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
 * branch. See check_cond_jmp_op().
 *
 * After the call R0 is set to return type of the function and registers R1-R5
 * are set to NOT_INIT to indicate that they are no longer readable.
 */

144
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
145
struct bpf_verifier_stack_elem {
146 147 148 149
	/* verifer state is 'st'
	 * before processing instruction 'insn_idx'
	 * and after processing instruction 'prev_insn_idx'
	 */
150
	struct bpf_verifier_state st;
151 152
	int insn_idx;
	int prev_insn_idx;
153
	struct bpf_verifier_stack_elem *next;
154 155
};

156
#define BPF_COMPLEXITY_LIMIT_INSNS	131072
157 158
#define BPF_COMPLEXITY_LIMIT_STACK	1024

159 160
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)

161 162
struct bpf_call_arg_meta {
	struct bpf_map *map_ptr;
163
	bool raw_mode;
164
	bool pkt_access;
165 166
	int regno;
	int access_size;
167 168
};

169 170 171 172 173 174
static DEFINE_MUTEX(bpf_verifier_lock);

/* log_level controls verbosity level of eBPF verifier.
 * verbose() is used to dump the verification trace to the log, so the user
 * can figure out what's wrong with the program
 */
175 176
static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
				   const char *fmt, ...)
177
{
178
	struct bpf_verifer_log *log = &env->log;
179
	unsigned int n;
180 181
	va_list args;

182
	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
183 184 185
		return;

	va_start(args, fmt);
186
	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
187
	va_end(args);
188 189 190 191 192 193 194 195 196 197 198

	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
		  "verifier log line truncated - local buffer too short\n");

	n = min(log->len_total - log->len_used - 1, n);
	log->kbuf[n] = '\0';

	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
		log->len_used += n;
	else
		log->ubuf = NULL;
199 200
}

201 202 203 204 205 206
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
	return type == PTR_TO_PACKET ||
	       type == PTR_TO_PACKET_META;
}

207 208 209
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
	[NOT_INIT]		= "?",
210
	[SCALAR_VALUE]		= "inv",
211 212 213 214 215
	[PTR_TO_CTX]		= "ctx",
	[CONST_PTR_TO_MAP]	= "map_ptr",
	[PTR_TO_MAP_VALUE]	= "map_value",
	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
	[PTR_TO_STACK]		= "fp",
A
Alexei Starovoitov 已提交
216
	[PTR_TO_PACKET]		= "pkt",
217
	[PTR_TO_PACKET_META]	= "pkt_meta",
A
Alexei Starovoitov 已提交
218
	[PTR_TO_PACKET_END]	= "pkt_end",
219 220
};

221 222 223 224 225 226 227 228 229 230 231
static void print_liveness(struct bpf_verifier_env *env,
			   enum bpf_reg_liveness live)
{
	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
	    verbose(env, "_");
	if (live & REG_LIVE_READ)
		verbose(env, "r");
	if (live & REG_LIVE_WRITTEN)
		verbose(env, "w");
}

232 233 234 235 236 237 238 239
static struct bpf_func_state *func(struct bpf_verifier_env *env,
				   const struct bpf_reg_state *reg)
{
	struct bpf_verifier_state *cur = env->cur_state;

	return cur->frame[reg->frameno];
}

240
static void print_verifier_state(struct bpf_verifier_env *env,
241
				 const struct bpf_func_state *state)
242
{
243
	const struct bpf_reg_state *reg;
244 245 246
	enum bpf_reg_type t;
	int i;

247 248
	if (state->frameno)
		verbose(env, " frame%d:", state->frameno);
249
	for (i = 0; i < MAX_BPF_REG; i++) {
A
Alexei Starovoitov 已提交
250 251
		reg = &state->regs[i];
		t = reg->type;
252 253
		if (t == NOT_INIT)
			continue;
254 255 256
		verbose(env, " R%d", i);
		print_liveness(env, reg->live);
		verbose(env, "=%s", reg_type_str[t]);
257 258 259
		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
		    tnum_is_const(reg->var_off)) {
			/* reg->off should be 0 for SCALAR_VALUE */
260
			verbose(env, "%lld", reg->var_off.value + reg->off);
261 262
			if (t == PTR_TO_STACK)
				verbose(env, ",call_%d", func(env, reg)->callsite);
263
		} else {
264
			verbose(env, "(id=%d", reg->id);
265
			if (t != SCALAR_VALUE)
266
				verbose(env, ",off=%d", reg->off);
267
			if (type_is_pkt_pointer(t))
268
				verbose(env, ",r=%d", reg->range);
269 270 271
			else if (t == CONST_PTR_TO_MAP ||
				 t == PTR_TO_MAP_VALUE ||
				 t == PTR_TO_MAP_VALUE_OR_NULL)
272
				verbose(env, ",ks=%d,vs=%d",
273 274
					reg->map_ptr->key_size,
					reg->map_ptr->value_size);
275 276 277 278 279
			if (tnum_is_const(reg->var_off)) {
				/* Typically an immediate SCALAR_VALUE, but
				 * could be a pointer whose offset is too big
				 * for reg->off
				 */
280
				verbose(env, ",imm=%llx", reg->var_off.value);
281 282 283
			} else {
				if (reg->smin_value != reg->umin_value &&
				    reg->smin_value != S64_MIN)
284
					verbose(env, ",smin_value=%lld",
285 286 287
						(long long)reg->smin_value);
				if (reg->smax_value != reg->umax_value &&
				    reg->smax_value != S64_MAX)
288
					verbose(env, ",smax_value=%lld",
289 290
						(long long)reg->smax_value);
				if (reg->umin_value != 0)
291
					verbose(env, ",umin_value=%llu",
292 293
						(unsigned long long)reg->umin_value);
				if (reg->umax_value != U64_MAX)
294
					verbose(env, ",umax_value=%llu",
295 296 297
						(unsigned long long)reg->umax_value);
				if (!tnum_is_unknown(reg->var_off)) {
					char tn_buf[48];
298

299
					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
300
					verbose(env, ",var_off=%s", tn_buf);
301
				}
302
			}
303
			verbose(env, ")");
304
		}
305
	}
306
	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
307 308 309 310 311
		if (state->stack[i].slot_type[0] == STACK_SPILL) {
			verbose(env, " fp%d",
				(-i - 1) * BPF_REG_SIZE);
			print_liveness(env, state->stack[i].spilled_ptr.live);
			verbose(env, "=%s",
312
				reg_type_str[state->stack[i].spilled_ptr.type]);
313
		}
314 315
		if (state->stack[i].slot_type[0] == STACK_ZERO)
			verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
316
	}
317
	verbose(env, "\n");
318 319
}

320 321
static int copy_stack_state(struct bpf_func_state *dst,
			    const struct bpf_func_state *src)
322
{
323 324 325 326 327 328 329 330 331 332 333 334 335 336
	if (!src->stack)
		return 0;
	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
		/* internal bug, make state invalid to reject the program */
		memset(dst, 0, sizeof(*dst));
		return -EFAULT;
	}
	memcpy(dst->stack, src->stack,
	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
	return 0;
}

/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
 * make it consume minimal amount of memory. check_stack_write() access from
337
 * the program calls into realloc_func_state() to grow the stack size.
338 339 340 341
 * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
 * which this function copies over. It points to previous bpf_verifier_state
 * which is never reallocated
 */
342 343
static int realloc_func_state(struct bpf_func_state *state, int size,
			      bool copy_old)
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
{
	u32 old_size = state->allocated_stack;
	struct bpf_stack_state *new_stack;
	int slot = size / BPF_REG_SIZE;

	if (size <= old_size || !size) {
		if (copy_old)
			return 0;
		state->allocated_stack = slot * BPF_REG_SIZE;
		if (!size && old_size) {
			kfree(state->stack);
			state->stack = NULL;
		}
		return 0;
	}
	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
				  GFP_KERNEL);
	if (!new_stack)
		return -ENOMEM;
	if (copy_old) {
		if (state->stack)
			memcpy(new_stack, state->stack,
			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
		memset(new_stack + old_size / BPF_REG_SIZE, 0,
		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
	}
	state->allocated_stack = slot * BPF_REG_SIZE;
	kfree(state->stack);
	state->stack = new_stack;
	return 0;
}

376 377 378 379 380 381
static void free_func_state(struct bpf_func_state *state)
{
	kfree(state->stack);
	kfree(state);
}

382 383
static void free_verifier_state(struct bpf_verifier_state *state,
				bool free_self)
384
{
385 386 387 388 389 390
	int i;

	for (i = 0; i <= state->curframe; i++) {
		free_func_state(state->frame[i]);
		state->frame[i] = NULL;
	}
391 392
	if (free_self)
		kfree(state);
393 394 395 396 397
}

/* copy verifier state from src to dst growing dst stack space
 * when necessary to accommodate larger src stack
 */
398 399
static int copy_func_state(struct bpf_func_state *dst,
			   const struct bpf_func_state *src)
400 401 402
{
	int err;

403
	err = realloc_func_state(dst, src->allocated_stack, false);
404 405
	if (err)
		return err;
406
	memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
407 408 409
	return copy_stack_state(dst, src);
}

410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
static int copy_verifier_state(struct bpf_verifier_state *dst_state,
			       const struct bpf_verifier_state *src)
{
	struct bpf_func_state *dst;
	int i, err;

	/* if dst has more stack frames then src frame, free them */
	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
		free_func_state(dst_state->frame[i]);
		dst_state->frame[i] = NULL;
	}
	dst_state->curframe = src->curframe;
	dst_state->parent = src->parent;
	for (i = 0; i <= src->curframe; i++) {
		dst = dst_state->frame[i];
		if (!dst) {
			dst = kzalloc(sizeof(*dst), GFP_KERNEL);
			if (!dst)
				return -ENOMEM;
			dst_state->frame[i] = dst;
		}
		err = copy_func_state(dst, src->frame[i]);
		if (err)
			return err;
	}
	return 0;
}

438 439 440 441 442 443
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
		     int *insn_idx)
{
	struct bpf_verifier_state *cur = env->cur_state;
	struct bpf_verifier_stack_elem *elem, *head = env->head;
	int err;
444 445

	if (env->head == NULL)
446
		return -ENOENT;
447

448 449 450 451 452 453 454
	if (cur) {
		err = copy_verifier_state(cur, &head->st);
		if (err)
			return err;
	}
	if (insn_idx)
		*insn_idx = head->insn_idx;
455
	if (prev_insn_idx)
456 457
		*prev_insn_idx = head->prev_insn_idx;
	elem = head->next;
458
	free_verifier_state(&head->st, false);
459
	kfree(head);
460 461
	env->head = elem;
	env->stack_size--;
462
	return 0;
463 464
}

465 466
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
					     int insn_idx, int prev_insn_idx)
467
{
468
	struct bpf_verifier_state *cur = env->cur_state;
469
	struct bpf_verifier_stack_elem *elem;
470
	int err;
471

472
	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
473 474 475 476 477 478 479 480
	if (!elem)
		goto err;

	elem->insn_idx = insn_idx;
	elem->prev_insn_idx = prev_insn_idx;
	elem->next = env->head;
	env->head = elem;
	env->stack_size++;
481 482 483
	err = copy_verifier_state(&elem->st, cur);
	if (err)
		goto err;
484
	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
485
		verbose(env, "BPF program is too complex\n");
486 487 488 489 490
		goto err;
	}
	return &elem->st;
err:
	/* pop all elements and return */
491
	while (!pop_stack(env, NULL, NULL));
492 493 494 495 496 497 498
	return NULL;
}

#define CALLER_SAVED_REGS 6
static const int caller_saved[CALLER_SAVED_REGS] = {
	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
499 500 501 502
#define CALLEE_SAVED_REGS 5
static const int callee_saved[CALLEE_SAVED_REGS] = {
	BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
};
503

504 505
static void __mark_reg_not_init(struct bpf_reg_state *reg);

506 507 508 509 510 511 512 513 514 515 516 517 518
/* Mark the unknown part of a register (variable offset or scalar value) as
 * known to have the value @imm.
 */
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
	reg->id = 0;
	reg->var_off = tnum_const(imm);
	reg->smin_value = (s64)imm;
	reg->smax_value = (s64)imm;
	reg->umin_value = imm;
	reg->umax_value = imm;
}

519 520 521 522
/* Mark the 'variable offset' part of a register as zero.  This should be
 * used only on registers holding a pointer type.
 */
static void __mark_reg_known_zero(struct bpf_reg_state *reg)
523
{
524
	__mark_reg_known(reg, 0);
525
}
526

527 528 529 530 531 532 533
static void __mark_reg_const_zero(struct bpf_reg_state *reg)
{
	__mark_reg_known(reg, 0);
	reg->off = 0;
	reg->type = SCALAR_VALUE;
}

534 535
static void mark_reg_known_zero(struct bpf_verifier_env *env,
				struct bpf_reg_state *regs, u32 regno)
536 537
{
	if (WARN_ON(regno >= MAX_BPF_REG)) {
538
		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
539 540 541 542 543 544 545 546
		/* Something bad happened, let's kill all regs */
		for (regno = 0; regno < MAX_BPF_REG; regno++)
			__mark_reg_not_init(regs + regno);
		return;
	}
	__mark_reg_known_zero(regs + regno);
}

547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
	return type_is_pkt_pointer(reg->type);
}

static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
{
	return reg_is_pkt_pointer(reg) ||
	       reg->type == PTR_TO_PACKET_END;
}

/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
				    enum bpf_reg_type which)
{
	/* The register can already have a range from prior markings.
	 * This is fine as long as it hasn't been advanced from its
	 * origin.
	 */
	return reg->type == which &&
	       reg->id == 0 &&
	       reg->off == 0 &&
	       tnum_equals_const(reg->var_off, 0);
}

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
	/* min signed is max(sign bit) | min(other bits) */
	reg->smin_value = max_t(s64, reg->smin_value,
				reg->var_off.value | (reg->var_off.mask & S64_MIN));
	/* max signed is min(sign bit) | max(other bits) */
	reg->smax_value = min_t(s64, reg->smax_value,
				reg->var_off.value | (reg->var_off.mask & S64_MAX));
	reg->umin_value = max(reg->umin_value, reg->var_off.value);
	reg->umax_value = min(reg->umax_value,
			      reg->var_off.value | reg->var_off.mask);
}

/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
	/* Learn sign from signed bounds.
	 * If we cannot cross the sign boundary, then signed and unsigned bounds
	 * are the same, so combine.  This works even in the negative case, e.g.
	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
	 */
	if (reg->smin_value >= 0 || reg->smax_value < 0) {
		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
							  reg->umin_value);
		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
							  reg->umax_value);
		return;
	}
	/* Learn sign from unsigned bounds.  Signed bounds cross the sign
	 * boundary, so we must be careful.
	 */
	if ((s64)reg->umax_value >= 0) {
		/* Positive.  We can't learn anything from the smin, but smax
		 * is positive, hence safe.
		 */
		reg->smin_value = reg->umin_value;
		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
							  reg->umax_value);
	} else if ((s64)reg->umin_value < 0) {
		/* Negative.  We can't learn anything from the smax, but smin
		 * is negative, hence safe.
		 */
		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
							  reg->umin_value);
		reg->smax_value = reg->umax_value;
	}
}

/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
	reg->var_off = tnum_intersect(reg->var_off,
				      tnum_range(reg->umin_value,
						 reg->umax_value));
}

/* Reset the min/max bounds of a register */
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
	reg->smin_value = S64_MIN;
	reg->smax_value = S64_MAX;
	reg->umin_value = 0;
	reg->umax_value = U64_MAX;
}

638 639 640 641 642 643 644
/* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
	reg->type = SCALAR_VALUE;
	reg->id = 0;
	reg->off = 0;
	reg->var_off = tnum_unknown;
645
	reg->frameno = 0;
646
	__mark_reg_unbounded(reg);
647 648
}

649 650
static void mark_reg_unknown(struct bpf_verifier_env *env,
			     struct bpf_reg_state *regs, u32 regno)
651 652
{
	if (WARN_ON(regno >= MAX_BPF_REG)) {
653
		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
654 655
		/* Something bad happened, let's kill all regs except FP */
		for (regno = 0; regno < BPF_REG_FP; regno++)
656 657 658 659 660 661 662 663 664 665 666 667
			__mark_reg_not_init(regs + regno);
		return;
	}
	__mark_reg_unknown(regs + regno);
}

static void __mark_reg_not_init(struct bpf_reg_state *reg)
{
	__mark_reg_unknown(reg);
	reg->type = NOT_INIT;
}

668 669
static void mark_reg_not_init(struct bpf_verifier_env *env,
			      struct bpf_reg_state *regs, u32 regno)
670 671
{
	if (WARN_ON(regno >= MAX_BPF_REG)) {
672
		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
673 674
		/* Something bad happened, let's kill all regs except FP */
		for (regno = 0; regno < BPF_REG_FP; regno++)
675 676 677 678
			__mark_reg_not_init(regs + regno);
		return;
	}
	__mark_reg_not_init(regs + regno);
679 680
}

681
static void init_reg_state(struct bpf_verifier_env *env,
682
			   struct bpf_func_state *state)
683
{
684
	struct bpf_reg_state *regs = state->regs;
685 686
	int i;

687
	for (i = 0; i < MAX_BPF_REG; i++) {
688
		mark_reg_not_init(env, regs, i);
689 690
		regs[i].live = REG_LIVE_NONE;
	}
691 692

	/* frame pointer */
693
	regs[BPF_REG_FP].type = PTR_TO_STACK;
694
	mark_reg_known_zero(env, regs, BPF_REG_FP);
695
	regs[BPF_REG_FP].frameno = state->frameno;
696 697 698

	/* 1st arg to a function */
	regs[BPF_REG_1].type = PTR_TO_CTX;
699
	mark_reg_known_zero(env, regs, BPF_REG_1);
700 701
}

702 703 704 705 706 707 708 709 710 711 712
#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
			    struct bpf_func_state *state,
			    int callsite, int frameno, int subprogno)
{
	state->callsite = callsite;
	state->frameno = frameno;
	state->subprogno = subprogno;
	init_reg_state(env, state);
}

713 714 715 716 717 718
enum reg_arg_type {
	SRC_OP,		/* register is used as source operand */
	DST_OP,		/* register is used as destination operand */
	DST_OP_NO_MARK	/* same as above, check only, don't mark */
};

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
static int cmp_subprogs(const void *a, const void *b)
{
	return *(int *)a - *(int *)b;
}

static int find_subprog(struct bpf_verifier_env *env, int off)
{
	u32 *p;

	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
		    sizeof(env->subprog_starts[0]), cmp_subprogs);
	if (!p)
		return -ENOENT;
	return p - env->subprog_starts;

}

static int add_subprog(struct bpf_verifier_env *env, int off)
{
	int insn_cnt = env->prog->len;
	int ret;

	if (off >= insn_cnt || off < 0) {
		verbose(env, "call to invalid destination\n");
		return -EINVAL;
	}
	ret = find_subprog(env, off);
	if (ret >= 0)
		return 0;
	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
		verbose(env, "too many subprograms\n");
		return -E2BIG;
	}
	env->subprog_starts[env->subprog_cnt++] = off;
	sort(env->subprog_starts, env->subprog_cnt,
	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
	return 0;
}

static int check_subprogs(struct bpf_verifier_env *env)
{
	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
	struct bpf_insn *insn = env->prog->insnsi;
	int insn_cnt = env->prog->len;

	/* determine subprog starts. The end is one before the next starts */
	for (i = 0; i < insn_cnt; i++) {
		if (insn[i].code != (BPF_JMP | BPF_CALL))
			continue;
		if (insn[i].src_reg != BPF_PSEUDO_CALL)
			continue;
		if (!env->allow_ptr_leaks) {
			verbose(env, "function calls to other bpf functions are allowed for root only\n");
			return -EPERM;
		}
		if (bpf_prog_is_dev_bound(env->prog->aux)) {
775
			verbose(env, "function calls in offloaded programs are not supported yet\n");
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
			return -EINVAL;
		}
		ret = add_subprog(env, i + insn[i].imm + 1);
		if (ret < 0)
			return ret;
	}

	if (env->log.level > 1)
		for (i = 0; i < env->subprog_cnt; i++)
			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);

	/* now check that all jumps are within the same subprog */
	subprog_start = 0;
	if (env->subprog_cnt == cur_subprog)
		subprog_end = insn_cnt;
	else
		subprog_end = env->subprog_starts[cur_subprog++];
	for (i = 0; i < insn_cnt; i++) {
		u8 code = insn[i].code;

		if (BPF_CLASS(code) != BPF_JMP)
			goto next;
		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
			goto next;
		off = i + insn[i].off + 1;
		if (off < subprog_start || off >= subprog_end) {
			verbose(env, "jump out of range from insn %d to %d\n", i, off);
			return -EINVAL;
		}
next:
		if (i == subprog_end - 1) {
			/* to avoid fall-through from one subprog into another
			 * the last insn of the subprog should be either exit
			 * or unconditional jump back
			 */
			if (code != (BPF_JMP | BPF_EXIT) &&
			    code != (BPF_JMP | BPF_JA)) {
				verbose(env, "last insn is not an exit or jmp\n");
				return -EINVAL;
			}
			subprog_start = subprog_end;
			if (env->subprog_cnt == cur_subprog)
				subprog_end = insn_cnt;
			else
				subprog_end = env->subprog_starts[cur_subprog++];
		}
	}
	return 0;
}

826
static
827 828 829 830
struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
				       const struct bpf_verifier_state *state,
				       struct bpf_verifier_state *parent,
				       u32 regno)
831
{
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
	struct bpf_verifier_state *tmp = NULL;

	/* 'parent' could be a state of caller and
	 * 'state' could be a state of callee. In such case
	 * parent->curframe < state->curframe
	 * and it's ok for r1 - r5 registers
	 *
	 * 'parent' could be a callee's state after it bpf_exit-ed.
	 * In such case parent->curframe > state->curframe
	 * and it's ok for r0 only
	 */
	if (parent->curframe == state->curframe ||
	    (parent->curframe < state->curframe &&
	     regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
	    (parent->curframe > state->curframe &&
	       regno == BPF_REG_0))
		return parent;

	if (parent->curframe > state->curframe &&
	    regno >= BPF_REG_6) {
		/* for callee saved regs we have to skip the whole chain
		 * of states that belong to callee and mark as LIVE_READ
		 * the registers before the call
		 */
		tmp = parent;
		while (tmp && tmp->curframe != state->curframe) {
			tmp = tmp->parent;
		}
		if (!tmp)
			goto bug;
		parent = tmp;
	} else {
		goto bug;
	}
	return parent;
bug:
	verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
	verbose(env, "regno %d parent frame %d current frame %d\n",
		regno, parent->curframe, state->curframe);
871
	return NULL;
872 873 874 875 876 877 878 879
}

static int mark_reg_read(struct bpf_verifier_env *env,
			 const struct bpf_verifier_state *state,
			 struct bpf_verifier_state *parent,
			 u32 regno)
{
	bool writes = parent == state->parent; /* Observe write marks */
880

A
Alexei Starovoitov 已提交
881 882
	if (regno == BPF_REG_FP)
		/* We don't need to worry about FP liveness because it's read-only */
883
		return 0;
A
Alexei Starovoitov 已提交
884

885 886
	while (parent) {
		/* if read wasn't screened by an earlier write ... */
887
		if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
888
			break;
889 890 891
		parent = skip_callee(env, state, parent, regno);
		if (!parent)
			return -EFAULT;
892
		/* ... then we depend on parent's value */
893
		parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
894 895
		state = parent;
		parent = state->parent;
896
		writes = true;
897
	}
898
	return 0;
899 900 901
}

static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
902 903
			 enum reg_arg_type t)
{
904 905 906
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
	struct bpf_reg_state *regs = state->regs;
907

908
	if (regno >= MAX_BPF_REG) {
909
		verbose(env, "R%d is invalid\n", regno);
910 911 912 913 914 915
		return -EINVAL;
	}

	if (t == SRC_OP) {
		/* check whether register used as source operand can be read */
		if (regs[regno].type == NOT_INIT) {
916
			verbose(env, "R%d !read_ok\n", regno);
917 918
			return -EACCES;
		}
919
		return mark_reg_read(env, vstate, vstate->parent, regno);
920 921 922
	} else {
		/* check whether register used as dest operand can be written to */
		if (regno == BPF_REG_FP) {
923
			verbose(env, "frame pointer is read only\n");
924 925
			return -EACCES;
		}
926
		regs[regno].live |= REG_LIVE_WRITTEN;
927
		if (t == DST_OP)
928
			mark_reg_unknown(env, regs, regno);
929 930 931 932
	}
	return 0;
}

933 934 935 936 937 938 939
static bool is_spillable_regtype(enum bpf_reg_type type)
{
	switch (type) {
	case PTR_TO_MAP_VALUE:
	case PTR_TO_MAP_VALUE_OR_NULL:
	case PTR_TO_STACK:
	case PTR_TO_CTX:
A
Alexei Starovoitov 已提交
940
	case PTR_TO_PACKET:
941
	case PTR_TO_PACKET_META:
A
Alexei Starovoitov 已提交
942
	case PTR_TO_PACKET_END:
943 944 945 946 947 948 949
	case CONST_PTR_TO_MAP:
		return true;
	default:
		return false;
	}
}

950 951 952 953 954 955
/* Does this register contain a constant zero? */
static bool register_is_null(struct bpf_reg_state *reg)
{
	return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}

956 957 958
/* check_stack_read/write functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
959
static int check_stack_write(struct bpf_verifier_env *env,
960 961
			     struct bpf_func_state *state, /* func where register points to */
			     int off, int size, int value_regno)
962
{
963
	struct bpf_func_state *cur; /* state of the current function */
964
	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
965
	enum bpf_reg_type type;
966

967 968
	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
				 true);
969 970
	if (err)
		return err;
971 972 973
	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
	 * so it's aligned access and [off, off + size) are within stack limits
	 */
974 975 976 977 978 979
	if (!env->allow_ptr_leaks &&
	    state->stack[spi].slot_type[0] == STACK_SPILL &&
	    size != BPF_REG_SIZE) {
		verbose(env, "attempt to corrupt spilled pointer on stack\n");
		return -EACCES;
	}
980

981
	cur = env->cur_state->frame[env->cur_state->curframe];
982
	if (value_regno >= 0 &&
983
	    is_spillable_regtype((type = cur->regs[value_regno].type))) {
984 985

		/* register containing pointer is being spilled into stack */
986
		if (size != BPF_REG_SIZE) {
987
			verbose(env, "invalid size of register spill\n");
988 989 990
			return -EACCES;
		}

991 992 993 994 995
		if (state != cur && type == PTR_TO_STACK) {
			verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
			return -EINVAL;
		}

996
		/* save register state */
997
		state->stack[spi].spilled_ptr = cur->regs[value_regno];
998
		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
999

1000
		for (i = 0; i < BPF_REG_SIZE; i++)
1001
			state->stack[spi].slot_type[i] = STACK_SPILL;
1002
	} else {
1003 1004
		u8 type = STACK_MISC;

1005
		/* regular write of data into stack */
1006
		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
1007

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
		/* only mark the slot as written if all 8 bytes were written
		 * otherwise read propagation may incorrectly stop too soon
		 * when stack slots are partially written.
		 * This heuristic means that read propagation will be
		 * conservative, since it will add reg_live_read marks
		 * to stack slots all the way to first state when programs
		 * writes+reads less than 8 bytes
		 */
		if (size == BPF_REG_SIZE)
			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;

		/* when we zero initialize stack slots mark them as such */
		if (value_regno >= 0 &&
		    register_is_null(&cur->regs[value_regno]))
			type = STACK_ZERO;

1024
		for (i = 0; i < size; i++)
1025
			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
1026
				type;
1027 1028 1029 1030
	}
	return 0;
}

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
/* registers of every function are unique and mark_reg_read() propagates
 * the liveness in the following cases:
 * - from callee into caller for R1 - R5 that were used as arguments
 * - from caller into callee for R0 that used as result of the call
 * - from caller to the same caller skipping states of the callee for R6 - R9,
 *   since R6 - R9 are callee saved by implicit function prologue and
 *   caller's R6 != callee's R6, so when we propagate liveness up to
 *   parent states we need to skip callee states for R6 - R9.
 *
 * stack slot marking is different, since stacks of caller and callee are
 * accessible in both (since caller can pass a pointer to caller's stack to
 * callee which can pass it to another function), hence mark_stack_slot_read()
 * has to propagate the stack liveness to all parent states at given frame number.
 * Consider code:
 * f1() {
 *   ptr = fp - 8;
 *   *ptr = ctx;
 *   call f2 {
 *      .. = *ptr;
 *   }
 *   .. = *ptr;
 * }
 * First *ptr is reading from f1's stack and mark_stack_slot_read() has
 * to mark liveness at the f1's frame and not f2's frame.
 * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
 * to propagate liveness to f2 states at f1's frame level and further into
 * f1 states at f1's frame level until write into that stack slot
 */
static void mark_stack_slot_read(struct bpf_verifier_env *env,
				 const struct bpf_verifier_state *state,
				 struct bpf_verifier_state *parent,
				 int slot, int frameno)
1063
{
1064
	bool writes = parent == state->parent; /* Observe write marks */
1065 1066

	while (parent) {
1067 1068 1069 1070 1071 1072 1073 1074
		if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
			/* since LIVE_WRITTEN mark is only done for full 8-byte
			 * write the read marks are conservative and parent
			 * state may not even have the stack allocated. In such case
			 * end the propagation, since the loop reached beginning
			 * of the function
			 */
			break;
1075
		/* if read wasn't screened by an earlier write ... */
1076
		if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
1077 1078
			break;
		/* ... then we depend on parent's value */
1079
		parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
1080 1081
		state = parent;
		parent = state->parent;
1082
		writes = true;
1083 1084 1085
	}
}

1086
static int check_stack_read(struct bpf_verifier_env *env,
1087 1088
			    struct bpf_func_state *reg_state /* func where register points to */,
			    int off, int size, int value_regno)
1089
{
1090 1091
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
1092 1093
	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
	u8 *stype;
1094

1095
	if (reg_state->allocated_stack <= slot) {
1096 1097 1098 1099
		verbose(env, "invalid read from stack off %d+0 size %d\n",
			off, size);
		return -EACCES;
	}
1100
	stype = reg_state->stack[spi].slot_type;
1101

1102
	if (stype[0] == STACK_SPILL) {
1103
		if (size != BPF_REG_SIZE) {
1104
			verbose(env, "invalid size of register spill\n");
1105 1106
			return -EACCES;
		}
1107
		for (i = 1; i < BPF_REG_SIZE; i++) {
1108
			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
1109
				verbose(env, "corrupted spill memory\n");
1110 1111 1112 1113
				return -EACCES;
			}
		}

1114
		if (value_regno >= 0) {
1115
			/* restore register state from stack */
1116
			state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
1117 1118 1119 1120 1121
			/* mark reg as written since spilled pointer state likely
			 * has its liveness marks cleared by is_state_visited()
			 * which resets stack/reg liveness for state transitions
			 */
			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1122
		}
1123 1124
		mark_stack_slot_read(env, vstate, vstate->parent, spi,
				     reg_state->frameno);
1125 1126
		return 0;
	} else {
1127 1128
		int zeros = 0;

1129
		for (i = 0; i < size; i++) {
1130 1131 1132 1133 1134
			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
				continue;
			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
				zeros++;
				continue;
1135
			}
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
			verbose(env, "invalid read from stack off %d+%d size %d\n",
				off, i, size);
			return -EACCES;
		}
		mark_stack_slot_read(env, vstate, vstate->parent, spi,
				     reg_state->frameno);
		if (value_regno >= 0) {
			if (zeros == size) {
				/* any size read into register is zero extended,
				 * so the whole register == const_zero
				 */
				__mark_reg_const_zero(&state->regs[value_regno]);
			} else {
				/* have read misc data from the stack */
				mark_reg_unknown(env, state->regs, value_regno);
			}
			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1153 1154 1155 1156 1157 1158
		}
		return 0;
	}
}

/* check read/write into map element returned by bpf_map_lookup_elem() */
1159
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
1160
			      int size, bool zero_size_allowed)
1161
{
1162 1163
	struct bpf_reg_state *regs = cur_regs(env);
	struct bpf_map *map = regs[regno].map_ptr;
1164

1165 1166
	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
	    off + size > map->value_size) {
1167
		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
1168 1169 1170 1171 1172 1173
			map->value_size, off, size);
		return -EACCES;
	}
	return 0;
}

1174 1175
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
1176
			    int off, int size, bool zero_size_allowed)
1177
{
1178 1179
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
1180 1181 1182
	struct bpf_reg_state *reg = &state->regs[regno];
	int err;

1183 1184 1185
	/* We may have adjusted the register to this map value, so we
	 * need to try adding each of min_value and max_value to off
	 * to make sure our theoretical access will be safe.
1186
	 */
1187 1188
	if (env->log.level)
		print_verifier_state(env, state);
1189 1190 1191 1192 1193 1194
	/* The minimum value is only important with signed
	 * comparisons where we can't assume the floor of a
	 * value is 0.  If we are using signed variables for our
	 * index'es we need to make sure that whatever we use
	 * will have a set floor within our range.
	 */
1195
	if (reg->smin_value < 0) {
1196
		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
1197 1198 1199
			regno);
		return -EACCES;
	}
1200 1201
	err = __check_map_access(env, regno, reg->smin_value + off, size,
				 zero_size_allowed);
1202
	if (err) {
1203 1204
		verbose(env, "R%d min value is outside of the array range\n",
			regno);
1205 1206 1207
		return err;
	}

1208 1209 1210
	/* If we haven't set a max value then we need to bail since we can't be
	 * sure we won't do bad things.
	 * If reg->umax_value + off could overflow, treat that as unbounded too.
1211
	 */
1212
	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
1213
		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
1214 1215 1216
			regno);
		return -EACCES;
	}
1217 1218
	err = __check_map_access(env, regno, reg->umax_value + off, size,
				 zero_size_allowed);
1219
	if (err)
1220 1221
		verbose(env, "R%d max value is outside of the array range\n",
			regno);
1222
	return err;
1223 1224
}

A
Alexei Starovoitov 已提交
1225 1226
#define MAX_PACKET_OFF 0xffff

1227
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
1228 1229
				       const struct bpf_call_arg_meta *meta,
				       enum bpf_access_type t)
1230
{
1231
	switch (env->prog->type) {
1232 1233 1234 1235 1236
	case BPF_PROG_TYPE_LWT_IN:
	case BPF_PROG_TYPE_LWT_OUT:
		/* dst_input() and dst_output() can't write for now */
		if (t == BPF_WRITE)
			return false;
1237
		/* fallthrough */
1238 1239
	case BPF_PROG_TYPE_SCHED_CLS:
	case BPF_PROG_TYPE_SCHED_ACT:
1240
	case BPF_PROG_TYPE_XDP:
1241
	case BPF_PROG_TYPE_LWT_XMIT:
1242
	case BPF_PROG_TYPE_SK_SKB:
1243 1244 1245 1246
		if (meta)
			return meta->pkt_access;

		env->seen_direct_write = true;
1247 1248 1249 1250 1251 1252
		return true;
	default:
		return false;
	}
}

1253
static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
1254
				 int off, int size, bool zero_size_allowed)
A
Alexei Starovoitov 已提交
1255
{
1256
	struct bpf_reg_state *regs = cur_regs(env);
1257
	struct bpf_reg_state *reg = &regs[regno];
A
Alexei Starovoitov 已提交
1258

1259 1260
	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
	    (u64)off + size > reg->range) {
1261
		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
1262
			off, size, regno, reg->id, reg->off, reg->range);
A
Alexei Starovoitov 已提交
1263 1264 1265 1266 1267
		return -EACCES;
	}
	return 0;
}

1268
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
1269
			       int size, bool zero_size_allowed)
1270
{
1271
	struct bpf_reg_state *regs = cur_regs(env);
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
	struct bpf_reg_state *reg = &regs[regno];
	int err;

	/* We may have added a variable offset to the packet pointer; but any
	 * reg->range we have comes after that.  We are only checking the fixed
	 * offset.
	 */

	/* We don't allow negative numbers, because we aren't tracking enough
	 * detail to prove they're safe.
	 */
1283
	if (reg->smin_value < 0) {
1284
		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
1285 1286 1287
			regno);
		return -EACCES;
	}
1288
	err = __check_packet_access(env, regno, off, size, zero_size_allowed);
1289
	if (err) {
1290
		verbose(env, "R%d offset is outside of the packet\n", regno);
1291 1292 1293 1294 1295 1296
		return err;
	}
	return err;
}

/* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
1297
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
1298
			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
1299
{
1300 1301 1302
	struct bpf_insn_access_aux info = {
		.reg_type = *reg_type,
	};
1303

1304 1305
	if (env->ops->is_valid_access &&
	    env->ops->is_valid_access(off, size, t, &info)) {
1306 1307 1308 1309 1310 1311
		/* A non zero info.ctx_field_size indicates that this field is a
		 * candidate for later verifier transformation to load the whole
		 * field and then apply a mask when accessed with a narrower
		 * access than actual ctx access size. A zero info.ctx_field_size
		 * will only allow for whole field access and rejects any other
		 * type of narrower access.
1312
		 */
1313
		*reg_type = info.reg_type;
1314

1315
		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
1316 1317 1318
		/* remember the offset of last byte accessed in ctx */
		if (env->prog->aux->max_ctx_offset < off + size)
			env->prog->aux->max_ctx_offset = off + size;
1319
		return 0;
1320
	}
1321

1322
	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
1323 1324 1325
	return -EACCES;
}

1326 1327
static bool __is_pointer_value(bool allow_ptr_leaks,
			       const struct bpf_reg_state *reg)
1328
{
1329
	if (allow_ptr_leaks)
1330 1331
		return false;

1332
	return reg->type != SCALAR_VALUE;
1333 1334
}

1335 1336
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
1337
	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
1338 1339
}

1340 1341
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
				   const struct bpf_reg_state *reg,
1342
				   int off, int size, bool strict)
A
Alexei Starovoitov 已提交
1343
{
1344
	struct tnum reg_off;
1345
	int ip_align;
1346 1347 1348 1349 1350

	/* Byte size accesses are always allowed. */
	if (!strict || size == 1)
		return 0;

1351 1352 1353 1354 1355 1356 1357
	/* For platforms that do not have a Kconfig enabling
	 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
	 * NET_IP_ALIGN is universally set to '2'.  And on platforms
	 * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
	 * to this code only in strict mode where we want to emulate
	 * the NET_IP_ALIGN==2 checking.  Therefore use an
	 * unconditional IP align value of '2'.
1358
	 */
1359
	ip_align = 2;
1360 1361 1362 1363 1364 1365

	reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
	if (!tnum_is_aligned(reg_off, size)) {
		char tn_buf[48];

		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1366 1367
		verbose(env,
			"misaligned packet access off %d+%s+%d+%d size %d\n",
1368
			ip_align, tn_buf, reg->off, off, size);
A
Alexei Starovoitov 已提交
1369 1370
		return -EACCES;
	}
1371

A
Alexei Starovoitov 已提交
1372 1373 1374
	return 0;
}

1375 1376
static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
				       const struct bpf_reg_state *reg,
1377 1378
				       const char *pointer_desc,
				       int off, int size, bool strict)
1379
{
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
	struct tnum reg_off;

	/* Byte size accesses are always allowed. */
	if (!strict || size == 1)
		return 0;

	reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
	if (!tnum_is_aligned(reg_off, size)) {
		char tn_buf[48];

		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1391
		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
1392
			pointer_desc, tn_buf, reg->off, off, size);
1393 1394 1395
		return -EACCES;
	}

A
Alexei Starovoitov 已提交
1396 1397 1398
	return 0;
}

1399 1400
static int check_ptr_alignment(struct bpf_verifier_env *env,
			       const struct bpf_reg_state *reg,
1401 1402
			       int off, int size)
{
1403
	bool strict = env->strict_alignment;
1404
	const char *pointer_desc = "";
1405

1406 1407
	switch (reg->type) {
	case PTR_TO_PACKET:
1408 1409 1410 1411
	case PTR_TO_PACKET_META:
		/* Special case, because of NET_IP_ALIGN. Given metadata sits
		 * right in front, treat it the very same way.
		 */
1412
		return check_pkt_ptr_alignment(env, reg, off, size, strict);
1413 1414 1415 1416 1417 1418 1419 1420
	case PTR_TO_MAP_VALUE:
		pointer_desc = "value ";
		break;
	case PTR_TO_CTX:
		pointer_desc = "context ";
		break;
	case PTR_TO_STACK:
		pointer_desc = "stack ";
1421 1422 1423 1424 1425
		/* The stack spill tracking logic in check_stack_write()
		 * and check_stack_read() relies on stack accesses being
		 * aligned.
		 */
		strict = true;
1426
		break;
1427
	default:
1428
		break;
1429
	}
1430 1431
	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
					   strict);
1432 1433
}

1434 1435 1436 1437
static int update_stack_depth(struct bpf_verifier_env *env,
			      const struct bpf_func_state *func,
			      int off)
{
1438
	u16 stack = env->subprog_stack_depth[func->subprogno];
1439 1440 1441 1442 1443 1444

	if (stack >= -off)
		return 0;

	/* update known max for given subprogram */
	env->subprog_stack_depth[func->subprogno] = -off;
1445 1446
	return 0;
}
1447

1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460
/* starting from main bpf function walk all instructions of the function
 * and recursively walk all callees that given function can call.
 * Ignore jump and exit insns.
 * Since recursion is prevented by check_cfg() this algorithm
 * only needs a local stack of MAX_CALL_FRAMES to remember callsites
 */
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
	struct bpf_insn *insn = env->prog->insnsi;
	int insn_cnt = env->prog->len;
	int ret_insn[MAX_CALL_FRAMES];
	int ret_prog[MAX_CALL_FRAMES];
1461

1462 1463 1464 1465 1466 1467
process_func:
	/* round up to 32-bytes, since this is granularity
	 * of interpreter stack size
	 */
	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
	if (depth > MAX_BPF_STACK) {
1468
		verbose(env, "combined stack size of %d calls is %d. Too large\n",
1469
			frame + 1, depth);
1470 1471
		return -EACCES;
	}
1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
continue_func:
	if (env->subprog_cnt == subprog)
		subprog_end = insn_cnt;
	else
		subprog_end = env->subprog_starts[subprog];
	for (; i < subprog_end; i++) {
		if (insn[i].code != (BPF_JMP | BPF_CALL))
			continue;
		if (insn[i].src_reg != BPF_PSEUDO_CALL)
			continue;
		/* remember insn and function to return to */
		ret_insn[frame] = i + 1;
		ret_prog[frame] = subprog;

		/* find the callee */
		i = i + insn[i].imm + 1;
		subprog = find_subprog(env, i);
		if (subprog < 0) {
			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
				  i);
			return -EFAULT;
		}
		subprog++;
		frame++;
		if (frame >= MAX_CALL_FRAMES) {
			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
			return -EFAULT;
		}
		goto process_func;
	}
	/* end of for() loop means the last insn of the 'subprog'
	 * was reached. Doesn't matter whether it was JA or EXIT
	 */
	if (frame == 0)
		return 0;
	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
	frame--;
	i = ret_insn[frame];
	subprog = ret_prog[frame];
	goto continue_func;
1512 1513
}

1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
static int get_callee_stack_depth(struct bpf_verifier_env *env,
				  const struct bpf_insn *insn, int idx)
{
	int start = idx + insn->imm + 1, subprog;

	subprog = find_subprog(env, start);
	if (subprog < 0) {
		WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
			  start);
		return -EFAULT;
	}
	subprog++;
	return env->subprog_stack_depth[subprog];
}

1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
/* truncate register to smaller size (in bytes)
 * must be called with size < BPF_REG_SIZE
 */
static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
	u64 mask;

	/* clear high bits in bit representation */
	reg->var_off = tnum_cast(reg->var_off, size);

	/* fix arithmetic bounds */
	mask = ((u64)1 << (size * 8)) - 1;
	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
		reg->umin_value &= mask;
		reg->umax_value &= mask;
	} else {
		reg->umin_value = 0;
		reg->umax_value = mask;
	}
	reg->smin_value = reg->umin_value;
	reg->smax_value = reg->umax_value;
}

1552 1553 1554 1555 1556 1557
/* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
 * if t==write && value_regno==-1, some unknown value is stored into memory
 * if t==read && value_regno==-1, don't care what we read from memory
 */
1558
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
1559 1560 1561
			    int bpf_size, enum bpf_access_type t,
			    int value_regno)
{
1562 1563
	struct bpf_reg_state *regs = cur_regs(env);
	struct bpf_reg_state *reg = regs + regno;
1564
	struct bpf_func_state *state;
1565 1566 1567 1568 1569 1570
	int size, err = 0;

	size = bpf_size_to_bytes(bpf_size);
	if (size < 0)
		return size;

1571
	/* alignment checks will add in reg->off themselves */
1572
	err = check_ptr_alignment(env, reg, off, size);
A
Alexei Starovoitov 已提交
1573 1574
	if (err)
		return err;
1575

1576 1577 1578 1579
	/* for access checks, reg->off is just part of off */
	off += reg->off;

	if (reg->type == PTR_TO_MAP_VALUE) {
1580 1581
		if (t == BPF_WRITE && value_regno >= 0 &&
		    is_pointer_value(env, value_regno)) {
1582
			verbose(env, "R%d leaks addr into map\n", value_regno);
1583 1584
			return -EACCES;
		}
1585

1586
		err = check_map_access(env, regno, off, size, false);
1587
		if (!err && t == BPF_READ && value_regno >= 0)
1588
			mark_reg_unknown(env, regs, value_regno);
1589

A
Alexei Starovoitov 已提交
1590
	} else if (reg->type == PTR_TO_CTX) {
1591
		enum bpf_reg_type reg_type = SCALAR_VALUE;
1592

1593 1594
		if (t == BPF_WRITE && value_regno >= 0 &&
		    is_pointer_value(env, value_regno)) {
1595
			verbose(env, "R%d leaks addr into ctx\n", value_regno);
1596 1597
			return -EACCES;
		}
1598 1599 1600
		/* ctx accesses must be at a fixed offset, so that we can
		 * determine what type of data were returned.
		 */
1601
		if (reg->off) {
1602 1603
			verbose(env,
				"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
1604 1605 1606 1607
				regno, reg->off, off - reg->off);
			return -EACCES;
		}
		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
1608 1609 1610
			char tn_buf[48];

			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1611 1612
			verbose(env,
				"variable ctx access var_off=%s off=%d size=%d",
1613 1614 1615
				tn_buf, off, size);
			return -EACCES;
		}
1616
		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
A
Alexei Starovoitov 已提交
1617
		if (!err && t == BPF_READ && value_regno >= 0) {
1618
			/* ctx access returns either a scalar, or a
1619 1620
			 * PTR_TO_PACKET[_META,_END]. In the latter
			 * case, we know the offset is zero.
1621 1622
			 */
			if (reg_type == SCALAR_VALUE)
1623
				mark_reg_unknown(env, regs, value_regno);
1624
			else
1625
				mark_reg_known_zero(env, regs,
1626
						    value_regno);
1627 1628 1629 1630
			regs[value_regno].id = 0;
			regs[value_regno].off = 0;
			regs[value_regno].range = 0;
			regs[value_regno].type = reg_type;
A
Alexei Starovoitov 已提交
1631
		}
1632

1633 1634 1635 1636 1637 1638 1639 1640 1641
	} else if (reg->type == PTR_TO_STACK) {
		/* stack accesses must be at a fixed offset, so that we can
		 * determine what type of data were returned.
		 * See check_stack_read().
		 */
		if (!tnum_is_const(reg->var_off)) {
			char tn_buf[48];

			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1642
			verbose(env, "variable stack access var_off=%s off=%d size=%d",
1643 1644 1645 1646
				tn_buf, off, size);
			return -EACCES;
		}
		off += reg->var_off.value;
1647
		if (off >= 0 || off < -MAX_BPF_STACK) {
1648 1649
			verbose(env, "invalid stack off=%d size=%d\n", off,
				size);
1650 1651
			return -EACCES;
		}
1652

1653 1654 1655 1656
		state = func(env, reg);
		err = update_stack_depth(env, state, off);
		if (err)
			return err;
1657

1658
		if (t == BPF_WRITE)
1659 1660
			err = check_stack_write(env, state, off, size,
						value_regno);
1661
		else
1662 1663
			err = check_stack_read(env, state, off, size,
					       value_regno);
1664
	} else if (reg_is_pkt_pointer(reg)) {
1665
		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
1666
			verbose(env, "cannot write into packet\n");
A
Alexei Starovoitov 已提交
1667 1668
			return -EACCES;
		}
1669 1670
		if (t == BPF_WRITE && value_regno >= 0 &&
		    is_pointer_value(env, value_regno)) {
1671 1672
			verbose(env, "R%d leaks addr into packet\n",
				value_regno);
1673 1674
			return -EACCES;
		}
1675
		err = check_packet_access(env, regno, off, size, false);
A
Alexei Starovoitov 已提交
1676
		if (!err && t == BPF_READ && value_regno >= 0)
1677
			mark_reg_unknown(env, regs, value_regno);
1678
	} else {
1679 1680
		verbose(env, "R%d invalid mem access '%s'\n", regno,
			reg_type_str[reg->type]);
1681 1682
		return -EACCES;
	}
A
Alexei Starovoitov 已提交
1683

1684
	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
1685
	    regs[value_regno].type == SCALAR_VALUE) {
1686
		/* b/h/w load zero-extends, mark upper bits as known 0 */
1687
		coerce_reg_to_size(&regs[value_regno], size);
A
Alexei Starovoitov 已提交
1688
	}
1689 1690 1691
	return err;
}

1692
static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
1693 1694 1695 1696 1697
{
	int err;

	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
	    insn->imm != 0) {
1698
		verbose(env, "BPF_XADD uses reserved fields\n");
1699 1700 1701 1702
		return -EINVAL;
	}

	/* check src1 operand */
1703
	err = check_reg_arg(env, insn->src_reg, SRC_OP);
1704 1705 1706 1707
	if (err)
		return err;

	/* check src2 operand */
1708
	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
1709 1710 1711
	if (err)
		return err;

1712
	if (is_pointer_value(env, insn->src_reg)) {
1713
		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
1714 1715 1716
		return -EACCES;
	}

1717
	/* check whether atomic_add can read the memory */
1718
	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
1719 1720 1721 1722 1723
			       BPF_SIZE(insn->code), BPF_READ, -1);
	if (err)
		return err;

	/* check whether atomic_add can write into the same memory */
1724
	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
1725 1726 1727 1728 1729
				BPF_SIZE(insn->code), BPF_WRITE, -1);
}

/* when register 'regno' is passed into function that will read 'access_size'
 * bytes from that pointer, make sure that it's within stack boundary
1730 1731 1732
 * and all elements of stack are initialized.
 * Unlike most pointer bounds-checking functions, this one doesn't take an
 * 'off' argument, so it has to add in reg->off itself.
1733
 */
1734
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1735 1736
				int access_size, bool zero_size_allowed,
				struct bpf_call_arg_meta *meta)
1737
{
1738
	struct bpf_reg_state *reg = cur_regs(env) + regno;
1739
	struct bpf_func_state *state = func(env, reg);
1740
	int off, i, slot, spi;
1741

1742
	if (reg->type != PTR_TO_STACK) {
1743
		/* Allow zero-byte read from NULL, regardless of pointer type */
1744
		if (zero_size_allowed && access_size == 0 &&
1745
		    register_is_null(reg))
1746 1747
			return 0;

1748
		verbose(env, "R%d type=%s expected=%s\n", regno,
1749
			reg_type_str[reg->type],
1750
			reg_type_str[PTR_TO_STACK]);
1751
		return -EACCES;
1752
	}
1753

1754
	/* Only allow fixed-offset stack reads */
1755
	if (!tnum_is_const(reg->var_off)) {
1756 1757
		char tn_buf[48];

1758
		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1759
		verbose(env, "invalid variable stack read R%d var_off=%s\n",
1760
			regno, tn_buf);
1761
		return -EACCES;
1762
	}
1763
	off = reg->off + reg->var_off.value;
1764
	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
1765
	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
1766
		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
1767 1768 1769 1770
			regno, off, access_size);
		return -EACCES;
	}

1771 1772 1773 1774 1775 1776
	if (meta && meta->raw_mode) {
		meta->access_size = access_size;
		meta->regno = regno;
		return 0;
	}

1777
	for (i = 0; i < access_size; i++) {
1778 1779
		u8 *stype;

1780 1781
		slot = -(off + i) - 1;
		spi = slot / BPF_REG_SIZE;
1782 1783 1784 1785 1786 1787 1788 1789 1790
		if (state->allocated_stack <= slot)
			goto err;
		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
		if (*stype == STACK_MISC)
			goto mark;
		if (*stype == STACK_ZERO) {
			/* helper can write anything into the stack */
			*stype = STACK_MISC;
			goto mark;
1791
		}
1792 1793 1794 1795 1796 1797 1798 1799 1800 1801
err:
		verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
			off, i, access_size);
		return -EACCES;
mark:
		/* reading any byte out of 8-byte 'spill_slot' will cause
		 * the whole slot to be marked as 'read'
		 */
		mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
				     spi, state->frameno);
1802
	}
1803
	return update_stack_depth(env, state, off);
1804 1805
}

1806 1807 1808 1809
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
				   int access_size, bool zero_size_allowed,
				   struct bpf_call_arg_meta *meta)
{
1810
	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
1811

1812
	switch (reg->type) {
1813
	case PTR_TO_PACKET:
1814
	case PTR_TO_PACKET_META:
1815 1816
		return check_packet_access(env, regno, reg->off, access_size,
					   zero_size_allowed);
1817
	case PTR_TO_MAP_VALUE:
1818 1819
		return check_map_access(env, regno, reg->off, access_size,
					zero_size_allowed);
1820
	default: /* scalar_value|ptr_to_stack or invalid ptr */
1821 1822 1823 1824 1825
		return check_stack_boundary(env, regno, access_size,
					    zero_size_allowed, meta);
	}
}

1826
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1827 1828
			  enum bpf_arg_type arg_type,
			  struct bpf_call_arg_meta *meta)
1829
{
1830
	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
1831
	enum bpf_reg_type expected_type, type = reg->type;
1832 1833
	int err = 0;

1834
	if (arg_type == ARG_DONTCARE)
1835 1836
		return 0;

1837 1838 1839
	err = check_reg_arg(env, regno, SRC_OP);
	if (err)
		return err;
1840

1841 1842
	if (arg_type == ARG_ANYTHING) {
		if (is_pointer_value(env, regno)) {
1843 1844
			verbose(env, "R%d leaks addr into helper function\n",
				regno);
1845 1846
			return -EACCES;
		}
1847
		return 0;
1848
	}
1849

1850
	if (type_is_pkt_pointer(type) &&
1851
	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
1852
		verbose(env, "helper access to the packet is not allowed\n");
1853 1854 1855
		return -EACCES;
	}

1856
	if (arg_type == ARG_PTR_TO_MAP_KEY ||
1857 1858
	    arg_type == ARG_PTR_TO_MAP_VALUE) {
		expected_type = PTR_TO_STACK;
1859 1860
		if (!type_is_pkt_pointer(type) &&
		    type != expected_type)
1861
			goto err_type;
1862 1863
	} else if (arg_type == ARG_CONST_SIZE ||
		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
1864 1865
		expected_type = SCALAR_VALUE;
		if (type != expected_type)
1866
			goto err_type;
1867 1868
	} else if (arg_type == ARG_CONST_MAP_PTR) {
		expected_type = CONST_PTR_TO_MAP;
1869 1870
		if (type != expected_type)
			goto err_type;
1871 1872
	} else if (arg_type == ARG_PTR_TO_CTX) {
		expected_type = PTR_TO_CTX;
1873 1874
		if (type != expected_type)
			goto err_type;
1875
	} else if (arg_type == ARG_PTR_TO_MEM ||
1876
		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
1877
		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
1878 1879
		expected_type = PTR_TO_STACK;
		/* One exception here. In case function allows for NULL to be
1880
		 * passed in as argument, it's a SCALAR_VALUE type. Final test
1881 1882
		 * happens during stack boundary checking.
		 */
1883
		if (register_is_null(reg) &&
1884
		    arg_type == ARG_PTR_TO_MEM_OR_NULL)
1885
			/* final test in check_stack_boundary() */;
1886 1887
		else if (!type_is_pkt_pointer(type) &&
			 type != PTR_TO_MAP_VALUE &&
1888
			 type != expected_type)
1889
			goto err_type;
1890
		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
1891
	} else {
1892
		verbose(env, "unsupported arg_type %d\n", arg_type);
1893 1894 1895 1896 1897
		return -EFAULT;
	}

	if (arg_type == ARG_CONST_MAP_PTR) {
		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
1898
		meta->map_ptr = reg->map_ptr;
1899 1900 1901 1902 1903
	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
		/* bpf_map_xxx(..., map_ptr, ..., key) call:
		 * check that [key, key + map->key_size) are within
		 * stack limits and initialized
		 */
1904
		if (!meta->map_ptr) {
1905 1906 1907 1908 1909
			/* in function declaration map_ptr must come before
			 * map_key, so that it's verified and known before
			 * we have to check map_key here. Otherwise it means
			 * that kernel subsystem misconfigured verifier
			 */
1910
			verbose(env, "invalid map_ptr to access map->key\n");
1911 1912
			return -EACCES;
		}
1913
		if (type_is_pkt_pointer(type))
1914
			err = check_packet_access(env, regno, reg->off,
1915 1916
						  meta->map_ptr->key_size,
						  false);
1917 1918 1919 1920
		else
			err = check_stack_boundary(env, regno,
						   meta->map_ptr->key_size,
						   false, NULL);
1921 1922 1923 1924
	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
		/* bpf_map_xxx(..., map_ptr, ..., value) call:
		 * check [value, value + map->value_size) validity
		 */
1925
		if (!meta->map_ptr) {
1926
			/* kernel subsystem misconfigured verifier */
1927
			verbose(env, "invalid map_ptr to access map->value\n");
1928 1929
			return -EACCES;
		}
1930
		if (type_is_pkt_pointer(type))
1931
			err = check_packet_access(env, regno, reg->off,
1932 1933
						  meta->map_ptr->value_size,
						  false);
1934 1935 1936 1937
		else
			err = check_stack_boundary(env, regno,
						   meta->map_ptr->value_size,
						   false, NULL);
1938 1939 1940
	} else if (arg_type == ARG_CONST_SIZE ||
		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
1941 1942 1943 1944 1945 1946 1947

		/* bpf_xxx(..., buf, len) call will access 'len' bytes
		 * from stack pointer 'buf'. Check it
		 * note: regno == len, regno - 1 == buf
		 */
		if (regno == 0) {
			/* kernel subsystem misconfigured verifier */
1948 1949
			verbose(env,
				"ARG_CONST_SIZE cannot be first argument\n");
1950 1951
			return -EACCES;
		}
1952

1953 1954
		/* The register is SCALAR_VALUE; the access check
		 * happens using its boundaries.
1955
		 */
1956 1957

		if (!tnum_is_const(reg->var_off))
1958 1959 1960 1961 1962 1963 1964
			/* For unprivileged variable accesses, disable raw
			 * mode so that the program is required to
			 * initialize all the memory that the helper could
			 * just partially fill up.
			 */
			meta = NULL;

1965
		if (reg->smin_value < 0) {
1966
			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
1967 1968 1969
				regno);
			return -EACCES;
		}
1970

1971
		if (reg->umin_value == 0) {
1972 1973 1974
			err = check_helper_mem_access(env, regno - 1, 0,
						      zero_size_allowed,
						      meta);
1975 1976 1977
			if (err)
				return err;
		}
1978

1979
		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
1980
			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
1981 1982 1983 1984
				regno);
			return -EACCES;
		}
		err = check_helper_mem_access(env, regno - 1,
1985
					      reg->umax_value,
1986
					      zero_size_allowed, meta);
1987 1988 1989
	}

	return err;
1990
err_type:
1991
	verbose(env, "R%d type=%s expected=%s\n", regno,
1992 1993
		reg_type_str[type], reg_type_str[expected_type]);
	return -EACCES;
1994 1995
}

1996 1997
static int check_map_func_compatibility(struct bpf_verifier_env *env,
					struct bpf_map *map, int func_id)
1998 1999 2000 2001
{
	if (!map)
		return 0;

2002 2003 2004 2005 2006 2007 2008 2009
	/* We need a two way check, first is from map perspective ... */
	switch (map->map_type) {
	case BPF_MAP_TYPE_PROG_ARRAY:
		if (func_id != BPF_FUNC_tail_call)
			goto error;
		break;
	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
		if (func_id != BPF_FUNC_perf_event_read &&
2010 2011
		    func_id != BPF_FUNC_perf_event_output &&
		    func_id != BPF_FUNC_perf_event_read_value)
2012 2013 2014 2015 2016 2017
			goto error;
		break;
	case BPF_MAP_TYPE_STACK_TRACE:
		if (func_id != BPF_FUNC_get_stackid)
			goto error;
		break;
2018
	case BPF_MAP_TYPE_CGROUP_ARRAY:
2019
		if (func_id != BPF_FUNC_skb_under_cgroup &&
2020
		    func_id != BPF_FUNC_current_task_under_cgroup)
2021 2022
			goto error;
		break;
2023 2024 2025 2026 2027
	/* devmap returns a pointer to a live net_device ifindex that we cannot
	 * allow to be modified from bpf side. So do not allow lookup elements
	 * for now.
	 */
	case BPF_MAP_TYPE_DEVMAP:
2028
		if (func_id != BPF_FUNC_redirect_map)
2029 2030
			goto error;
		break;
2031 2032 2033 2034 2035
	/* Restrict bpf side of cpumap, open when use-cases appear */
	case BPF_MAP_TYPE_CPUMAP:
		if (func_id != BPF_FUNC_redirect_map)
			goto error;
		break;
2036
	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
M
Martin KaFai Lau 已提交
2037
	case BPF_MAP_TYPE_HASH_OF_MAPS:
2038 2039
		if (func_id != BPF_FUNC_map_lookup_elem)
			goto error;
2040
		break;
2041 2042 2043 2044 2045 2046
	case BPF_MAP_TYPE_SOCKMAP:
		if (func_id != BPF_FUNC_sk_redirect_map &&
		    func_id != BPF_FUNC_sock_map_update &&
		    func_id != BPF_FUNC_map_delete_elem)
			goto error;
		break;
2047 2048 2049 2050 2051 2052 2053 2054 2055
	default:
		break;
	}

	/* ... and second from the function itself. */
	switch (func_id) {
	case BPF_FUNC_tail_call:
		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
			goto error;
2056 2057 2058 2059
		if (env->subprog_cnt) {
			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
			return -EINVAL;
		}
2060 2061 2062
		break;
	case BPF_FUNC_perf_event_read:
	case BPF_FUNC_perf_event_output:
2063
	case BPF_FUNC_perf_event_read_value:
2064 2065 2066 2067 2068 2069 2070
		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
			goto error;
		break;
	case BPF_FUNC_get_stackid:
		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
			goto error;
		break;
2071
	case BPF_FUNC_current_task_under_cgroup:
2072
	case BPF_FUNC_skb_under_cgroup:
2073 2074 2075
		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
			goto error;
		break;
2076
	case BPF_FUNC_redirect_map:
2077 2078
		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
		    map->map_type != BPF_MAP_TYPE_CPUMAP)
2079 2080
			goto error;
		break;
2081 2082 2083 2084 2085 2086 2087 2088
	case BPF_FUNC_sk_redirect_map:
		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
			goto error;
		break;
	case BPF_FUNC_sock_map_update:
		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
			goto error;
		break;
2089 2090
	default:
		break;
2091 2092 2093
	}

	return 0;
2094
error:
2095
	verbose(env, "cannot pass map_type %d into func %s#%d\n",
2096
		map->map_type, func_id_name(func_id), func_id);
2097
	return -EINVAL;
2098 2099
}

2100 2101 2102 2103
static int check_raw_mode(const struct bpf_func_proto *fn)
{
	int count = 0;

2104
	if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
2105
		count++;
2106
	if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
2107
		count++;
2108
	if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
2109
		count++;
2110
	if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
2111
		count++;
2112
	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
2113 2114 2115 2116 2117
		count++;

	return count > 1 ? -EINVAL : 0;
}

2118 2119
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
 * are now invalid, so turn them into unknown SCALAR_VALUE.
2120
 */
2121 2122
static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
				     struct bpf_func_state *state)
A
Alexei Starovoitov 已提交
2123
{
2124
	struct bpf_reg_state *regs = state->regs, *reg;
A
Alexei Starovoitov 已提交
2125 2126 2127
	int i;

	for (i = 0; i < MAX_BPF_REG; i++)
2128
		if (reg_is_pkt_pointer_any(&regs[i]))
2129
			mark_reg_unknown(env, regs, i);
A
Alexei Starovoitov 已提交
2130

2131 2132
	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
		if (state->stack[i].slot_type[0] != STACK_SPILL)
A
Alexei Starovoitov 已提交
2133
			continue;
2134
		reg = &state->stack[i].spilled_ptr;
2135 2136
		if (reg_is_pkt_pointer_any(reg))
			__mark_reg_unknown(reg);
A
Alexei Starovoitov 已提交
2137 2138 2139
	}
}

2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
	struct bpf_verifier_state *vstate = env->cur_state;
	int i;

	for (i = 0; i <= vstate->curframe; i++)
		__clear_all_pkt_pointers(env, vstate->frame[i]);
}

static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
			   int *insn_idx)
{
	struct bpf_verifier_state *state = env->cur_state;
	struct bpf_func_state *caller, *callee;
	int i, subprog, target_insn;

A
Alexei Starovoitov 已提交
2156
	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
2157
		verbose(env, "the call stack of %d frames is too deep\n",
A
Alexei Starovoitov 已提交
2158
			state->curframe + 2);
2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254
		return -E2BIG;
	}

	target_insn = *insn_idx + insn->imm;
	subprog = find_subprog(env, target_insn + 1);
	if (subprog < 0) {
		verbose(env, "verifier bug. No program starts at insn %d\n",
			target_insn + 1);
		return -EFAULT;
	}

	caller = state->frame[state->curframe];
	if (state->frame[state->curframe + 1]) {
		verbose(env, "verifier bug. Frame %d already allocated\n",
			state->curframe + 1);
		return -EFAULT;
	}

	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
	if (!callee)
		return -ENOMEM;
	state->frame[state->curframe + 1] = callee;

	/* callee cannot access r0, r6 - r9 for reading and has to write
	 * into its own stack before reading from it.
	 * callee can read/write into caller's stack
	 */
	init_func_state(env, callee,
			/* remember the callsite, it will be used by bpf_exit */
			*insn_idx /* callsite */,
			state->curframe + 1 /* frameno within this callchain */,
			subprog + 1 /* subprog number within this prog */);

	/* copy r1 - r5 args that callee can access */
	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
		callee->regs[i] = caller->regs[i];

	/* after the call regsiters r0 - r5 were scratched */
	for (i = 0; i < CALLER_SAVED_REGS; i++) {
		mark_reg_not_init(env, caller->regs, caller_saved[i]);
		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
	}

	/* only increment it after check_reg_arg() finished */
	state->curframe++;

	/* and go analyze first insn of the callee */
	*insn_idx = target_insn;

	if (env->log.level) {
		verbose(env, "caller:\n");
		print_verifier_state(env, caller);
		verbose(env, "callee:\n");
		print_verifier_state(env, callee);
	}
	return 0;
}

static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
	struct bpf_verifier_state *state = env->cur_state;
	struct bpf_func_state *caller, *callee;
	struct bpf_reg_state *r0;

	callee = state->frame[state->curframe];
	r0 = &callee->regs[BPF_REG_0];
	if (r0->type == PTR_TO_STACK) {
		/* technically it's ok to return caller's stack pointer
		 * (or caller's caller's pointer) back to the caller,
		 * since these pointers are valid. Only current stack
		 * pointer will be invalid as soon as function exits,
		 * but let's be conservative
		 */
		verbose(env, "cannot return stack pointer to the caller\n");
		return -EINVAL;
	}

	state->curframe--;
	caller = state->frame[state->curframe];
	/* return to the caller whatever r0 had in the callee */
	caller->regs[BPF_REG_0] = *r0;

	*insn_idx = callee->callsite + 1;
	if (env->log.level) {
		verbose(env, "returning from callee:\n");
		print_verifier_state(env, callee);
		verbose(env, "to caller at %d:\n", *insn_idx);
		print_verifier_state(env, caller);
	}
	/* clear everything in the callee */
	free_func_state(callee);
	state->frame[state->curframe + 1] = NULL;
	return 0;
}

static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
2255 2256
{
	const struct bpf_func_proto *fn = NULL;
2257
	struct bpf_reg_state *regs;
2258
	struct bpf_call_arg_meta meta;
A
Alexei Starovoitov 已提交
2259
	bool changes_data;
2260 2261 2262 2263
	int i, err;

	/* find function prototype */
	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
2264 2265
		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
			func_id);
2266 2267 2268
		return -EINVAL;
	}

2269 2270
	if (env->ops->get_func_proto)
		fn = env->ops->get_func_proto(func_id);
2271 2272

	if (!fn) {
2273 2274
		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
			func_id);
2275 2276 2277 2278
		return -EINVAL;
	}

	/* eBPF programs must be GPL compatible to use GPL-ed functions */
2279
	if (!env->prog->gpl_compatible && fn->gpl_only) {
2280
		verbose(env, "cannot call GPL only function from proprietary program\n");
2281 2282 2283
		return -EINVAL;
	}

2284
	/* With LD_ABS/IND some JITs save/restore skb from r1. */
2285
	changes_data = bpf_helper_changes_pkt_data(fn->func);
2286 2287 2288 2289 2290
	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
		verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
			func_id_name(func_id), func_id);
		return -EINVAL;
	}
A
Alexei Starovoitov 已提交
2291

2292
	memset(&meta, 0, sizeof(meta));
2293
	meta.pkt_access = fn->pkt_access;
2294

2295 2296 2297 2298 2299
	/* We only support one arg being in raw mode at the moment, which
	 * is sufficient for the helper functions we have right now.
	 */
	err = check_raw_mode(fn);
	if (err) {
2300
		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
2301
			func_id_name(func_id), func_id);
2302 2303 2304
		return err;
	}

2305
	/* check args */
2306
	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
2307 2308
	if (err)
		return err;
2309
	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
2310 2311
	if (err)
		return err;
2312
	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
2313 2314
	if (err)
		return err;
2315
	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
2316 2317
	if (err)
		return err;
2318
	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
2319 2320 2321
	if (err)
		return err;

2322 2323 2324 2325
	/* Mark slots with STACK_MISC in case of raw mode, stack offset
	 * is inferred from register state.
	 */
	for (i = 0; i < meta.access_size; i++) {
2326
		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
2327 2328 2329 2330
		if (err)
			return err;
	}

2331
	regs = cur_regs(env);
2332
	/* reset caller saved regs */
2333
	for (i = 0; i < CALLER_SAVED_REGS; i++) {
2334
		mark_reg_not_init(env, regs, caller_saved[i]);
2335 2336
		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
	}
2337

2338
	/* update return register (already marked as written above) */
2339
	if (fn->ret_type == RET_INTEGER) {
2340
		/* sets type to SCALAR_VALUE */
2341
		mark_reg_unknown(env, regs, BPF_REG_0);
2342 2343 2344
	} else if (fn->ret_type == RET_VOID) {
		regs[BPF_REG_0].type = NOT_INIT;
	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
2345 2346
		struct bpf_insn_aux_data *insn_aux;

2347
		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
2348
		/* There is no offset yet applied, variable or fixed */
2349
		mark_reg_known_zero(env, regs, BPF_REG_0);
2350
		regs[BPF_REG_0].off = 0;
2351 2352 2353 2354
		/* remember map_ptr, so that check_map_access()
		 * can check 'value_size' boundary of memory access
		 * to map element returned from bpf_map_lookup_elem()
		 */
2355
		if (meta.map_ptr == NULL) {
2356 2357
			verbose(env,
				"kernel subsystem misconfigured verifier\n");
2358 2359
			return -EINVAL;
		}
2360
		regs[BPF_REG_0].map_ptr = meta.map_ptr;
2361
		regs[BPF_REG_0].id = ++env->id_gen;
2362 2363 2364 2365 2366
		insn_aux = &env->insn_aux_data[insn_idx];
		if (!insn_aux->map_ptr)
			insn_aux->map_ptr = meta.map_ptr;
		else if (insn_aux->map_ptr != meta.map_ptr)
			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
2367
	} else {
2368
		verbose(env, "unknown return type %d of func %s#%d\n",
2369
			fn->ret_type, func_id_name(func_id), func_id);
2370 2371
		return -EINVAL;
	}
2372

2373
	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
2374 2375
	if (err)
		return err;
2376

A
Alexei Starovoitov 已提交
2377 2378 2379 2380 2381
	if (changes_data)
		clear_all_pkt_pointers(env);
	return 0;
}

2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399
static bool signed_add_overflows(s64 a, s64 b)
{
	/* Do the add in u64, where overflow is well-defined */
	s64 res = (s64)((u64)a + (u64)b);

	if (b < 0)
		return res > a;
	return res < a;
}

static bool signed_sub_overflows(s64 a, s64 b)
{
	/* Do the sub in u64, where overflow is well-defined */
	s64 res = (s64)((u64)a - (u64)b);

	if (b < 0)
		return res < a;
	return res > a;
A
Alexei Starovoitov 已提交
2400 2401
}

A
Alexei Starovoitov 已提交
2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
				  const struct bpf_reg_state *reg,
				  enum bpf_reg_type type)
{
	bool known = tnum_is_const(reg->var_off);
	s64 val = reg->var_off.value;
	s64 smin = reg->smin_value;

	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
		verbose(env, "math between %s pointer and %lld is not allowed\n",
			reg_type_str[type], val);
		return false;
	}

	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
		verbose(env, "%s pointer offset %d is not allowed\n",
			reg_type_str[type], reg->off);
		return false;
	}

	if (smin == S64_MIN) {
		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
			reg_type_str[type]);
		return false;
	}

	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
		verbose(env, "value %lld makes %s pointer be out of bounds\n",
			smin, reg_type_str[type]);
		return false;
	}

	return true;
}

2437 2438 2439 2440 2441 2442 2443 2444 2445
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
 * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
 */
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
				   struct bpf_insn *insn,
				   const struct bpf_reg_state *ptr_reg,
				   const struct bpf_reg_state *off_reg)
A
Alexei Starovoitov 已提交
2446
{
2447 2448 2449
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
	struct bpf_reg_state *regs = state->regs, *dst_reg;
2450
	bool known = tnum_is_const(off_reg->var_off);
2451 2452 2453 2454
	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
A
Alexei Starovoitov 已提交
2455
	u8 opcode = BPF_OP(insn->code);
2456
	u32 dst = insn->dst_reg;
A
Alexei Starovoitov 已提交
2457

2458
	dst_reg = &regs[dst];
A
Alexei Starovoitov 已提交
2459

2460
	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
2461
		print_verifier_state(env, state);
2462 2463
		verbose(env,
			"verifier internal error: known but bad sbounds\n");
2464 2465 2466
		return -EINVAL;
	}
	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
2467
		print_verifier_state(env, state);
2468 2469
		verbose(env,
			"verifier internal error: known but bad ubounds\n");
2470 2471 2472 2473 2474
		return -EINVAL;
	}

	if (BPF_CLASS(insn->code) != BPF_ALU64) {
		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
2475 2476 2477
		verbose(env,
			"R%d 32-bit pointer arithmetic prohibited\n",
			dst);
2478
		return -EACCES;
A
Alexei Starovoitov 已提交
2479 2480
	}

2481
	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
2482 2483
		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
			dst);
2484 2485 2486
		return -EACCES;
	}
	if (ptr_reg->type == CONST_PTR_TO_MAP) {
2487 2488
		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
			dst);
2489 2490 2491
		return -EACCES;
	}
	if (ptr_reg->type == PTR_TO_PACKET_END) {
2492 2493
		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
			dst);
2494 2495 2496 2497 2498
		return -EACCES;
	}

	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
	 * The id may be overwritten later if we create a new variable offset.
A
Alexei Starovoitov 已提交
2499
	 */
2500 2501
	dst_reg->type = ptr_reg->type;
	dst_reg->id = ptr_reg->id;
A
Alexei Starovoitov 已提交
2502

A
Alexei Starovoitov 已提交
2503 2504 2505 2506
	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
		return -EINVAL;

2507 2508 2509 2510
	switch (opcode) {
	case BPF_ADD:
		/* We can take a fixed offset as long as it doesn't overflow
		 * the s32 'off' field
A
Alexei Starovoitov 已提交
2511
		 */
2512 2513
		if (known && (ptr_reg->off + smin_val ==
			      (s64)(s32)(ptr_reg->off + smin_val))) {
2514
			/* pointer += K.  Accumulate it into fixed offset */
2515 2516 2517 2518
			dst_reg->smin_value = smin_ptr;
			dst_reg->smax_value = smax_ptr;
			dst_reg->umin_value = umin_ptr;
			dst_reg->umax_value = umax_ptr;
2519
			dst_reg->var_off = ptr_reg->var_off;
2520
			dst_reg->off = ptr_reg->off + smin_val;
2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531
			dst_reg->range = ptr_reg->range;
			break;
		}
		/* A new variable offset is created.  Note that off_reg->off
		 * == 0, since it's a scalar.
		 * dst_reg gets the pointer type and since some positive
		 * integer value was added to the pointer, give it a new 'id'
		 * if it's a PTR_TO_PACKET.
		 * this creates a new 'base' pointer, off_reg (variable) gets
		 * added into the variable offset, and we copy the fixed offset
		 * from ptr_reg.
A
Alexei Starovoitov 已提交
2532
		 */
2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548
		if (signed_add_overflows(smin_ptr, smin_val) ||
		    signed_add_overflows(smax_ptr, smax_val)) {
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			dst_reg->smin_value = smin_ptr + smin_val;
			dst_reg->smax_value = smax_ptr + smax_val;
		}
		if (umin_ptr + umin_val < umin_ptr ||
		    umax_ptr + umax_val < umax_ptr) {
			dst_reg->umin_value = 0;
			dst_reg->umax_value = U64_MAX;
		} else {
			dst_reg->umin_value = umin_ptr + umin_val;
			dst_reg->umax_value = umax_ptr + umax_val;
		}
2549 2550
		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
		dst_reg->off = ptr_reg->off;
2551
		if (reg_is_pkt_pointer(ptr_reg)) {
2552 2553 2554 2555 2556 2557 2558 2559
			dst_reg->id = ++env->id_gen;
			/* something was added to pkt_ptr, set range to zero */
			dst_reg->range = 0;
		}
		break;
	case BPF_SUB:
		if (dst_reg == off_reg) {
			/* scalar -= pointer.  Creates an unknown scalar */
2560 2561
			verbose(env, "R%d tried to subtract pointer from scalar\n",
				dst);
2562 2563 2564 2565 2566
			return -EACCES;
		}
		/* We don't allow subtraction from FP, because (according to
		 * test_verifier.c test "invalid fp arithmetic", JITs might not
		 * be able to deal with it.
A
Alexei Starovoitov 已提交
2567
		 */
2568
		if (ptr_reg->type == PTR_TO_STACK) {
2569 2570
			verbose(env, "R%d subtraction from stack pointer prohibited\n",
				dst);
2571 2572
			return -EACCES;
		}
2573 2574
		if (known && (ptr_reg->off - smin_val ==
			      (s64)(s32)(ptr_reg->off - smin_val))) {
2575
			/* pointer -= K.  Subtract it from fixed offset */
2576 2577 2578 2579
			dst_reg->smin_value = smin_ptr;
			dst_reg->smax_value = smax_ptr;
			dst_reg->umin_value = umin_ptr;
			dst_reg->umax_value = umax_ptr;
2580 2581
			dst_reg->var_off = ptr_reg->var_off;
			dst_reg->id = ptr_reg->id;
2582
			dst_reg->off = ptr_reg->off - smin_val;
2583 2584 2585 2586 2587
			dst_reg->range = ptr_reg->range;
			break;
		}
		/* A new variable offset is created.  If the subtrahend is known
		 * nonnegative, then any reg->range we had before is still good.
A
Alexei Starovoitov 已提交
2588
		 */
2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606
		if (signed_sub_overflows(smin_ptr, smax_val) ||
		    signed_sub_overflows(smax_ptr, smin_val)) {
			/* Overflow possible, we know nothing */
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			dst_reg->smin_value = smin_ptr - smax_val;
			dst_reg->smax_value = smax_ptr - smin_val;
		}
		if (umin_ptr < umax_val) {
			/* Overflow possible, we know nothing */
			dst_reg->umin_value = 0;
			dst_reg->umax_value = U64_MAX;
		} else {
			/* Cannot overflow (as long as bounds are consistent) */
			dst_reg->umin_value = umin_ptr - umax_val;
			dst_reg->umax_value = umax_ptr - umin_val;
		}
2607 2608
		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
		dst_reg->off = ptr_reg->off;
2609
		if (reg_is_pkt_pointer(ptr_reg)) {
2610 2611
			dst_reg->id = ++env->id_gen;
			/* something was added to pkt_ptr, set range to zero */
2612
			if (smin_val < 0)
2613
				dst_reg->range = 0;
2614
		}
2615 2616 2617 2618
		break;
	case BPF_AND:
	case BPF_OR:
	case BPF_XOR:
2619 2620 2621
		/* bitwise ops on pointers are troublesome, prohibit. */
		verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
			dst, bpf_alu_string[opcode >> 4]);
2622 2623 2624
		return -EACCES;
	default:
		/* other operators (e.g. MUL,LSH) produce non-pointer results */
2625 2626
		verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
			dst, bpf_alu_string[opcode >> 4]);
2627
		return -EACCES;
2628 2629
	}

A
Alexei Starovoitov 已提交
2630 2631 2632
	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
		return -EINVAL;

2633 2634 2635
	__update_reg_bounds(dst_reg);
	__reg_deduce_bounds(dst_reg);
	__reg_bound_offset(dst_reg);
2636 2637 2638
	return 0;
}

J
Jann Horn 已提交
2639 2640 2641 2642
/* WARNING: This function does calculations on 64-bit values, but the actual
 * execution may occur on 32-bit values. Therefore, things like bitshifts
 * need extra checks in the 32-bit case.
 */
2643 2644 2645 2646
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
				      struct bpf_insn *insn,
				      struct bpf_reg_state *dst_reg,
				      struct bpf_reg_state src_reg)
A
Alexei Starovoitov 已提交
2647
{
2648
	struct bpf_reg_state *regs = cur_regs(env);
2649
	u8 opcode = BPF_OP(insn->code);
2650
	bool src_known, dst_known;
2651 2652
	s64 smin_val, smax_val;
	u64 umin_val, umax_val;
J
Jann Horn 已提交
2653
	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
2654

2655 2656 2657 2658
	smin_val = src_reg.smin_value;
	smax_val = src_reg.smax_value;
	umin_val = src_reg.umin_value;
	umax_val = src_reg.umax_value;
2659 2660
	src_known = tnum_is_const(src_reg.var_off);
	dst_known = tnum_is_const(dst_reg->var_off);
2661

A
Alexei Starovoitov 已提交
2662 2663 2664 2665 2666 2667
	if (!src_known &&
	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
		__mark_reg_unknown(dst_reg);
		return 0;
	}

2668 2669
	switch (opcode) {
	case BPF_ADD:
2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685
		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
		    signed_add_overflows(dst_reg->smax_value, smax_val)) {
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			dst_reg->smin_value += smin_val;
			dst_reg->smax_value += smax_val;
		}
		if (dst_reg->umin_value + umin_val < umin_val ||
		    dst_reg->umax_value + umax_val < umax_val) {
			dst_reg->umin_value = 0;
			dst_reg->umax_value = U64_MAX;
		} else {
			dst_reg->umin_value += umin_val;
			dst_reg->umax_value += umax_val;
		}
2686
		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
2687 2688
		break;
	case BPF_SUB:
2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706
		if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
		    signed_sub_overflows(dst_reg->smax_value, smin_val)) {
			/* Overflow possible, we know nothing */
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			dst_reg->smin_value -= smax_val;
			dst_reg->smax_value -= smin_val;
		}
		if (dst_reg->umin_value < umax_val) {
			/* Overflow possible, we know nothing */
			dst_reg->umin_value = 0;
			dst_reg->umax_value = U64_MAX;
		} else {
			/* Cannot overflow (as long as bounds are consistent) */
			dst_reg->umin_value -= umax_val;
			dst_reg->umax_value -= umin_val;
		}
2707
		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
2708 2709
		break;
	case BPF_MUL:
2710 2711
		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
		if (smin_val < 0 || dst_reg->smin_value < 0) {
2712
			/* Ain't nobody got time to multiply that sign */
2713 2714
			__mark_reg_unbounded(dst_reg);
			__update_reg_bounds(dst_reg);
2715 2716
			break;
		}
2717 2718
		/* Both values are positive, so we can work with unsigned and
		 * copy the result to signed (unless it exceeds S64_MAX).
2719
		 */
2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
		if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
			/* Potential overflow, we know nothing */
			__mark_reg_unbounded(dst_reg);
			/* (except what we can learn from the var_off) */
			__update_reg_bounds(dst_reg);
			break;
		}
		dst_reg->umin_value *= umin_val;
		dst_reg->umax_value *= umax_val;
		if (dst_reg->umax_value > S64_MAX) {
			/* Overflow possible, we know nothing */
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			dst_reg->smin_value = dst_reg->umin_value;
			dst_reg->smax_value = dst_reg->umax_value;
		}
2737 2738
		break;
	case BPF_AND:
2739
		if (src_known && dst_known) {
2740 2741
			__mark_reg_known(dst_reg, dst_reg->var_off.value &
						  src_reg.var_off.value);
2742 2743
			break;
		}
2744 2745
		/* We get our minimum from the var_off, since that's inherently
		 * bitwise.  Our maximum is the minimum of the operands' maxima.
2746
		 */
2747
		dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764
		dst_reg->umin_value = dst_reg->var_off.value;
		dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
		if (dst_reg->smin_value < 0 || smin_val < 0) {
			/* Lose signed bounds when ANDing negative numbers,
			 * ain't nobody got time for that.
			 */
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
		} else {
			/* ANDing two positives gives a positive, so safe to
			 * cast result into s64.
			 */
			dst_reg->smin_value = dst_reg->umin_value;
			dst_reg->smax_value = dst_reg->umax_value;
		}
		/* We may learn something more from the var_off */
		__update_reg_bounds(dst_reg);
2765 2766 2767
		break;
	case BPF_OR:
		if (src_known && dst_known) {
2768 2769
			__mark_reg_known(dst_reg, dst_reg->var_off.value |
						  src_reg.var_off.value);
2770 2771
			break;
		}
2772 2773
		/* We get our maximum from the var_off, and our minimum is the
		 * maximum of the operands' minima
2774 2775
		 */
		dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
2776 2777 2778 2779 2780 2781 2782 2783 2784
		dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
		dst_reg->umax_value = dst_reg->var_off.value |
				      dst_reg->var_off.mask;
		if (dst_reg->smin_value < 0 || smin_val < 0) {
			/* Lose signed bounds when ORing negative numbers,
			 * ain't nobody got time for that.
			 */
			dst_reg->smin_value = S64_MIN;
			dst_reg->smax_value = S64_MAX;
2785
		} else {
2786 2787 2788 2789 2790
			/* ORing two positives gives a positive, so safe to
			 * cast result into s64.
			 */
			dst_reg->smin_value = dst_reg->umin_value;
			dst_reg->smax_value = dst_reg->umax_value;
2791
		}
2792 2793
		/* We may learn something more from the var_off */
		__update_reg_bounds(dst_reg);
2794 2795
		break;
	case BPF_LSH:
J
Jann Horn 已提交
2796 2797 2798
		if (umax_val >= insn_bitness) {
			/* Shifts greater than 31 or 63 are undefined.
			 * This includes shifts by a negative number.
2799
			 */
2800
			mark_reg_unknown(env, regs, insn->dst_reg);
2801 2802
			break;
		}
2803 2804
		/* We lose all sign bit information (except what we can pick
		 * up from var_off)
2805
		 */
2806 2807 2808 2809 2810 2811
		dst_reg->smin_value = S64_MIN;
		dst_reg->smax_value = S64_MAX;
		/* If we might shift our top bit out, then we know nothing */
		if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
			dst_reg->umin_value = 0;
			dst_reg->umax_value = U64_MAX;
2812
		} else {
2813 2814
			dst_reg->umin_value <<= umin_val;
			dst_reg->umax_value <<= umax_val;
2815
		}
2816 2817 2818 2819 2820 2821
		if (src_known)
			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
		else
			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
		/* We may learn something more from the var_off */
		__update_reg_bounds(dst_reg);
2822 2823
		break;
	case BPF_RSH:
J
Jann Horn 已提交
2824 2825 2826
		if (umax_val >= insn_bitness) {
			/* Shifts greater than 31 or 63 are undefined.
			 * This includes shifts by a negative number.
2827
			 */
2828
			mark_reg_unknown(env, regs, insn->dst_reg);
2829 2830
			break;
		}
2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846
		/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
		 * be negative, then either:
		 * 1) src_reg might be zero, so the sign bit of the result is
		 *    unknown, so we lose our signed bounds
		 * 2) it's known negative, thus the unsigned bounds capture the
		 *    signed bounds
		 * 3) the signed bounds cross zero, so they tell us nothing
		 *    about the result
		 * If the value in dst_reg is known nonnegative, then again the
		 * unsigned bounts capture the signed bounds.
		 * Thus, in all cases it suffices to blow away our signed bounds
		 * and rely on inferring new ones from the unsigned bounds and
		 * var_off of the result.
		 */
		dst_reg->smin_value = S64_MIN;
		dst_reg->smax_value = S64_MAX;
2847
		if (src_known)
2848 2849
			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
						       umin_val);
2850
		else
2851 2852 2853 2854 2855
			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
		dst_reg->umin_value >>= umax_val;
		dst_reg->umax_value >>= umin_val;
		/* We may learn something more from the var_off */
		__update_reg_bounds(dst_reg);
2856 2857
		break;
	default:
2858
		mark_reg_unknown(env, regs, insn->dst_reg);
2859 2860 2861
		break;
	}

J
Jann Horn 已提交
2862 2863 2864 2865 2866 2867
	if (BPF_CLASS(insn->code) != BPF_ALU64) {
		/* 32-bit ALU ops are (32,32)->32 */
		coerce_reg_to_size(dst_reg, 4);
		coerce_reg_to_size(&src_reg, 4);
	}

2868 2869
	__reg_deduce_bounds(dst_reg);
	__reg_bound_offset(dst_reg);
2870 2871 2872 2873 2874 2875 2876 2877 2878
	return 0;
}

/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
 * and var_off.
 */
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
				   struct bpf_insn *insn)
{
2879 2880 2881
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893
	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
	u8 opcode = BPF_OP(insn->code);

	dst_reg = &regs[insn->dst_reg];
	src_reg = NULL;
	if (dst_reg->type != SCALAR_VALUE)
		ptr_reg = dst_reg;
	if (BPF_SRC(insn->code) == BPF_X) {
		src_reg = &regs[insn->src_reg];
		if (src_reg->type != SCALAR_VALUE) {
			if (dst_reg->type != SCALAR_VALUE) {
				/* Combining two pointers by any ALU op yields
2894 2895
				 * an arbitrary scalar. Disallow all math except
				 * pointer subtraction
2896
				 */
2897 2898 2899
				if (opcode == BPF_SUB){
					mark_reg_unknown(env, regs, insn->dst_reg);
					return 0;
2900
				}
2901 2902 2903 2904
				verbose(env, "R%d pointer %s pointer prohibited\n",
					insn->dst_reg,
					bpf_alu_string[opcode >> 4]);
				return -EACCES;
2905 2906 2907 2908 2909
			} else {
				/* scalar += pointer
				 * This is legal, but we have to reverse our
				 * src/dest handling in computing the range
				 */
2910 2911
				return adjust_ptr_min_max_vals(env, insn,
							       src_reg, dst_reg);
2912 2913 2914
			}
		} else if (ptr_reg) {
			/* pointer += scalar */
2915 2916
			return adjust_ptr_min_max_vals(env, insn,
						       dst_reg, src_reg);
2917 2918 2919 2920 2921 2922
		}
	} else {
		/* Pretend the src is a reg with a known value, since we only
		 * need to be able to read from this state.
		 */
		off_reg.type = SCALAR_VALUE;
2923
		__mark_reg_known(&off_reg, insn->imm);
2924
		src_reg = &off_reg;
2925 2926 2927
		if (ptr_reg) /* pointer += K */
			return adjust_ptr_min_max_vals(env, insn,
						       ptr_reg, src_reg);
2928 2929 2930 2931
	}

	/* Got here implies adding two SCALAR_VALUEs */
	if (WARN_ON_ONCE(ptr_reg)) {
2932
		print_verifier_state(env, state);
2933
		verbose(env, "verifier internal error: unexpected ptr_reg\n");
2934 2935 2936
		return -EINVAL;
	}
	if (WARN_ON(!src_reg)) {
2937
		print_verifier_state(env, state);
2938
		verbose(env, "verifier internal error: no src_reg\n");
2939 2940 2941
		return -EINVAL;
	}
	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
2942 2943
}

2944
/* check validity of 32-bit and 64-bit arithmetic operations */
2945
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2946
{
2947
	struct bpf_reg_state *regs = cur_regs(env);
2948 2949 2950 2951 2952 2953 2954 2955
	u8 opcode = BPF_OP(insn->code);
	int err;

	if (opcode == BPF_END || opcode == BPF_NEG) {
		if (opcode == BPF_NEG) {
			if (BPF_SRC(insn->code) != 0 ||
			    insn->src_reg != BPF_REG_0 ||
			    insn->off != 0 || insn->imm != 0) {
2956
				verbose(env, "BPF_NEG uses reserved fields\n");
2957 2958 2959 2960
				return -EINVAL;
			}
		} else {
			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
2961 2962
			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
			    BPF_CLASS(insn->code) == BPF_ALU64) {
2963
				verbose(env, "BPF_END uses reserved fields\n");
2964 2965 2966 2967 2968
				return -EINVAL;
			}
		}

		/* check src operand */
2969
		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
2970 2971 2972
		if (err)
			return err;

2973
		if (is_pointer_value(env, insn->dst_reg)) {
2974
			verbose(env, "R%d pointer arithmetic prohibited\n",
2975 2976 2977 2978
				insn->dst_reg);
			return -EACCES;
		}

2979
		/* check dest operand */
2980
		err = check_reg_arg(env, insn->dst_reg, DST_OP);
2981 2982 2983 2984 2985 2986 2987
		if (err)
			return err;

	} else if (opcode == BPF_MOV) {

		if (BPF_SRC(insn->code) == BPF_X) {
			if (insn->imm != 0 || insn->off != 0) {
2988
				verbose(env, "BPF_MOV uses reserved fields\n");
2989 2990 2991 2992
				return -EINVAL;
			}

			/* check src operand */
2993
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
2994 2995 2996 2997
			if (err)
				return err;
		} else {
			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
2998
				verbose(env, "BPF_MOV uses reserved fields\n");
2999 3000 3001 3002 3003
				return -EINVAL;
			}
		}

		/* check dest operand */
3004
		err = check_reg_arg(env, insn->dst_reg, DST_OP);
3005 3006 3007 3008 3009 3010 3011 3012 3013
		if (err)
			return err;

		if (BPF_SRC(insn->code) == BPF_X) {
			if (BPF_CLASS(insn->code) == BPF_ALU64) {
				/* case: R1 = R2
				 * copy register state to dest reg
				 */
				regs[insn->dst_reg] = regs[insn->src_reg];
A
Alexei Starovoitov 已提交
3014
				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
3015
			} else {
3016
				/* R1 = (u32) R2 */
3017
				if (is_pointer_value(env, insn->src_reg)) {
3018 3019
					verbose(env,
						"R%d partial copy of pointer\n",
3020 3021 3022
						insn->src_reg);
					return -EACCES;
				}
3023
				mark_reg_unknown(env, regs, insn->dst_reg);
3024
				coerce_reg_to_size(&regs[insn->dst_reg], 4);
3025 3026 3027 3028 3029
			}
		} else {
			/* case: R = imm
			 * remember the value we stored into this reg
			 */
3030
			regs[insn->dst_reg].type = SCALAR_VALUE;
3031 3032 3033 3034 3035 3036 3037
			if (BPF_CLASS(insn->code) == BPF_ALU64) {
				__mark_reg_known(regs + insn->dst_reg,
						 insn->imm);
			} else {
				__mark_reg_known(regs + insn->dst_reg,
						 (u32)insn->imm);
			}
3038 3039 3040
		}

	} else if (opcode > BPF_END) {
3041
		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
3042 3043 3044 3045 3046 3047
		return -EINVAL;

	} else {	/* all other ALU ops: and, sub, xor, add, ... */

		if (BPF_SRC(insn->code) == BPF_X) {
			if (insn->imm != 0 || insn->off != 0) {
3048
				verbose(env, "BPF_ALU uses reserved fields\n");
3049 3050 3051
				return -EINVAL;
			}
			/* check src1 operand */
3052
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
3053 3054 3055 3056
			if (err)
				return err;
		} else {
			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
3057
				verbose(env, "BPF_ALU uses reserved fields\n");
3058 3059 3060 3061 3062
				return -EINVAL;
			}
		}

		/* check src2 operand */
3063
		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
3064 3065 3066 3067 3068
		if (err)
			return err;

		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
3069
			verbose(env, "div by zero\n");
3070 3071 3072
			return -EINVAL;
		}

R
Rabin Vincent 已提交
3073 3074 3075 3076 3077
		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;

			if (insn->imm < 0 || insn->imm >= size) {
3078
				verbose(env, "invalid shift %d\n", insn->imm);
R
Rabin Vincent 已提交
3079 3080 3081 3082
				return -EINVAL;
			}
		}

A
Alexei Starovoitov 已提交
3083
		/* check dest operand */
3084
		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
A
Alexei Starovoitov 已提交
3085 3086 3087
		if (err)
			return err;

3088
		return adjust_reg_min_max_vals(env, insn);
3089 3090 3091 3092 3093
	}

	return 0;
}

3094
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
3095
				   struct bpf_reg_state *dst_reg,
3096
				   enum bpf_reg_type type,
3097
				   bool range_right_open)
A
Alexei Starovoitov 已提交
3098
{
3099
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
3100
	struct bpf_reg_state *regs = state->regs, *reg;
3101
	u16 new_range;
3102
	int i, j;
3103

3104 3105
	if (dst_reg->off < 0 ||
	    (dst_reg->off == 0 && range_right_open))
3106 3107 3108
		/* This doesn't give us any range */
		return;

3109 3110
	if (dst_reg->umax_value > MAX_PACKET_OFF ||
	    dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
3111 3112 3113 3114 3115
		/* Risk of overflow.  For instance, ptr + (1<<63) may be less
		 * than pkt_end, but that's because it's also less than pkt.
		 */
		return;

3116 3117 3118 3119 3120
	new_range = dst_reg->off;
	if (range_right_open)
		new_range--;

	/* Examples for register markings:
3121
	 *
3122
	 * pkt_data in dst register:
3123 3124 3125 3126 3127 3128
	 *
	 *   r2 = r3;
	 *   r2 += 8;
	 *   if (r2 > pkt_end) goto <handle exception>
	 *   <access okay>
	 *
3129 3130 3131 3132 3133
	 *   r2 = r3;
	 *   r2 += 8;
	 *   if (r2 < pkt_end) goto <access okay>
	 *   <handle exception>
	 *
3134 3135 3136 3137 3138
	 *   Where:
	 *     r2 == dst_reg, pkt_end == src_reg
	 *     r2=pkt(id=n,off=8,r=0)
	 *     r3=pkt(id=n,off=0,r=0)
	 *
3139
	 * pkt_data in src register:
3140 3141 3142 3143 3144 3145
	 *
	 *   r2 = r3;
	 *   r2 += 8;
	 *   if (pkt_end >= r2) goto <access okay>
	 *   <handle exception>
	 *
3146 3147 3148 3149 3150
	 *   r2 = r3;
	 *   r2 += 8;
	 *   if (pkt_end <= r2) goto <handle exception>
	 *   <access okay>
	 *
3151 3152 3153 3154 3155 3156
	 *   Where:
	 *     pkt_end == dst_reg, r2 == src_reg
	 *     r2=pkt(id=n,off=8,r=0)
	 *     r3=pkt(id=n,off=0,r=0)
	 *
	 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
3157 3158 3159
	 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
	 * and [r3, r3 + 8-1) respectively is safe to access depending on
	 * the check.
A
Alexei Starovoitov 已提交
3160
	 */
3161

3162 3163 3164 3165 3166
	/* If our ids match, then we must have the same max_value.  And we
	 * don't care about the other reg's fixed offset, since if it's too big
	 * the range won't allow anything.
	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
	 */
A
Alexei Starovoitov 已提交
3167
	for (i = 0; i < MAX_BPF_REG; i++)
3168
		if (regs[i].type == type && regs[i].id == dst_reg->id)
3169
			/* keep the maximum range already checked */
3170
			regs[i].range = max(regs[i].range, new_range);
A
Alexei Starovoitov 已提交
3171

3172 3173 3174 3175 3176 3177 3178 3179 3180
	for (j = 0; j <= vstate->curframe; j++) {
		state = vstate->frame[j];
		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
			if (state->stack[i].slot_type[0] != STACK_SPILL)
				continue;
			reg = &state->stack[i].spilled_ptr;
			if (reg->type == type && reg->id == dst_reg->id)
				reg->range = max(reg->range, new_range);
		}
A
Alexei Starovoitov 已提交
3181 3182 3183
	}
}

3184 3185 3186
/* Adjusts the register min/max values in the case that the dst_reg is the
 * variable register that we are working on, and src_reg is a constant or we're
 * simply doing a BPF_K check.
3187
 * In JEQ/JNE cases we also adjust the var_off values.
3188 3189 3190 3191 3192
 */
static void reg_set_min_max(struct bpf_reg_state *true_reg,
			    struct bpf_reg_state *false_reg, u64 val,
			    u8 opcode)
{
3193 3194 3195 3196 3197 3198 3199 3200
	/* If the dst_reg is a pointer, we can't learn anything about its
	 * variable offset from the compare (unless src_reg were a pointer into
	 * the same object, but we don't bother with that.
	 * Since false_reg and true_reg have the same type by construction, we
	 * only need to check one of them for pointerness.
	 */
	if (__is_pointer_value(false, false_reg))
		return;
3201

3202 3203 3204 3205 3206
	switch (opcode) {
	case BPF_JEQ:
		/* If this is false then we know nothing Jon Snow, but if it is
		 * true then we know for sure.
		 */
3207
		__mark_reg_known(true_reg, val);
3208 3209 3210 3211 3212
		break;
	case BPF_JNE:
		/* If this is true we know nothing Jon Snow, but if it is false
		 * we know the value for sure;
		 */
3213
		__mark_reg_known(false_reg, val);
3214 3215
		break;
	case BPF_JGT:
3216 3217 3218
		false_reg->umax_value = min(false_reg->umax_value, val);
		true_reg->umin_value = max(true_reg->umin_value, val + 1);
		break;
3219
	case BPF_JSGT:
3220 3221
		false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
		true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
3222
		break;
3223 3224 3225 3226 3227 3228 3229 3230
	case BPF_JLT:
		false_reg->umin_value = max(false_reg->umin_value, val);
		true_reg->umax_value = min(true_reg->umax_value, val - 1);
		break;
	case BPF_JSLT:
		false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
		true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
		break;
3231
	case BPF_JGE:
3232 3233 3234
		false_reg->umax_value = min(false_reg->umax_value, val - 1);
		true_reg->umin_value = max(true_reg->umin_value, val);
		break;
3235
	case BPF_JSGE:
3236 3237
		false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
		true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
3238
		break;
3239 3240 3241 3242 3243 3244 3245 3246
	case BPF_JLE:
		false_reg->umin_value = max(false_reg->umin_value, val + 1);
		true_reg->umax_value = min(true_reg->umax_value, val);
		break;
	case BPF_JSLE:
		false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
		true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
		break;
3247 3248 3249 3250
	default:
		break;
	}

3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261
	__reg_deduce_bounds(false_reg);
	__reg_deduce_bounds(true_reg);
	/* We might have learned some bits from the bounds. */
	__reg_bound_offset(false_reg);
	__reg_bound_offset(true_reg);
	/* Intersecting with the old var_off might have improved our bounds
	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
	 * then new var_off is (0; 0x7f...fc) which improves our umax.
	 */
	__update_reg_bounds(false_reg);
	__update_reg_bounds(true_reg);
3262 3263
}

3264 3265
/* Same as above, but for the case that dst_reg holds a constant and src_reg is
 * the variable reg.
3266 3267 3268 3269 3270
 */
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
				struct bpf_reg_state *false_reg, u64 val,
				u8 opcode)
{
3271 3272
	if (__is_pointer_value(false, false_reg))
		return;
3273

3274 3275 3276 3277 3278
	switch (opcode) {
	case BPF_JEQ:
		/* If this is false then we know nothing Jon Snow, but if it is
		 * true then we know for sure.
		 */
3279
		__mark_reg_known(true_reg, val);
3280 3281 3282 3283 3284
		break;
	case BPF_JNE:
		/* If this is true we know nothing Jon Snow, but if it is false
		 * we know the value for sure;
		 */
3285
		__mark_reg_known(false_reg, val);
3286 3287
		break;
	case BPF_JGT:
3288 3289 3290
		true_reg->umax_value = min(true_reg->umax_value, val - 1);
		false_reg->umin_value = max(false_reg->umin_value, val);
		break;
3291
	case BPF_JSGT:
3292 3293
		true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
		false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
3294
		break;
3295 3296 3297 3298 3299 3300 3301 3302
	case BPF_JLT:
		true_reg->umin_value = max(true_reg->umin_value, val + 1);
		false_reg->umax_value = min(false_reg->umax_value, val);
		break;
	case BPF_JSLT:
		true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
		false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
		break;
3303
	case BPF_JGE:
3304 3305 3306
		true_reg->umax_value = min(true_reg->umax_value, val);
		false_reg->umin_value = max(false_reg->umin_value, val + 1);
		break;
3307
	case BPF_JSGE:
3308 3309
		true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
		false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
3310
		break;
3311 3312 3313 3314 3315 3316 3317 3318
	case BPF_JLE:
		true_reg->umin_value = max(true_reg->umin_value, val);
		false_reg->umax_value = min(false_reg->umax_value, val - 1);
		break;
	case BPF_JSLE:
		true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
		false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
		break;
3319 3320 3321 3322
	default:
		break;
	}

3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333
	__reg_deduce_bounds(false_reg);
	__reg_deduce_bounds(true_reg);
	/* We might have learned some bits from the bounds. */
	__reg_bound_offset(false_reg);
	__reg_bound_offset(true_reg);
	/* Intersecting with the old var_off might have improved our bounds
	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
	 * then new var_off is (0; 0x7f...fc) which improves our umax.
	 */
	__update_reg_bounds(false_reg);
	__update_reg_bounds(true_reg);
3334 3335 3336 3337 3338 3339
}

/* Regs are known to be equal, so intersect their min/max/var_off */
static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
				  struct bpf_reg_state *dst_reg)
{
3340 3341 3342 3343 3344 3345 3346 3347
	src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
							dst_reg->umin_value);
	src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
							dst_reg->umax_value);
	src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
							dst_reg->smin_value);
	src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
							dst_reg->smax_value);
3348 3349
	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
							     dst_reg->var_off);
3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364
	/* We might have learned new bounds from the var_off. */
	__update_reg_bounds(src_reg);
	__update_reg_bounds(dst_reg);
	/* We might have learned something about the sign bit. */
	__reg_deduce_bounds(src_reg);
	__reg_deduce_bounds(dst_reg);
	/* We might have learned some bits from the bounds. */
	__reg_bound_offset(src_reg);
	__reg_bound_offset(dst_reg);
	/* Intersecting with the old var_off might have improved our bounds
	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
	 * then new var_off is (0; 0x7f...fc) which improves our umax.
	 */
	__update_reg_bounds(src_reg);
	__update_reg_bounds(dst_reg);
3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378
}

static void reg_combine_min_max(struct bpf_reg_state *true_src,
				struct bpf_reg_state *true_dst,
				struct bpf_reg_state *false_src,
				struct bpf_reg_state *false_dst,
				u8 opcode)
{
	switch (opcode) {
	case BPF_JEQ:
		__reg_combine_min_max(true_src, true_dst);
		break;
	case BPF_JNE:
		__reg_combine_min_max(false_src, false_dst);
3379
		break;
3380
	}
3381 3382
}

3383
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
3384
			 bool is_null)
3385 3386 3387 3388
{
	struct bpf_reg_state *reg = &regs[regno];

	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
3389 3390 3391 3392
		/* Old offset (both fixed and variable parts) should
		 * have been known-zero, because we don't allow pointer
		 * arithmetic on pointers that might be NULL.
		 */
3393 3394
		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
				 !tnum_equals_const(reg->var_off, 0) ||
3395
				 reg->off)) {
3396 3397
			__mark_reg_known_zero(reg);
			reg->off = 0;
3398 3399 3400
		}
		if (is_null) {
			reg->type = SCALAR_VALUE;
3401 3402 3403 3404
		} else if (reg->map_ptr->inner_map_meta) {
			reg->type = CONST_PTR_TO_MAP;
			reg->map_ptr = reg->map_ptr->inner_map_meta;
		} else {
3405
			reg->type = PTR_TO_MAP_VALUE;
3406
		}
3407 3408 3409 3410 3411
		/* We don't need id from this point onwards anymore, thus we
		 * should better reset it, so that state pruning has chances
		 * to take effect.
		 */
		reg->id = 0;
3412 3413 3414 3415 3416 3417
	}
}

/* The logic is similar to find_good_pkt_pointers(), both could eventually
 * be folded together at some point.
 */
3418
static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
3419
			  bool is_null)
3420
{
3421
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
3422
	struct bpf_reg_state *regs = state->regs;
3423
	u32 id = regs[regno].id;
3424
	int i, j;
3425 3426

	for (i = 0; i < MAX_BPF_REG; i++)
3427
		mark_map_reg(regs, i, id, is_null);
3428

3429 3430 3431 3432 3433 3434 3435
	for (j = 0; j <= vstate->curframe; j++) {
		state = vstate->frame[j];
		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
			if (state->stack[i].slot_type[0] != STACK_SPILL)
				continue;
			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
		}
3436 3437 3438
	}
}

3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
				   struct bpf_reg_state *dst_reg,
				   struct bpf_reg_state *src_reg,
				   struct bpf_verifier_state *this_branch,
				   struct bpf_verifier_state *other_branch)
{
	if (BPF_SRC(insn->code) != BPF_X)
		return false;

	switch (BPF_OP(insn->code)) {
	case BPF_JGT:
		if ((dst_reg->type == PTR_TO_PACKET &&
		     src_reg->type == PTR_TO_PACKET_END) ||
		    (dst_reg->type == PTR_TO_PACKET_META &&
		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
			find_good_pkt_pointers(this_branch, dst_reg,
					       dst_reg->type, false);
		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
			    src_reg->type == PTR_TO_PACKET) ||
			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
			    src_reg->type == PTR_TO_PACKET_META)) {
			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
			find_good_pkt_pointers(other_branch, src_reg,
					       src_reg->type, true);
		} else {
			return false;
		}
		break;
	case BPF_JLT:
		if ((dst_reg->type == PTR_TO_PACKET &&
		     src_reg->type == PTR_TO_PACKET_END) ||
		    (dst_reg->type == PTR_TO_PACKET_META &&
		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
			find_good_pkt_pointers(other_branch, dst_reg,
					       dst_reg->type, true);
		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
			    src_reg->type == PTR_TO_PACKET) ||
			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
			    src_reg->type == PTR_TO_PACKET_META)) {
			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
			find_good_pkt_pointers(this_branch, src_reg,
					       src_reg->type, false);
		} else {
			return false;
		}
		break;
	case BPF_JGE:
		if ((dst_reg->type == PTR_TO_PACKET &&
		     src_reg->type == PTR_TO_PACKET_END) ||
		    (dst_reg->type == PTR_TO_PACKET_META &&
		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
			find_good_pkt_pointers(this_branch, dst_reg,
					       dst_reg->type, true);
		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
			    src_reg->type == PTR_TO_PACKET) ||
			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
			    src_reg->type == PTR_TO_PACKET_META)) {
			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
			find_good_pkt_pointers(other_branch, src_reg,
					       src_reg->type, false);
		} else {
			return false;
		}
		break;
	case BPF_JLE:
		if ((dst_reg->type == PTR_TO_PACKET &&
		     src_reg->type == PTR_TO_PACKET_END) ||
		    (dst_reg->type == PTR_TO_PACKET_META &&
		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
			find_good_pkt_pointers(other_branch, dst_reg,
					       dst_reg->type, false);
		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
			    src_reg->type == PTR_TO_PACKET) ||
			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
			    src_reg->type == PTR_TO_PACKET_META)) {
			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
			find_good_pkt_pointers(this_branch, src_reg,
					       src_reg->type, true);
		} else {
			return false;
		}
		break;
	default:
		return false;
	}

	return true;
}

3532
static int check_cond_jmp_op(struct bpf_verifier_env *env,
3533 3534
			     struct bpf_insn *insn, int *insn_idx)
{
3535 3536 3537 3538
	struct bpf_verifier_state *this_branch = env->cur_state;
	struct bpf_verifier_state *other_branch;
	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
	struct bpf_reg_state *dst_reg, *other_branch_regs;
3539 3540 3541
	u8 opcode = BPF_OP(insn->code);
	int err;

3542
	if (opcode > BPF_JSLE) {
3543
		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
3544 3545 3546 3547 3548
		return -EINVAL;
	}

	if (BPF_SRC(insn->code) == BPF_X) {
		if (insn->imm != 0) {
3549
			verbose(env, "BPF_JMP uses reserved fields\n");
3550 3551 3552 3553
			return -EINVAL;
		}

		/* check src1 operand */
3554
		err = check_reg_arg(env, insn->src_reg, SRC_OP);
3555 3556
		if (err)
			return err;
3557 3558

		if (is_pointer_value(env, insn->src_reg)) {
3559
			verbose(env, "R%d pointer comparison prohibited\n",
3560 3561 3562
				insn->src_reg);
			return -EACCES;
		}
3563 3564
	} else {
		if (insn->src_reg != BPF_REG_0) {
3565
			verbose(env, "BPF_JMP uses reserved fields\n");
3566 3567 3568 3569 3570
			return -EINVAL;
		}
	}

	/* check src2 operand */
3571
	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
3572 3573 3574
	if (err)
		return err;

A
Alexei Starovoitov 已提交
3575 3576
	dst_reg = &regs[insn->dst_reg];

3577 3578 3579
	/* detect if R == 0 where R was initialized to zero earlier */
	if (BPF_SRC(insn->code) == BPF_K &&
	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
3580
	    dst_reg->type == SCALAR_VALUE &&
3581 3582 3583
	    tnum_is_const(dst_reg->var_off)) {
		if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
		    (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600
			/* if (imm == imm) goto pc+off;
			 * only follow the goto, ignore fall-through
			 */
			*insn_idx += insn->off;
			return 0;
		} else {
			/* if (imm != imm) goto pc+off;
			 * only follow fall-through branch, since
			 * that's where the program will go
			 */
			return 0;
		}
	}

	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
	if (!other_branch)
		return -EFAULT;
3601
	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
3602

3603 3604
	/* detect if we are comparing against a constant value so we can adjust
	 * our min/max values for our dst register.
3605 3606 3607 3608
	 * this is only legit if both are scalars (or pointers to the same
	 * object, I suppose, but we don't support that right now), because
	 * otherwise the different base pointers mean the offsets aren't
	 * comparable.
3609 3610
	 */
	if (BPF_SRC(insn->code) == BPF_X) {
3611 3612 3613
		if (dst_reg->type == SCALAR_VALUE &&
		    regs[insn->src_reg].type == SCALAR_VALUE) {
			if (tnum_is_const(regs[insn->src_reg].var_off))
3614
				reg_set_min_max(&other_branch_regs[insn->dst_reg],
3615 3616 3617
						dst_reg, regs[insn->src_reg].var_off.value,
						opcode);
			else if (tnum_is_const(dst_reg->var_off))
3618
				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
3619 3620 3621 3622
						    &regs[insn->src_reg],
						    dst_reg->var_off.value, opcode);
			else if (opcode == BPF_JEQ || opcode == BPF_JNE)
				/* Comparing for equality, we can combine knowledge */
3623 3624
				reg_combine_min_max(&other_branch_regs[insn->src_reg],
						    &other_branch_regs[insn->dst_reg],
3625 3626 3627 3628
						    &regs[insn->src_reg],
						    &regs[insn->dst_reg], opcode);
		}
	} else if (dst_reg->type == SCALAR_VALUE) {
3629
		reg_set_min_max(&other_branch_regs[insn->dst_reg],
3630 3631 3632
					dst_reg, insn->imm, opcode);
	}

3633
	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
3634
	if (BPF_SRC(insn->code) == BPF_K &&
A
Alexei Starovoitov 已提交
3635 3636
	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
3637 3638 3639
		/* Mark all identical map registers in each branch as either
		 * safe or unknown depending R == 0 or R != 0 conditional.
		 */
3640 3641
		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
3642 3643 3644
	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
					   this_branch, other_branch) &&
		   is_pointer_value(env, insn->dst_reg)) {
3645 3646
		verbose(env, "R%d pointer comparison prohibited\n",
			insn->dst_reg);
3647
		return -EACCES;
3648
	}
3649
	if (env->log.level)
3650
		print_verifier_state(env, this_branch->frame[this_branch->curframe]);
3651 3652 3653
	return 0;
}

3654 3655 3656 3657 3658 3659 3660 3661
/* return the map pointer stored inside BPF_LD_IMM64 instruction */
static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
{
	u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;

	return (struct bpf_map *) (unsigned long) imm64;
}

3662
/* verify BPF_LD_IMM64 instruction */
3663
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
3664
{
3665
	struct bpf_reg_state *regs = cur_regs(env);
3666 3667 3668
	int err;

	if (BPF_SIZE(insn->code) != BPF_DW) {
3669
		verbose(env, "invalid BPF_LD_IMM insn\n");
3670 3671 3672
		return -EINVAL;
	}
	if (insn->off != 0) {
3673
		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
3674 3675 3676
		return -EINVAL;
	}

3677
	err = check_reg_arg(env, insn->dst_reg, DST_OP);
3678 3679 3680
	if (err)
		return err;

3681 3682 3683
	if (insn->src_reg == 0) {
		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;

3684
		regs[insn->dst_reg].type = SCALAR_VALUE;
3685
		__mark_reg_known(&regs[insn->dst_reg], imm);
3686
		return 0;
3687
	}
3688 3689 3690 3691 3692 3693 3694 3695 3696

	/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
	BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);

	regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
	regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
	return 0;
}

3697 3698 3699 3700 3701
static bool may_access_skb(enum bpf_prog_type type)
{
	switch (type) {
	case BPF_PROG_TYPE_SOCKET_FILTER:
	case BPF_PROG_TYPE_SCHED_CLS:
3702
	case BPF_PROG_TYPE_SCHED_ACT:
3703 3704 3705 3706 3707 3708
		return true;
	default:
		return false;
	}
}

3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723
/* verify safety of LD_ABS|LD_IND instructions:
 * - they can only appear in the programs where ctx == skb
 * - since they are wrappers of function calls, they scratch R1-R5 registers,
 *   preserve R6-R9, and store return value into R0
 *
 * Implicit input:
 *   ctx == skb == R6 == CTX
 *
 * Explicit input:
 *   SRC == any register
 *   IMM == 32-bit immediate
 *
 * Output:
 *   R0 - 8/16/32-bit skb data converted to cpu endianness
 */
3724
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
3725
{
3726
	struct bpf_reg_state *regs = cur_regs(env);
3727 3728 3729
	u8 mode = BPF_MODE(insn->code);
	int i, err;

3730
	if (!may_access_skb(env->prog->type)) {
3731
		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
3732 3733 3734
		return -EINVAL;
	}

3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746
	if (env->subprog_cnt) {
		/* when program has LD_ABS insn JITs and interpreter assume
		 * that r1 == ctx == skb which is not the case for callees
		 * that can have arbitrary arguments. It's problematic
		 * for main prog as well since JITs would need to analyze
		 * all functions in order to make proper register save/restore
		 * decisions in the main prog. Hence disallow LD_ABS with calls
		 */
		verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
		return -EINVAL;
	}

3747
	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
3748
	    BPF_SIZE(insn->code) == BPF_DW ||
3749
	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
3750
		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
3751 3752 3753 3754
		return -EINVAL;
	}

	/* check whether implicit source operand (register R6) is readable */
3755
	err = check_reg_arg(env, BPF_REG_6, SRC_OP);
3756 3757 3758 3759
	if (err)
		return err;

	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
3760 3761
		verbose(env,
			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
3762 3763 3764 3765 3766
		return -EINVAL;
	}

	if (mode == BPF_IND) {
		/* check explicit source operand */
3767
		err = check_reg_arg(env, insn->src_reg, SRC_OP);
3768 3769 3770 3771 3772
		if (err)
			return err;
	}

	/* reset caller saved regs to unreadable */
3773
	for (i = 0; i < CALLER_SAVED_REGS; i++) {
3774
		mark_reg_not_init(env, regs, caller_saved[i]);
3775 3776
		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
	}
3777 3778

	/* mark destination R0 register as readable, since it contains
3779 3780
	 * the value fetched from the packet.
	 * Already marked as written above.
3781
	 */
3782
	mark_reg_unknown(env, regs, BPF_REG_0);
3783 3784 3785
	return 0;
}

3786 3787 3788 3789 3790 3791 3792 3793 3794
static int check_return_code(struct bpf_verifier_env *env)
{
	struct bpf_reg_state *reg;
	struct tnum range = tnum_range(0, 1);

	switch (env->prog->type) {
	case BPF_PROG_TYPE_CGROUP_SKB:
	case BPF_PROG_TYPE_CGROUP_SOCK:
	case BPF_PROG_TYPE_SOCK_OPS:
3795
	case BPF_PROG_TYPE_CGROUP_DEVICE:
3796 3797 3798 3799 3800
		break;
	default:
		return 0;
	}

3801
	reg = cur_regs(env) + BPF_REG_0;
3802
	if (reg->type != SCALAR_VALUE) {
3803
		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
3804 3805 3806 3807 3808
			reg_type_str[reg->type]);
		return -EINVAL;
	}

	if (!tnum_in(range, reg->var_off)) {
3809
		verbose(env, "At program exit the register R0 ");
3810 3811 3812 3813
		if (!tnum_is_unknown(reg->var_off)) {
			char tn_buf[48];

			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
3814
			verbose(env, "has value %s", tn_buf);
3815
		} else {
3816
			verbose(env, "has unknown scalar value");
3817
		}
3818
		verbose(env, " should have been 0 or 1\n");
3819 3820 3821 3822 3823
		return -EINVAL;
	}
	return 0;
}

3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863
/* non-recursive DFS pseudo code
 * 1  procedure DFS-iterative(G,v):
 * 2      label v as discovered
 * 3      let S be a stack
 * 4      S.push(v)
 * 5      while S is not empty
 * 6            t <- S.pop()
 * 7            if t is what we're looking for:
 * 8                return t
 * 9            for all edges e in G.adjacentEdges(t) do
 * 10               if edge e is already labelled
 * 11                   continue with the next edge
 * 12               w <- G.adjacentVertex(t,e)
 * 13               if vertex w is not discovered and not explored
 * 14                   label e as tree-edge
 * 15                   label w as discovered
 * 16                   S.push(w)
 * 17                   continue at 5
 * 18               else if vertex w is discovered
 * 19                   label e as back-edge
 * 20               else
 * 21                   // vertex w is explored
 * 22                   label e as forward- or cross-edge
 * 23           label t as explored
 * 24           S.pop()
 *
 * convention:
 * 0x10 - discovered
 * 0x11 - discovered and fall-through edge labelled
 * 0x12 - discovered and fall-through and branch edges labelled
 * 0x20 - explored
 */

enum {
	DISCOVERED = 0x10,
	EXPLORED = 0x20,
	FALLTHROUGH = 1,
	BRANCH = 2,
};

3864
#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
3865

3866 3867 3868 3869 3870 3871 3872 3873 3874
static int *insn_stack;	/* stack of insns to process */
static int cur_stack;	/* current stack index */
static int *insn_state;

/* t, w, e - match pseudo-code above:
 * t - index of current instruction
 * w - next instruction
 * e - edge
 */
3875
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
3876 3877 3878 3879 3880 3881 3882 3883
{
	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
		return 0;

	if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
		return 0;

	if (w < 0 || w >= env->prog->len) {
3884
		verbose(env, "jump out of range from insn %d to %d\n", t, w);
3885 3886 3887
		return -EINVAL;
	}

3888 3889 3890 3891
	if (e == BRANCH)
		/* mark branch target for state pruning */
		env->explored_states[w] = STATE_LIST_MARK;

3892 3893 3894 3895 3896 3897 3898 3899 3900
	if (insn_state[w] == 0) {
		/* tree-edge */
		insn_state[t] = DISCOVERED | e;
		insn_state[w] = DISCOVERED;
		if (cur_stack >= env->prog->len)
			return -E2BIG;
		insn_stack[cur_stack++] = w;
		return 1;
	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
3901
		verbose(env, "back-edge from insn %d to %d\n", t, w);
3902 3903 3904 3905 3906
		return -EINVAL;
	} else if (insn_state[w] == EXPLORED) {
		/* forward- or cross-edge */
		insn_state[t] = DISCOVERED | e;
	} else {
3907
		verbose(env, "insn state internal bug\n");
3908 3909 3910 3911 3912 3913 3914 3915
		return -EFAULT;
	}
	return 0;
}

/* non-recursive depth-first-search to detect loops in BPF program
 * loop == back-edge in directed graph
 */
3916
static int check_cfg(struct bpf_verifier_env *env)
3917 3918 3919 3920 3921 3922
{
	struct bpf_insn *insns = env->prog->insnsi;
	int insn_cnt = env->prog->len;
	int ret = 0;
	int i, t;

3923 3924 3925 3926
	ret = check_subprogs(env);
	if (ret < 0)
		return ret;

3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956
	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	if (!insn_state)
		return -ENOMEM;

	insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	if (!insn_stack) {
		kfree(insn_state);
		return -ENOMEM;
	}

	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
	insn_stack[0] = 0; /* 0 is the first instruction */
	cur_stack = 1;

peek_stack:
	if (cur_stack == 0)
		goto check_state;
	t = insn_stack[cur_stack - 1];

	if (BPF_CLASS(insns[t].code) == BPF_JMP) {
		u8 opcode = BPF_OP(insns[t].code);

		if (opcode == BPF_EXIT) {
			goto mark_explored;
		} else if (opcode == BPF_CALL) {
			ret = push_insn(t, t + 1, FALLTHROUGH, env);
			if (ret == 1)
				goto peek_stack;
			else if (ret < 0)
				goto err_free;
3957 3958
			if (t + 1 < insn_cnt)
				env->explored_states[t + 1] = STATE_LIST_MARK;
3959 3960 3961 3962 3963 3964 3965 3966
			if (insns[t].src_reg == BPF_PSEUDO_CALL) {
				env->explored_states[t] = STATE_LIST_MARK;
				ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
				if (ret == 1)
					goto peek_stack;
				else if (ret < 0)
					goto err_free;
			}
3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978
		} else if (opcode == BPF_JA) {
			if (BPF_SRC(insns[t].code) != BPF_K) {
				ret = -EINVAL;
				goto err_free;
			}
			/* unconditional jump with single edge */
			ret = push_insn(t, t + insns[t].off + 1,
					FALLTHROUGH, env);
			if (ret == 1)
				goto peek_stack;
			else if (ret < 0)
				goto err_free;
3979 3980 3981
			/* tell verifier to check for equivalent states
			 * after every call and jump
			 */
3982 3983
			if (t + 1 < insn_cnt)
				env->explored_states[t + 1] = STATE_LIST_MARK;
3984 3985
		} else {
			/* conditional jump with two edges */
3986
			env->explored_states[t] = STATE_LIST_MARK;
3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012
			ret = push_insn(t, t + 1, FALLTHROUGH, env);
			if (ret == 1)
				goto peek_stack;
			else if (ret < 0)
				goto err_free;

			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
			if (ret == 1)
				goto peek_stack;
			else if (ret < 0)
				goto err_free;
		}
	} else {
		/* all other non-branch instructions with single
		 * fall-through edge
		 */
		ret = push_insn(t, t + 1, FALLTHROUGH, env);
		if (ret == 1)
			goto peek_stack;
		else if (ret < 0)
			goto err_free;
	}

mark_explored:
	insn_state[t] = EXPLORED;
	if (cur_stack-- <= 0) {
4013
		verbose(env, "pop stack internal bug\n");
4014 4015 4016 4017 4018 4019 4020 4021
		ret = -EFAULT;
		goto err_free;
	}
	goto peek_stack;

check_state:
	for (i = 0; i < insn_cnt; i++) {
		if (insn_state[i] != EXPLORED) {
4022
			verbose(env, "unreachable insn %d\n", i);
4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034
			ret = -EINVAL;
			goto err_free;
		}
	}
	ret = 0; /* cfg looks good */

err_free:
	kfree(insn_state);
	kfree(insn_stack);
	return ret;
}

4035 4036 4037 4038
/* check %cur's range satisfies %old's */
static bool range_within(struct bpf_reg_state *old,
			 struct bpf_reg_state *cur)
{
4039 4040 4041 4042
	return old->umin_value <= cur->umin_value &&
	       old->umax_value >= cur->umax_value &&
	       old->smin_value <= cur->smin_value &&
	       old->smax_value >= cur->smax_value;
4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060
}

/* Maximum number of register states that can exist at once */
#define ID_MAP_SIZE	(MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
struct idpair {
	u32 old;
	u32 cur;
};

/* If in the old state two registers had the same id, then they need to have
 * the same id in the new state as well.  But that id could be different from
 * the old state, so we need to track the mapping from old to new ids.
 * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
 * regs with old id 5 must also have new id 9 for the new state to be safe.  But
 * regs with a different old id could still have new id 9, we don't care about
 * that.
 * So we look through our idmap to see if this old id has been seen before.  If
 * so, we require the new id to match; otherwise, we add the id pair to the map.
A
Alexei Starovoitov 已提交
4061
 */
4062
static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
A
Alexei Starovoitov 已提交
4063
{
4064
	unsigned int i;
A
Alexei Starovoitov 已提交
4065

4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081
	for (i = 0; i < ID_MAP_SIZE; i++) {
		if (!idmap[i].old) {
			/* Reached an empty slot; haven't seen this id before */
			idmap[i].old = old_id;
			idmap[i].cur = cur_id;
			return true;
		}
		if (idmap[i].old == old_id)
			return idmap[i].cur == cur_id;
	}
	/* We ran out of idmap slots, which should be impossible */
	WARN_ON_ONCE(1);
	return false;
}

/* Returns true if (rold safe implies rcur safe) */
4082 4083
static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
		    struct idpair *idmap)
4084
{
4085 4086
	bool equal;

4087 4088 4089 4090
	if (!(rold->live & REG_LIVE_READ))
		/* explored state didn't use this */
		return true;

4091 4092 4093 4094 4095 4096 4097 4098 4099
	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;

	if (rold->type == PTR_TO_STACK)
		/* two stack pointers are equal only if they're pointing to
		 * the same stack frame, since fp-8 in foo != fp-8 in bar
		 */
		return equal && rold->frameno == rcur->frameno;

	if (equal)
A
Alexei Starovoitov 已提交
4100 4101
		return true;

4102 4103
	if (rold->type == NOT_INIT)
		/* explored state can't have used this */
A
Alexei Starovoitov 已提交
4104
		return true;
4105 4106 4107 4108 4109 4110 4111 4112 4113
	if (rcur->type == NOT_INIT)
		return false;
	switch (rold->type) {
	case SCALAR_VALUE:
		if (rcur->type == SCALAR_VALUE) {
			/* new val must satisfy old val knowledge */
			return range_within(rold, rcur) &&
			       tnum_in(rold->var_off, rcur->var_off);
		} else {
4114 4115 4116 4117 4118 4119
			/* We're trying to use a pointer in place of a scalar.
			 * Even if the scalar was unbounded, this could lead to
			 * pointer leaks because scalars are allowed to leak
			 * while pointers are not. We could make this safe in
			 * special cases if root is calling us, but it's
			 * probably not worth the hassle.
4120
			 */
4121
			return false;
4122 4123
		}
	case PTR_TO_MAP_VALUE:
4124 4125 4126 4127 4128 4129 4130 4131
		/* If the new min/max/var_off satisfy the old ones and
		 * everything else matches, we are OK.
		 * We don't care about the 'id' value, because nothing
		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
		 */
		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
		       range_within(rold, rcur) &&
		       tnum_in(rold->var_off, rcur->var_off);
4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145
	case PTR_TO_MAP_VALUE_OR_NULL:
		/* a PTR_TO_MAP_VALUE could be safe to use as a
		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.
		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
		 * checked, doing so could have affected others with the same
		 * id, and we can't check for that because we lost the id when
		 * we converted to a PTR_TO_MAP_VALUE.
		 */
		if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
			return false;
		if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
			return false;
		/* Check our ids match any regs they're supposed to */
		return check_ids(rold->id, rcur->id, idmap);
4146
	case PTR_TO_PACKET_META:
4147
	case PTR_TO_PACKET:
4148
		if (rcur->type != rold->type)
4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178
			return false;
		/* We must have at least as much range as the old ptr
		 * did, so that any accesses which were safe before are
		 * still safe.  This is true even if old range < old off,
		 * since someone could have accessed through (ptr - k), or
		 * even done ptr -= k in a register, to get a safe access.
		 */
		if (rold->range > rcur->range)
			return false;
		/* If the offsets don't match, we can't trust our alignment;
		 * nor can we be sure that we won't fall out of range.
		 */
		if (rold->off != rcur->off)
			return false;
		/* id relations must be preserved */
		if (rold->id && !check_ids(rold->id, rcur->id, idmap))
			return false;
		/* new val must satisfy old val knowledge */
		return range_within(rold, rcur) &&
		       tnum_in(rold->var_off, rcur->var_off);
	case PTR_TO_CTX:
	case CONST_PTR_TO_MAP:
	case PTR_TO_PACKET_END:
		/* Only valid matches are exact, which memcmp() above
		 * would have accepted
		 */
	default:
		/* Don't know what's going on, just say it's not safe */
		return false;
	}
A
Alexei Starovoitov 已提交
4179

4180 4181
	/* Shouldn't get here; if we do, say it's not safe */
	WARN_ON_ONCE(1);
A
Alexei Starovoitov 已提交
4182 4183 4184
	return false;
}

4185 4186
static bool stacksafe(struct bpf_func_state *old,
		      struct bpf_func_state *cur,
4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203
		      struct idpair *idmap)
{
	int i, spi;

	/* if explored stack has more populated slots than current stack
	 * such stacks are not equivalent
	 */
	if (old->allocated_stack > cur->allocated_stack)
		return false;

	/* walk slots of the explored stack and ignore any additional
	 * slots in the current stack, since explored(safe) state
	 * didn't use them
	 */
	for (i = 0; i < old->allocated_stack; i++) {
		spi = i / BPF_REG_SIZE;

4204 4205
		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
			/* explored state didn't use this */
4206
			continue;
4207

4208 4209
		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
			continue;
4210 4211 4212 4213 4214 4215 4216
		/* if old state was safe with misc data in the stack
		 * it will be safe with zero-initialized stack.
		 * The opposite is not true
		 */
		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
		    cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
			continue;
4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246
		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
			/* Ex: old explored (safe) state has STACK_SPILL in
			 * this stack slot, but current has has STACK_MISC ->
			 * this verifier states are not equivalent,
			 * return false to continue verification of this path
			 */
			return false;
		if (i % BPF_REG_SIZE)
			continue;
		if (old->stack[spi].slot_type[0] != STACK_SPILL)
			continue;
		if (!regsafe(&old->stack[spi].spilled_ptr,
			     &cur->stack[spi].spilled_ptr,
			     idmap))
			/* when explored and current stack slot are both storing
			 * spilled registers, check that stored pointers types
			 * are the same as well.
			 * Ex: explored safe path could have stored
			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
			 * but current path has stored:
			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
			 * such verifier states are not equivalent.
			 * return false to continue verification of this path
			 */
			return false;
	}
	return true;
}

4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272
/* compare two verifier states
 *
 * all states stored in state_list are known to be valid, since
 * verifier reached 'bpf_exit' instruction through them
 *
 * this function is called when verifier exploring different branches of
 * execution popped from the state stack. If it sees an old state that has
 * more strict register state and more strict stack state then this execution
 * branch doesn't need to be explored further, since verifier already
 * concluded that more strict state leads to valid finish.
 *
 * Therefore two states are equivalent if register state is more conservative
 * and explored stack state is more conservative than the current one.
 * Example:
 *       explored                   current
 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
 *
 * In other words if current stack state (one being explored) has more
 * valid slots than old one that already passed validation, it means
 * the verifier can stop exploring and conclude that current state is valid too
 *
 * Similarly with registers. If explored state has register type as invalid
 * whereas register type in current state is meaningful, it means that
 * the current state will reach 'bpf_exit' instruction safely
 */
4273 4274
static bool func_states_equal(struct bpf_func_state *old,
			      struct bpf_func_state *cur)
4275
{
4276 4277
	struct idpair *idmap;
	bool ret = false;
4278 4279
	int i;

4280 4281 4282
	idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
	/* If we failed to allocate the idmap, just say it's not safe */
	if (!idmap)
A
Alexei Starovoitov 已提交
4283
		return false;
4284 4285

	for (i = 0; i < MAX_BPF_REG; i++) {
4286
		if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
4287
			goto out_free;
4288 4289
	}

4290 4291
	if (!stacksafe(old, cur, idmap))
		goto out_free;
4292 4293 4294 4295
	ret = true;
out_free:
	kfree(idmap);
	return ret;
4296 4297
}

4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318
static bool states_equal(struct bpf_verifier_env *env,
			 struct bpf_verifier_state *old,
			 struct bpf_verifier_state *cur)
{
	int i;

	if (old->curframe != cur->curframe)
		return false;

	/* for states to be equal callsites have to be the same
	 * and all frame states need to be equivalent
	 */
	for (i = 0; i <= old->curframe; i++) {
		if (old->frame[i]->callsite != cur->frame[i]->callsite)
			return false;
		if (!func_states_equal(old->frame[i], cur->frame[i]))
			return false;
	}
	return true;
}

4319
/* A write screens off any subsequent reads; but write marks come from the
4320 4321 4322 4323 4324
 * straight-line code between a state and its parent.  When we arrive at an
 * equivalent state (jump target or such) we didn't arrive by the straight-line
 * code, so read marks in the state must propagate to the parent regardless
 * of the state's write marks. That's what 'parent == state->parent' comparison
 * in mark_reg_read() and mark_stack_slot_read() is for.
4325
 */
4326 4327 4328
static int propagate_liveness(struct bpf_verifier_env *env,
			      const struct bpf_verifier_state *vstate,
			      struct bpf_verifier_state *vparent)
4329
{
4330 4331
	int i, frame, err = 0;
	struct bpf_func_state *state, *parent;
4332

4333 4334 4335 4336 4337
	if (vparent->curframe != vstate->curframe) {
		WARN(1, "propagate_live: parent frame %d current frame %d\n",
		     vparent->curframe, vstate->curframe);
		return -EFAULT;
	}
4338 4339 4340 4341
	/* Propagate read liveness of registers... */
	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
	/* We don't need to worry about FP liveness because it's read-only */
	for (i = 0; i < BPF_REG_FP; i++) {
4342
		if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
4343
			continue;
4344 4345 4346 4347
		if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
			err = mark_reg_read(env, vstate, vparent, i);
			if (err)
				return err;
4348 4349
		}
	}
4350

4351
	/* ... and stack slots */
4352 4353 4354 4355 4356 4357 4358 4359 4360
	for (frame = 0; frame <= vstate->curframe; frame++) {
		state = vstate->frame[frame];
		parent = vparent->frame[frame];
		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
			    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
				continue;
			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
				mark_stack_slot_read(env, vstate, vparent, i, frame);
4361 4362
		}
	}
4363
	return err;
4364 4365
}

4366
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
4367
{
4368 4369
	struct bpf_verifier_state_list *new_sl;
	struct bpf_verifier_state_list *sl;
4370
	struct bpf_verifier_state *cur = env->cur_state;
4371
	int i, j, err;
4372 4373 4374 4375 4376 4377 4378 4379 4380

	sl = env->explored_states[insn_idx];
	if (!sl)
		/* this 'insn_idx' instruction wasn't marked, so we will not
		 * be doing state search here
		 */
		return 0;

	while (sl != STATE_LIST_MARK) {
4381
		if (states_equal(env, &sl->state, cur)) {
4382
			/* reached equivalent register/stack state,
4383 4384
			 * prune the search.
			 * Registers read by the continuation are read by us.
4385 4386 4387 4388 4389 4390
			 * If we have any write marks in env->cur_state, they
			 * will prevent corresponding reads in the continuation
			 * from reaching our parent (an explored_state).  Our
			 * own state will get the read marks recorded, but
			 * they'll be immediately forgotten as we're pruning
			 * this state and will pop a new one.
4391
			 */
4392 4393 4394
			err = propagate_liveness(env, &sl->state, cur);
			if (err)
				return err;
4395
			return 1;
4396
		}
4397 4398 4399 4400 4401
		sl = sl->next;
	}

	/* there were no equivalent states, remember current one.
	 * technically the current state is not proven to be safe yet,
4402 4403 4404 4405
	 * but it will either reach outer most bpf_exit (which means it's safe)
	 * or it will be rejected. Since there are no loops, we won't be
	 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
	 * again on the way to bpf_exit
4406
	 */
4407
	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
4408 4409 4410 4411
	if (!new_sl)
		return -ENOMEM;

	/* add new state to the head of linked list */
4412 4413 4414 4415 4416 4417
	err = copy_verifier_state(&new_sl->state, cur);
	if (err) {
		free_verifier_state(&new_sl->state, false);
		kfree(new_sl);
		return err;
	}
4418 4419
	new_sl->next = env->explored_states[insn_idx];
	env->explored_states[insn_idx] = new_sl;
4420
	/* connect new state to parentage chain */
4421
	cur->parent = &new_sl->state;
4422 4423 4424 4425 4426 4427
	/* clear write marks in current state: the writes we did are not writes
	 * our child did, so they don't screen off its reads from us.
	 * (There are no read marks in current state, because reads always mark
	 * their parent and current state never has children yet.  Only
	 * explored_states can get read marks.)
	 */
4428
	for (i = 0; i < BPF_REG_FP; i++)
4429 4430 4431 4432 4433 4434 4435
		cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;

	/* all stack frames are accessible from callee, clear them all */
	for (j = 0; j <= cur->curframe; j++) {
		struct bpf_func_state *frame = cur->frame[j];

		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
4436
			frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
4437
	}
4438 4439 4440
	return 0;
}

4441
static int do_check(struct bpf_verifier_env *env)
4442
{
4443
	struct bpf_verifier_state *state;
4444
	struct bpf_insn *insns = env->prog->insnsi;
4445
	struct bpf_reg_state *regs;
4446
	int insn_cnt = env->prog->len, i;
4447 4448 4449 4450
	int insn_idx, prev_insn_idx = 0;
	int insn_processed = 0;
	bool do_print_state = false;

4451 4452 4453
	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
	if (!state)
		return -ENOMEM;
4454
	state->curframe = 0;
4455
	state->parent = NULL;
4456 4457 4458 4459 4460 4461 4462 4463 4464 4465
	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
	if (!state->frame[0]) {
		kfree(state);
		return -ENOMEM;
	}
	env->cur_state = state;
	init_func_state(env, state->frame[0],
			BPF_MAIN_FUNC /* callsite */,
			0 /* frameno */,
			0 /* subprogno, zero == main subprog */);
4466 4467 4468 4469 4470 4471 4472
	insn_idx = 0;
	for (;;) {
		struct bpf_insn *insn;
		u8 class;
		int err;

		if (insn_idx >= insn_cnt) {
4473
			verbose(env, "invalid insn idx %d insn_cnt %d\n",
4474 4475 4476 4477 4478 4479 4480
				insn_idx, insn_cnt);
			return -EFAULT;
		}

		insn = &insns[insn_idx];
		class = BPF_CLASS(insn->code);

4481
		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
4482 4483
			verbose(env,
				"BPF program is too large. Processed %d insn\n",
4484 4485 4486 4487
				insn_processed);
			return -E2BIG;
		}

4488 4489 4490 4491 4492
		err = is_state_visited(env, insn_idx);
		if (err < 0)
			return err;
		if (err == 1) {
			/* found equivalent state, can prune the search */
4493
			if (env->log.level) {
4494
				if (do_print_state)
4495
					verbose(env, "\nfrom %d to %d: safe\n",
4496 4497
						prev_insn_idx, insn_idx);
				else
4498
					verbose(env, "%d: safe\n", insn_idx);
4499 4500 4501 4502
			}
			goto process_bpf_exit;
		}

4503 4504 4505
		if (need_resched())
			cond_resched();

4506 4507 4508
		if (env->log.level > 1 || (env->log.level && do_print_state)) {
			if (env->log.level > 1)
				verbose(env, "%d:", insn_idx);
4509
			else
4510
				verbose(env, "\nfrom %d to %d:",
4511
					prev_insn_idx, insn_idx);
4512
			print_verifier_state(env, state->frame[state->curframe]);
4513 4514 4515
			do_print_state = false;
		}

4516
		if (env->log.level) {
4517 4518 4519 4520
			const struct bpf_insn_cbs cbs = {
				.cb_print	= verbose,
			};

4521
			verbose(env, "%d: ", insn_idx);
4522
			print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
4523 4524
		}

4525 4526 4527 4528 4529 4530
		if (bpf_prog_is_dev_bound(env->prog->aux)) {
			err = bpf_prog_offload_verify_insn(env, insn_idx,
							   prev_insn_idx);
			if (err)
				return err;
		}
4531

4532
		regs = cur_regs(env);
A
Alexei Starovoitov 已提交
4533
		env->insn_aux_data[insn_idx].seen = true;
4534
		if (class == BPF_ALU || class == BPF_ALU64) {
4535
			err = check_alu_op(env, insn);
4536 4537 4538 4539
			if (err)
				return err;

		} else if (class == BPF_LDX) {
4540
			enum bpf_reg_type *prev_src_type, src_reg_type;
4541 4542 4543

			/* check for reserved fields is already done */

4544
			/* check src operand */
4545
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
4546 4547 4548
			if (err)
				return err;

4549
			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
4550 4551 4552
			if (err)
				return err;

4553 4554
			src_reg_type = regs[insn->src_reg].type;

4555 4556 4557
			/* check that memory (src_reg + off) is readable,
			 * the state of dst_reg will be updated by this func
			 */
4558
			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
4559 4560 4561 4562 4563
					       BPF_SIZE(insn->code), BPF_READ,
					       insn->dst_reg);
			if (err)
				return err;

4564 4565 4566
			prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;

			if (*prev_src_type == NOT_INIT) {
4567 4568
				/* saw a valid insn
				 * dst_reg = *(u32 *)(src_reg + off)
4569
				 * save type to validate intersecting paths
4570
				 */
4571
				*prev_src_type = src_reg_type;
4572

4573
			} else if (src_reg_type != *prev_src_type &&
4574
				   (src_reg_type == PTR_TO_CTX ||
4575
				    *prev_src_type == PTR_TO_CTX)) {
4576 4577 4578 4579 4580 4581 4582
				/* ABuser program is trying to use the same insn
				 * dst_reg = *(u32*) (src_reg + off)
				 * with different pointer types:
				 * src_reg == ctx in one branch and
				 * src_reg == stack|map in some other branch.
				 * Reject it.
				 */
4583
				verbose(env, "same insn cannot be used with different pointers\n");
4584 4585 4586
				return -EINVAL;
			}

4587
		} else if (class == BPF_STX) {
4588
			enum bpf_reg_type *prev_dst_type, dst_reg_type;
4589

4590
			if (BPF_MODE(insn->code) == BPF_XADD) {
4591
				err = check_xadd(env, insn_idx, insn);
4592 4593 4594 4595 4596 4597 4598
				if (err)
					return err;
				insn_idx++;
				continue;
			}

			/* check src1 operand */
4599
			err = check_reg_arg(env, insn->src_reg, SRC_OP);
4600 4601 4602
			if (err)
				return err;
			/* check src2 operand */
4603
			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
4604 4605 4606
			if (err)
				return err;

4607 4608
			dst_reg_type = regs[insn->dst_reg].type;

4609
			/* check that memory (dst_reg + off) is writeable */
4610
			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
4611 4612 4613 4614 4615
					       BPF_SIZE(insn->code), BPF_WRITE,
					       insn->src_reg);
			if (err)
				return err;

4616 4617 4618 4619 4620
			prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;

			if (*prev_dst_type == NOT_INIT) {
				*prev_dst_type = dst_reg_type;
			} else if (dst_reg_type != *prev_dst_type &&
4621
				   (dst_reg_type == PTR_TO_CTX ||
4622
				    *prev_dst_type == PTR_TO_CTX)) {
4623
				verbose(env, "same insn cannot be used with different pointers\n");
4624 4625 4626
				return -EINVAL;
			}

4627 4628 4629
		} else if (class == BPF_ST) {
			if (BPF_MODE(insn->code) != BPF_MEM ||
			    insn->src_reg != BPF_REG_0) {
4630
				verbose(env, "BPF_ST uses reserved fields\n");
4631 4632 4633
				return -EINVAL;
			}
			/* check src operand */
4634
			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
4635 4636 4637 4638
			if (err)
				return err;

			/* check that memory (dst_reg + off) is writeable */
4639
			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650
					       BPF_SIZE(insn->code), BPF_WRITE,
					       -1);
			if (err)
				return err;

		} else if (class == BPF_JMP) {
			u8 opcode = BPF_OP(insn->code);

			if (opcode == BPF_CALL) {
				if (BPF_SRC(insn->code) != BPF_K ||
				    insn->off != 0 ||
4651 4652
				    (insn->src_reg != BPF_REG_0 &&
				     insn->src_reg != BPF_PSEUDO_CALL) ||
4653
				    insn->dst_reg != BPF_REG_0) {
4654
					verbose(env, "BPF_CALL uses reserved fields\n");
4655 4656 4657
					return -EINVAL;
				}

4658 4659 4660 4661
				if (insn->src_reg == BPF_PSEUDO_CALL)
					err = check_func_call(env, insn, &insn_idx);
				else
					err = check_helper_call(env, insn->imm, insn_idx);
4662 4663 4664 4665 4666 4667 4668 4669
				if (err)
					return err;

			} else if (opcode == BPF_JA) {
				if (BPF_SRC(insn->code) != BPF_K ||
				    insn->imm != 0 ||
				    insn->src_reg != BPF_REG_0 ||
				    insn->dst_reg != BPF_REG_0) {
4670
					verbose(env, "BPF_JA uses reserved fields\n");
4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681
					return -EINVAL;
				}

				insn_idx += insn->off + 1;
				continue;

			} else if (opcode == BPF_EXIT) {
				if (BPF_SRC(insn->code) != BPF_K ||
				    insn->imm != 0 ||
				    insn->src_reg != BPF_REG_0 ||
				    insn->dst_reg != BPF_REG_0) {
4682
					verbose(env, "BPF_EXIT uses reserved fields\n");
4683 4684 4685
					return -EINVAL;
				}

4686 4687 4688 4689 4690 4691 4692 4693 4694 4695
				if (state->curframe) {
					/* exit from nested function */
					prev_insn_idx = insn_idx;
					err = prepare_func_exit(env, &insn_idx);
					if (err)
						return err;
					do_print_state = true;
					continue;
				}

4696 4697 4698 4699 4700 4701
				/* eBPF calling convetion is such that R0 is used
				 * to return the value from eBPF program.
				 * Make sure that it's readable at this time
				 * of bpf_exit, which means that program wrote
				 * something into it earlier
				 */
4702
				err = check_reg_arg(env, BPF_REG_0, SRC_OP);
4703 4704 4705
				if (err)
					return err;

4706
				if (is_pointer_value(env, BPF_REG_0)) {
4707
					verbose(env, "R0 leaks addr as return value\n");
4708 4709 4710
					return -EACCES;
				}

4711 4712 4713
				err = check_return_code(env);
				if (err)
					return err;
4714
process_bpf_exit:
4715 4716 4717 4718
				err = pop_stack(env, &prev_insn_idx, &insn_idx);
				if (err < 0) {
					if (err != -ENOENT)
						return err;
4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732
					break;
				} else {
					do_print_state = true;
					continue;
				}
			} else {
				err = check_cond_jmp_op(env, insn, &insn_idx);
				if (err)
					return err;
			}
		} else if (class == BPF_LD) {
			u8 mode = BPF_MODE(insn->code);

			if (mode == BPF_ABS || mode == BPF_IND) {
4733 4734 4735 4736
				err = check_ld_abs(env, insn);
				if (err)
					return err;

4737 4738 4739 4740 4741 4742
			} else if (mode == BPF_IMM) {
				err = check_ld_imm(env, insn);
				if (err)
					return err;

				insn_idx++;
A
Alexei Starovoitov 已提交
4743
				env->insn_aux_data[insn_idx].seen = true;
4744
			} else {
4745
				verbose(env, "invalid BPF_LD mode\n");
4746 4747 4748
				return -EINVAL;
			}
		} else {
4749
			verbose(env, "unknown insn class %d\n", class);
4750 4751 4752 4753 4754 4755
			return -EINVAL;
		}

		insn_idx++;
	}

4756 4757 4758 4759 4760 4761 4762 4763 4764 4765
	verbose(env, "processed %d insns, stack depth ", insn_processed);
	for (i = 0; i < env->subprog_cnt + 1; i++) {
		u32 depth = env->subprog_stack_depth[i];

		verbose(env, "%d", depth);
		if (i + 1 < env->subprog_cnt + 1)
			verbose(env, "+");
	}
	verbose(env, "\n");
	env->prog->aux->stack_depth = env->subprog_stack_depth[0];
4766 4767 4768
	return 0;
}

4769 4770 4771
static int check_map_prealloc(struct bpf_map *map)
{
	return (map->map_type != BPF_MAP_TYPE_HASH &&
M
Martin KaFai Lau 已提交
4772 4773
		map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
		map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
4774 4775 4776
		!(map->map_flags & BPF_F_NO_PREALLOC);
}

4777 4778
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
					struct bpf_map *map,
4779 4780 4781
					struct bpf_prog *prog)

{
4782 4783 4784 4785 4786 4787 4788
	/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
	 * preallocated hash maps, since doing memory allocation
	 * in overflow_handler can crash depending on where nmi got
	 * triggered.
	 */
	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
		if (!check_map_prealloc(map)) {
4789
			verbose(env, "perf_event programs can only use preallocated hash map\n");
4790 4791 4792 4793
			return -EINVAL;
		}
		if (map->inner_map_meta &&
		    !check_map_prealloc(map->inner_map_meta)) {
4794
			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
4795 4796
			return -EINVAL;
		}
4797 4798 4799 4800
	}
	return 0;
}

4801 4802 4803
/* look for pseudo eBPF instructions that access map FDs and
 * replace them with actual map pointers
 */
4804
static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
4805 4806 4807
{
	struct bpf_insn *insn = env->prog->insnsi;
	int insn_cnt = env->prog->len;
4808
	int i, j, err;
4809

4810
	err = bpf_prog_calc_tag(env->prog);
4811 4812 4813
	if (err)
		return err;

4814
	for (i = 0; i < insn_cnt; i++, insn++) {
4815
		if (BPF_CLASS(insn->code) == BPF_LDX &&
4816
		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
4817
			verbose(env, "BPF_LDX uses reserved fields\n");
4818 4819 4820
			return -EINVAL;
		}

4821 4822 4823
		if (BPF_CLASS(insn->code) == BPF_STX &&
		    ((BPF_MODE(insn->code) != BPF_MEM &&
		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
4824
			verbose(env, "BPF_STX uses reserved fields\n");
4825 4826 4827
			return -EINVAL;
		}

4828 4829 4830 4831 4832 4833 4834
		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
			struct bpf_map *map;
			struct fd f;

			if (i == insn_cnt - 1 || insn[1].code != 0 ||
			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
			    insn[1].off != 0) {
4835
				verbose(env, "invalid bpf_ld_imm64 insn\n");
4836 4837 4838 4839 4840 4841 4842 4843
				return -EINVAL;
			}

			if (insn->src_reg == 0)
				/* valid generic load 64-bit imm */
				goto next_insn;

			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
4844 4845
				verbose(env,
					"unrecognized bpf_ld_imm64 insn\n");
4846 4847 4848 4849
				return -EINVAL;
			}

			f = fdget(insn->imm);
4850
			map = __bpf_map_get(f);
4851
			if (IS_ERR(map)) {
4852
				verbose(env, "fd %d is not pointing to valid bpf_map\n",
4853 4854 4855 4856
					insn->imm);
				return PTR_ERR(map);
			}

4857
			err = check_map_prog_compatibility(env, map, env->prog);
4858 4859 4860 4861 4862
			if (err) {
				fdput(f);
				return err;
			}

4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883
			/* store map pointer inside BPF_LD_IMM64 instruction */
			insn[0].imm = (u32) (unsigned long) map;
			insn[1].imm = ((u64) (unsigned long) map) >> 32;

			/* check whether we recorded this map already */
			for (j = 0; j < env->used_map_cnt; j++)
				if (env->used_maps[j] == map) {
					fdput(f);
					goto next_insn;
				}

			if (env->used_map_cnt >= MAX_USED_MAPS) {
				fdput(f);
				return -E2BIG;
			}

			/* hold the map. If the program is rejected by verifier,
			 * the map will be released by release_maps() or it
			 * will be used by the valid program until it's unloaded
			 * and all maps are released in free_bpf_prog_info()
			 */
A
Alexei Starovoitov 已提交
4884 4885 4886 4887 4888 4889 4890
			map = bpf_map_inc(map, false);
			if (IS_ERR(map)) {
				fdput(f);
				return PTR_ERR(map);
			}
			env->used_maps[env->used_map_cnt++] = map;

4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905
			fdput(f);
next_insn:
			insn++;
			i++;
		}
	}

	/* now all pseudo BPF_LD_IMM64 instructions load valid
	 * 'struct bpf_map *' into a register instead of user map_fd.
	 * These pointers will be used later by verifier to validate map access.
	 */
	return 0;
}

/* drop refcnt of maps used by the rejected program */
4906
static void release_maps(struct bpf_verifier_env *env)
4907 4908 4909 4910 4911 4912 4913 4914
{
	int i;

	for (i = 0; i < env->used_map_cnt; i++)
		bpf_map_put(env->used_maps[i]);
}

/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
4915
static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
4916 4917 4918 4919 4920 4921 4922 4923 4924 4925
{
	struct bpf_insn *insn = env->prog->insnsi;
	int insn_cnt = env->prog->len;
	int i;

	for (i = 0; i < insn_cnt; i++, insn++)
		if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
			insn->src_reg = 0;
}

4926 4927 4928 4929 4930 4931 4932 4933
/* single env->prog->insni[off] instruction was replaced with the range
 * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
 * [0, off) and [off, end) to new locations, so the patched range stays zero
 */
static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
				u32 off, u32 cnt)
{
	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
A
Alexei Starovoitov 已提交
4934
	int i;
4935 4936 4937 4938 4939 4940 4941 4942 4943

	if (cnt == 1)
		return 0;
	new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
	if (!new_data)
		return -ENOMEM;
	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
	memcpy(new_data + off + cnt - 1, old_data + off,
	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
A
Alexei Starovoitov 已提交
4944 4945
	for (i = off; i < off + cnt - 1; i++)
		new_data[i].seen = true;
4946 4947 4948 4949 4950
	env->insn_aux_data = new_data;
	vfree(old_data);
	return 0;
}

4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
{
	int i;

	if (len == 1)
		return;
	for (i = 0; i < env->subprog_cnt; i++) {
		if (env->subprog_starts[i] < off)
			continue;
		env->subprog_starts[i] += len - 1;
	}
}

4964 4965 4966 4967 4968 4969 4970 4971 4972 4973
static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
					    const struct bpf_insn *patch, u32 len)
{
	struct bpf_prog *new_prog;

	new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
	if (!new_prog)
		return NULL;
	if (adjust_insn_aux_data(env, new_prog->len, off, len))
		return NULL;
4974
	adjust_subprog_starts(env, off, len);
4975 4976 4977
	return new_prog;
}

A
Alexei Starovoitov 已提交
4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996
/* The verifier does more data flow analysis than llvm and will not explore
 * branches that are dead at run time. Malicious programs can have dead code
 * too. Therefore replace all dead at-run-time code with nops.
 */
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
	struct bpf_insn *insn = env->prog->insnsi;
	const int insn_cnt = env->prog->len;
	int i;

	for (i = 0; i < insn_cnt; i++) {
		if (aux_data[i].seen)
			continue;
		memcpy(insn + i, &nop, sizeof(nop));
	}
}

4997 4998 4999
/* convert load instructions that access fields of 'struct __sk_buff'
 * into sequence of instructions that access fields of 'struct sk_buff'
 */
5000
static int convert_ctx_accesses(struct bpf_verifier_env *env)
5001
{
5002
	const struct bpf_verifier_ops *ops = env->ops;
5003
	int i, cnt, size, ctx_field_size, delta = 0;
5004
	const int insn_cnt = env->prog->len;
5005
	struct bpf_insn insn_buf[16], *insn;
5006
	struct bpf_prog *new_prog;
5007
	enum bpf_access_type type;
5008 5009
	bool is_narrower_load;
	u32 target_size;
5010

5011 5012 5013 5014
	if (ops->gen_prologue) {
		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
					env->prog);
		if (cnt >= ARRAY_SIZE(insn_buf)) {
5015
			verbose(env, "bpf verifier is misconfigured\n");
5016 5017
			return -EINVAL;
		} else if (cnt) {
5018
			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
5019 5020
			if (!new_prog)
				return -ENOMEM;
5021

5022
			env->prog = new_prog;
5023
			delta += cnt - 1;
5024 5025 5026 5027
		}
	}

	if (!ops->convert_ctx_access)
5028 5029
		return 0;

5030
	insn = env->prog->insnsi + delta;
5031

5032
	for (i = 0; i < insn_cnt; i++, insn++) {
5033 5034 5035
		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
5036
		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
5037
			type = BPF_READ;
5038 5039 5040
		else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
			 insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
			 insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
5041
			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
5042 5043
			type = BPF_WRITE;
		else
5044 5045
			continue;

5046
		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
5047 5048
			continue;

5049
		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
5050
		size = BPF_LDST_BYTES(insn);
5051 5052 5053 5054 5055 5056

		/* If the read access is a narrower load of the field,
		 * convert to a 4/8-byte load, to minimum program type specific
		 * convert_ctx_access changes. If conversion is successful,
		 * we will apply proper mask to the result.
		 */
5057
		is_narrower_load = size < ctx_field_size;
5058
		if (is_narrower_load) {
5059 5060 5061 5062
			u32 off = insn->off;
			u8 size_code;

			if (type == BPF_WRITE) {
5063
				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
5064 5065
				return -EINVAL;
			}
5066

5067
			size_code = BPF_H;
5068 5069 5070 5071
			if (ctx_field_size == 4)
				size_code = BPF_W;
			else if (ctx_field_size == 8)
				size_code = BPF_DW;
5072

5073 5074 5075
			insn->off = off & ~(ctx_field_size - 1);
			insn->code = BPF_LDX | BPF_MEM | size_code;
		}
5076 5077 5078 5079 5080 5081

		target_size = 0;
		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
					      &target_size);
		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
		    (ctx_field_size && !target_size)) {
5082
			verbose(env, "bpf verifier is misconfigured\n");
5083 5084
			return -EINVAL;
		}
5085 5086

		if (is_narrower_load && size < target_size) {
5087 5088
			if (ctx_field_size <= 4)
				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
5089
								(1 << size * 8) - 1);
5090 5091
			else
				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
5092
								(1 << size * 8) - 1);
5093
		}
5094

5095
		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
5096 5097 5098
		if (!new_prog)
			return -ENOMEM;

5099
		delta += cnt - 1;
5100 5101 5102

		/* keep walking new program and skip insns we just inserted */
		env->prog = new_prog;
5103
		insn      = new_prog->insnsi + i + delta;
5104 5105 5106 5107 5108
	}

	return 0;
}

5109 5110 5111 5112
static int jit_subprogs(struct bpf_verifier_env *env)
{
	struct bpf_prog *prog = env->prog, **func, *tmp;
	int i, j, subprog_start, subprog_end = 0, len, subprog;
5113
	struct bpf_insn *insn;
5114 5115 5116 5117 5118 5119
	void *old_bpf_func;
	int err = -ENOMEM;

	if (env->subprog_cnt == 0)
		return 0;

5120
	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158
		if (insn->code != (BPF_JMP | BPF_CALL) ||
		    insn->src_reg != BPF_PSEUDO_CALL)
			continue;
		subprog = find_subprog(env, i + insn->imm + 1);
		if (subprog < 0) {
			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
				  i + insn->imm + 1);
			return -EFAULT;
		}
		/* temporarily remember subprog id inside insn instead of
		 * aux_data, since next loop will split up all insns into funcs
		 */
		insn->off = subprog + 1;
		/* remember original imm in case JIT fails and fallback
		 * to interpreter will be needed
		 */
		env->insn_aux_data[i].call_imm = insn->imm;
		/* point imm to __bpf_call_base+1 from JITs point of view */
		insn->imm = 1;
	}

	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
	if (!func)
		return -ENOMEM;

	for (i = 0; i <= env->subprog_cnt; i++) {
		subprog_start = subprog_end;
		if (env->subprog_cnt == i)
			subprog_end = prog->len;
		else
			subprog_end = env->subprog_starts[i];

		len = subprog_end - subprog_start;
		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
		if (!func[i])
			goto out_free;
		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
		       len * sizeof(struct bpf_insn));
5159
		func[i]->type = prog->type;
5160
		func[i]->len = len;
5161 5162
		if (bpf_prog_calc_tag(func[i]))
			goto out_free;
5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211
		func[i]->is_func = 1;
		/* Use bpf_prog_F_tag to indicate functions in stack traces.
		 * Long term would need debug info to populate names
		 */
		func[i]->aux->name[0] = 'F';
		func[i]->aux->stack_depth = env->subprog_stack_depth[i];
		func[i]->jit_requested = 1;
		func[i] = bpf_int_jit_compile(func[i]);
		if (!func[i]->jited) {
			err = -ENOTSUPP;
			goto out_free;
		}
		cond_resched();
	}
	/* at this point all bpf functions were successfully JITed
	 * now populate all bpf_calls with correct addresses and
	 * run last pass of JIT
	 */
	for (i = 0; i <= env->subprog_cnt; i++) {
		insn = func[i]->insnsi;
		for (j = 0; j < func[i]->len; j++, insn++) {
			if (insn->code != (BPF_JMP | BPF_CALL) ||
			    insn->src_reg != BPF_PSEUDO_CALL)
				continue;
			subprog = insn->off;
			insn->off = 0;
			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
				func[subprog]->bpf_func -
				__bpf_call_base;
		}
	}
	for (i = 0; i <= env->subprog_cnt; i++) {
		old_bpf_func = func[i]->bpf_func;
		tmp = bpf_int_jit_compile(func[i]);
		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
			verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
			err = -EFAULT;
			goto out_free;
		}
		cond_resched();
	}

	/* finally lock prog and jit images for all functions and
	 * populate kallsysm
	 */
	for (i = 0; i <= env->subprog_cnt; i++) {
		bpf_prog_lock_ro(func[i]);
		bpf_prog_kallsyms_add(func[i]);
	}
5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230

	/* Last step: make now unused interpreter insns from main
	 * prog consistent for later dump requests, so they can
	 * later look the same as if they were interpreted only.
	 */
	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
		unsigned long addr;

		if (insn->code != (BPF_JMP | BPF_CALL) ||
		    insn->src_reg != BPF_PSEUDO_CALL)
			continue;
		insn->off = env->insn_aux_data[i].call_imm;
		subprog = find_subprog(env, i + insn->off + 1);
		addr  = (unsigned long)func[subprog + 1]->bpf_func;
		addr &= PAGE_MASK;
		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
			    addr - __bpf_call_base;
	}

5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252
	prog->jited = 1;
	prog->bpf_func = func[0]->bpf_func;
	prog->aux->func = func;
	prog->aux->func_cnt = env->subprog_cnt + 1;
	return 0;
out_free:
	for (i = 0; i <= env->subprog_cnt; i++)
		if (func[i])
			bpf_jit_free(func[i]);
	kfree(func);
	/* cleanup main prog to be interpreted */
	prog->jit_requested = 0;
	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
		if (insn->code != (BPF_JMP | BPF_CALL) ||
		    insn->src_reg != BPF_PSEUDO_CALL)
			continue;
		insn->off = 0;
		insn->imm = env->insn_aux_data[i].call_imm;
	}
	return err;
}

5253 5254 5255 5256 5257 5258
static int fixup_call_args(struct bpf_verifier_env *env)
{
	struct bpf_prog *prog = env->prog;
	struct bpf_insn *insn = prog->insnsi;
	int i, depth;

5259 5260 5261 5262
	if (env->prog->jit_requested)
		if (jit_subprogs(env) == 0)
			return 0;

5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274
	for (i = 0; i < prog->len; i++, insn++) {
		if (insn->code != (BPF_JMP | BPF_CALL) ||
		    insn->src_reg != BPF_PSEUDO_CALL)
			continue;
		depth = get_callee_stack_depth(env, insn, i);
		if (depth < 0)
			return depth;
		bpf_patch_call_args(insn, depth);
	}
	return 0;
}

5275
/* fixup insn->imm field of bpf_call instructions
5276
 * and inline eligible helpers as explicit sequence of BPF instructions
5277 5278 5279
 *
 * this function is called after eBPF program passed verification
 */
5280
static int fixup_bpf_calls(struct bpf_verifier_env *env)
5281
{
5282 5283
	struct bpf_prog *prog = env->prog;
	struct bpf_insn *insn = prog->insnsi;
5284
	const struct bpf_func_proto *fn;
5285
	const int insn_cnt = prog->len;
5286 5287 5288 5289
	struct bpf_insn insn_buf[16];
	struct bpf_prog *new_prog;
	struct bpf_map *map_ptr;
	int i, cnt, delta = 0;
5290

5291 5292 5293
	for (i = 0; i < insn_cnt; i++, insn++) {
		if (insn->code != (BPF_JMP | BPF_CALL))
			continue;
5294 5295
		if (insn->src_reg == BPF_PSEUDO_CALL)
			continue;
5296

5297 5298 5299 5300
		if (insn->imm == BPF_FUNC_get_route_realm)
			prog->dst_needed = 1;
		if (insn->imm == BPF_FUNC_get_prandom_u32)
			bpf_user_rnd_init_once();
5301 5302
		if (insn->imm == BPF_FUNC_override_return)
			prog->kprobe_override = 1;
5303
		if (insn->imm == BPF_FUNC_tail_call) {
5304 5305 5306 5307 5308 5309
			/* If we tail call into other programs, we
			 * cannot make any assumptions since they can
			 * be replaced dynamically during runtime in
			 * the program array.
			 */
			prog->cb_access = 1;
5310
			env->prog->aux->stack_depth = MAX_BPF_STACK;
5311

5312 5313 5314 5315
			/* mark bpf_tail_call as different opcode to avoid
			 * conditional branch in the interpeter for every normal
			 * call and to prevent accidental JITing by JIT compiler
			 * that doesn't support bpf_tail_call yet
5316
			 */
5317
			insn->imm = 0;
5318
			insn->code = BPF_JMP | BPF_TAIL_CALL;
5319 5320
			continue;
		}
5321

5322 5323 5324
		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
		 * handlers are currently limited to 64 bit only.
		 */
5325
		if (prog->jit_requested && BITS_PER_LONG == 64 &&
5326
		    insn->imm == BPF_FUNC_map_lookup_elem) {
5327
			map_ptr = env->insn_aux_data[i + delta].map_ptr;
5328 5329
			if (map_ptr == BPF_MAP_PTR_POISON ||
			    !map_ptr->ops->map_gen_lookup)
5330 5331 5332 5333
				goto patch_call_imm;

			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
5334
				verbose(env, "bpf verifier is misconfigured\n");
5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350
				return -EINVAL;
			}

			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
						       cnt);
			if (!new_prog)
				return -ENOMEM;

			delta += cnt - 1;

			/* keep walking new program and skip insns we just inserted */
			env->prog = prog = new_prog;
			insn      = new_prog->insnsi + i + delta;
			continue;
		}

5351
		if (insn->imm == BPF_FUNC_redirect_map) {
5352 5353 5354 5355 5356 5357
			/* Note, we cannot use prog directly as imm as subsequent
			 * rewrites would still change the prog pointer. The only
			 * stable address we can use is aux, which also works with
			 * prog clones during blinding.
			 */
			u64 addr = (unsigned long)prog->aux;
5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371
			struct bpf_insn r4_ld[] = {
				BPF_LD_IMM64(BPF_REG_4, addr),
				*insn,
			};
			cnt = ARRAY_SIZE(r4_ld);

			new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
			if (!new_prog)
				return -ENOMEM;

			delta    += cnt - 1;
			env->prog = prog = new_prog;
			insn      = new_prog->insnsi + i + delta;
		}
5372
patch_call_imm:
5373
		fn = env->ops->get_func_proto(insn->imm);
5374 5375 5376 5377
		/* all functions that have prototype and verifier allowed
		 * programs to call them, must be real in-kernel functions
		 */
		if (!fn->func) {
5378 5379
			verbose(env,
				"kernel subsystem misconfigured func %s#%d\n",
5380 5381
				func_id_name(insn->imm), insn->imm);
			return -EFAULT;
5382
		}
5383
		insn->imm = fn->func - __bpf_call_base;
5384 5385
	}

5386 5387
	return 0;
}
5388

5389
static void free_states(struct bpf_verifier_env *env)
5390
{
5391
	struct bpf_verifier_state_list *sl, *sln;
5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402
	int i;

	if (!env->explored_states)
		return;

	for (i = 0; i < env->prog->len; i++) {
		sl = env->explored_states[i];

		if (sl)
			while (sl != STATE_LIST_MARK) {
				sln = sl->next;
5403
				free_verifier_state(&sl->state, false);
5404 5405 5406 5407 5408 5409 5410 5411
				kfree(sl);
				sl = sln;
			}
	}

	kfree(env->explored_states);
}

5412
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
A
Alexei Starovoitov 已提交
5413
{
5414
	struct bpf_verifier_env *env;
5415
	struct bpf_verifer_log *log;
A
Alexei Starovoitov 已提交
5416 5417
	int ret = -EINVAL;

5418 5419 5420 5421
	/* no program is valid */
	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
		return -EINVAL;

5422
	/* 'struct bpf_verifier_env' can be global, but since it's not small,
5423 5424
	 * allocate/free it every time bpf_check() is called
	 */
5425
	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
5426 5427
	if (!env)
		return -ENOMEM;
5428
	log = &env->log;
5429

5430 5431 5432 5433 5434
	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
				     (*prog)->len);
	ret = -ENOMEM;
	if (!env->insn_aux_data)
		goto err_free_env;
5435
	env->prog = *prog;
5436
	env->ops = bpf_verifier_ops[env->prog->type];
5437

5438 5439 5440 5441 5442 5443 5444
	/* grab the mutex to protect few globals used by verifier */
	mutex_lock(&bpf_verifier_lock);

	if (attr->log_level || attr->log_buf || attr->log_size) {
		/* user requested verbose verifier output
		 * and supplied buffer to store the verification trace
		 */
5445 5446 5447
		log->level = attr->log_level;
		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
		log->len_total = attr->log_size;
5448 5449

		ret = -EINVAL;
5450 5451 5452
		/* log attributes have to be sane */
		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
		    !log->level || !log->ubuf)
5453
			goto err_unlock;
5454
	}
5455 5456 5457

	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
5458
		env->strict_alignment = true;
5459

5460
	if (bpf_prog_is_dev_bound(env->prog->aux)) {
5461 5462 5463 5464 5465
		ret = bpf_prog_offload_verifier_prep(env);
		if (ret)
			goto err_unlock;
	}

5466 5467 5468 5469
	ret = replace_map_fd_with_map_ptr(env);
	if (ret < 0)
		goto skip_full_check;

5470
	env->explored_states = kcalloc(env->prog->len,
5471
				       sizeof(struct bpf_verifier_state_list *),
5472 5473 5474 5475 5476
				       GFP_USER);
	ret = -ENOMEM;
	if (!env->explored_states)
		goto skip_full_check;

5477 5478
	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);

5479 5480 5481 5482
	ret = check_cfg(env);
	if (ret < 0)
		goto skip_full_check;

5483
	ret = do_check(env);
5484 5485 5486 5487
	if (env->cur_state) {
		free_verifier_state(env->cur_state, true);
		env->cur_state = NULL;
	}
5488

5489
skip_full_check:
5490
	while (!pop_stack(env, NULL, NULL));
5491
	free_states(env);
5492

A
Alexei Starovoitov 已提交
5493 5494 5495
	if (ret == 0)
		sanitize_dead_code(env);

5496 5497 5498
	if (ret == 0)
		ret = check_max_stack_depth(env);

5499 5500 5501 5502
	if (ret == 0)
		/* program is valid, convert *(u32*)(ctx + off) accesses */
		ret = convert_ctx_accesses(env);

5503
	if (ret == 0)
5504
		ret = fixup_bpf_calls(env);
5505

5506 5507 5508
	if (ret == 0)
		ret = fixup_call_args(env);

5509
	if (log->level && bpf_verifier_log_full(log))
5510
		ret = -ENOSPC;
5511
	if (log->level && !log->ubuf) {
5512
		ret = -EFAULT;
5513
		goto err_release_maps;
5514 5515
	}

5516 5517
	if (ret == 0 && env->used_map_cnt) {
		/* if program passed verifier, update used_maps in bpf_prog_info */
5518 5519 5520
		env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
							  sizeof(env->used_maps[0]),
							  GFP_KERNEL);
5521

5522
		if (!env->prog->aux->used_maps) {
5523
			ret = -ENOMEM;
5524
			goto err_release_maps;
5525 5526
		}

5527
		memcpy(env->prog->aux->used_maps, env->used_maps,
5528
		       sizeof(env->used_maps[0]) * env->used_map_cnt);
5529
		env->prog->aux->used_map_cnt = env->used_map_cnt;
5530 5531 5532 5533 5534 5535

		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
		 * bpf_ld_imm64 instructions
		 */
		convert_pseudo_ld_imm64(env);
	}
5536

5537
err_release_maps:
5538
	if (!env->prog->aux->used_maps)
5539 5540 5541 5542
		/* if we didn't copy map pointers into bpf_prog_info, release
		 * them now. Otherwise free_bpf_prog_info() will release them.
		 */
		release_maps(env);
5543
	*prog = env->prog;
5544
err_unlock:
5545
	mutex_unlock(&bpf_verifier_lock);
5546 5547 5548
	vfree(env->insn_aux_data);
err_free_env:
	kfree(env);
A
Alexei Starovoitov 已提交
5549 5550
	return ret;
}