filter.c 18.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Author:
 *     Jay Schulist <jschlst@samba.org>
 *
 * Based on the design of:
 *     - The Berkeley Packet Filter
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 * Andi Kleen - Fix a few bad bugs and races.
16
 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
L
Linus Torvalds 已提交
17 18 19 20 21 22 23 24 25 26 27
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
28
#include <linux/gfp.h>
L
Linus Torvalds 已提交
29 30
#include <net/ip.h>
#include <net/protocol.h>
31
#include <net/netlink.h>
L
Linus Torvalds 已提交
32 33 34 35 36
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
37
#include <asm/unaligned.h>
L
Linus Torvalds 已提交
38
#include <linux/filter.h>
E
Eric Dumazet 已提交
39
#include <linux/reciprocal_div.h>
40
#include <linux/ratelimit.h>
41
#include <linux/seccomp.h>
L
Linus Torvalds 已提交
42

43 44 45 46 47
/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
L
Linus Torvalds 已提交
48 49 50 51
{
	u8 *ptr = NULL;

	if (k >= SKF_NET_OFF)
52
		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
L
Linus Torvalds 已提交
53
	else if (k >= SKF_LL_OFF)
54
		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
L
Linus Torvalds 已提交
55

56
	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
L
Linus Torvalds 已提交
57 58 59 60
		return ptr;
	return NULL;
}

E
Eric Dumazet 已提交
61
static inline void *load_pointer(const struct sk_buff *skb, int k,
62
				 unsigned int size, void *buffer)
63 64 65
{
	if (k >= 0)
		return skb_header_pointer(skb, k, size, buffer);
66
	return bpf_internal_load_pointer_neg_helper(skb, k, size);
67 68
}

S
Stephen Hemminger 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
/**
 *	sk_filter - run a packet through a socket filter
 *	@sk: sock associated with &sk_buff
 *	@skb: buffer to filter
 *
 * Run the filter code and then cut skb->data to correct size returned by
 * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to sk_run_filter. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter(struct sock *sk, struct sk_buff *skb)
{
	int err;
	struct sk_filter *filter;

86 87 88 89 90 91 92 93
	/*
	 * If the skb was allocated from pfmemalloc reserves, only
	 * allow SOCK_MEMALLOC sockets to use it as this socket is
	 * helping free memory
	 */
	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
		return -ENOMEM;

S
Stephen Hemminger 已提交
94 95 96 97
	err = security_sock_rcv_skb(sk, skb);
	if (err)
		return err;

98 99
	rcu_read_lock();
	filter = rcu_dereference(sk->sk_filter);
S
Stephen Hemminger 已提交
100
	if (filter) {
101
		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
102

S
Stephen Hemminger 已提交
103 104
		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
	}
105
	rcu_read_unlock();
S
Stephen Hemminger 已提交
106 107 108 109 110

	return err;
}
EXPORT_SYMBOL(sk_filter);

L
Linus Torvalds 已提交
111
/**
112
 *	sk_run_filter - run a filter on a socket
L
Linus Torvalds 已提交
113
 *	@skb: buffer to run the filter on
114
 *	@fentry: filter to apply
L
Linus Torvalds 已提交
115 116
 *
 * Decode and apply filter instructions to the skb->data.
E
Eric Dumazet 已提交
117 118 119 120 121
 * Return length to keep, 0 for none. @skb is the data we are
 * filtering, @filter is the array of filter instructions.
 * Because all jumps are guaranteed to be before last instruction,
 * and last instruction guaranteed to be a RET, we dont need to check
 * flen. (We used to pass to this function the length of filter)
L
Linus Torvalds 已提交
122
 */
E
Eric Dumazet 已提交
123 124
unsigned int sk_run_filter(const struct sk_buff *skb,
			   const struct sock_filter *fentry)
L
Linus Torvalds 已提交
125
{
126
	void *ptr;
127 128
	u32 A = 0;			/* Accumulator */
	u32 X = 0;			/* Index Register */
L
Linus Torvalds 已提交
129
	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
130
	u32 tmp;
L
Linus Torvalds 已提交
131 132 133 134 135
	int k;

	/*
	 * Process array of filter instructions.
	 */
E
Eric Dumazet 已提交
136 137 138 139 140 141
	for (;; fentry++) {
#if defined(CONFIG_X86_32)
#define	K (fentry->k)
#else
		const u32 K = fentry->k;
#endif
142

L
Linus Torvalds 已提交
143
		switch (fentry->code) {
144
		case BPF_S_ALU_ADD_X:
L
Linus Torvalds 已提交
145 146
			A += X;
			continue;
147
		case BPF_S_ALU_ADD_K:
E
Eric Dumazet 已提交
148
			A += K;
L
Linus Torvalds 已提交
149
			continue;
150
		case BPF_S_ALU_SUB_X:
L
Linus Torvalds 已提交
151 152
			A -= X;
			continue;
153
		case BPF_S_ALU_SUB_K:
E
Eric Dumazet 已提交
154
			A -= K;
L
Linus Torvalds 已提交
155
			continue;
156
		case BPF_S_ALU_MUL_X:
L
Linus Torvalds 已提交
157 158
			A *= X;
			continue;
159
		case BPF_S_ALU_MUL_K:
E
Eric Dumazet 已提交
160
			A *= K;
L
Linus Torvalds 已提交
161
			continue;
162
		case BPF_S_ALU_DIV_X:
L
Linus Torvalds 已提交
163 164 165 166
			if (X == 0)
				return 0;
			A /= X;
			continue;
167
		case BPF_S_ALU_DIV_K:
E
Eric Dumazet 已提交
168
			A = reciprocal_divide(A, K);
L
Linus Torvalds 已提交
169
			continue;
E
Eric Dumazet 已提交
170 171 172 173 174 175 176 177
		case BPF_S_ALU_MOD_X:
			if (X == 0)
				return 0;
			A %= X;
			continue;
		case BPF_S_ALU_MOD_K:
			A %= K;
			continue;
178
		case BPF_S_ALU_AND_X:
L
Linus Torvalds 已提交
179 180
			A &= X;
			continue;
181
		case BPF_S_ALU_AND_K:
E
Eric Dumazet 已提交
182
			A &= K;
L
Linus Torvalds 已提交
183
			continue;
184
		case BPF_S_ALU_OR_X:
L
Linus Torvalds 已提交
185 186
			A |= X;
			continue;
187
		case BPF_S_ALU_OR_K:
E
Eric Dumazet 已提交
188
			A |= K;
L
Linus Torvalds 已提交
189
			continue;
190 191 192 193 194 195 196
		case BPF_S_ANC_ALU_XOR_X:
		case BPF_S_ALU_XOR_X:
			A ^= X;
			continue;
		case BPF_S_ALU_XOR_K:
			A ^= K;
			continue;
197
		case BPF_S_ALU_LSH_X:
L
Linus Torvalds 已提交
198 199
			A <<= X;
			continue;
200
		case BPF_S_ALU_LSH_K:
E
Eric Dumazet 已提交
201
			A <<= K;
L
Linus Torvalds 已提交
202
			continue;
203
		case BPF_S_ALU_RSH_X:
L
Linus Torvalds 已提交
204 205
			A >>= X;
			continue;
206
		case BPF_S_ALU_RSH_K:
E
Eric Dumazet 已提交
207
			A >>= K;
L
Linus Torvalds 已提交
208
			continue;
209
		case BPF_S_ALU_NEG:
L
Linus Torvalds 已提交
210 211
			A = -A;
			continue;
212
		case BPF_S_JMP_JA:
E
Eric Dumazet 已提交
213
			fentry += K;
L
Linus Torvalds 已提交
214
			continue;
215
		case BPF_S_JMP_JGT_K:
E
Eric Dumazet 已提交
216
			fentry += (A > K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
217
			continue;
218
		case BPF_S_JMP_JGE_K:
E
Eric Dumazet 已提交
219
			fentry += (A >= K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
220
			continue;
221
		case BPF_S_JMP_JEQ_K:
E
Eric Dumazet 已提交
222
			fentry += (A == K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
223
			continue;
224
		case BPF_S_JMP_JSET_K:
E
Eric Dumazet 已提交
225
			fentry += (A & K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
226
			continue;
227
		case BPF_S_JMP_JGT_X:
E
Eric Dumazet 已提交
228
			fentry += (A > X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
229
			continue;
230
		case BPF_S_JMP_JGE_X:
E
Eric Dumazet 已提交
231
			fentry += (A >= X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
232
			continue;
233
		case BPF_S_JMP_JEQ_X:
E
Eric Dumazet 已提交
234
			fentry += (A == X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
235
			continue;
236
		case BPF_S_JMP_JSET_X:
E
Eric Dumazet 已提交
237
			fentry += (A & X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
238
			continue;
239
		case BPF_S_LD_W_ABS:
E
Eric Dumazet 已提交
240
			k = K;
241
load_w:
242 243
			ptr = load_pointer(skb, k, 4, &tmp);
			if (ptr != NULL) {
244
				A = get_unaligned_be32(ptr);
245
				continue;
L
Linus Torvalds 已提交
246
			}
247
			return 0;
248
		case BPF_S_LD_H_ABS:
E
Eric Dumazet 已提交
249
			k = K;
250
load_h:
251 252
			ptr = load_pointer(skb, k, 2, &tmp);
			if (ptr != NULL) {
253
				A = get_unaligned_be16(ptr);
254
				continue;
L
Linus Torvalds 已提交
255
			}
256
			return 0;
257
		case BPF_S_LD_B_ABS:
E
Eric Dumazet 已提交
258
			k = K;
L
Linus Torvalds 已提交
259
load_b:
260 261 262 263
			ptr = load_pointer(skb, k, 1, &tmp);
			if (ptr != NULL) {
				A = *(u8 *)ptr;
				continue;
L
Linus Torvalds 已提交
264
			}
265
			return 0;
266
		case BPF_S_LD_W_LEN:
267
			A = skb->len;
L
Linus Torvalds 已提交
268
			continue;
269
		case BPF_S_LDX_W_LEN:
270
			X = skb->len;
L
Linus Torvalds 已提交
271
			continue;
272
		case BPF_S_LD_W_IND:
E
Eric Dumazet 已提交
273
			k = X + K;
L
Linus Torvalds 已提交
274
			goto load_w;
275
		case BPF_S_LD_H_IND:
E
Eric Dumazet 已提交
276
			k = X + K;
L
Linus Torvalds 已提交
277
			goto load_h;
278
		case BPF_S_LD_B_IND:
E
Eric Dumazet 已提交
279
			k = X + K;
L
Linus Torvalds 已提交
280
			goto load_b;
281
		case BPF_S_LDX_B_MSH:
E
Eric Dumazet 已提交
282
			ptr = load_pointer(skb, K, 1, &tmp);
283 284 285 286 287
			if (ptr != NULL) {
				X = (*(u8 *)ptr & 0xf) << 2;
				continue;
			}
			return 0;
288
		case BPF_S_LD_IMM:
E
Eric Dumazet 已提交
289
			A = K;
L
Linus Torvalds 已提交
290
			continue;
291
		case BPF_S_LDX_IMM:
E
Eric Dumazet 已提交
292
			X = K;
L
Linus Torvalds 已提交
293
			continue;
294
		case BPF_S_LD_MEM:
295
			A = mem[K];
L
Linus Torvalds 已提交
296
			continue;
297
		case BPF_S_LDX_MEM:
298
			X = mem[K];
L
Linus Torvalds 已提交
299
			continue;
300
		case BPF_S_MISC_TAX:
L
Linus Torvalds 已提交
301 302
			X = A;
			continue;
303
		case BPF_S_MISC_TXA:
L
Linus Torvalds 已提交
304 305
			A = X;
			continue;
306
		case BPF_S_RET_K:
E
Eric Dumazet 已提交
307
			return K;
308
		case BPF_S_RET_A:
309
			return A;
310
		case BPF_S_ST:
E
Eric Dumazet 已提交
311
			mem[K] = A;
L
Linus Torvalds 已提交
312
			continue;
313
		case BPF_S_STX:
E
Eric Dumazet 已提交
314
			mem[K] = X;
L
Linus Torvalds 已提交
315
			continue;
316
		case BPF_S_ANC_PROTOCOL:
A
Al Viro 已提交
317
			A = ntohs(skb->protocol);
L
Linus Torvalds 已提交
318
			continue;
319
		case BPF_S_ANC_PKTTYPE:
L
Linus Torvalds 已提交
320 321
			A = skb->pkt_type;
			continue;
322
		case BPF_S_ANC_IFINDEX:
323 324
			if (!skb->dev)
				return 0;
L
Linus Torvalds 已提交
325 326
			A = skb->dev->ifindex;
			continue;
327
		case BPF_S_ANC_MARK:
J
jamal 已提交
328 329
			A = skb->mark;
			continue;
330
		case BPF_S_ANC_QUEUE:
331 332
			A = skb->queue_mapping;
			continue;
333
		case BPF_S_ANC_HATYPE:
334 335 336 337
			if (!skb->dev)
				return 0;
			A = skb->dev->type;
			continue;
338
		case BPF_S_ANC_RXHASH:
339 340
			A = skb->rxhash;
			continue;
341
		case BPF_S_ANC_CPU:
342 343
			A = raw_smp_processor_id();
			continue;
344
		case BPF_S_ANC_NLATTR: {
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = nla_find((struct nlattr *)&skb->data[A],
				       skb->len - A, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
360
		case BPF_S_ANC_NLATTR_NEST: {
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = (struct nlattr *)&skb->data[A];
			if (nla->nla_len > A - skb->len)
				return 0;

			nla = nla_find_nested(nla, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
379 380 381 382 383
#ifdef CONFIG_SECCOMP_FILTER
		case BPF_S_ANC_SECCOMP_LD_W:
			A = seccomp_bpf_load(fentry->k);
			continue;
#endif
L
Linus Torvalds 已提交
384
		default:
J
Joe Perches 已提交
385 386 387
			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
				       fentry->code, fentry->jt,
				       fentry->jf, fentry->k);
L
Linus Torvalds 已提交
388 389 390 391 392 393
			return 0;
		}
	}

	return 0;
}
394
EXPORT_SYMBOL(sk_run_filter);
L
Linus Torvalds 已提交
395

396 397 398 399 400 401 402
/*
 * Security :
 * A BPF program is able to use 16 cells of memory to store intermediate
 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
 * As we dont want to clear mem[] array for each packet going through
 * sk_run_filter(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
L
Lucas De Marchi 已提交
403
 * a malicious user doesn't try to abuse us.
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
 */
static int check_load_and_stores(struct sock_filter *filter, int flen)
{
	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
	int pc, ret = 0;

	BUILD_BUG_ON(BPF_MEMWORDS > 16);
	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
	if (!masks)
		return -ENOMEM;
	memset(masks, 0xff, flen * sizeof(*masks));

	for (pc = 0; pc < flen; pc++) {
		memvalid &= masks[pc];

		switch (filter[pc].code) {
		case BPF_S_ST:
		case BPF_S_STX:
			memvalid |= (1 << filter[pc].k);
			break;
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
			if (!(memvalid & (1 << filter[pc].k))) {
				ret = -EINVAL;
				goto error;
			}
			break;
		case BPF_S_JMP_JA:
			/* a jump must set masks on target */
			masks[pc + 1 + filter[pc].k] &= memvalid;
			memvalid = ~0;
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
			/* a jump must set masks on targets */
			masks[pc + 1 + filter[pc].jt] &= memvalid;
			masks[pc + 1 + filter[pc].jf] &= memvalid;
			memvalid = ~0;
			break;
		}
	}
error:
	kfree(masks);
	return ret;
}

L
Linus Torvalds 已提交
456 457 458 459 460 461 462
/**
 *	sk_chk_filter - verify socket filter code
 *	@filter: filter to verify
 *	@flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
463 464
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
L
Linus Torvalds 已提交
465
 *
466 467 468
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
L
Linus Torvalds 已提交
469
 */
470
int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
L
Linus Torvalds 已提交
471
{
472 473 474 475 476
	/*
	 * Valid instructions are initialized to non-0.
	 * Invalid instructions are initialized to 0.
	 */
	static const u8 codes[] = {
E
Eric Dumazet 已提交
477 478 479 480 481 482 483
		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K,
		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X,
		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K,
		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X,
		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K,
		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X,
		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X,
E
Eric Dumazet 已提交
484 485
		[BPF_ALU|BPF_MOD|BPF_K]  = BPF_S_ALU_MOD_K,
		[BPF_ALU|BPF_MOD|BPF_X]  = BPF_S_ALU_MOD_X,
E
Eric Dumazet 已提交
486 487 488 489
		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K,
		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X,
		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K,
		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X,
490 491
		[BPF_ALU|BPF_XOR|BPF_K]  = BPF_S_ALU_XOR_K,
		[BPF_ALU|BPF_XOR|BPF_X]  = BPF_S_ALU_XOR_X,
E
Eric Dumazet 已提交
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K,
		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X,
		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K,
		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X,
		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG,
		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS,
		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS,
		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS,
		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN,
		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND,
		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND,
		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND,
		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM,
		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN,
		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH,
		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM,
		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX,
		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA,
		[BPF_RET|BPF_K]          = BPF_S_RET_K,
		[BPF_RET|BPF_A]          = BPF_S_RET_A,
		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K,
		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM,
		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM,
		[BPF_ST]                 = BPF_S_ST,
		[BPF_STX]                = BPF_S_STX,
		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA,
		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K,
		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X,
		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K,
		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X,
		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K,
		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X,
		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
526
	};
L
Linus Torvalds 已提交
527 528
	int pc;

529
	if (flen == 0 || flen > BPF_MAXINSNS)
L
Linus Torvalds 已提交
530 531 532 533
		return -EINVAL;

	/* check the filter code now */
	for (pc = 0; pc < flen; pc++) {
534 535
		struct sock_filter *ftest = &filter[pc];
		u16 code = ftest->code;
536

537 538 539
		if (code >= ARRAY_SIZE(codes))
			return -EINVAL;
		code = codes[code];
E
Eric Dumazet 已提交
540
		if (!code)
541
			return -EINVAL;
542
		/* Some instructions need special checks */
543 544
		switch (code) {
		case BPF_S_ALU_DIV_K:
545 546
			/* check for division by zero */
			if (ftest->k == 0)
L
Linus Torvalds 已提交
547
				return -EINVAL;
E
Eric Dumazet 已提交
548
			ftest->k = reciprocal_value(ftest->k);
549
			break;
E
Eric Dumazet 已提交
550 551 552 553 554
		case BPF_S_ALU_MOD_K:
			/* check for division by zero */
			if (ftest->k == 0)
				return -EINVAL;
			break;
555 556 557 558 559
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
		case BPF_S_ST:
		case BPF_S_STX:
			/* check for invalid memory addresses */
560 561 562
			if (ftest->k >= BPF_MEMWORDS)
				return -EINVAL;
			break;
563
		case BPF_S_JMP_JA:
564 565 566 567 568
			/*
			 * Note, the large ftest->k might cause loops.
			 * Compare this with conditional jumps below,
			 * where offsets are limited. --ANK (981016)
			 */
569
			if (ftest->k >= (unsigned int)(flen-pc-1))
570
				return -EINVAL;
571 572 573 574 575 576 577 578 579
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
580
			/* for conditionals both must be safe */
581
			if (pc + ftest->jt + 1 >= flen ||
582 583
			    pc + ftest->jf + 1 >= flen)
				return -EINVAL;
584
			break;
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
		case BPF_S_LD_W_ABS:
		case BPF_S_LD_H_ABS:
		case BPF_S_LD_B_ABS:
#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
				code = BPF_S_ANC_##CODE;	\
				break
			switch (ftest->k) {
			ANCILLARY(PROTOCOL);
			ANCILLARY(PKTTYPE);
			ANCILLARY(IFINDEX);
			ANCILLARY(NLATTR);
			ANCILLARY(NLATTR_NEST);
			ANCILLARY(MARK);
			ANCILLARY(QUEUE);
			ANCILLARY(HATYPE);
			ANCILLARY(RXHASH);
			ANCILLARY(CPU);
J
Jiri Pirko 已提交
602
			ANCILLARY(ALU_XOR_X);
603
			}
604
		}
605
		ftest->code = code;
606
	}
607

608 609 610 611
	/* last instruction must be a RET code */
	switch (filter[flen - 1].code) {
	case BPF_S_RET_K:
	case BPF_S_RET_A:
612
		return check_load_and_stores(filter, flen);
613 614
	}
	return -EINVAL;
L
Linus Torvalds 已提交
615
}
616
EXPORT_SYMBOL(sk_chk_filter);
L
Linus Torvalds 已提交
617

618
/**
E
Eric Dumazet 已提交
619
 * 	sk_filter_release_rcu - Release a socket filter by rcu_head
620 621
 *	@rcu: rcu_head that contains the sk_filter to free
 */
E
Eric Dumazet 已提交
622
void sk_filter_release_rcu(struct rcu_head *rcu)
623 624 625
{
	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

626
	bpf_jit_free(fp);
E
Eric Dumazet 已提交
627
	kfree(fp);
628
}
E
Eric Dumazet 已提交
629
EXPORT_SYMBOL(sk_filter_release_rcu);
630

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
static int __sk_prepare_filter(struct sk_filter *fp)
{
	int err;

	fp->bpf_func = sk_run_filter;

	err = sk_chk_filter(fp->insns, fp->len);
	if (err)
		return err;

	bpf_jit_compile(fp);
	return 0;
}

/**
 *	sk_unattached_filter_create - create an unattached filter
 *	@fprog: the filter program
R
Randy Dunlap 已提交
648
 *	@pfp: the unattached filter that is created
649
 *
R
Randy Dunlap 已提交
650
 * Create a filter independent of any socket. We first run some
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int sk_unattached_filter_create(struct sk_filter **pfp,
				struct sock_fprog *fprog)
{
	struct sk_filter *fp;
	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
	int err;

	/* Make sure new filter is there and in the right amounts. */
	if (fprog->filter == NULL)
		return -EINVAL;

	fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;
	memcpy(fp->insns, fprog->filter, fsize);

	atomic_set(&fp->refcnt, 1);
	fp->len = fprog->len;

	err = __sk_prepare_filter(fp);
	if (err)
		goto free_mem;

	*pfp = fp;
	return 0;
free_mem:
	kfree(fp);
	return err;
}
EXPORT_SYMBOL_GPL(sk_unattached_filter_create);

void sk_unattached_filter_destroy(struct sk_filter *fp)
{
	sk_filter_release(fp);
}
EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);

L
Linus Torvalds 已提交
692 693 694 695 696 697 698 699 700 701 702 703
/**
 *	sk_attach_filter - attach a socket filter
 *	@fprog: the filter program
 *	@sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
704
	struct sk_filter *fp, *old_fp;
L
Linus Torvalds 已提交
705 706 707 708
	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
	int err;

	/* Make sure new filter is there and in the right amounts. */
709 710
	if (fprog->filter == NULL)
		return -EINVAL;
L
Linus Torvalds 已提交
711 712 713 714 715

	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;
	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
716
		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
L
Linus Torvalds 已提交
717 718 719 720 721 722
		return -EFAULT;
	}

	atomic_set(&fp->refcnt, 1);
	fp->len = fprog->len;

723
	err = __sk_prepare_filter(fp);
724 725 726
	if (err) {
		sk_filter_uncharge(sk, fp);
		return err;
L
Linus Torvalds 已提交
727 728
	}

729 730
	old_fp = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
731 732
	rcu_assign_pointer(sk->sk_filter, fp);

733
	if (old_fp)
E
Eric Dumazet 已提交
734
		sk_filter_uncharge(sk, old_fp);
735
	return 0;
L
Linus Torvalds 已提交
736
}
737
EXPORT_SYMBOL_GPL(sk_attach_filter);
L
Linus Torvalds 已提交
738

739 740 741 742 743
int sk_detach_filter(struct sock *sk)
{
	int ret = -ENOENT;
	struct sk_filter *filter;

744 745
	filter = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
746
	if (filter) {
747
		RCU_INIT_POINTER(sk->sk_filter, NULL);
E
Eric Dumazet 已提交
748
		sk_filter_uncharge(sk, filter);
749 750 751 752
		ret = 0;
	}
	return ret;
}
753
EXPORT_SYMBOL_GPL(sk_detach_filter);