filter.c 16.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Author:
 *     Jay Schulist <jschlst@samba.org>
 *
 * Based on the design of:
 *     - The Berkeley Packet Filter
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 * Andi Kleen - Fix a few bad bugs and races.
16
 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
L
Linus Torvalds 已提交
17 18 19 20 21 22 23 24 25 26 27
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
28
#include <linux/gfp.h>
L
Linus Torvalds 已提交
29 30
#include <net/ip.h>
#include <net/protocol.h>
31
#include <net/netlink.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <asm/system.h>
#include <asm/uaccess.h>
38
#include <asm/unaligned.h>
L
Linus Torvalds 已提交
39
#include <linux/filter.h>
E
Eric Dumazet 已提交
40
#include <linux/reciprocal_div.h>
L
Linus Torvalds 已提交
41

42
enum {
E
Eric Dumazet 已提交
43
	BPF_S_RET_K = 1,
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
	BPF_S_RET_A,
	BPF_S_ALU_ADD_K,
	BPF_S_ALU_ADD_X,
	BPF_S_ALU_SUB_K,
	BPF_S_ALU_SUB_X,
	BPF_S_ALU_MUL_K,
	BPF_S_ALU_MUL_X,
	BPF_S_ALU_DIV_X,
	BPF_S_ALU_AND_K,
	BPF_S_ALU_AND_X,
	BPF_S_ALU_OR_K,
	BPF_S_ALU_OR_X,
	BPF_S_ALU_LSH_K,
	BPF_S_ALU_LSH_X,
	BPF_S_ALU_RSH_K,
	BPF_S_ALU_RSH_X,
	BPF_S_ALU_NEG,
	BPF_S_LD_W_ABS,
	BPF_S_LD_H_ABS,
	BPF_S_LD_B_ABS,
	BPF_S_LD_W_LEN,
	BPF_S_LD_W_IND,
	BPF_S_LD_H_IND,
	BPF_S_LD_B_IND,
	BPF_S_LD_IMM,
	BPF_S_LDX_W_LEN,
	BPF_S_LDX_B_MSH,
	BPF_S_LDX_IMM,
	BPF_S_MISC_TAX,
	BPF_S_MISC_TXA,
	BPF_S_ALU_DIV_K,
	BPF_S_LD_MEM,
	BPF_S_LDX_MEM,
	BPF_S_ST,
	BPF_S_STX,
	BPF_S_JMP_JA,
	BPF_S_JMP_JEQ_K,
	BPF_S_JMP_JEQ_X,
	BPF_S_JMP_JGE_K,
	BPF_S_JMP_JGE_X,
	BPF_S_JMP_JGT_K,
	BPF_S_JMP_JGT_X,
	BPF_S_JMP_JSET_K,
	BPF_S_JMP_JSET_X,
88 89 90 91 92 93 94 95 96 97 98
	/* Ancillary data */
	BPF_S_ANC_PROTOCOL,
	BPF_S_ANC_PKTTYPE,
	BPF_S_ANC_IFINDEX,
	BPF_S_ANC_NLATTR,
	BPF_S_ANC_NLATTR_NEST,
	BPF_S_ANC_MARK,
	BPF_S_ANC_QUEUE,
	BPF_S_ANC_HATYPE,
	BPF_S_ANC_RXHASH,
	BPF_S_ANC_CPU,
99 100
};

L
Linus Torvalds 已提交
101
/* No hurry in this branch */
102
static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
L
Linus Torvalds 已提交
103 104 105 106
{
	u8 *ptr = NULL;

	if (k >= SKF_NET_OFF)
107
		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
L
Linus Torvalds 已提交
108
	else if (k >= SKF_LL_OFF)
109
		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
L
Linus Torvalds 已提交
110

111
	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
L
Linus Torvalds 已提交
112 113 114 115
		return ptr;
	return NULL;
}

E
Eric Dumazet 已提交
116
static inline void *load_pointer(const struct sk_buff *skb, int k,
117
				 unsigned int size, void *buffer)
118 119 120
{
	if (k >= 0)
		return skb_header_pointer(skb, k, size, buffer);
121
	return __load_pointer(skb, k, size);
122 123
}

S
Stephen Hemminger 已提交
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
/**
 *	sk_filter - run a packet through a socket filter
 *	@sk: sock associated with &sk_buff
 *	@skb: buffer to filter
 *
 * Run the filter code and then cut skb->data to correct size returned by
 * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to sk_run_filter. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter(struct sock *sk, struct sk_buff *skb)
{
	int err;
	struct sk_filter *filter;

	err = security_sock_rcv_skb(sk, skb);
	if (err)
		return err;

145 146
	rcu_read_lock();
	filter = rcu_dereference(sk->sk_filter);
S
Stephen Hemminger 已提交
147
	if (filter) {
E
Eric Dumazet 已提交
148
		unsigned int pkt_len = sk_run_filter(skb, filter->insns);
149

S
Stephen Hemminger 已提交
150 151
		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
	}
152
	rcu_read_unlock();
S
Stephen Hemminger 已提交
153 154 155 156 157

	return err;
}
EXPORT_SYMBOL(sk_filter);

L
Linus Torvalds 已提交
158
/**
159
 *	sk_run_filter - run a filter on a socket
L
Linus Torvalds 已提交
160
 *	@skb: buffer to run the filter on
161
 *	@fentry: filter to apply
L
Linus Torvalds 已提交
162 163
 *
 * Decode and apply filter instructions to the skb->data.
E
Eric Dumazet 已提交
164 165 166 167 168
 * Return length to keep, 0 for none. @skb is the data we are
 * filtering, @filter is the array of filter instructions.
 * Because all jumps are guaranteed to be before last instruction,
 * and last instruction guaranteed to be a RET, we dont need to check
 * flen. (We used to pass to this function the length of filter)
L
Linus Torvalds 已提交
169
 */
E
Eric Dumazet 已提交
170 171
unsigned int sk_run_filter(const struct sk_buff *skb,
			   const struct sock_filter *fentry)
L
Linus Torvalds 已提交
172
{
173
	void *ptr;
174 175
	u32 A = 0;			/* Accumulator */
	u32 X = 0;			/* Index Register */
L
Linus Torvalds 已提交
176
	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
177
	u32 tmp;
L
Linus Torvalds 已提交
178 179 180 181 182
	int k;

	/*
	 * Process array of filter instructions.
	 */
E
Eric Dumazet 已提交
183 184 185 186 187 188
	for (;; fentry++) {
#if defined(CONFIG_X86_32)
#define	K (fentry->k)
#else
		const u32 K = fentry->k;
#endif
189

L
Linus Torvalds 已提交
190
		switch (fentry->code) {
191
		case BPF_S_ALU_ADD_X:
L
Linus Torvalds 已提交
192 193
			A += X;
			continue;
194
		case BPF_S_ALU_ADD_K:
E
Eric Dumazet 已提交
195
			A += K;
L
Linus Torvalds 已提交
196
			continue;
197
		case BPF_S_ALU_SUB_X:
L
Linus Torvalds 已提交
198 199
			A -= X;
			continue;
200
		case BPF_S_ALU_SUB_K:
E
Eric Dumazet 已提交
201
			A -= K;
L
Linus Torvalds 已提交
202
			continue;
203
		case BPF_S_ALU_MUL_X:
L
Linus Torvalds 已提交
204 205
			A *= X;
			continue;
206
		case BPF_S_ALU_MUL_K:
E
Eric Dumazet 已提交
207
			A *= K;
L
Linus Torvalds 已提交
208
			continue;
209
		case BPF_S_ALU_DIV_X:
L
Linus Torvalds 已提交
210 211 212 213
			if (X == 0)
				return 0;
			A /= X;
			continue;
214
		case BPF_S_ALU_DIV_K:
E
Eric Dumazet 已提交
215
			A = reciprocal_divide(A, K);
L
Linus Torvalds 已提交
216
			continue;
217
		case BPF_S_ALU_AND_X:
L
Linus Torvalds 已提交
218 219
			A &= X;
			continue;
220
		case BPF_S_ALU_AND_K:
E
Eric Dumazet 已提交
221
			A &= K;
L
Linus Torvalds 已提交
222
			continue;
223
		case BPF_S_ALU_OR_X:
L
Linus Torvalds 已提交
224 225
			A |= X;
			continue;
226
		case BPF_S_ALU_OR_K:
E
Eric Dumazet 已提交
227
			A |= K;
L
Linus Torvalds 已提交
228
			continue;
229
		case BPF_S_ALU_LSH_X:
L
Linus Torvalds 已提交
230 231
			A <<= X;
			continue;
232
		case BPF_S_ALU_LSH_K:
E
Eric Dumazet 已提交
233
			A <<= K;
L
Linus Torvalds 已提交
234
			continue;
235
		case BPF_S_ALU_RSH_X:
L
Linus Torvalds 已提交
236 237
			A >>= X;
			continue;
238
		case BPF_S_ALU_RSH_K:
E
Eric Dumazet 已提交
239
			A >>= K;
L
Linus Torvalds 已提交
240
			continue;
241
		case BPF_S_ALU_NEG:
L
Linus Torvalds 已提交
242 243
			A = -A;
			continue;
244
		case BPF_S_JMP_JA:
E
Eric Dumazet 已提交
245
			fentry += K;
L
Linus Torvalds 已提交
246
			continue;
247
		case BPF_S_JMP_JGT_K:
E
Eric Dumazet 已提交
248
			fentry += (A > K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
249
			continue;
250
		case BPF_S_JMP_JGE_K:
E
Eric Dumazet 已提交
251
			fentry += (A >= K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
252
			continue;
253
		case BPF_S_JMP_JEQ_K:
E
Eric Dumazet 已提交
254
			fentry += (A == K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
255
			continue;
256
		case BPF_S_JMP_JSET_K:
E
Eric Dumazet 已提交
257
			fentry += (A & K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
258
			continue;
259
		case BPF_S_JMP_JGT_X:
E
Eric Dumazet 已提交
260
			fentry += (A > X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
261
			continue;
262
		case BPF_S_JMP_JGE_X:
E
Eric Dumazet 已提交
263
			fentry += (A >= X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
264
			continue;
265
		case BPF_S_JMP_JEQ_X:
E
Eric Dumazet 已提交
266
			fentry += (A == X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
267
			continue;
268
		case BPF_S_JMP_JSET_X:
E
Eric Dumazet 已提交
269
			fentry += (A & X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
270
			continue;
271
		case BPF_S_LD_W_ABS:
E
Eric Dumazet 已提交
272
			k = K;
273
load_w:
274 275
			ptr = load_pointer(skb, k, 4, &tmp);
			if (ptr != NULL) {
276
				A = get_unaligned_be32(ptr);
277
				continue;
L
Linus Torvalds 已提交
278
			}
279
			return 0;
280
		case BPF_S_LD_H_ABS:
E
Eric Dumazet 已提交
281
			k = K;
282
load_h:
283 284
			ptr = load_pointer(skb, k, 2, &tmp);
			if (ptr != NULL) {
285
				A = get_unaligned_be16(ptr);
286
				continue;
L
Linus Torvalds 已提交
287
			}
288
			return 0;
289
		case BPF_S_LD_B_ABS:
E
Eric Dumazet 已提交
290
			k = K;
L
Linus Torvalds 已提交
291
load_b:
292 293 294 295
			ptr = load_pointer(skb, k, 1, &tmp);
			if (ptr != NULL) {
				A = *(u8 *)ptr;
				continue;
L
Linus Torvalds 已提交
296
			}
297
			return 0;
298
		case BPF_S_LD_W_LEN:
299
			A = skb->len;
L
Linus Torvalds 已提交
300
			continue;
301
		case BPF_S_LDX_W_LEN:
302
			X = skb->len;
L
Linus Torvalds 已提交
303
			continue;
304
		case BPF_S_LD_W_IND:
E
Eric Dumazet 已提交
305
			k = X + K;
L
Linus Torvalds 已提交
306
			goto load_w;
307
		case BPF_S_LD_H_IND:
E
Eric Dumazet 已提交
308
			k = X + K;
L
Linus Torvalds 已提交
309
			goto load_h;
310
		case BPF_S_LD_B_IND:
E
Eric Dumazet 已提交
311
			k = X + K;
L
Linus Torvalds 已提交
312
			goto load_b;
313
		case BPF_S_LDX_B_MSH:
E
Eric Dumazet 已提交
314
			ptr = load_pointer(skb, K, 1, &tmp);
315 316 317 318 319
			if (ptr != NULL) {
				X = (*(u8 *)ptr & 0xf) << 2;
				continue;
			}
			return 0;
320
		case BPF_S_LD_IMM:
E
Eric Dumazet 已提交
321
			A = K;
L
Linus Torvalds 已提交
322
			continue;
323
		case BPF_S_LDX_IMM:
E
Eric Dumazet 已提交
324
			X = K;
L
Linus Torvalds 已提交
325
			continue;
326
		case BPF_S_LD_MEM:
327
			A = mem[K];
L
Linus Torvalds 已提交
328
			continue;
329
		case BPF_S_LDX_MEM:
330
			X = mem[K];
L
Linus Torvalds 已提交
331
			continue;
332
		case BPF_S_MISC_TAX:
L
Linus Torvalds 已提交
333 334
			X = A;
			continue;
335
		case BPF_S_MISC_TXA:
L
Linus Torvalds 已提交
336 337
			A = X;
			continue;
338
		case BPF_S_RET_K:
E
Eric Dumazet 已提交
339
			return K;
340
		case BPF_S_RET_A:
341
			return A;
342
		case BPF_S_ST:
E
Eric Dumazet 已提交
343
			mem[K] = A;
L
Linus Torvalds 已提交
344
			continue;
345
		case BPF_S_STX:
E
Eric Dumazet 已提交
346
			mem[K] = X;
L
Linus Torvalds 已提交
347
			continue;
348
		case BPF_S_ANC_PROTOCOL:
A
Al Viro 已提交
349
			A = ntohs(skb->protocol);
L
Linus Torvalds 已提交
350
			continue;
351
		case BPF_S_ANC_PKTTYPE:
L
Linus Torvalds 已提交
352 353
			A = skb->pkt_type;
			continue;
354
		case BPF_S_ANC_IFINDEX:
355 356
			if (!skb->dev)
				return 0;
L
Linus Torvalds 已提交
357 358
			A = skb->dev->ifindex;
			continue;
359
		case BPF_S_ANC_MARK:
J
jamal 已提交
360 361
			A = skb->mark;
			continue;
362
		case BPF_S_ANC_QUEUE:
363 364
			A = skb->queue_mapping;
			continue;
365
		case BPF_S_ANC_HATYPE:
366 367 368 369
			if (!skb->dev)
				return 0;
			A = skb->dev->type;
			continue;
370
		case BPF_S_ANC_RXHASH:
371 372
			A = skb->rxhash;
			continue;
373
		case BPF_S_ANC_CPU:
374 375
			A = raw_smp_processor_id();
			continue;
376
		case BPF_S_ANC_NLATTR: {
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = nla_find((struct nlattr *)&skb->data[A],
				       skb->len - A, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
392
		case BPF_S_ANC_NLATTR_NEST: {
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = (struct nlattr *)&skb->data[A];
			if (nla->nla_len > A - skb->len)
				return 0;

			nla = nla_find_nested(nla, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
L
Linus Torvalds 已提交
411
		default:
412
			WARN_ON(1);
L
Linus Torvalds 已提交
413 414 415 416 417 418
			return 0;
		}
	}

	return 0;
}
419
EXPORT_SYMBOL(sk_run_filter);
L
Linus Torvalds 已提交
420

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
/*
 * Security :
 * A BPF program is able to use 16 cells of memory to store intermediate
 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
 * As we dont want to clear mem[] array for each packet going through
 * sk_run_filter(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesnt try to abuse us.
 */
static int check_load_and_stores(struct sock_filter *filter, int flen)
{
	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
	int pc, ret = 0;

	BUILD_BUG_ON(BPF_MEMWORDS > 16);
	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
	if (!masks)
		return -ENOMEM;
	memset(masks, 0xff, flen * sizeof(*masks));

	for (pc = 0; pc < flen; pc++) {
		memvalid &= masks[pc];

		switch (filter[pc].code) {
		case BPF_S_ST:
		case BPF_S_STX:
			memvalid |= (1 << filter[pc].k);
			break;
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
			if (!(memvalid & (1 << filter[pc].k))) {
				ret = -EINVAL;
				goto error;
			}
			break;
		case BPF_S_JMP_JA:
			/* a jump must set masks on target */
			masks[pc + 1 + filter[pc].k] &= memvalid;
			memvalid = ~0;
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
			/* a jump must set masks on targets */
			masks[pc + 1 + filter[pc].jt] &= memvalid;
			masks[pc + 1 + filter[pc].jf] &= memvalid;
			memvalid = ~0;
			break;
		}
	}
error:
	kfree(masks);
	return ret;
}

L
Linus Torvalds 已提交
481 482 483 484 485 486 487
/**
 *	sk_chk_filter - verify socket filter code
 *	@filter: filter to verify
 *	@flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
488 489
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
L
Linus Torvalds 已提交
490
 *
491 492 493
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
L
Linus Torvalds 已提交
494 495 496
 */
int sk_chk_filter(struct sock_filter *filter, int flen)
{
497 498 499 500 501
	/*
	 * Valid instructions are initialized to non-0.
	 * Invalid instructions are initialized to 0.
	 */
	static const u8 codes[] = {
E
Eric Dumazet 已提交
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K,
		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X,
		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K,
		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X,
		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K,
		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X,
		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X,
		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K,
		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X,
		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K,
		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X,
		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K,
		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X,
		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K,
		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X,
		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG,
		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS,
		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS,
		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS,
		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN,
		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND,
		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND,
		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND,
		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM,
		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN,
		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH,
		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM,
		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX,
		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA,
		[BPF_RET|BPF_K]          = BPF_S_RET_K,
		[BPF_RET|BPF_A]          = BPF_S_RET_A,
		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K,
		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM,
		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM,
		[BPF_ST]                 = BPF_S_ST,
		[BPF_STX]                = BPF_S_STX,
		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA,
		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K,
		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X,
		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K,
		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X,
		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K,
		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X,
		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
547
	};
L
Linus Torvalds 已提交
548 549
	int pc;

550
	if (flen == 0 || flen > BPF_MAXINSNS)
L
Linus Torvalds 已提交
551 552 553 554
		return -EINVAL;

	/* check the filter code now */
	for (pc = 0; pc < flen; pc++) {
555 556
		struct sock_filter *ftest = &filter[pc];
		u16 code = ftest->code;
557

558 559 560
		if (code >= ARRAY_SIZE(codes))
			return -EINVAL;
		code = codes[code];
E
Eric Dumazet 已提交
561
		if (!code)
562
			return -EINVAL;
563
		/* Some instructions need special checks */
564 565
		switch (code) {
		case BPF_S_ALU_DIV_K:
566 567
			/* check for division by zero */
			if (ftest->k == 0)
L
Linus Torvalds 已提交
568
				return -EINVAL;
E
Eric Dumazet 已提交
569
			ftest->k = reciprocal_value(ftest->k);
570
			break;
571 572 573 574 575
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
		case BPF_S_ST:
		case BPF_S_STX:
			/* check for invalid memory addresses */
576 577 578
			if (ftest->k >= BPF_MEMWORDS)
				return -EINVAL;
			break;
579
		case BPF_S_JMP_JA:
580 581 582 583 584 585 586
			/*
			 * Note, the large ftest->k might cause loops.
			 * Compare this with conditional jumps below,
			 * where offsets are limited. --ANK (981016)
			 */
			if (ftest->k >= (unsigned)(flen-pc-1))
				return -EINVAL;
587 588 589 590 591 592 593 594 595
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
596
			/* for conditionals both must be safe */
597
			if (pc + ftest->jt + 1 >= flen ||
598 599
			    pc + ftest->jf + 1 >= flen)
				return -EINVAL;
600
			break;
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
		case BPF_S_LD_W_ABS:
		case BPF_S_LD_H_ABS:
		case BPF_S_LD_B_ABS:
#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
				code = BPF_S_ANC_##CODE;	\
				break
			switch (ftest->k) {
			ANCILLARY(PROTOCOL);
			ANCILLARY(PKTTYPE);
			ANCILLARY(IFINDEX);
			ANCILLARY(NLATTR);
			ANCILLARY(NLATTR_NEST);
			ANCILLARY(MARK);
			ANCILLARY(QUEUE);
			ANCILLARY(HATYPE);
			ANCILLARY(RXHASH);
			ANCILLARY(CPU);
			}
619
		}
620
		ftest->code = code;
621
	}
622

623 624 625 626
	/* last instruction must be a RET code */
	switch (filter[flen - 1].code) {
	case BPF_S_RET_K:
	case BPF_S_RET_A:
627
		return check_load_and_stores(filter, flen);
628 629
	}
	return -EINVAL;
L
Linus Torvalds 已提交
630
}
631
EXPORT_SYMBOL(sk_chk_filter);
L
Linus Torvalds 已提交
632

633
/**
E
Eric Dumazet 已提交
634
 * 	sk_filter_release_rcu - Release a socket filter by rcu_head
635 636
 *	@rcu: rcu_head that contains the sk_filter to free
 */
E
Eric Dumazet 已提交
637
void sk_filter_release_rcu(struct rcu_head *rcu)
638 639 640
{
	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

E
Eric Dumazet 已提交
641
	kfree(fp);
642
}
E
Eric Dumazet 已提交
643
EXPORT_SYMBOL(sk_filter_release_rcu);
644

L
Linus Torvalds 已提交
645 646 647 648 649 650 651 652 653 654 655 656
/**
 *	sk_attach_filter - attach a socket filter
 *	@fprog: the filter program
 *	@sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
657
	struct sk_filter *fp, *old_fp;
L
Linus Torvalds 已提交
658 659 660 661
	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
	int err;

	/* Make sure new filter is there and in the right amounts. */
662 663
	if (fprog->filter == NULL)
		return -EINVAL;
L
Linus Torvalds 已提交
664 665 666 667 668

	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;
	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
669
		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
L
Linus Torvalds 已提交
670 671 672 673 674 675 676
		return -EFAULT;
	}

	atomic_set(&fp->refcnt, 1);
	fp->len = fprog->len;

	err = sk_chk_filter(fp->insns, fp->len);
677 678 679
	if (err) {
		sk_filter_uncharge(sk, fp);
		return err;
L
Linus Torvalds 已提交
680 681
	}

682 683
	old_fp = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
684 685
	rcu_assign_pointer(sk->sk_filter, fp);

686
	if (old_fp)
E
Eric Dumazet 已提交
687
		sk_filter_uncharge(sk, old_fp);
688
	return 0;
L
Linus Torvalds 已提交
689
}
690
EXPORT_SYMBOL_GPL(sk_attach_filter);
L
Linus Torvalds 已提交
691

692 693 694 695 696
int sk_detach_filter(struct sock *sk)
{
	int ret = -ENOENT;
	struct sk_filter *filter;

697 698
	filter = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
699 700
	if (filter) {
		rcu_assign_pointer(sk->sk_filter, NULL);
E
Eric Dumazet 已提交
701
		sk_filter_uncharge(sk, filter);
702 703 704 705
		ret = 0;
	}
	return ret;
}
706
EXPORT_SYMBOL_GPL(sk_detach_filter);