filter.c 22.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Author:
 *     Jay Schulist <jschlst@samba.org>
 *
 * Based on the design of:
 *     - The Berkeley Packet Filter
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 * Andi Kleen - Fix a few bad bugs and races.
16
 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
L
Linus Torvalds 已提交
17 18 19 20 21 22 23 24 25 26 27
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
28
#include <linux/gfp.h>
L
Linus Torvalds 已提交
29 30
#include <net/ip.h>
#include <net/protocol.h>
31
#include <net/netlink.h>
L
Linus Torvalds 已提交
32 33 34 35 36
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
37
#include <asm/unaligned.h>
L
Linus Torvalds 已提交
38
#include <linux/filter.h>
E
Eric Dumazet 已提交
39
#include <linux/reciprocal_div.h>
40
#include <linux/ratelimit.h>
41
#include <linux/seccomp.h>
E
Eric Dumazet 已提交
42
#include <linux/if_vlan.h>
L
Linus Torvalds 已提交
43

44 45 46 47 48
/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
L
Linus Torvalds 已提交
49 50 51 52
{
	u8 *ptr = NULL;

	if (k >= SKF_NET_OFF)
53
		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
L
Linus Torvalds 已提交
54
	else if (k >= SKF_LL_OFF)
55
		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
L
Linus Torvalds 已提交
56

57
	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
L
Linus Torvalds 已提交
58 59 60 61
		return ptr;
	return NULL;
}

E
Eric Dumazet 已提交
62
static inline void *load_pointer(const struct sk_buff *skb, int k,
63
				 unsigned int size, void *buffer)
64 65 66
{
	if (k >= 0)
		return skb_header_pointer(skb, k, size, buffer);
67
	return bpf_internal_load_pointer_neg_helper(skb, k, size);
68 69
}

S
Stephen Hemminger 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
/**
 *	sk_filter - run a packet through a socket filter
 *	@sk: sock associated with &sk_buff
 *	@skb: buffer to filter
 *
 * Run the filter code and then cut skb->data to correct size returned by
 * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to sk_run_filter. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter(struct sock *sk, struct sk_buff *skb)
{
	int err;
	struct sk_filter *filter;

87 88 89 90 91 92 93 94
	/*
	 * If the skb was allocated from pfmemalloc reserves, only
	 * allow SOCK_MEMALLOC sockets to use it as this socket is
	 * helping free memory
	 */
	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
		return -ENOMEM;

S
Stephen Hemminger 已提交
95 96 97 98
	err = security_sock_rcv_skb(sk, skb);
	if (err)
		return err;

99 100
	rcu_read_lock();
	filter = rcu_dereference(sk->sk_filter);
S
Stephen Hemminger 已提交
101
	if (filter) {
102
		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
103

S
Stephen Hemminger 已提交
104 105
		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
	}
106
	rcu_read_unlock();
S
Stephen Hemminger 已提交
107 108 109 110 111

	return err;
}
EXPORT_SYMBOL(sk_filter);

L
Linus Torvalds 已提交
112
/**
113
 *	sk_run_filter - run a filter on a socket
L
Linus Torvalds 已提交
114
 *	@skb: buffer to run the filter on
115
 *	@fentry: filter to apply
L
Linus Torvalds 已提交
116 117
 *
 * Decode and apply filter instructions to the skb->data.
E
Eric Dumazet 已提交
118 119 120 121 122
 * Return length to keep, 0 for none. @skb is the data we are
 * filtering, @filter is the array of filter instructions.
 * Because all jumps are guaranteed to be before last instruction,
 * and last instruction guaranteed to be a RET, we dont need to check
 * flen. (We used to pass to this function the length of filter)
L
Linus Torvalds 已提交
123
 */
E
Eric Dumazet 已提交
124 125
unsigned int sk_run_filter(const struct sk_buff *skb,
			   const struct sock_filter *fentry)
L
Linus Torvalds 已提交
126
{
127
	void *ptr;
128 129
	u32 A = 0;			/* Accumulator */
	u32 X = 0;			/* Index Register */
L
Linus Torvalds 已提交
130
	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
131
	u32 tmp;
L
Linus Torvalds 已提交
132 133 134 135 136
	int k;

	/*
	 * Process array of filter instructions.
	 */
E
Eric Dumazet 已提交
137 138 139 140 141 142
	for (;; fentry++) {
#if defined(CONFIG_X86_32)
#define	K (fentry->k)
#else
		const u32 K = fentry->k;
#endif
143

L
Linus Torvalds 已提交
144
		switch (fentry->code) {
145
		case BPF_S_ALU_ADD_X:
L
Linus Torvalds 已提交
146 147
			A += X;
			continue;
148
		case BPF_S_ALU_ADD_K:
E
Eric Dumazet 已提交
149
			A += K;
L
Linus Torvalds 已提交
150
			continue;
151
		case BPF_S_ALU_SUB_X:
L
Linus Torvalds 已提交
152 153
			A -= X;
			continue;
154
		case BPF_S_ALU_SUB_K:
E
Eric Dumazet 已提交
155
			A -= K;
L
Linus Torvalds 已提交
156
			continue;
157
		case BPF_S_ALU_MUL_X:
L
Linus Torvalds 已提交
158 159
			A *= X;
			continue;
160
		case BPF_S_ALU_MUL_K:
E
Eric Dumazet 已提交
161
			A *= K;
L
Linus Torvalds 已提交
162
			continue;
163
		case BPF_S_ALU_DIV_X:
L
Linus Torvalds 已提交
164 165 166 167
			if (X == 0)
				return 0;
			A /= X;
			continue;
168
		case BPF_S_ALU_DIV_K:
E
Eric Dumazet 已提交
169
			A = reciprocal_divide(A, K);
L
Linus Torvalds 已提交
170
			continue;
E
Eric Dumazet 已提交
171 172 173 174 175 176 177 178
		case BPF_S_ALU_MOD_X:
			if (X == 0)
				return 0;
			A %= X;
			continue;
		case BPF_S_ALU_MOD_K:
			A %= K;
			continue;
179
		case BPF_S_ALU_AND_X:
L
Linus Torvalds 已提交
180 181
			A &= X;
			continue;
182
		case BPF_S_ALU_AND_K:
E
Eric Dumazet 已提交
183
			A &= K;
L
Linus Torvalds 已提交
184
			continue;
185
		case BPF_S_ALU_OR_X:
L
Linus Torvalds 已提交
186 187
			A |= X;
			continue;
188
		case BPF_S_ALU_OR_K:
E
Eric Dumazet 已提交
189
			A |= K;
L
Linus Torvalds 已提交
190
			continue;
191 192 193 194 195 196 197
		case BPF_S_ANC_ALU_XOR_X:
		case BPF_S_ALU_XOR_X:
			A ^= X;
			continue;
		case BPF_S_ALU_XOR_K:
			A ^= K;
			continue;
198
		case BPF_S_ALU_LSH_X:
L
Linus Torvalds 已提交
199 200
			A <<= X;
			continue;
201
		case BPF_S_ALU_LSH_K:
E
Eric Dumazet 已提交
202
			A <<= K;
L
Linus Torvalds 已提交
203
			continue;
204
		case BPF_S_ALU_RSH_X:
L
Linus Torvalds 已提交
205 206
			A >>= X;
			continue;
207
		case BPF_S_ALU_RSH_K:
E
Eric Dumazet 已提交
208
			A >>= K;
L
Linus Torvalds 已提交
209
			continue;
210
		case BPF_S_ALU_NEG:
L
Linus Torvalds 已提交
211 212
			A = -A;
			continue;
213
		case BPF_S_JMP_JA:
E
Eric Dumazet 已提交
214
			fentry += K;
L
Linus Torvalds 已提交
215
			continue;
216
		case BPF_S_JMP_JGT_K:
E
Eric Dumazet 已提交
217
			fentry += (A > K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
218
			continue;
219
		case BPF_S_JMP_JGE_K:
E
Eric Dumazet 已提交
220
			fentry += (A >= K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
221
			continue;
222
		case BPF_S_JMP_JEQ_K:
E
Eric Dumazet 已提交
223
			fentry += (A == K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
224
			continue;
225
		case BPF_S_JMP_JSET_K:
E
Eric Dumazet 已提交
226
			fentry += (A & K) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
227
			continue;
228
		case BPF_S_JMP_JGT_X:
E
Eric Dumazet 已提交
229
			fentry += (A > X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
230
			continue;
231
		case BPF_S_JMP_JGE_X:
E
Eric Dumazet 已提交
232
			fentry += (A >= X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
233
			continue;
234
		case BPF_S_JMP_JEQ_X:
E
Eric Dumazet 已提交
235
			fentry += (A == X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
236
			continue;
237
		case BPF_S_JMP_JSET_X:
E
Eric Dumazet 已提交
238
			fentry += (A & X) ? fentry->jt : fentry->jf;
L
Linus Torvalds 已提交
239
			continue;
240
		case BPF_S_LD_W_ABS:
E
Eric Dumazet 已提交
241
			k = K;
242
load_w:
243 244
			ptr = load_pointer(skb, k, 4, &tmp);
			if (ptr != NULL) {
245
				A = get_unaligned_be32(ptr);
246
				continue;
L
Linus Torvalds 已提交
247
			}
248
			return 0;
249
		case BPF_S_LD_H_ABS:
E
Eric Dumazet 已提交
250
			k = K;
251
load_h:
252 253
			ptr = load_pointer(skb, k, 2, &tmp);
			if (ptr != NULL) {
254
				A = get_unaligned_be16(ptr);
255
				continue;
L
Linus Torvalds 已提交
256
			}
257
			return 0;
258
		case BPF_S_LD_B_ABS:
E
Eric Dumazet 已提交
259
			k = K;
L
Linus Torvalds 已提交
260
load_b:
261 262 263 264
			ptr = load_pointer(skb, k, 1, &tmp);
			if (ptr != NULL) {
				A = *(u8 *)ptr;
				continue;
L
Linus Torvalds 已提交
265
			}
266
			return 0;
267
		case BPF_S_LD_W_LEN:
268
			A = skb->len;
L
Linus Torvalds 已提交
269
			continue;
270
		case BPF_S_LDX_W_LEN:
271
			X = skb->len;
L
Linus Torvalds 已提交
272
			continue;
273
		case BPF_S_LD_W_IND:
E
Eric Dumazet 已提交
274
			k = X + K;
L
Linus Torvalds 已提交
275
			goto load_w;
276
		case BPF_S_LD_H_IND:
E
Eric Dumazet 已提交
277
			k = X + K;
L
Linus Torvalds 已提交
278
			goto load_h;
279
		case BPF_S_LD_B_IND:
E
Eric Dumazet 已提交
280
			k = X + K;
L
Linus Torvalds 已提交
281
			goto load_b;
282
		case BPF_S_LDX_B_MSH:
E
Eric Dumazet 已提交
283
			ptr = load_pointer(skb, K, 1, &tmp);
284 285 286 287 288
			if (ptr != NULL) {
				X = (*(u8 *)ptr & 0xf) << 2;
				continue;
			}
			return 0;
289
		case BPF_S_LD_IMM:
E
Eric Dumazet 已提交
290
			A = K;
L
Linus Torvalds 已提交
291
			continue;
292
		case BPF_S_LDX_IMM:
E
Eric Dumazet 已提交
293
			X = K;
L
Linus Torvalds 已提交
294
			continue;
295
		case BPF_S_LD_MEM:
296
			A = mem[K];
L
Linus Torvalds 已提交
297
			continue;
298
		case BPF_S_LDX_MEM:
299
			X = mem[K];
L
Linus Torvalds 已提交
300
			continue;
301
		case BPF_S_MISC_TAX:
L
Linus Torvalds 已提交
302 303
			X = A;
			continue;
304
		case BPF_S_MISC_TXA:
L
Linus Torvalds 已提交
305 306
			A = X;
			continue;
307
		case BPF_S_RET_K:
E
Eric Dumazet 已提交
308
			return K;
309
		case BPF_S_RET_A:
310
			return A;
311
		case BPF_S_ST:
E
Eric Dumazet 已提交
312
			mem[K] = A;
L
Linus Torvalds 已提交
313
			continue;
314
		case BPF_S_STX:
E
Eric Dumazet 已提交
315
			mem[K] = X;
L
Linus Torvalds 已提交
316
			continue;
317
		case BPF_S_ANC_PROTOCOL:
A
Al Viro 已提交
318
			A = ntohs(skb->protocol);
L
Linus Torvalds 已提交
319
			continue;
320
		case BPF_S_ANC_PKTTYPE:
L
Linus Torvalds 已提交
321 322
			A = skb->pkt_type;
			continue;
323
		case BPF_S_ANC_IFINDEX:
324 325
			if (!skb->dev)
				return 0;
L
Linus Torvalds 已提交
326 327
			A = skb->dev->ifindex;
			continue;
328
		case BPF_S_ANC_MARK:
J
jamal 已提交
329 330
			A = skb->mark;
			continue;
331
		case BPF_S_ANC_QUEUE:
332 333
			A = skb->queue_mapping;
			continue;
334
		case BPF_S_ANC_HATYPE:
335 336 337 338
			if (!skb->dev)
				return 0;
			A = skb->dev->type;
			continue;
339
		case BPF_S_ANC_RXHASH:
340 341
			A = skb->rxhash;
			continue;
342
		case BPF_S_ANC_CPU:
343 344
			A = raw_smp_processor_id();
			continue;
E
Eric Dumazet 已提交
345 346 347 348 349 350
		case BPF_S_ANC_VLAN_TAG:
			A = vlan_tx_tag_get(skb);
			continue;
		case BPF_S_ANC_VLAN_TAG_PRESENT:
			A = !!vlan_tx_tag_present(skb);
			continue;
351
		case BPF_S_ANC_NLATTR: {
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = nla_find((struct nlattr *)&skb->data[A],
				       skb->len - A, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
367
		case BPF_S_ANC_NLATTR_NEST: {
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
			struct nlattr *nla;

			if (skb_is_nonlinear(skb))
				return 0;
			if (A > skb->len - sizeof(struct nlattr))
				return 0;

			nla = (struct nlattr *)&skb->data[A];
			if (nla->nla_len > A - skb->len)
				return 0;

			nla = nla_find_nested(nla, X);
			if (nla)
				A = (void *)nla - (void *)skb->data;
			else
				A = 0;
			continue;
		}
386 387 388 389 390
#ifdef CONFIG_SECCOMP_FILTER
		case BPF_S_ANC_SECCOMP_LD_W:
			A = seccomp_bpf_load(fentry->k);
			continue;
#endif
L
Linus Torvalds 已提交
391
		default:
J
Joe Perches 已提交
392 393 394
			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
				       fentry->code, fentry->jt,
				       fentry->jf, fentry->k);
L
Linus Torvalds 已提交
395 396 397 398 399 400
			return 0;
		}
	}

	return 0;
}
401
EXPORT_SYMBOL(sk_run_filter);
L
Linus Torvalds 已提交
402

403 404 405 406 407 408 409
/*
 * Security :
 * A BPF program is able to use 16 cells of memory to store intermediate
 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
 * As we dont want to clear mem[] array for each packet going through
 * sk_run_filter(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
L
Lucas De Marchi 已提交
410
 * a malicious user doesn't try to abuse us.
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
 */
static int check_load_and_stores(struct sock_filter *filter, int flen)
{
	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
	int pc, ret = 0;

	BUILD_BUG_ON(BPF_MEMWORDS > 16);
	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
	if (!masks)
		return -ENOMEM;
	memset(masks, 0xff, flen * sizeof(*masks));

	for (pc = 0; pc < flen; pc++) {
		memvalid &= masks[pc];

		switch (filter[pc].code) {
		case BPF_S_ST:
		case BPF_S_STX:
			memvalid |= (1 << filter[pc].k);
			break;
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
			if (!(memvalid & (1 << filter[pc].k))) {
				ret = -EINVAL;
				goto error;
			}
			break;
		case BPF_S_JMP_JA:
			/* a jump must set masks on target */
			masks[pc + 1 + filter[pc].k] &= memvalid;
			memvalid = ~0;
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
			/* a jump must set masks on targets */
			masks[pc + 1 + filter[pc].jt] &= memvalid;
			masks[pc + 1 + filter[pc].jf] &= memvalid;
			memvalid = ~0;
			break;
		}
	}
error:
	kfree(masks);
	return ret;
}

L
Linus Torvalds 已提交
463 464 465 466 467 468 469
/**
 *	sk_chk_filter - verify socket filter code
 *	@filter: filter to verify
 *	@flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
470 471
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
L
Linus Torvalds 已提交
472
 *
473 474 475
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
L
Linus Torvalds 已提交
476
 */
477
int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
L
Linus Torvalds 已提交
478
{
479 480 481 482 483
	/*
	 * Valid instructions are initialized to non-0.
	 * Invalid instructions are initialized to 0.
	 */
	static const u8 codes[] = {
E
Eric Dumazet 已提交
484 485 486 487 488 489 490
		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K,
		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X,
		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K,
		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X,
		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K,
		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X,
		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X,
E
Eric Dumazet 已提交
491 492
		[BPF_ALU|BPF_MOD|BPF_K]  = BPF_S_ALU_MOD_K,
		[BPF_ALU|BPF_MOD|BPF_X]  = BPF_S_ALU_MOD_X,
E
Eric Dumazet 已提交
493 494 495 496
		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K,
		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X,
		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K,
		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X,
497 498
		[BPF_ALU|BPF_XOR|BPF_K]  = BPF_S_ALU_XOR_K,
		[BPF_ALU|BPF_XOR|BPF_X]  = BPF_S_ALU_XOR_X,
E
Eric Dumazet 已提交
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K,
		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X,
		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K,
		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X,
		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG,
		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS,
		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS,
		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS,
		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN,
		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND,
		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND,
		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND,
		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM,
		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN,
		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH,
		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM,
		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX,
		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA,
		[BPF_RET|BPF_K]          = BPF_S_RET_K,
		[BPF_RET|BPF_A]          = BPF_S_RET_A,
		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K,
		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM,
		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM,
		[BPF_ST]                 = BPF_S_ST,
		[BPF_STX]                = BPF_S_STX,
		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA,
		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K,
		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X,
		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K,
		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X,
		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K,
		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X,
		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
533
	};
L
Linus Torvalds 已提交
534
	int pc;
535
	bool anc_found;
L
Linus Torvalds 已提交
536

537
	if (flen == 0 || flen > BPF_MAXINSNS)
L
Linus Torvalds 已提交
538 539 540 541
		return -EINVAL;

	/* check the filter code now */
	for (pc = 0; pc < flen; pc++) {
542 543
		struct sock_filter *ftest = &filter[pc];
		u16 code = ftest->code;
544

545 546 547
		if (code >= ARRAY_SIZE(codes))
			return -EINVAL;
		code = codes[code];
E
Eric Dumazet 已提交
548
		if (!code)
549
			return -EINVAL;
550
		/* Some instructions need special checks */
551 552
		switch (code) {
		case BPF_S_ALU_DIV_K:
553 554
			/* check for division by zero */
			if (ftest->k == 0)
L
Linus Torvalds 已提交
555
				return -EINVAL;
E
Eric Dumazet 已提交
556
			ftest->k = reciprocal_value(ftest->k);
557
			break;
E
Eric Dumazet 已提交
558 559 560 561 562
		case BPF_S_ALU_MOD_K:
			/* check for division by zero */
			if (ftest->k == 0)
				return -EINVAL;
			break;
563 564 565 566 567
		case BPF_S_LD_MEM:
		case BPF_S_LDX_MEM:
		case BPF_S_ST:
		case BPF_S_STX:
			/* check for invalid memory addresses */
568 569 570
			if (ftest->k >= BPF_MEMWORDS)
				return -EINVAL;
			break;
571
		case BPF_S_JMP_JA:
572 573 574 575 576
			/*
			 * Note, the large ftest->k might cause loops.
			 * Compare this with conditional jumps below,
			 * where offsets are limited. --ANK (981016)
			 */
577
			if (ftest->k >= (unsigned int)(flen-pc-1))
578
				return -EINVAL;
579 580 581 582 583 584 585 586 587
			break;
		case BPF_S_JMP_JEQ_K:
		case BPF_S_JMP_JEQ_X:
		case BPF_S_JMP_JGE_K:
		case BPF_S_JMP_JGE_X:
		case BPF_S_JMP_JGT_K:
		case BPF_S_JMP_JGT_X:
		case BPF_S_JMP_JSET_X:
		case BPF_S_JMP_JSET_K:
588
			/* for conditionals both must be safe */
589
			if (pc + ftest->jt + 1 >= flen ||
590 591
			    pc + ftest->jf + 1 >= flen)
				return -EINVAL;
592
			break;
593 594 595
		case BPF_S_LD_W_ABS:
		case BPF_S_LD_H_ABS:
		case BPF_S_LD_B_ABS:
596
			anc_found = false;
597 598
#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
				code = BPF_S_ANC_##CODE;	\
599
				anc_found = true;		\
600 601 602 603 604 605 606 607 608 609 610 611
				break
			switch (ftest->k) {
			ANCILLARY(PROTOCOL);
			ANCILLARY(PKTTYPE);
			ANCILLARY(IFINDEX);
			ANCILLARY(NLATTR);
			ANCILLARY(NLATTR_NEST);
			ANCILLARY(MARK);
			ANCILLARY(QUEUE);
			ANCILLARY(HATYPE);
			ANCILLARY(RXHASH);
			ANCILLARY(CPU);
J
Jiri Pirko 已提交
612
			ANCILLARY(ALU_XOR_X);
E
Eric Dumazet 已提交
613 614
			ANCILLARY(VLAN_TAG);
			ANCILLARY(VLAN_TAG_PRESENT);
615
			}
616 617 618 619

			/* ancillary operation unknown or unsupported */
			if (anc_found == false && ftest->k >= SKF_AD_OFF)
				return -EINVAL;
620
		}
621
		ftest->code = code;
622
	}
623

624 625 626 627
	/* last instruction must be a RET code */
	switch (filter[flen - 1].code) {
	case BPF_S_RET_K:
	case BPF_S_RET_A:
628
		return check_load_and_stores(filter, flen);
629 630
	}
	return -EINVAL;
L
Linus Torvalds 已提交
631
}
632
EXPORT_SYMBOL(sk_chk_filter);
L
Linus Torvalds 已提交
633

634
/**
E
Eric Dumazet 已提交
635
 * 	sk_filter_release_rcu - Release a socket filter by rcu_head
636 637
 *	@rcu: rcu_head that contains the sk_filter to free
 */
E
Eric Dumazet 已提交
638
void sk_filter_release_rcu(struct rcu_head *rcu)
639 640 641
{
	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

642
	bpf_jit_free(fp);
E
Eric Dumazet 已提交
643
	kfree(fp);
644
}
E
Eric Dumazet 已提交
645
EXPORT_SYMBOL(sk_filter_release_rcu);
646

647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
static int __sk_prepare_filter(struct sk_filter *fp)
{
	int err;

	fp->bpf_func = sk_run_filter;

	err = sk_chk_filter(fp->insns, fp->len);
	if (err)
		return err;

	bpf_jit_compile(fp);
	return 0;
}

/**
 *	sk_unattached_filter_create - create an unattached filter
 *	@fprog: the filter program
R
Randy Dunlap 已提交
664
 *	@pfp: the unattached filter that is created
665
 *
R
Randy Dunlap 已提交
666
 * Create a filter independent of any socket. We first run some
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int sk_unattached_filter_create(struct sk_filter **pfp,
				struct sock_fprog *fprog)
{
	struct sk_filter *fp;
	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
	int err;

	/* Make sure new filter is there and in the right amounts. */
	if (fprog->filter == NULL)
		return -EINVAL;

	fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;
	memcpy(fp->insns, fprog->filter, fsize);

	atomic_set(&fp->refcnt, 1);
	fp->len = fprog->len;

	err = __sk_prepare_filter(fp);
	if (err)
		goto free_mem;

	*pfp = fp;
	return 0;
free_mem:
	kfree(fp);
	return err;
}
EXPORT_SYMBOL_GPL(sk_unattached_filter_create);

void sk_unattached_filter_destroy(struct sk_filter *fp)
{
	sk_filter_release(fp);
}
EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);

L
Linus Torvalds 已提交
708 709 710 711 712 713 714 715 716 717 718 719
/**
 *	sk_attach_filter - attach a socket filter
 *	@fprog: the filter program
 *	@sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
720
	struct sk_filter *fp, *old_fp;
L
Linus Torvalds 已提交
721 722 723
	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
	int err;

724 725 726
	if (sock_flag(sk, SOCK_FILTER_LOCKED))
		return -EPERM;

L
Linus Torvalds 已提交
727
	/* Make sure new filter is there and in the right amounts. */
728 729
	if (fprog->filter == NULL)
		return -EINVAL;
L
Linus Torvalds 已提交
730 731 732 733 734

	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;
	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
735
		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
L
Linus Torvalds 已提交
736 737 738 739 740 741
		return -EFAULT;
	}

	atomic_set(&fp->refcnt, 1);
	fp->len = fprog->len;

742
	err = __sk_prepare_filter(fp);
743 744 745
	if (err) {
		sk_filter_uncharge(sk, fp);
		return err;
L
Linus Torvalds 已提交
746 747
	}

748 749
	old_fp = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
750 751
	rcu_assign_pointer(sk->sk_filter, fp);

752
	if (old_fp)
E
Eric Dumazet 已提交
753
		sk_filter_uncharge(sk, old_fp);
754
	return 0;
L
Linus Torvalds 已提交
755
}
756
EXPORT_SYMBOL_GPL(sk_attach_filter);
L
Linus Torvalds 已提交
757

758 759 760 761 762
int sk_detach_filter(struct sock *sk)
{
	int ret = -ENOENT;
	struct sk_filter *filter;

763 764 765
	if (sock_flag(sk, SOCK_FILTER_LOCKED))
		return -EPERM;

766 767
	filter = rcu_dereference_protected(sk->sk_filter,
					   sock_owned_by_user(sk));
768
	if (filter) {
769
		RCU_INIT_POINTER(sk->sk_filter, NULL);
E
Eric Dumazet 已提交
770
		sk_filter_uncharge(sk, filter);
771 772 773 774
		ret = 0;
	}
	return ret;
}
775
EXPORT_SYMBOL_GPL(sk_detach_filter);
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905

static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
{
	static const u16 decodes[] = {
		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K,
		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X,
		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K,
		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X,
		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K,
		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X,
		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X,
		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K,
		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X,
		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K,
		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X,
		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K,
		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X,
		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K,
		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X,
		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K,
		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X,
		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K,
		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X,
		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG,
		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS,
		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS,
		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND,
		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM,
		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN,
		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH,
		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM,
		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX,
		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA,
		[BPF_S_RET_K]		= BPF_RET|BPF_K,
		[BPF_S_RET_A]		= BPF_RET|BPF_A,
		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K,
		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM,
		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM,
		[BPF_S_ST]		= BPF_ST,
		[BPF_S_STX]		= BPF_STX,
		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA,
		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K,
		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X,
		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K,
		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X,
		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K,
		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X,
		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K,
		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X,
	};
	u16 code;

	code = filt->code;

	to->code = decodes[code];
	to->jt = filt->jt;
	to->jf = filt->jf;

	if (code == BPF_S_ALU_DIV_K) {
		/*
		 * When loaded this rule user gave us X, which was
		 * translated into R = r(X). Now we calculate the
		 * RR = r(R) and report it back. If next time this
		 * value is loaded and RRR = r(RR) is calculated
		 * then the R == RRR will be true.
		 *
		 * One exception. X == 1 translates into R == 0 and
		 * we can't calculate RR out of it with r().
		 */

		if (filt->k == 0)
			to->k = 1;
		else
			to->k = reciprocal_value(filt->k);

		BUG_ON(reciprocal_value(to->k) != filt->k);
	} else
		to->k = filt->k;
}

int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len)
{
	struct sk_filter *filter;
	int i, ret;

	lock_sock(sk);
	filter = rcu_dereference_protected(sk->sk_filter,
			sock_owned_by_user(sk));
	ret = 0;
	if (!filter)
		goto out;
	ret = filter->len;
	if (!len)
		goto out;
	ret = -EINVAL;
	if (len < filter->len)
		goto out;

	ret = -EFAULT;
	for (i = 0; i < filter->len; i++) {
		struct sock_filter fb;

		sk_decode_filter(&filter->insns[i], &fb);
		if (copy_to_user(&ubuf[i], &fb, sizeof(fb)))
			goto out;
	}

	ret = filter->len;
out:
	release_sock(sk);
	return ret;
}