syscall.c 37.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/bpf.h>
13
#include <linux/bpf_trace.h>
14 15
#include <linux/syscalls.h>
#include <linux/slab.h>
16
#include <linux/sched/signal.h>
17 18
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
19
#include <linux/anon_inodes.h>
20
#include <linux/file.h>
21 22
#include <linux/license.h>
#include <linux/filter.h>
23
#include <linux/version.h>
M
Mickaël Salaün 已提交
24
#include <linux/kernel.h>
M
Martin KaFai Lau 已提交
25
#include <linux/idr.h>
26 27 28
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
29

30 31 32 33 34 35 36
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
			   (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
			   (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))

37 38
#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)

39
DEFINE_PER_CPU(int, bpf_prog_active);
M
Martin KaFai Lau 已提交
40 41
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
M
Martin KaFai Lau 已提交
42 43
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
44

45 46
int sysctl_unprivileged_bpf_disabled __read_mostly;

47 48 49 50 51 52 53 54
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _ops)
#define BPF_MAP_TYPE(_id, _ops) \
	[_id] = &_ops,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
};
55

56 57 58 59 60 61 62 63 64
/*
 * If we're handed a bigger struct than we know of, ensure all the unknown bits
 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
 * we don't know about yet.
 *
 * There is a ToCToU between this function call and the following
 * copy_from_user() call. However, this is not a concern since this function is
 * meant to be a future-proofing of bits.
 */
65 66 67 68 69 70 71 72 73
static int check_uarg_tail_zero(void __user *uaddr,
				size_t expected_size,
				size_t actual_size)
{
	unsigned char __user *addr;
	unsigned char __user *end;
	unsigned char val;
	int err;

74 75 76 77 78 79
	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
		return -E2BIG;

	if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
		return -EFAULT;

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
	if (actual_size <= expected_size)
		return 0;

	addr = uaddr + expected_size;
	end  = uaddr + actual_size;

	for (; addr < end; addr++) {
		err = get_user(val, addr);
		if (err)
			return err;
		if (val)
			return -E2BIG;
	}

	return 0;
}

97 98 99 100
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
	struct bpf_map *map;

101 102 103
	if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
	    !bpf_map_types[attr->map_type])
		return ERR_PTR(-EINVAL);
104

105 106 107 108 109 110
	map = bpf_map_types[attr->map_type]->map_alloc(attr);
	if (IS_ERR(map))
		return map;
	map->ops = bpf_map_types[attr->map_type];
	map->map_type = attr->map_type;
	return map;
111 112
}

113
void *bpf_map_area_alloc(size_t size, int numa_node)
114 115 116 117 118 119 120 121 122
{
	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
	 * trigger under memory pressure as we really just want to
	 * fail instead.
	 */
	const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
	void *area;

	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
123
		area = kmalloc_node(size, GFP_USER | flags, numa_node);
124 125 126 127
		if (area != NULL)
			return area;
	}

128 129
	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
					   __builtin_return_address(0));
130 131 132 133 134 135 136
}

void bpf_map_area_free(void *area)
{
	kvfree(area);
}

137 138 139 140 141 142 143 144 145 146 147 148 149
int bpf_map_precharge_memlock(u32 pages)
{
	struct user_struct *user = get_current_user();
	unsigned long memlock_limit, cur;

	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	cur = atomic_long_read(&user->locked_vm);
	free_uid(user);
	if (cur + pages > memlock_limit)
		return -EPERM;
	return 0;
}

150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
static int bpf_map_charge_memlock(struct bpf_map *map)
{
	struct user_struct *user = get_current_user();
	unsigned long memlock_limit;

	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

	atomic_long_add(map->pages, &user->locked_vm);

	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
		atomic_long_sub(map->pages, &user->locked_vm);
		free_uid(user);
		return -EPERM;
	}
	map->user = user;
	return 0;
}

static void bpf_map_uncharge_memlock(struct bpf_map *map)
{
	struct user_struct *user = map->user;

	atomic_long_sub(map->pages, &user->locked_vm);
	free_uid(user);
}

M
Martin KaFai Lau 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
static int bpf_map_alloc_id(struct bpf_map *map)
{
	int id;

	spin_lock_bh(&map_idr_lock);
	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
	if (id > 0)
		map->id = id;
	spin_unlock_bh(&map_idr_lock);

	if (WARN_ON_ONCE(!id))
		return -ENOSPC;

	return id > 0 ? 0 : id;
}

M
Martin KaFai Lau 已提交
192
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
M
Martin KaFai Lau 已提交
193
{
194 195
	unsigned long flags;

M
Martin KaFai Lau 已提交
196
	if (do_idr_lock)
197
		spin_lock_irqsave(&map_idr_lock, flags);
M
Martin KaFai Lau 已提交
198 199 200
	else
		__acquire(&map_idr_lock);

M
Martin KaFai Lau 已提交
201
	idr_remove(&map_idr, map->id);
M
Martin KaFai Lau 已提交
202 203

	if (do_idr_lock)
204
		spin_unlock_irqrestore(&map_idr_lock, flags);
M
Martin KaFai Lau 已提交
205 206
	else
		__release(&map_idr_lock);
M
Martin KaFai Lau 已提交
207 208
}

209 210 211 212 213
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

214
	bpf_map_uncharge_memlock(map);
215 216 217 218
	/* implementation dependent freeing */
	map->ops->map_free(map);
}

219 220 221 222 223 224 225 226
static void bpf_map_put_uref(struct bpf_map *map)
{
	if (atomic_dec_and_test(&map->usercnt)) {
		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
			bpf_fd_array_map_clear(map);
	}
}

227 228 229
/* decrement map refcnt and schedule it for freeing via workqueue
 * (unrelying map implementation ops->map_free() might sleep)
 */
M
Martin KaFai Lau 已提交
230
static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
231 232
{
	if (atomic_dec_and_test(&map->refcnt)) {
233
		/* bpf_map_free_id() must be called first */
M
Martin KaFai Lau 已提交
234
		bpf_map_free_id(map, do_idr_lock);
235 236 237 238 239
		INIT_WORK(&map->work, bpf_map_free_deferred);
		schedule_work(&map->work);
	}
}

M
Martin KaFai Lau 已提交
240 241 242 243 244
void bpf_map_put(struct bpf_map *map)
{
	__bpf_map_put(map, true);
}

245
void bpf_map_put_with_uref(struct bpf_map *map)
246
{
247
	bpf_map_put_uref(map);
248
	bpf_map_put(map);
249 250 251 252
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
253 254 255 256 257 258
	struct bpf_map *map = filp->private_data;

	if (map->ops->map_release)
		map->ops->map_release(map, filp);

	bpf_map_put_with_uref(map);
259 260 261
	return 0;
}

262 263 264 265
#ifdef CONFIG_PROC_FS
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
	const struct bpf_map *map = filp->private_data;
266 267
	const struct bpf_array *array;
	u32 owner_prog_type = 0;
268
	u32 owner_jited = 0;
269 270 271 272

	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
		array = container_of(map, struct bpf_array, map);
		owner_prog_type = array->owner_prog_type;
273
		owner_jited = array->owner_jited;
274
	}
275 276 277 278 279

	seq_printf(m,
		   "map_type:\t%u\n"
		   "key_size:\t%u\n"
		   "value_size:\t%u\n"
280
		   "max_entries:\t%u\n"
281 282
		   "map_flags:\t%#x\n"
		   "memlock:\t%llu\n",
283 284 285
		   map->map_type,
		   map->key_size,
		   map->value_size,
286
		   map->max_entries,
287 288 289
		   map->map_flags,
		   map->pages * 1ULL << PAGE_SHIFT);

290
	if (owner_prog_type) {
291 292
		seq_printf(m, "owner_prog_type:\t%u\n",
			   owner_prog_type);
293 294 295
		seq_printf(m, "owner_jited:\t%u\n",
			   owner_jited);
	}
296 297 298
}
#endif

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
			      loff_t *ppos)
{
	/* We need this handler such that alloc_file() enables
	 * f_mode with FMODE_CAN_READ.
	 */
	return -EINVAL;
}

static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
			       size_t siz, loff_t *ppos)
{
	/* We need this handler such that alloc_file() enables
	 * f_mode with FMODE_CAN_WRITE.
	 */
	return -EINVAL;
}

317
static const struct file_operations bpf_map_fops = {
318 319 320 321
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= bpf_map_show_fdinfo,
#endif
	.release	= bpf_map_release,
322 323
	.read		= bpf_dummy_read,
	.write		= bpf_dummy_write,
324 325
};

326
int bpf_map_new_fd(struct bpf_map *map, int flags)
327 328
{
	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
329 330 331 332 333 334 335 336 337 338 339 340
				flags | O_CLOEXEC);
}

int bpf_get_file_flag(int flags)
{
	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
		return -EINVAL;
	if (flags & BPF_F_RDONLY)
		return O_RDONLY;
	if (flags & BPF_F_WRONLY)
		return O_WRONLY;
	return O_RDWR;
341 342
}

343 344 345 346 347 348 349 350
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
		   sizeof(attr->CMD##_LAST_FIELD), 0, \
		   sizeof(*attr) - \
		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
		   sizeof(attr->CMD##_LAST_FIELD)) != NULL

351 352 353 354 355 356 357
/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
 * Return 0 on success and < 0 on error.
 */
static int bpf_obj_name_cpy(char *dst, const char *src)
{
	const char *end = src + BPF_OBJ_NAME_LEN;

358 359
	memset(dst, 0, BPF_OBJ_NAME_LEN);

360 361 362 363 364 365 366 367 368 369 370 371 372 373
	/* Copy all isalnum() and '_' char */
	while (src < end && *src) {
		if (!isalnum(*src) && *src != '_')
			return -EINVAL;
		*dst++ = *src++;
	}

	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
	if (src == end)
		return -EINVAL;

	return 0;
}

374
#define BPF_MAP_CREATE_LAST_FIELD map_name
375 376 377
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
378
	int numa_node = bpf_map_attr_numa_node(attr);
379
	struct bpf_map *map;
380
	int f_flags;
381 382 383 384 385 386
	int err;

	err = CHECK_ATTR(BPF_MAP_CREATE);
	if (err)
		return -EINVAL;

387 388 389 390
	f_flags = bpf_get_file_flag(attr->map_flags);
	if (f_flags < 0)
		return f_flags;

391
	if (numa_node != NUMA_NO_NODE &&
E
Eric Dumazet 已提交
392 393
	    ((unsigned int)numa_node >= nr_node_ids ||
	     !node_online(numa_node)))
394 395
		return -EINVAL;

396 397 398 399 400
	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	map = find_and_alloc_map(attr);
	if (IS_ERR(map))
		return PTR_ERR(map);

401 402 403 404
	err = bpf_obj_name_cpy(map->name, attr->map_name);
	if (err)
		goto free_map_nouncharge;

405
	atomic_set(&map->refcnt, 1);
406
	atomic_set(&map->usercnt, 1);
407

408 409
	err = bpf_map_charge_memlock(map);
	if (err)
410
		goto free_map_nouncharge;
411

M
Martin KaFai Lau 已提交
412 413 414 415
	err = bpf_map_alloc_id(map);
	if (err)
		goto free_map;

416
	err = bpf_map_new_fd(map, f_flags);
M
Martin KaFai Lau 已提交
417 418 419 420 421 422 423 424 425 426
	if (err < 0) {
		/* failed to allocate fd.
		 * bpf_map_put() is needed because the above
		 * bpf_map_alloc_id() has published the map
		 * to the userspace and the userspace may
		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
		 */
		bpf_map_put(map);
		return err;
	}
427

428
	trace_bpf_map_create(map, err);
429 430 431
	return err;

free_map:
432 433
	bpf_map_uncharge_memlock(map);
free_map_nouncharge:
434 435 436 437
	map->ops->map_free(map);
	return err;
}

438 439 440
/* if error is returned, fd is released.
 * On success caller should complete fd access with matching fdput()
 */
441
struct bpf_map *__bpf_map_get(struct fd f)
442 443 444 445 446 447 448 449
{
	if (!f.file)
		return ERR_PTR(-EBADF);
	if (f.file->f_op != &bpf_map_fops) {
		fdput(f);
		return ERR_PTR(-EINVAL);
	}

450 451 452
	return f.file->private_data;
}

A
Alexei Starovoitov 已提交
453 454 455 456
/* prog's and map's refcnt limit */
#define BPF_MAX_REFCNT 32768

struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
457
{
A
Alexei Starovoitov 已提交
458 459 460 461
	if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
		atomic_dec(&map->refcnt);
		return ERR_PTR(-EBUSY);
	}
462 463
	if (uref)
		atomic_inc(&map->usercnt);
A
Alexei Starovoitov 已提交
464
	return map;
465 466 467
}

struct bpf_map *bpf_map_get_with_uref(u32 ufd)
468 469 470 471 472 473 474 475
{
	struct fd f = fdget(ufd);
	struct bpf_map *map;

	map = __bpf_map_get(f);
	if (IS_ERR(map))
		return map;

A
Alexei Starovoitov 已提交
476
	map = bpf_map_inc(map, true);
477
	fdput(f);
478 479 480 481

	return map;
}

M
Martin KaFai Lau 已提交
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
/* map_idr_lock should have been held */
static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
					    bool uref)
{
	int refold;

	refold = __atomic_add_unless(&map->refcnt, 1, 0);

	if (refold >= BPF_MAX_REFCNT) {
		__bpf_map_put(map, false);
		return ERR_PTR(-EBUSY);
	}

	if (!refold)
		return ERR_PTR(-ENOENT);

	if (uref)
		atomic_inc(&map->usercnt);

	return map;
}

504 505 506 507 508
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
	return -ENOTSUPP;
}

509 510 511 512 513
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value

static int map_lookup_elem(union bpf_attr *attr)
{
M
Mickaël Salaün 已提交
514 515
	void __user *ukey = u64_to_user_ptr(attr->key);
	void __user *uvalue = u64_to_user_ptr(attr->value);
516 517
	int ufd = attr->map_fd;
	struct bpf_map *map;
518
	void *key, *value, *ptr;
519
	u32 value_size;
520
	struct fd f;
521 522 523 524 525
	int err;

	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
		return -EINVAL;

526
	f = fdget(ufd);
527
	map = __bpf_map_get(f);
528 529 530
	if (IS_ERR(map))
		return PTR_ERR(map);

531 532 533 534 535
	if (!(f.file->f_mode & FMODE_CAN_READ)) {
		err = -EPERM;
		goto err_put;
	}

A
Al Viro 已提交
536 537 538
	key = memdup_user(ukey, map->key_size);
	if (IS_ERR(key)) {
		err = PTR_ERR(key);
539
		goto err_put;
A
Al Viro 已提交
540
	}
541

542
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
543
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
544 545
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
546 547
	else if (IS_FD_MAP(map))
		value_size = sizeof(u32);
548 549 550
	else
		value_size = map->value_size;

551
	err = -ENOMEM;
552
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
553
	if (!value)
554 555
		goto free_key;

556 557
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
558 559 560
		err = bpf_percpu_hash_copy(map, key, value);
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
		err = bpf_percpu_array_copy(map, key, value);
561 562
	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
		err = bpf_stackmap_copy(map, key, value);
563 564 565 566
	} else if (IS_FD_ARRAY(map)) {
		err = bpf_fd_array_map_lookup_elem(map, key, value);
	} else if (IS_FD_HASH(map)) {
		err = bpf_fd_htab_map_lookup_elem(map, key, value);
567 568 569 570 571 572 573 574
	} else {
		rcu_read_lock();
		ptr = map->ops->map_lookup_elem(map, key);
		if (ptr)
			memcpy(value, ptr, value_size);
		rcu_read_unlock();
		err = ptr ? 0 : -ENOENT;
	}
575

576
	if (err)
577
		goto free_value;
578 579

	err = -EFAULT;
580
	if (copy_to_user(uvalue, value, value_size) != 0)
581
		goto free_value;
582

583
	trace_bpf_map_lookup_elem(map, ufd, key, value);
584 585
	err = 0;

586 587
free_value:
	kfree(value);
588 589 590 591 592 593 594
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

595
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
596 597 598

static int map_update_elem(union bpf_attr *attr)
{
M
Mickaël Salaün 已提交
599 600
	void __user *ukey = u64_to_user_ptr(attr->key);
	void __user *uvalue = u64_to_user_ptr(attr->value);
601 602 603
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *value;
604
	u32 value_size;
605
	struct fd f;
606 607 608 609 610
	int err;

	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
		return -EINVAL;

611
	f = fdget(ufd);
612
	map = __bpf_map_get(f);
613 614 615
	if (IS_ERR(map))
		return PTR_ERR(map);

616 617 618 619 620
	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
		err = -EPERM;
		goto err_put;
	}

A
Al Viro 已提交
621 622 623
	key = memdup_user(ukey, map->key_size);
	if (IS_ERR(key)) {
		err = PTR_ERR(key);
624
		goto err_put;
A
Al Viro 已提交
625
	}
626

627
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
628
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
629 630 631 632 633
	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		value_size = round_up(map->value_size, 8) * num_possible_cpus();
	else
		value_size = map->value_size;

634
	err = -ENOMEM;
635
	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
636 637 638 639
	if (!value)
		goto free_key;

	err = -EFAULT;
640
	if (copy_from_user(value, uvalue, value_size) != 0)
641 642
		goto free_value;

643 644 645 646 647 648
	/* Need to create a kthread, thus must support schedule */
	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
		err = map->ops->map_update_elem(map, key, value, attr->flags);
		goto out;
	}

649 650 651 652 653
	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
	 * inside bpf map update or delete otherwise deadlocks are possible
	 */
	preempt_disable();
	__this_cpu_inc(bpf_prog_active);
654 655
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
656 657 658
		err = bpf_percpu_hash_update(map, key, value, attr->flags);
	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
		err = bpf_percpu_array_update(map, key, value, attr->flags);
659
	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
660
		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
661 662
		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
		   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
663 664 665 666
		rcu_read_lock();
		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
						   attr->flags);
		rcu_read_unlock();
M
Martin KaFai Lau 已提交
667 668 669 670 671
	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
		rcu_read_lock();
		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
						  attr->flags);
		rcu_read_unlock();
672 673 674 675 676
	} else {
		rcu_read_lock();
		err = map->ops->map_update_elem(map, key, value, attr->flags);
		rcu_read_unlock();
	}
677 678
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();
679
out:
680 681
	if (!err)
		trace_bpf_map_update_elem(map, ufd, key, value);
682 683 684 685 686 687 688 689 690 691 692 693 694
free_value:
	kfree(value);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

#define BPF_MAP_DELETE_ELEM_LAST_FIELD key

static int map_delete_elem(union bpf_attr *attr)
{
M
Mickaël Salaün 已提交
695
	void __user *ukey = u64_to_user_ptr(attr->key);
696 697
	int ufd = attr->map_fd;
	struct bpf_map *map;
698
	struct fd f;
699 700 701 702 703 704
	void *key;
	int err;

	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
		return -EINVAL;

705
	f = fdget(ufd);
706
	map = __bpf_map_get(f);
707 708 709
	if (IS_ERR(map))
		return PTR_ERR(map);

710 711 712 713 714
	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
		err = -EPERM;
		goto err_put;
	}

A
Al Viro 已提交
715 716 717
	key = memdup_user(ukey, map->key_size);
	if (IS_ERR(key)) {
		err = PTR_ERR(key);
718
		goto err_put;
A
Al Viro 已提交
719
	}
720

721 722
	preempt_disable();
	__this_cpu_inc(bpf_prog_active);
723 724 725
	rcu_read_lock();
	err = map->ops->map_delete_elem(map, key);
	rcu_read_unlock();
726 727
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();
728

729 730
	if (!err)
		trace_bpf_map_delete_elem(map, ufd, key);
731 732 733 734 735 736 737 738 739 740 741
	kfree(key);
err_put:
	fdput(f);
	return err;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key

static int map_get_next_key(union bpf_attr *attr)
{
M
Mickaël Salaün 已提交
742 743
	void __user *ukey = u64_to_user_ptr(attr->key);
	void __user *unext_key = u64_to_user_ptr(attr->next_key);
744 745 746
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *next_key;
747
	struct fd f;
748 749 750 751 752
	int err;

	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
		return -EINVAL;

753
	f = fdget(ufd);
754
	map = __bpf_map_get(f);
755 756 757
	if (IS_ERR(map))
		return PTR_ERR(map);

758 759 760 761 762
	if (!(f.file->f_mode & FMODE_CAN_READ)) {
		err = -EPERM;
		goto err_put;
	}

763
	if (ukey) {
A
Al Viro 已提交
764 765 766
		key = memdup_user(ukey, map->key_size);
		if (IS_ERR(key)) {
			err = PTR_ERR(key);
767
			goto err_put;
A
Al Viro 已提交
768
		}
769 770 771
	} else {
		key = NULL;
	}
772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787

	err = -ENOMEM;
	next_key = kmalloc(map->key_size, GFP_USER);
	if (!next_key)
		goto free_key;

	rcu_read_lock();
	err = map->ops->map_get_next_key(map, key, next_key);
	rcu_read_unlock();
	if (err)
		goto free_next_key;

	err = -EFAULT;
	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
		goto free_next_key;

788
	trace_bpf_map_next_key(map, ufd, key, next_key);
789 790 791 792 793 794 795 796 797 798 799
	err = 0;

free_next_key:
	kfree(next_key);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

800 801 802 803 804 805 806 807 808
static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name) \
	[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
};

809 810
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
811 812
	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
		return -EINVAL;
813

814 815 816
	prog->aux->ops = bpf_prog_types[type];
	prog->type = type;
	return 0;
817 818 819 820 821 822 823 824 825 826 827 828 829
}

/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
	int i;

	for (i = 0; i < aux->used_map_cnt; i++)
		bpf_map_put(aux->used_maps[i]);

	kfree(aux->used_maps);
}

830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	unsigned long user_bufs;

	if (user) {
		user_bufs = atomic_long_add_return(pages, &user->locked_vm);
		if (user_bufs > memlock_limit) {
			atomic_long_sub(pages, &user->locked_vm);
			return -EPERM;
		}
	}

	return 0;
}

void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
{
	if (user)
		atomic_long_sub(pages, &user->locked_vm);
}

852 853 854
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
	struct user_struct *user = get_current_user();
855
	int ret;
856

857 858
	ret = __bpf_prog_charge(user, prog->pages);
	if (ret) {
859
		free_uid(user);
860
		return ret;
861
	}
862

863 864 865 866 867 868 869 870
	prog->aux->user = user;
	return 0;
}

static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
	struct user_struct *user = prog->aux->user;

871
	__bpf_prog_uncharge(user, prog->pages);
872 873 874
	free_uid(user);
}

M
Martin KaFai Lau 已提交
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
	int id;

	spin_lock_bh(&prog_idr_lock);
	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
	if (id > 0)
		prog->aux->id = id;
	spin_unlock_bh(&prog_idr_lock);

	/* id is in [1, INT_MAX) */
	if (WARN_ON_ONCE(!id))
		return -ENOSPC;

	return id > 0 ? 0 : id;
}

892
static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
M
Martin KaFai Lau 已提交
893 894 895 896 897
{
	/* cBPF to eBPF migrations are currently not in the idr store. */
	if (!prog->aux->id)
		return;

898 899 900 901 902
	if (do_idr_lock)
		spin_lock_bh(&prog_idr_lock);
	else
		__acquire(&prog_idr_lock);

M
Martin KaFai Lau 已提交
903
	idr_remove(&prog_idr, prog->aux->id);
904 905 906 907 908

	if (do_idr_lock)
		spin_unlock_bh(&prog_idr_lock);
	else
		__release(&prog_idr_lock);
M
Martin KaFai Lau 已提交
909 910
}

911
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
912 913 914 915
{
	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

	free_used_maps(aux);
916
	bpf_prog_uncharge_memlock(aux->prog);
917 918 919
	bpf_prog_free(aux->prog);
}

920
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
921
{
922 923
	if (atomic_dec_and_test(&prog->aux->refcnt)) {
		trace_bpf_prog_put_rcu(prog);
924
		/* bpf_prog_free_id() must be called first */
925
		bpf_prog_free_id(prog, do_idr_lock);
926
		bpf_prog_kallsyms_del(prog);
927
		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
928
	}
929
}
930 931 932 933 934

void bpf_prog_put(struct bpf_prog *prog)
{
	__bpf_prog_put(prog, true);
}
935
EXPORT_SYMBOL_GPL(bpf_prog_put);
936 937 938 939 940

static int bpf_prog_release(struct inode *inode, struct file *filp)
{
	struct bpf_prog *prog = filp->private_data;

941
	bpf_prog_put(prog);
942 943 944
	return 0;
}

945 946 947 948
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
	const struct bpf_prog *prog = filp->private_data;
949
	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
950

951
	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
952 953 954
	seq_printf(m,
		   "prog_type:\t%u\n"
		   "prog_jited:\t%u\n"
955
		   "prog_tag:\t%s\n"
956 957 958
		   "memlock:\t%llu\n",
		   prog->type,
		   prog->jited,
959
		   prog_tag,
960 961 962 963
		   prog->pages * 1ULL << PAGE_SHIFT);
}
#endif

964
static const struct file_operations bpf_prog_fops = {
965 966 967 968
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= bpf_prog_show_fdinfo,
#endif
	.release	= bpf_prog_release,
969 970
	.read		= bpf_dummy_read,
	.write		= bpf_dummy_write,
971 972
};

973
int bpf_prog_new_fd(struct bpf_prog *prog)
974 975 976 977 978
{
	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
				O_RDWR | O_CLOEXEC);
}

979
static struct bpf_prog *____bpf_prog_get(struct fd f)
980 981 982 983 984 985 986 987
{
	if (!f.file)
		return ERR_PTR(-EBADF);
	if (f.file->f_op != &bpf_prog_fops) {
		fdput(f);
		return ERR_PTR(-EINVAL);
	}

988
	return f.file->private_data;
989 990
}

991
struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
A
Alexei Starovoitov 已提交
992
{
993 994
	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
		atomic_sub(i, &prog->aux->refcnt);
A
Alexei Starovoitov 已提交
995 996 997 998
		return ERR_PTR(-EBUSY);
	}
	return prog;
}
999 1000
EXPORT_SYMBOL_GPL(bpf_prog_add);

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
void bpf_prog_sub(struct bpf_prog *prog, int i)
{
	/* Only to be used for undoing previous bpf_prog_add() in some
	 * error path. We still know that another entity in our call
	 * path holds a reference to the program, thus atomic_sub() can
	 * be safely used in such cases!
	 */
	WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);

1012 1013 1014 1015
struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
{
	return bpf_prog_add(prog, 1);
}
1016
EXPORT_SYMBOL_GPL(bpf_prog_inc);
A
Alexei Starovoitov 已提交
1017

1018
/* prog_idr_lock should have been held */
1019
struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
{
	int refold;

	refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);

	if (refold >= BPF_MAX_REFCNT) {
		__bpf_prog_put(prog, false);
		return ERR_PTR(-EBUSY);
	}

	if (!refold)
		return ERR_PTR(-ENOENT);

	return prog;
}
1035
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
1036

1037
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
1038 1039 1040 1041
{
	struct fd f = fdget(ufd);
	struct bpf_prog *prog;

1042
	prog = ____bpf_prog_get(f);
1043 1044
	if (IS_ERR(prog))
		return prog;
1045 1046 1047 1048
	if (type && prog->type != *type) {
		prog = ERR_PTR(-EINVAL);
		goto out;
	}
1049

A
Alexei Starovoitov 已提交
1050
	prog = bpf_prog_inc(prog);
1051
out:
1052 1053 1054
	fdput(f);
	return prog;
}
1055 1056 1057 1058 1059 1060 1061 1062

struct bpf_prog *bpf_prog_get(u32 ufd)
{
	return __bpf_prog_get(ufd, NULL);
}

struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
{
1063 1064 1065 1066 1067
	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);

	if (!IS_ERR(prog))
		trace_bpf_prog_get_type(prog);
	return prog;
1068 1069
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
1070 1071

/* last field in 'union bpf_attr' used by this command */
1072
#define	BPF_PROG_LOAD_LAST_FIELD prog_name
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084

static int bpf_prog_load(union bpf_attr *attr)
{
	enum bpf_prog_type type = attr->prog_type;
	struct bpf_prog *prog;
	int err;
	char license[128];
	bool is_gpl;

	if (CHECK_ATTR(BPF_PROG_LOAD))
		return -EINVAL;

1085 1086 1087
	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
		return -EINVAL;

1088
	/* copy eBPF program license from user space */
M
Mickaël Salaün 已提交
1089
	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
1090 1091 1092 1093 1094 1095 1096
			      sizeof(license) - 1) < 0)
		return -EFAULT;
	license[sizeof(license) - 1] = 0;

	/* eBPF programs must be GPL compatible to use GPL-ed functions */
	is_gpl = license_is_gpl_compatible(license);

1097 1098
	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
		return -E2BIG;
1099

1100 1101 1102 1103
	if (type == BPF_PROG_TYPE_KPROBE &&
	    attr->kern_version != LINUX_VERSION_CODE)
		return -EINVAL;

1104 1105 1106
	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
	    type != BPF_PROG_TYPE_CGROUP_SKB &&
	    !capable(CAP_SYS_ADMIN))
1107 1108
		return -EPERM;

1109 1110 1111 1112 1113
	/* plain bpf_prog allocation */
	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	if (!prog)
		return -ENOMEM;

1114 1115 1116 1117
	err = bpf_prog_charge_memlock(prog);
	if (err)
		goto free_prog_nouncharge;

1118 1119 1120
	prog->len = attr->insn_cnt;

	err = -EFAULT;
M
Mickaël Salaün 已提交
1121
	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
1122
			   bpf_prog_insn_size(prog)) != 0)
1123 1124 1125
		goto free_prog;

	prog->orig_prog = NULL;
1126
	prog->jited = 0;
1127 1128

	atomic_set(&prog->aux->refcnt, 1);
1129
	prog->gpl_compatible = is_gpl ? 1 : 0;
1130 1131 1132 1133 1134 1135

	/* find program type: socket_filter vs tracing_filter */
	err = find_prog_type(type, prog);
	if (err < 0)
		goto free_prog;

1136 1137 1138 1139 1140
	prog->aux->load_time = ktime_get_boot_ns();
	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
	if (err)
		goto free_prog;

1141
	/* run eBPF verifier */
1142
	err = bpf_check(&prog, attr);
1143 1144 1145 1146
	if (err < 0)
		goto free_used_maps;

	/* eBPF program is ready to be JITed */
1147
	prog = bpf_prog_select_runtime(prog, &err);
1148 1149
	if (err < 0)
		goto free_used_maps;
1150

M
Martin KaFai Lau 已提交
1151 1152 1153 1154
	err = bpf_prog_alloc_id(prog);
	if (err)
		goto free_used_maps;

1155
	err = bpf_prog_new_fd(prog);
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
	if (err < 0) {
		/* failed to allocate fd.
		 * bpf_prog_put() is needed because the above
		 * bpf_prog_alloc_id() has published the prog
		 * to the userspace and the userspace may
		 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
		 */
		bpf_prog_put(prog);
		return err;
	}
1166

1167
	bpf_prog_kallsyms_add(prog);
1168
	trace_bpf_prog_load(prog, err);
1169 1170 1171 1172 1173
	return err;

free_used_maps:
	free_used_maps(prog->aux);
free_prog:
1174 1175
	bpf_prog_uncharge_memlock(prog);
free_prog_nouncharge:
1176 1177 1178 1179
	bpf_prog_free(prog);
	return err;
}

1180
#define BPF_OBJ_LAST_FIELD file_flags
1181 1182 1183

static int bpf_obj_pin(const union bpf_attr *attr)
{
1184
	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
1185 1186
		return -EINVAL;

M
Mickaël Salaün 已提交
1187
	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
1188 1189 1190 1191
}

static int bpf_obj_get(const union bpf_attr *attr)
{
1192 1193
	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
1194 1195
		return -EINVAL;

1196 1197
	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
				attr->file_flags);
1198 1199
}

1200 1201
#ifdef CONFIG_CGROUP_BPF

1202
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
1203

1204
static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
1205
{
1206
	struct bpf_prog *prog = NULL;
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
	int ufd = attr->target_fd;
	struct bpf_map *map;
	struct fd f;
	int err;

	f = fdget(ufd);
	map = __bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

1217 1218 1219 1220 1221 1222 1223
	if (attach) {
		prog = bpf_prog_get_type(attr->attach_bpf_fd,
					 BPF_PROG_TYPE_SK_SKB);
		if (IS_ERR(prog)) {
			fdput(f);
			return PTR_ERR(prog);
		}
1224 1225
	}

1226
	err = sock_map_prog(map, prog, attr->attach_type);
1227 1228
	if (err) {
		fdput(f);
1229 1230
		if (prog)
			bpf_prog_put(prog);
1231
		return err;
1232 1233 1234
	}

	fdput(f);
1235
	return 0;
1236
}
1237

1238 1239 1240
#define BPF_F_ATTACH_MASK \
	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)

1241 1242
static int bpf_prog_attach(const union bpf_attr *attr)
{
1243
	enum bpf_prog_type ptype;
1244 1245
	struct bpf_prog *prog;
	struct cgroup *cgrp;
1246
	int ret;
1247 1248 1249 1250 1251 1252 1253

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	if (CHECK_ATTR(BPF_PROG_ATTACH))
		return -EINVAL;

1254
	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
1255 1256
		return -EINVAL;

1257 1258 1259
	switch (attr->attach_type) {
	case BPF_CGROUP_INET_INGRESS:
	case BPF_CGROUP_INET_EGRESS:
1260
		ptype = BPF_PROG_TYPE_CGROUP_SKB;
1261
		break;
1262 1263 1264
	case BPF_CGROUP_INET_SOCK_CREATE:
		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
		break;
L
Lawrence Brakmo 已提交
1265 1266 1267
	case BPF_CGROUP_SOCK_OPS:
		ptype = BPF_PROG_TYPE_SOCK_OPS;
		break;
1268 1269
	case BPF_SK_SKB_STREAM_PARSER:
	case BPF_SK_SKB_STREAM_VERDICT:
1270
		return sockmap_get_from_fd(attr, true);
1271 1272 1273 1274
	default:
		return -EINVAL;
	}

1275 1276 1277 1278 1279 1280 1281 1282 1283 1284
	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
	if (IS_ERR(prog))
		return PTR_ERR(prog);

	cgrp = cgroup_get_from_fd(attr->target_fd);
	if (IS_ERR(cgrp)) {
		bpf_prog_put(prog);
		return PTR_ERR(cgrp);
	}

1285 1286
	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
				attr->attach_flags);
1287 1288
	if (ret)
		bpf_prog_put(prog);
1289 1290
	cgroup_put(cgrp);

1291
	return ret;
1292 1293 1294 1295 1296 1297
}

#define BPF_PROG_DETACH_LAST_FIELD attach_type

static int bpf_prog_detach(const union bpf_attr *attr)
{
1298 1299
	enum bpf_prog_type ptype;
	struct bpf_prog *prog;
1300
	struct cgroup *cgrp;
1301
	int ret;
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	if (CHECK_ATTR(BPF_PROG_DETACH))
		return -EINVAL;

	switch (attr->attach_type) {
	case BPF_CGROUP_INET_INGRESS:
	case BPF_CGROUP_INET_EGRESS:
1312 1313
		ptype = BPF_PROG_TYPE_CGROUP_SKB;
		break;
1314
	case BPF_CGROUP_INET_SOCK_CREATE:
1315 1316
		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
		break;
L
Lawrence Brakmo 已提交
1317
	case BPF_CGROUP_SOCK_OPS:
1318
		ptype = BPF_PROG_TYPE_SOCK_OPS;
1319
		break;
1320 1321
	case BPF_SK_SKB_STREAM_PARSER:
	case BPF_SK_SKB_STREAM_VERDICT:
1322
		return sockmap_get_from_fd(attr, false);
1323 1324 1325 1326
	default:
		return -EINVAL;
	}

1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
	cgrp = cgroup_get_from_fd(attr->target_fd);
	if (IS_ERR(cgrp))
		return PTR_ERR(cgrp);

	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
	if (IS_ERR(prog))
		prog = NULL;

	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
	if (prog)
		bpf_prog_put(prog);
	cgroup_put(cgrp);
1339
	return ret;
1340
}
L
Lawrence Brakmo 已提交
1341

1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt

static int bpf_prog_query(const union bpf_attr *attr,
			  union bpf_attr __user *uattr)
{
	struct cgroup *cgrp;
	int ret;

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;
	if (CHECK_ATTR(BPF_PROG_QUERY))
		return -EINVAL;
	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
		return -EINVAL;

	switch (attr->query.attach_type) {
	case BPF_CGROUP_INET_INGRESS:
	case BPF_CGROUP_INET_EGRESS:
	case BPF_CGROUP_INET_SOCK_CREATE:
	case BPF_CGROUP_SOCK_OPS:
		break;
	default:
		return -EINVAL;
	}
	cgrp = cgroup_get_from_fd(attr->query.target_fd);
	if (IS_ERR(cgrp))
		return PTR_ERR(cgrp);
	ret = cgroup_bpf_query(cgrp, attr, uattr);
	cgroup_put(cgrp);
	return ret;
}
1373 1374
#endif /* CONFIG_CGROUP_BPF */

1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration

static int bpf_prog_test_run(const union bpf_attr *attr,
			     union bpf_attr __user *uattr)
{
	struct bpf_prog *prog;
	int ret = -ENOTSUPP;

	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
		return -EINVAL;

	prog = bpf_prog_get(attr->test.prog_fd);
	if (IS_ERR(prog))
		return PTR_ERR(prog);

	if (prog->aux->ops->test_run)
		ret = prog->aux->ops->test_run(prog, attr, uattr);

	bpf_prog_put(prog);
	return ret;
}

1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id

static int bpf_obj_get_next_id(const union bpf_attr *attr,
			       union bpf_attr __user *uattr,
			       struct idr *idr,
			       spinlock_t *lock)
{
	u32 next_id = attr->start_id;
	int err = 0;

	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
		return -EINVAL;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	next_id++;
	spin_lock_bh(lock);
	if (!idr_get_next(idr, &next_id))
		err = -ENOENT;
	spin_unlock_bh(lock);

	if (!err)
		err = put_user(next_id, &uattr->next_id);

	return err;
}

1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id

static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
{
	struct bpf_prog *prog;
	u32 id = attr->prog_id;
	int fd;

	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
		return -EINVAL;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	spin_lock_bh(&prog_idr_lock);
	prog = idr_find(&prog_idr, id);
	if (prog)
		prog = bpf_prog_inc_not_zero(prog);
	else
		prog = ERR_PTR(-ENOENT);
	spin_unlock_bh(&prog_idr_lock);

	if (IS_ERR(prog))
		return PTR_ERR(prog);

	fd = bpf_prog_new_fd(prog);
	if (fd < 0)
		bpf_prog_put(prog);

	return fd;
}

1457
#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
M
Martin KaFai Lau 已提交
1458 1459 1460 1461 1462

static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
	struct bpf_map *map;
	u32 id = attr->map_id;
1463
	int f_flags;
M
Martin KaFai Lau 已提交
1464 1465
	int fd;

1466 1467
	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
M
Martin KaFai Lau 已提交
1468 1469 1470 1471 1472
		return -EINVAL;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

1473 1474 1475 1476
	f_flags = bpf_get_file_flag(attr->open_flags);
	if (f_flags < 0)
		return f_flags;

M
Martin KaFai Lau 已提交
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
	spin_lock_bh(&map_idr_lock);
	map = idr_find(&map_idr, id);
	if (map)
		map = bpf_map_inc_not_zero(map, true);
	else
		map = ERR_PTR(-ENOENT);
	spin_unlock_bh(&map_idr_lock);

	if (IS_ERR(map))
		return PTR_ERR(map);

1488
	fd = bpf_map_new_fd(map, f_flags);
M
Martin KaFai Lau 已提交
1489 1490 1491 1492 1493 1494
	if (fd < 0)
		bpf_map_put(map);

	return fd;
}

1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
				   const union bpf_attr *attr,
				   union bpf_attr __user *uattr)
{
	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
	struct bpf_prog_info info = {};
	u32 info_len = attr->info.info_len;
	char __user *uinsns;
	u32 ulen;
	int err;

	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
	if (err)
		return err;
	info_len = min_t(u32, sizeof(info), info_len);

	if (copy_from_user(&info, uinfo, info_len))
1512
		return -EFAULT;
1513 1514 1515

	info.type = prog->type;
	info.id = prog->aux->id;
1516 1517 1518
	info.load_time = prog->aux->load_time;
	info.created_by_uid = from_kuid_munged(current_user_ns(),
					       prog->aux->user->uid);
1519 1520

	memcpy(info.tag, prog->tag, sizeof(prog->tag));
1521 1522 1523 1524 1525 1526
	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));

	ulen = info.nr_map_ids;
	info.nr_map_ids = prog->aux->used_map_cnt;
	ulen = min_t(u32, info.nr_map_ids, ulen);
	if (ulen) {
1527
		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
1528 1529 1530 1531 1532 1533 1534
		u32 i;

		for (i = 0; i < ulen; i++)
			if (put_user(prog->aux->used_maps[i]->id,
				     &user_map_ids[i]))
				return -EFAULT;
	}
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551

	if (!capable(CAP_SYS_ADMIN)) {
		info.jited_prog_len = 0;
		info.xlated_prog_len = 0;
		goto done;
	}

	ulen = info.jited_prog_len;
	info.jited_prog_len = prog->jited_len;
	if (info.jited_prog_len && ulen) {
		uinsns = u64_to_user_ptr(info.jited_prog_insns);
		ulen = min_t(u32, info.jited_prog_len, ulen);
		if (copy_to_user(uinsns, prog->bpf_func, ulen))
			return -EFAULT;
	}

	ulen = info.xlated_prog_len;
1552
	info.xlated_prog_len = bpf_prog_insn_size(prog);
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587
	if (info.xlated_prog_len && ulen) {
		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
		ulen = min_t(u32, info.xlated_prog_len, ulen);
		if (copy_to_user(uinsns, prog->insnsi, ulen))
			return -EFAULT;
	}

done:
	if (copy_to_user(uinfo, &info, info_len) ||
	    put_user(info_len, &uattr->info.info_len))
		return -EFAULT;

	return 0;
}

static int bpf_map_get_info_by_fd(struct bpf_map *map,
				  const union bpf_attr *attr,
				  union bpf_attr __user *uattr)
{
	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
	struct bpf_map_info info = {};
	u32 info_len = attr->info.info_len;
	int err;

	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
	if (err)
		return err;
	info_len = min_t(u32, sizeof(info), info_len);

	info.type = map->map_type;
	info.id = map->id;
	info.key_size = map->key_size;
	info.value_size = map->value_size;
	info.max_entries = map->max_entries;
	info.map_flags = map->map_flags;
1588
	memcpy(info.name, map->name, sizeof(map->name));
1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625

	if (copy_to_user(uinfo, &info, info_len) ||
	    put_user(info_len, &uattr->info.info_len))
		return -EFAULT;

	return 0;
}

#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info

static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
				  union bpf_attr __user *uattr)
{
	int ufd = attr->info.bpf_fd;
	struct fd f;
	int err;

	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
		return -EINVAL;

	f = fdget(ufd);
	if (!f.file)
		return -EBADFD;

	if (f.file->f_op == &bpf_prog_fops)
		err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
					      uattr);
	else if (f.file->f_op == &bpf_map_fops)
		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
					     uattr);
	else
		err = -EINVAL;

	fdput(f);
	return err;
}

1626 1627 1628 1629 1630
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

1631
	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
1632 1633
		return -EPERM;

1634 1635 1636 1637
	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
	if (err)
		return err;
	size = min_t(u32, size, sizeof(attr));
1638 1639 1640 1641 1642 1643 1644 1645 1646

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658
	case BPF_MAP_LOOKUP_ELEM:
		err = map_lookup_elem(&attr);
		break;
	case BPF_MAP_UPDATE_ELEM:
		err = map_update_elem(&attr);
		break;
	case BPF_MAP_DELETE_ELEM:
		err = map_delete_elem(&attr);
		break;
	case BPF_MAP_GET_NEXT_KEY:
		err = map_get_next_key(&attr);
		break;
1659 1660 1661
	case BPF_PROG_LOAD:
		err = bpf_prog_load(&attr);
		break;
1662 1663 1664 1665 1666 1667
	case BPF_OBJ_PIN:
		err = bpf_obj_pin(&attr);
		break;
	case BPF_OBJ_GET:
		err = bpf_obj_get(&attr);
		break;
1668 1669 1670 1671 1672 1673 1674
#ifdef CONFIG_CGROUP_BPF
	case BPF_PROG_ATTACH:
		err = bpf_prog_attach(&attr);
		break;
	case BPF_PROG_DETACH:
		err = bpf_prog_detach(&attr);
		break;
1675 1676 1677
	case BPF_PROG_QUERY:
		err = bpf_prog_query(&attr, uattr);
		break;
1678
#endif
1679 1680 1681
	case BPF_PROG_TEST_RUN:
		err = bpf_prog_test_run(&attr, uattr);
		break;
1682 1683 1684 1685 1686 1687 1688 1689
	case BPF_PROG_GET_NEXT_ID:
		err = bpf_obj_get_next_id(&attr, uattr,
					  &prog_idr, &prog_idr_lock);
		break;
	case BPF_MAP_GET_NEXT_ID:
		err = bpf_obj_get_next_id(&attr, uattr,
					  &map_idr, &map_idr_lock);
		break;
1690 1691 1692
	case BPF_PROG_GET_FD_BY_ID:
		err = bpf_prog_get_fd_by_id(&attr);
		break;
M
Martin KaFai Lau 已提交
1693 1694 1695
	case BPF_MAP_GET_FD_BY_ID:
		err = bpf_map_get_fd_by_id(&attr);
		break;
1696 1697 1698
	case BPF_OBJ_GET_INFO_BY_FD:
		err = bpf_obj_get_info_by_fd(&attr, uattr);
		break;
1699 1700 1701 1702 1703 1704 1705
	default:
		err = -EINVAL;
		break;
	}

	return err;
}