arraymap.c 20.0 KB
Newer Older
1
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2
 * Copyright (c) 2016,2017 Facebook
3 4 5 6 7 8 9 10 11 12 13
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/bpf.h>
14
#include <linux/btf.h>
15 16 17
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/mm.h>
18
#include <linux/filter.h>
19
#include <linux/perf_event.h>
20
#include <uapi/linux/btf.h>
21

22 23
#include "map_in_map.h"

24 25 26
#define ARRAY_CREATE_FLAG_MASK \
	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)

27 28 29 30
static void bpf_array_free_percpu(struct bpf_array *array)
{
	int i;

31
	for (i = 0; i < array->map.max_entries; i++) {
32
		free_percpu(array->pptrs[i]);
33 34
		cond_resched();
	}
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
}

static int bpf_array_alloc_percpu(struct bpf_array *array)
{
	void __percpu *ptr;
	int i;

	for (i = 0; i < array->map.max_entries; i++) {
		ptr = __alloc_percpu_gfp(array->elem_size, 8,
					 GFP_USER | __GFP_NOWARN);
		if (!ptr) {
			bpf_array_free_percpu(array);
			return -ENOMEM;
		}
		array->pptrs[i] = ptr;
50
		cond_resched();
51 52 53 54 55
	}

	return 0;
}

56
/* Called from syscall */
57
int array_map_alloc_check(union bpf_attr *attr)
58
{
59
	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
60
	int numa_node = bpf_map_attr_numa_node(attr);
61 62 63

	/* check sanity of attributes */
	if (attr->max_entries == 0 || attr->key_size != 4 ||
64 65
	    attr->value_size == 0 ||
	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
66
	    (percpu && numa_node != NUMA_NO_NODE))
67
		return -EINVAL;
68

M
Michal Hocko 已提交
69
	if (attr->value_size > KMALLOC_MAX_SIZE)
70 71 72
		/* if value_size is bigger, the user space won't be able to
		 * access the elements.
		 */
73 74 75 76 77 78 79 80
		return -E2BIG;

	return 0;
}

static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
81
	int ret, numa_node = bpf_map_attr_numa_node(attr);
82 83
	u32 elem_size, index_mask, max_entries;
	bool unpriv = !capable(CAP_SYS_ADMIN);
84
	u64 cost, array_size, mask64;
85
	struct bpf_array *array;
86

87 88
	elem_size = round_up(attr->value_size, 8);

89 90
	max_entries = attr->max_entries;

91 92 93 94 95 96 97 98 99 100
	/* On 32 bit archs roundup_pow_of_two() with max_entries that has
	 * upper most bit set in u32 space is undefined behavior due to
	 * resulting 1U << 32, so do it manually here in u64 space.
	 */
	mask64 = fls_long(max_entries - 1);
	mask64 = 1ULL << mask64;
	mask64 -= 1;

	index_mask = mask64;
	if (unpriv) {
101 102 103 104
		/* round up array size to nearest power of 2,
		 * since cpu will speculate within index_mask limits
		 */
		max_entries = index_mask + 1;
105 106 107 108
		/* Check for overflows. */
		if (max_entries < attr->max_entries)
			return ERR_PTR(-E2BIG);
	}
109

110 111
	array_size = sizeof(*array);
	if (percpu)
112
		array_size += (u64) max_entries * sizeof(void *);
113
	else
114
		array_size += (u64) max_entries * elem_size;
115 116

	/* make sure there is no u32 overflow later in round_up() */
117 118
	cost = array_size;
	if (cost >= U32_MAX - PAGE_SIZE)
119
		return ERR_PTR(-ENOMEM);
120 121 122 123 124 125 126 127 128 129
	if (percpu) {
		cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
		if (cost >= U32_MAX - PAGE_SIZE)
			return ERR_PTR(-ENOMEM);
	}
	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;

	ret = bpf_map_precharge_memlock(cost);
	if (ret < 0)
		return ERR_PTR(ret);
130

131
	/* allocate all map elements and zero-initialize them */
132
	array = bpf_map_area_alloc(array_size, numa_node);
133 134
	if (!array)
		return ERR_PTR(-ENOMEM);
135 136
	array->index_mask = index_mask;
	array->map.unpriv_array = unpriv;
137 138

	/* copy mandatory map attributes */
139
	bpf_map_init_from_attr(&array->map, attr);
140
	array->map.pages = cost;
141 142
	array->elem_size = elem_size;

143
	if (percpu && bpf_array_alloc_percpu(array)) {
144
		bpf_map_area_free(array);
145 146 147
		return ERR_PTR(-ENOMEM);
	}

148 149 150 151 152 153 154 155 156
	return &array->map;
}

/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

157
	if (unlikely(index >= array->map.max_entries))
158 159
		return NULL;

160
	return array->value + array->elem_size * (index & array->index_mask);
161 162
}

163 164 165
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
166
	struct bpf_array *array = container_of(map, struct bpf_array, map);
167
	struct bpf_insn *insn = insn_buf;
168
	u32 elem_size = round_up(map->value_size, 8);
169 170 171 172 173 174
	const int ret = BPF_REG_0;
	const int map_ptr = BPF_REG_1;
	const int index = BPF_REG_2;

	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
175 176 177 178 179 180
	if (map->unpriv_array) {
		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
	} else {
		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
	}
181 182

	if (is_power_of_2(elem_size)) {
183 184 185 186 187 188 189 190 191 192
		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
	} else {
		*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
	}
	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
	*insn++ = BPF_MOV64_IMM(ret, 0);
	return insn - insn_buf;
}

193 194 195 196 197 198 199 200 201
/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

	if (unlikely(index >= array->map.max_entries))
		return NULL;

202
	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
203 204
}

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;
	void __percpu *pptr;
	int cpu, off = 0;
	u32 size;

	if (unlikely(index >= array->map.max_entries))
		return -ENOENT;

	/* per_cpu areas are zero-filled and bpf programs can only
	 * access 'value_size' of them, so copying rounded areas
	 * will not leak any kernel data
	 */
	size = round_up(map->value_size, 8);
	rcu_read_lock();
222
	pptr = array->pptrs[index & array->index_mask];
223 224 225 226 227 228 229 230
	for_each_possible_cpu(cpu) {
		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
		off += size;
	}
	rcu_read_unlock();
	return 0;
}

231 232 233 234
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
235
	u32 index = key ? *(u32 *)key : U32_MAX;
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	u32 *next = (u32 *)next_key;

	if (index >= array->map.max_entries) {
		*next = 0;
		return 0;
	}

	if (index == array->map.max_entries - 1)
		return -ENOENT;

	*next = index + 1;
	return 0;
}

/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
				 u64 map_flags)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;

257
	if (unlikely(map_flags > BPF_EXIST))
258 259 260
		/* unknown flags */
		return -EINVAL;

261
	if (unlikely(index >= array->map.max_entries))
262 263 264
		/* all elements were pre-allocated, cannot insert a new one */
		return -E2BIG;

265
	if (unlikely(map_flags == BPF_NOEXIST))
266
		/* all elements already exist */
267 268
		return -EEXIST;

269
	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
270
		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
271 272
		       value, map->value_size);
	else
273 274
		memcpy(array->value +
		       array->elem_size * (index & array->index_mask),
275
		       value, map->value_size);
276 277 278
	return 0;
}

279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
			    u64 map_flags)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;
	void __percpu *pptr;
	int cpu, off = 0;
	u32 size;

	if (unlikely(map_flags > BPF_EXIST))
		/* unknown flags */
		return -EINVAL;

	if (unlikely(index >= array->map.max_entries))
		/* all elements were pre-allocated, cannot insert a new one */
		return -E2BIG;

	if (unlikely(map_flags == BPF_NOEXIST))
		/* all elements already exist */
		return -EEXIST;

	/* the user space will provide round_up(value_size, 8) bytes that
	 * will be copied into per-cpu area. bpf programs can only access
	 * value_size of it. During lookup the same extra bytes will be
	 * returned or zeros which were zero-filled by percpu_alloc,
	 * so no kernel data leaks possible
	 */
	size = round_up(map->value_size, 8);
	rcu_read_lock();
308
	pptr = array->pptrs[index & array->index_mask];
309 310 311 312 313 314 315 316
	for_each_possible_cpu(cpu) {
		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
		off += size;
	}
	rcu_read_unlock();
	return 0;
}

317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
/* Called from syscall or from eBPF program */
static int array_map_delete_elem(struct bpf_map *map, void *key)
{
	return -EINVAL;
}

/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);

	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
	 * so the programs (can be more than one that used this map) were
	 * disconnected from events. Wait for outstanding programs to complete
	 * and free the array
	 */
	synchronize_rcu();

335 336 337
	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
		bpf_array_free_percpu(array);

338
	bpf_map_area_free(array);
339 340
}

341 342 343 344 345 346 347 348 349 350 351 352 353 354
static void array_map_seq_show_elem(struct bpf_map *map, void *key,
				    struct seq_file *m)
{
	void *value;

	rcu_read_lock();

	value = array_map_lookup_elem(map, key);
	if (!value) {
		rcu_read_unlock();
		return;
	}

	seq_printf(m, "%u: ", *(u32 *)key);
355
	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
356 357 358 359 360
	seq_puts(m, "\n");

	rcu_read_unlock();
}

361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
					   struct seq_file *m)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	u32 index = *(u32 *)key;
	void __percpu *pptr;
	int cpu;

	rcu_read_lock();

	seq_printf(m, "%u: {\n", *(u32 *)key);
	pptr = array->pptrs[index & array->index_mask];
	for_each_possible_cpu(cpu) {
		seq_printf(m, "\tcpu%d: ", cpu);
		btf_type_seq_show(map->btf, map->btf_value_type_id,
				  per_cpu_ptr(pptr, cpu), m);
		seq_puts(m, "\n");
	}
	seq_puts(m, "}\n");

	rcu_read_unlock();
}

384 385 386
static int array_map_check_btf(const struct bpf_map *map,
			       const struct btf_type *key_type,
			       const struct btf_type *value_type)
387 388 389
{
	u32 int_data;

390
	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
391 392 393
		return -EINVAL;

	int_data = *(u32 *)(key_type + 1);
394 395
	/* bpf array can only take a u32 key. This check makes sure
	 * that the btf matches the attr used during map_create.
396
	 */
397
	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
398 399 400 401 402
		return -EINVAL;

	return 0;
}

403
const struct bpf_map_ops array_map_ops = {
404
	.map_alloc_check = array_map_alloc_check,
405 406 407 408 409 410
	.map_alloc = array_map_alloc,
	.map_free = array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = array_map_lookup_elem,
	.map_update_elem = array_map_update_elem,
	.map_delete_elem = array_map_delete_elem,
411
	.map_gen_lookup = array_map_gen_lookup,
412 413
	.map_seq_show_elem = array_map_seq_show_elem,
	.map_check_btf = array_map_check_btf,
414 415
};

416
const struct bpf_map_ops percpu_array_map_ops = {
417
	.map_alloc_check = array_map_alloc_check,
418 419 420 421 422 423
	.map_alloc = array_map_alloc,
	.map_free = array_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = percpu_array_map_lookup_elem,
	.map_update_elem = array_map_update_elem,
	.map_delete_elem = array_map_delete_elem,
424
	.map_seq_show_elem = percpu_array_map_seq_show_elem,
425
	.map_check_btf = array_map_check_btf,
426 427
};

428
static int fd_array_map_alloc_check(union bpf_attr *attr)
429
{
430
	/* only file descriptors can be stored in this type of map */
431
	if (attr->value_size != sizeof(u32))
432 433
		return -EINVAL;
	return array_map_alloc_check(attr);
434 435
}

436
static void fd_array_map_free(struct bpf_map *map)
437 438 439 440 441 442 443 444
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;

	synchronize_rcu();

	/* make sure it's empty */
	for (i = 0; i < array->map.max_entries; i++)
445
		BUG_ON(array->ptrs[i] != NULL);
446 447

	bpf_map_area_free(array);
448 449
}

450
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
451 452 453 454
{
	return NULL;
}

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
/* only called from syscall */
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
{
	void **elem, *ptr;
	int ret =  0;

	if (!map->ops->map_fd_sys_lookup_elem)
		return -ENOTSUPP;

	rcu_read_lock();
	elem = array_map_lookup_elem(map, key);
	if (elem && (ptr = READ_ONCE(*elem)))
		*value = map->ops->map_fd_sys_lookup_elem(ptr);
	else
		ret = -ENOENT;
	rcu_read_unlock();

	return ret;
}

475
/* only called from syscall */
476 477
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
				 void *key, void *value, u64 map_flags)
478 479
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
480
	void *new_ptr, *old_ptr;
481 482 483 484 485 486 487 488 489
	u32 index = *(u32 *)key, ufd;

	if (map_flags != BPF_ANY)
		return -EINVAL;

	if (index >= array->map.max_entries)
		return -E2BIG;

	ufd = *(u32 *)value;
490
	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
491 492
	if (IS_ERR(new_ptr))
		return PTR_ERR(new_ptr);
493

494 495 496
	old_ptr = xchg(array->ptrs + index, new_ptr);
	if (old_ptr)
		map->ops->map_fd_put_ptr(old_ptr);
497 498 499 500

	return 0;
}

501
static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
502 503
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
504
	void *old_ptr;
505 506 507 508 509
	u32 index = *(u32 *)key;

	if (index >= array->map.max_entries)
		return -E2BIG;

510 511 512
	old_ptr = xchg(array->ptrs + index, NULL);
	if (old_ptr) {
		map->ops->map_fd_put_ptr(old_ptr);
513 514 515 516 517 518
		return 0;
	} else {
		return -ENOENT;
	}
}

519 520
static void *prog_fd_array_get_ptr(struct bpf_map *map,
				   struct file *map_file, int fd)
521 522 523
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_prog *prog = bpf_prog_get(fd);
524

525 526 527 528 529 530 531
	if (IS_ERR(prog))
		return prog;

	if (!bpf_prog_array_compatible(array, prog)) {
		bpf_prog_put(prog);
		return ERR_PTR(-EINVAL);
	}
532

533 534 535 536 537
	return prog;
}

static void prog_fd_array_put_ptr(void *ptr)
{
538
	bpf_prog_put(ptr);
539 540
}

541 542 543 544 545
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
{
	return ((struct bpf_prog *)ptr)->aux->id;
}

546
/* decrement refcnt of all bpf_progs that are stored in this map */
547
static void bpf_fd_array_map_clear(struct bpf_map *map)
548 549 550 551 552
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	int i;

	for (i = 0; i < array->map.max_entries; i++)
553
		fd_array_map_delete_elem(map, &i);
554 555
}

556
const struct bpf_map_ops prog_array_map_ops = {
557 558
	.map_alloc_check = fd_array_map_alloc_check,
	.map_alloc = array_map_alloc,
559
	.map_free = fd_array_map_free,
560
	.map_get_next_key = array_map_get_next_key,
561 562 563 564
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = prog_fd_array_get_ptr,
	.map_fd_put_ptr = prog_fd_array_put_ptr,
565
	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
566
	.map_release_uref = bpf_fd_array_map_clear,
567
	.map_check_btf = map_check_no_btf,
568 569
};

570 571
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
						   struct file *map_file)
572
{
573 574
	struct bpf_event_entry *ee;

575
	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
	if (ee) {
		ee->event = perf_file->private_data;
		ee->perf_file = perf_file;
		ee->map_file = map_file;
	}

	return ee;
}

static void __bpf_event_entry_free(struct rcu_head *rcu)
{
	struct bpf_event_entry *ee;

	ee = container_of(rcu, struct bpf_event_entry, rcu);
	fput(ee->perf_file);
	kfree(ee);
}

static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
{
	call_rcu(&ee->rcu, __bpf_event_entry_free);
597 598
}

599 600
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
					 struct file *map_file, int fd)
601
{
602 603 604
	struct bpf_event_entry *ee;
	struct perf_event *event;
	struct file *perf_file;
605
	u64 value;
606

607 608 609
	perf_file = perf_event_get(fd);
	if (IS_ERR(perf_file))
		return perf_file;
610

611
	ee = ERR_PTR(-EOPNOTSUPP);
612
	event = perf_file->private_data;
613
	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
614 615
		goto err_out;

616 617 618 619
	ee = bpf_event_entry_gen(perf_file, map_file);
	if (ee)
		return ee;
	ee = ERR_PTR(-ENOMEM);
620 621 622
err_out:
	fput(perf_file);
	return ee;
623 624 625 626
}

static void perf_event_fd_array_put_ptr(void *ptr)
{
627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
	bpf_event_entry_free_rcu(ptr);
}

static void perf_event_fd_array_release(struct bpf_map *map,
					struct file *map_file)
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct bpf_event_entry *ee;
	int i;

	rcu_read_lock();
	for (i = 0; i < array->map.max_entries; i++) {
		ee = READ_ONCE(array->ptrs[i]);
		if (ee && ee->map_file == map_file)
			fd_array_map_delete_elem(map, &i);
	}
	rcu_read_unlock();
644 645
}

646
const struct bpf_map_ops perf_event_array_map_ops = {
647 648
	.map_alloc_check = fd_array_map_alloc_check,
	.map_alloc = array_map_alloc,
649
	.map_free = fd_array_map_free,
650 651 652 653 654
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
655
	.map_release = perf_event_fd_array_release,
656
	.map_check_btf = map_check_no_btf,
657 658
};

659
#ifdef CONFIG_CGROUPS
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
				     struct file *map_file /* not used */,
				     int fd)
{
	return cgroup_get_from_fd(fd);
}

static void cgroup_fd_array_put_ptr(void *ptr)
{
	/* cgroup_put free cgrp after a rcu grace period */
	cgroup_put(ptr);
}

static void cgroup_fd_array_free(struct bpf_map *map)
{
	bpf_fd_array_map_clear(map);
	fd_array_map_free(map);
}

679
const struct bpf_map_ops cgroup_array_map_ops = {
680 681
	.map_alloc_check = fd_array_map_alloc_check,
	.map_alloc = array_map_alloc,
682 683 684 685 686 687
	.map_free = cgroup_fd_array_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
688
	.map_check_btf = map_check_no_btf,
689 690
};
#endif
691 692 693 694 695 696 697 698 699

static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
{
	struct bpf_map *map, *inner_map_meta;

	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
	if (IS_ERR(inner_map_meta))
		return inner_map_meta;

700
	map = array_map_alloc(attr);
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
	if (IS_ERR(map)) {
		bpf_map_meta_free(inner_map_meta);
		return map;
	}

	map->inner_map_meta = inner_map_meta;

	return map;
}

static void array_of_map_free(struct bpf_map *map)
{
	/* map->inner_map_meta is only accessed by syscall which
	 * is protected by fdget/fdput.
	 */
	bpf_map_meta_free(map->inner_map_meta);
	bpf_fd_array_map_clear(map);
	fd_array_map_free(map);
}

static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
{
	struct bpf_map **inner_map = array_map_lookup_elem(map, key);

	if (!inner_map)
		return NULL;

	return READ_ONCE(*inner_map);
}

731 732 733
static u32 array_of_map_gen_lookup(struct bpf_map *map,
				   struct bpf_insn *insn_buf)
{
734
	struct bpf_array *array = container_of(map, struct bpf_array, map);
735 736 737 738 739 740 741 742
	u32 elem_size = round_up(map->value_size, 8);
	struct bpf_insn *insn = insn_buf;
	const int ret = BPF_REG_0;
	const int map_ptr = BPF_REG_1;
	const int index = BPF_REG_2;

	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
743 744 745 746 747 748
	if (map->unpriv_array) {
		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
	} else {
		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
	}
749 750 751 752 753 754 755 756 757 758 759 760 761
	if (is_power_of_2(elem_size))
		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
	else
		*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
	*insn++ = BPF_MOV64_IMM(ret, 0);

	return insn - insn_buf;
}

762
const struct bpf_map_ops array_of_maps_map_ops = {
763
	.map_alloc_check = fd_array_map_alloc_check,
764 765 766 767 768 769 770
	.map_alloc = array_of_map_alloc,
	.map_free = array_of_map_free,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = array_of_map_lookup_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = bpf_map_fd_get_ptr,
	.map_fd_put_ptr = bpf_map_fd_put_ptr,
771
	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
772
	.map_gen_lookup = array_of_map_gen_lookup,
773
	.map_check_btf = map_check_no_btf,
774
};