xsk.c 21.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)

/*
 * AF_XDP user-space access library.
 *
 * Copyright(c) 2018 - 2019 Intel Corporation.
 *
 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
 */

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <asm/barrier.h>
#include <linux/compiler.h>
#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/if_xdp.h>
23
#include <linux/kernel.h>
24
#include <linux/list.h>
25 26 27 28 29 30 31 32 33
#include <linux/sockios.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>

#include "bpf.h"
#include "libbpf.h"
34
#include "libbpf_internal.h"
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#include "xsk.h"

#ifndef SOL_XDP
 #define SOL_XDP 283
#endif

#ifndef AF_XDP
 #define AF_XDP 44
#endif

#ifndef PF_XDP
 #define PF_XDP AF_XDP
#endif

struct xsk_umem {
50 51
	struct xsk_ring_prod *fill_save;
	struct xsk_ring_cons *comp_save;
52 53 54 55
	char *umem_area;
	struct xsk_umem_config config;
	int fd;
	int refcount;
56
	struct list_head ctx_list;
57 58
	bool rx_ring_setup_done;
	bool tx_ring_setup_done;
59 60 61 62 63 64 65 66 67 68 69 70 71
};

struct xsk_ctx {
	struct xsk_ring_prod *fill;
	struct xsk_ring_cons *comp;
	__u32 queue_id;
	struct xsk_umem *umem;
	int refcount;
	int ifindex;
	struct list_head list;
	int prog_fd;
	int xsks_map_fd;
	char ifname[IFNAMSIZ];
72 73 74 75 76 77
};

struct xsk_socket {
	struct xsk_ring_cons *rx;
	struct xsk_ring_prod *tx;
	__u64 outstanding_tx;
78
	struct xsk_ctx *ctx;
79 80 81 82 83 84 85 86 87 88
	struct xsk_socket_config config;
	int fd;
};

struct xsk_nl_info {
	bool xdp_prog_attached;
	int ifindex;
	int fd;
};

89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
/* Up until and including Linux 5.3 */
struct xdp_ring_offset_v1 {
	__u64 producer;
	__u64 consumer;
	__u64 desc;
};

/* Up until and including Linux 5.3 */
struct xdp_mmap_offsets_v1 {
	struct xdp_ring_offset_v1 rx;
	struct xdp_ring_offset_v1 tx;
	struct xdp_ring_offset_v1 fr;
	struct xdp_ring_offset_v1 cr;
};

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
int xsk_umem__fd(const struct xsk_umem *umem)
{
	return umem ? umem->fd : -EINVAL;
}

int xsk_socket__fd(const struct xsk_socket *xsk)
{
	return xsk ? xsk->fd : -EINVAL;
}

static bool xsk_page_aligned(void *buffer)
{
	unsigned long addr = (unsigned long)buffer;

	return !(addr & (getpagesize() - 1));
}

static void xsk_set_umem_config(struct xsk_umem_config *cfg,
				const struct xsk_umem_config *usr_cfg)
{
	if (!usr_cfg) {
		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
K
Kevin Laatz 已提交
129
		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
130 131 132 133 134 135 136
		return;
	}

	cfg->fill_size = usr_cfg->fill_size;
	cfg->comp_size = usr_cfg->comp_size;
	cfg->frame_size = usr_cfg->frame_size;
	cfg->frame_headroom = usr_cfg->frame_headroom;
K
Kevin Laatz 已提交
137
	cfg->flags = usr_cfg->flags;
138 139
}

140 141
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
				     const struct xsk_socket_config *usr_cfg)
142 143 144 145 146 147 148
{
	if (!usr_cfg) {
		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
		cfg->libbpf_flags = 0;
		cfg->xdp_flags = 0;
		cfg->bind_flags = 0;
149
		return 0;
150 151
	}

152 153 154
	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
		return -EINVAL;

155 156 157 158 159
	cfg->rx_size = usr_cfg->rx_size;
	cfg->tx_size = usr_cfg->tx_size;
	cfg->libbpf_flags = usr_cfg->libbpf_flags;
	cfg->xdp_flags = usr_cfg->xdp_flags;
	cfg->bind_flags = usr_cfg->bind_flags;
160 161

	return 0;
162 163
}

164 165 166 167 168 169 170 171 172 173 174 175 176
static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
{
	struct xdp_mmap_offsets_v1 off_v1;

	/* getsockopt on a kernel <= 5.3 has no flags fields.
	 * Copy over the offsets to the correct places in the >=5.4 format
	 * and put the flags where they would have been on that kernel.
	 */
	memcpy(&off_v1, off, sizeof(off_v1));

	off->rx.producer = off_v1.rx.producer;
	off->rx.consumer = off_v1.rx.consumer;
	off->rx.desc = off_v1.rx.desc;
177
	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
178 179 180 181

	off->tx.producer = off_v1.tx.producer;
	off->tx.consumer = off_v1.tx.consumer;
	off->tx.desc = off_v1.tx.desc;
182
	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
183 184 185 186

	off->fr.producer = off_v1.fr.producer;
	off->fr.consumer = off_v1.fr.consumer;
	off->fr.desc = off_v1.fr.desc;
187
	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
188 189 190 191

	off->cr.producer = off_v1.cr.producer;
	off->cr.consumer = off_v1.cr.consumer;
	off->cr.desc = off_v1.cr.desc;
192
	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
}

static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
{
	socklen_t optlen;
	int err;

	optlen = sizeof(*off);
	err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
	if (err)
		return err;

	if (optlen == sizeof(*off))
		return 0;

	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
		xsk_mmap_offsets_v1(off);
		return 0;
	}

	return -EINVAL;
}

216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
				 struct xsk_ring_prod *fill,
				 struct xsk_ring_cons *comp)
{
	struct xdp_mmap_offsets off;
	void *map;
	int err;

	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
			 &umem->config.fill_size,
			 sizeof(umem->config.fill_size));
	if (err)
		return -errno;

	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
			 &umem->config.comp_size,
			 sizeof(umem->config.comp_size));
	if (err)
		return -errno;

	err = xsk_get_mmap_offsets(fd, &off);
	if (err)
		return -errno;

	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_FILL_RING);
	if (map == MAP_FAILED)
		return -errno;

	fill->mask = umem->config.fill_size - 1;
	fill->size = umem->config.fill_size;
	fill->producer = map + off.fr.producer;
	fill->consumer = map + off.fr.consumer;
	fill->flags = map + off.fr.flags;
	fill->ring = map + off.fr.desc;
	fill->cached_cons = umem->config.fill_size;

	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_COMPLETION_RING);
	if (map == MAP_FAILED) {
		err = -errno;
		goto out_mmap;
	}

	comp->mask = umem->config.comp_size - 1;
	comp->size = umem->config.comp_size;
	comp->producer = map + off.cr.producer;
	comp->consumer = map + off.cr.consumer;
	comp->flags = map + off.cr.flags;
	comp->ring = map + off.cr.desc;

	return 0;

out_mmap:
	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
	return err;
}

K
Kevin Laatz 已提交
276 277 278 279
int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
			    __u64 size, struct xsk_ring_prod *fill,
			    struct xsk_ring_cons *comp,
			    const struct xsk_umem_config *usr_config)
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
{
	struct xdp_umem_reg mr;
	struct xsk_umem *umem;
	int err;

	if (!umem_area || !umem_ptr || !fill || !comp)
		return -EFAULT;
	if (!size && !xsk_page_aligned(umem_area))
		return -EINVAL;

	umem = calloc(1, sizeof(*umem));
	if (!umem)
		return -ENOMEM;

	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
	if (umem->fd < 0) {
		err = -errno;
		goto out_umem_alloc;
	}

	umem->umem_area = umem_area;
301
	INIT_LIST_HEAD(&umem->ctx_list);
302 303
	xsk_set_umem_config(&umem->config, usr_config);

304
	memset(&mr, 0, sizeof(mr));
305 306 307 308
	mr.addr = (uintptr_t)umem_area;
	mr.len = size;
	mr.chunk_size = umem->config.frame_size;
	mr.headroom = umem->config.frame_headroom;
K
Kevin Laatz 已提交
309
	mr.flags = umem->config.flags;
310 311 312 313 314 315 316

	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
	if (err) {
		err = -errno;
		goto out_socket;
	}

317 318
	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
	if (err)
319 320
		goto out_socket;

321 322
	umem->fill_save = fill;
	umem->comp_save = comp;
323 324 325 326 327 328 329 330 331 332
	*umem_ptr = umem;
	return 0;

out_socket:
	close(umem->fd);
out_umem_alloc:
	free(umem);
	return err;
}

K
Kevin Laatz 已提交
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
struct xsk_umem_config_v1 {
	__u32 fill_size;
	__u32 comp_size;
	__u32 frame_size;
	__u32 frame_headroom;
};

int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
			    __u64 size, struct xsk_ring_prod *fill,
			    struct xsk_ring_cons *comp,
			    const struct xsk_umem_config *usr_config)
{
	struct xsk_umem_config config;

	memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
	config.flags = 0;

	return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
					&config);
}
353 354
COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2)
DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4)
K
Kevin Laatz 已提交
355

356 357
static int xsk_load_xdp_prog(struct xsk_socket *xsk)
{
358
	static const int log_buf_size = 16 * 1024;
359
	struct xsk_ctx *ctx = xsk->ctx;
360
	char log_buf[log_buf_size];
361 362 363 364 365
	int err, prog_fd;

	/* This is the C-program:
	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
	 * {
366
	 *     int ret, index = ctx->rx_queue_index;
367 368 369
	 *
	 *     // A set entry here means that the correspnding queue_id
	 *     // has an active AF_XDP socket bound to it.
370 371 372 373 374 375
	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
	 *     if (ret > 0)
	 *         return ret;
	 *
	 *     // Fallback for pre-5.3 kernels, not supporting default
	 *     // action in the flags parameter.
376
	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
377 378 379 380 381
	 *         return bpf_redirect_map(&xsks_map, index, 0);
	 *     return XDP_PASS;
	 * }
	 */
	struct bpf_insn prog[] = {
382 383 384 385 386
		/* r2 = *(u32 *)(r1 + 16) */
		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
		/* *(u32 *)(r10 - 4) = r2 */
		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
		/* r1 = xskmap[] */
387
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
388 389 390 391 392 393 394
		/* r3 = XDP_PASS */
		BPF_MOV64_IMM(BPF_REG_3, 2),
		/* call bpf_redirect_map */
		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
		/* if w0 != 0 goto pc+13 */
		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
		/* r2 = r10 */
395
		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
396
		/* r2 += -4 */
397
		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
398
		/* r1 = xskmap[] */
399
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
400
		/* call bpf_map_lookup_elem */
401
		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
402
		/* r1 = r0 */
403
		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
404 405 406
		/* r0 = XDP_PASS */
		BPF_MOV64_IMM(BPF_REG_0, 2),
		/* if r1 == 0 goto pc+5 */
407 408 409
		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
		/* r2 = *(u32 *)(r10 - 4) */
		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
410
		/* r1 = xskmap[] */
411
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
412 413 414
		/* r3 = 0 */
		BPF_MOV64_IMM(BPF_REG_3, 0),
		/* call bpf_redirect_map */
415 416 417 418 419 420 421
		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
		/* The jumps are to this instruction */
		BPF_EXIT_INSN(),
	};
	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);

	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
422 423
				   "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
				   log_buf_size);
424
	if (prog_fd < 0) {
425
		pr_warn("BPF log buffer:\n%s", log_buf);
426 427 428
		return prog_fd;
	}

429 430
	err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
				  xsk->config.xdp_flags);
431 432 433 434 435
	if (err) {
		close(prog_fd);
		return err;
	}

436
	ctx->prog_fd = prog_fd;
437 438 439 440 441
	return 0;
}

static int xsk_get_max_queues(struct xsk_socket *xsk)
{
442
	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
443
	struct xsk_ctx *ctx = xsk->ctx;
444
	struct ifreq ifr = {};
445 446 447 448 449 450 451
	int fd, err, ret;

	fd = socket(AF_INET, SOCK_DGRAM, 0);
	if (fd < 0)
		return -errno;

	ifr.ifr_data = (void *)&channels;
452
	memcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ - 1);
453
	ifr.ifr_name[IFNAMSIZ - 1] = '\0';
454 455 456 457 458 459
	err = ioctl(fd, SIOCETHTOOL, &ifr);
	if (err && errno != EOPNOTSUPP) {
		ret = -errno;
		goto out;
	}

460
	if (err) {
461 462 463 464
		/* If the device says it has no channels, then all traffic
		 * is sent to a single stream, so max queues = 1.
		 */
		ret = 1;
465 466 467 468 469 470 471
	} else {
		/* Take the max of rx, tx, combined. Drivers return
		 * the number of channels in different ways.
		 */
		ret = max(channels.max_rx, channels.max_tx);
		ret = max(ret, (int)channels.max_combined);
	}
472 473 474 475 476 477 478 479

out:
	close(fd);
	return ret;
}

static int xsk_create_bpf_maps(struct xsk_socket *xsk)
{
480
	struct xsk_ctx *ctx = xsk->ctx;
481 482 483 484 485 486 487
	int max_queues;
	int fd;

	max_queues = xsk_get_max_queues(xsk);
	if (max_queues < 0)
		return max_queues;

488
	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
489 490 491 492
				 sizeof(int), sizeof(int), max_queues, 0);
	if (fd < 0)
		return fd;

493
	ctx->xsks_map_fd = fd;
494 495 496 497 498 499

	return 0;
}

static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
{
500 501 502 503
	struct xsk_ctx *ctx = xsk->ctx;

	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
	close(ctx->xsks_map_fd);
504 505
}

B
Björn Töpel 已提交
506
static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
507
{
B
Björn Töpel 已提交
508 509
	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
	__u32 map_len = sizeof(struct bpf_map_info);
510
	struct bpf_prog_info prog_info = {};
511
	struct xsk_ctx *ctx = xsk->ctx;
512
	struct bpf_map_info map_info;
B
Björn Töpel 已提交
513
	int fd, err;
514

515
	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
516 517 518 519 520 521 522 523 524 525 526 527 528
	if (err)
		return err;

	num_maps = prog_info.nr_map_ids;

	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
	if (!map_ids)
		return -ENOMEM;

	memset(&prog_info, 0, prog_len);
	prog_info.nr_map_ids = num_maps;
	prog_info.map_ids = (__u64)(unsigned long)map_ids;

529
	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
530 531 532
	if (err)
		goto out_map_ids;

533
	ctx->xsks_map_fd = -1;
534

535
	for (i = 0; i < prog_info.nr_map_ids; i++) {
536
		fd = bpf_map_get_fd_by_id(map_ids[i]);
B
Björn Töpel 已提交
537 538
		if (fd < 0)
			continue;
539

540
		memset(&map_info, 0, map_len);
541
		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
B
Björn Töpel 已提交
542 543 544 545
		if (err) {
			close(fd);
			continue;
		}
546

547
		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
548
			ctx->xsks_map_fd = fd;
549
			break;
550 551
		}

B
Björn Töpel 已提交
552
		close(fd);
553 554
	}

B
Björn Töpel 已提交
555
	err = 0;
556
	if (ctx->xsks_map_fd == -1)
557 558 559 560 561 562 563
		err = -ENOENT;

out_map_ids:
	free(map_ids);
	return err;
}

B
Björn Töpel 已提交
564 565
static int xsk_set_bpf_maps(struct xsk_socket *xsk)
{
566 567 568
	struct xsk_ctx *ctx = xsk->ctx;

	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
569
				   &xsk->fd, 0);
B
Björn Töpel 已提交
570 571
}

572 573
static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
{
574
	struct xsk_ctx *ctx = xsk->ctx;
575 576 577
	__u32 prog_id = 0;
	int err;

578
	err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
579 580 581 582 583 584 585 586 587 588
				  xsk->config.xdp_flags);
	if (err)
		return err;

	if (!prog_id) {
		err = xsk_create_bpf_maps(xsk);
		if (err)
			return err;

		err = xsk_load_xdp_prog(xsk);
589 590 591 592
		if (err) {
			xsk_delete_bpf_maps(xsk);
			return err;
		}
593
	} else {
594 595
		ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
		if (ctx->prog_fd < 0)
596
			return -errno;
B
Björn Töpel 已提交
597
		err = xsk_lookup_bpf_maps(xsk);
598
		if (err) {
599
			close(ctx->prog_fd);
600 601
			return err;
		}
602 603
	}

604 605
	if (xsk->rx)
		err = xsk_set_bpf_maps(xsk);
606 607
	if (err) {
		xsk_delete_bpf_maps(xsk);
608
		close(ctx->prog_fd);
609 610
		return err;
	}
611 612 613 614

	return 0;
}

615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
				   __u32 queue_id)
{
	struct xsk_ctx *ctx;

	if (list_empty(&umem->ctx_list))
		return NULL;

	list_for_each_entry(ctx, &umem->ctx_list, list) {
		if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
			ctx->refcount++;
			return ctx;
		}
	}

	return NULL;
}

633
static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
634 635 636 637 638
{
	struct xsk_umem *umem = ctx->umem;
	struct xdp_mmap_offsets off;
	int err;

639 640
	if (--ctx->refcount)
		return;
641

642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
	if (!unmap)
		goto out_free;

	err = xsk_get_mmap_offsets(umem->fd, &off);
	if (err)
		goto out_free;

	munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
	       sizeof(__u64));
	munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
	       sizeof(__u64));

out_free:
	list_del(&ctx->list);
	free(ctx);
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
}

static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
				      struct xsk_umem *umem, int ifindex,
				      const char *ifname, __u32 queue_id,
				      struct xsk_ring_prod *fill,
				      struct xsk_ring_cons *comp)
{
	struct xsk_ctx *ctx;
	int err;

	ctx = calloc(1, sizeof(*ctx));
	if (!ctx)
		return NULL;

	if (!umem->fill_save) {
		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
		if (err) {
			free(ctx);
			return NULL;
		}
	} else if (umem->fill_save != fill || umem->comp_save != comp) {
		/* Copy over rings to new structs. */
		memcpy(fill, umem->fill_save, sizeof(*fill));
		memcpy(comp, umem->comp_save, sizeof(*comp));
	}

	ctx->ifindex = ifindex;
	ctx->refcount = 1;
	ctx->umem = umem;
	ctx->queue_id = queue_id;
	memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
	ctx->ifname[IFNAMSIZ - 1] = '\0';

	ctx->fill = fill;
	ctx->comp = comp;
	list_add(&ctx->list, &umem->ctx_list);
	return ctx;
}

int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
			      const char *ifname,
			      __u32 queue_id, struct xsk_umem *umem,
			      struct xsk_ring_cons *rx,
			      struct xsk_ring_prod *tx,
			      struct xsk_ring_prod *fill,
			      struct xsk_ring_cons *comp,
			      const struct xsk_socket_config *usr_config)
705
{
706
	bool unmap, rx_setup_done = false, tx_setup_done = false;
B
Björn Töpel 已提交
707
	void *rx_map = NULL, *tx_map = NULL;
708 709 710
	struct sockaddr_xdp sxdp = {};
	struct xdp_mmap_offsets off;
	struct xsk_socket *xsk;
711 712
	struct xsk_ctx *ctx;
	int err, ifindex;
713

714
	if (!umem || !xsk_ptr || !(rx || tx))
715 716
		return -EFAULT;

717 718
	unmap = umem->fill_save != fill;

719 720 721 722
	xsk = calloc(1, sizeof(*xsk));
	if (!xsk)
		return -ENOMEM;

723 724 725 726
	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
	if (err)
		goto out_xsk_alloc;

727 728 729 730
	xsk->outstanding_tx = 0;
	ifindex = if_nametoindex(ifname);
	if (!ifindex) {
		err = -errno;
731 732 733
		goto out_xsk_alloc;
	}

734 735 736 737 738 739 740 741
	if (umem->refcount++ > 0) {
		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
		if (xsk->fd < 0) {
			err = -errno;
			goto out_xsk_alloc;
		}
	} else {
		xsk->fd = umem->fd;
742 743
		rx_setup_done = umem->rx_ring_setup_done;
		tx_setup_done = umem->tx_ring_setup_done;
744 745
	}

746 747
	ctx = xsk_get_ctx(umem, ifindex, queue_id);
	if (!ctx) {
748 749 750 751 752
		if (!fill || !comp) {
			err = -EFAULT;
			goto out_socket;
		}

753 754 755 756 757 758
		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
				     fill, comp);
		if (!ctx) {
			err = -ENOMEM;
			goto out_socket;
		}
759
	}
760
	xsk->ctx = ctx;
761

762
	if (rx && !rx_setup_done) {
763 764 765 766 767
		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
				 &xsk->config.rx_size,
				 sizeof(xsk->config.rx_size));
		if (err) {
			err = -errno;
768
			goto out_put_ctx;
769
		}
770 771
		if (xsk->fd == umem->fd)
			umem->rx_ring_setup_done = true;
772
	}
773
	if (tx && !tx_setup_done) {
774 775 776 777 778
		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
				 &xsk->config.tx_size,
				 sizeof(xsk->config.tx_size));
		if (err) {
			err = -errno;
779
			goto out_put_ctx;
780
		}
781 782
		if (xsk->fd == umem->fd)
			umem->rx_ring_setup_done = true;
783 784
	}

785
	err = xsk_get_mmap_offsets(xsk->fd, &off);
786 787
	if (err) {
		err = -errno;
788
		goto out_put_ctx;
789 790 791
	}

	if (rx) {
792 793 794 795
		rx_map = mmap(NULL, off.rx.desc +
			      xsk->config.rx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_RX_RING);
B
Björn Töpel 已提交
796
		if (rx_map == MAP_FAILED) {
797
			err = -errno;
798
			goto out_put_ctx;
799 800 801 802
		}

		rx->mask = xsk->config.rx_size - 1;
		rx->size = xsk->config.rx_size;
B
Björn Töpel 已提交
803 804
		rx->producer = rx_map + off.rx.producer;
		rx->consumer = rx_map + off.rx.consumer;
805
		rx->flags = rx_map + off.rx.flags;
B
Björn Töpel 已提交
806
		rx->ring = rx_map + off.rx.desc;
807 808
		rx->cached_prod = *rx->producer;
		rx->cached_cons = *rx->consumer;
809 810 811 812
	}
	xsk->rx = rx;

	if (tx) {
813 814 815 816
		tx_map = mmap(NULL, off.tx.desc +
			      xsk->config.tx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_TX_RING);
B
Björn Töpel 已提交
817
		if (tx_map == MAP_FAILED) {
818 819 820 821 822 823
			err = -errno;
			goto out_mmap_rx;
		}

		tx->mask = xsk->config.tx_size - 1;
		tx->size = xsk->config.tx_size;
B
Björn Töpel 已提交
824 825
		tx->producer = tx_map + off.tx.producer;
		tx->consumer = tx_map + off.tx.consumer;
826
		tx->flags = tx_map + off.tx.flags;
B
Björn Töpel 已提交
827
		tx->ring = tx_map + off.tx.desc;
828 829 830 831 832
		tx->cached_prod = *tx->producer;
		/* cached_cons is r->size bigger than the real consumer pointer
		 * See xsk_prod_nb_free
		 */
		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
833 834 835 836
	}
	xsk->tx = tx;

	sxdp.sxdp_family = PF_XDP;
837 838
	sxdp.sxdp_ifindex = ctx->ifindex;
	sxdp.sxdp_queue_id = ctx->queue_id;
839
	if (umem->refcount > 1) {
840
		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
841 842 843 844
		sxdp.sxdp_shared_umem_fd = umem->fd;
	} else {
		sxdp.sxdp_flags = xsk->config.bind_flags;
	}
845 846 847 848 849 850 851

	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
	if (err) {
		err = -errno;
		goto out_mmap_tx;
	}

852
	ctx->prog_fd = -1;
853

854 855 856 857 858 859 860
	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
		err = xsk_setup_xdp_prog(xsk);
		if (err)
			goto out_mmap_tx;
	}

	*xsk_ptr = xsk;
861 862
	umem->fill_save = NULL;
	umem->comp_save = NULL;
863 864 865 866
	return 0;

out_mmap_tx:
	if (tx)
B
Björn Töpel 已提交
867
		munmap(tx_map, off.tx.desc +
868 869 870
		       xsk->config.tx_size * sizeof(struct xdp_desc));
out_mmap_rx:
	if (rx)
B
Björn Töpel 已提交
871
		munmap(rx_map, off.rx.desc +
872
		       xsk->config.rx_size * sizeof(struct xdp_desc));
873
out_put_ctx:
874
	xsk_put_ctx(ctx, unmap);
875 876 877 878 879 880 881 882
out_socket:
	if (--umem->refcount)
		close(xsk->fd);
out_xsk_alloc:
	free(xsk);
	return err;
}

883 884 885 886
int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
		       __u32 queue_id, struct xsk_umem *umem,
		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
		       const struct xsk_socket_config *usr_config)
887
{
888 889 890
	if (!umem)
		return -EFAULT;

891 892 893 894
	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
					 rx, tx, umem->fill_save,
					 umem->comp_save, usr_config);
}
895

896 897
int xsk_umem__delete(struct xsk_umem *umem)
{
898 899 900 901 902 903 904 905 906 907 908 909 910 911
	if (!umem)
		return 0;

	if (umem->refcount)
		return -EBUSY;

	close(umem->fd);
	free(umem);

	return 0;
}

void xsk_socket__delete(struct xsk_socket *xsk)
{
B
Björn Töpel 已提交
912
	size_t desc_sz = sizeof(struct xdp_desc);
913
	struct xdp_mmap_offsets off;
914
	struct xsk_umem *umem;
915
	struct xsk_ctx *ctx;
916 917 918 919 920
	int err;

	if (!xsk)
		return;

921
	ctx = xsk->ctx;
922
	umem = ctx->umem;
923
	if (ctx->prog_fd != -1) {
924
		xsk_delete_bpf_maps(xsk);
925
		close(ctx->prog_fd);
926
	}
927

928
	err = xsk_get_mmap_offsets(xsk->fd, &off);
929
	if (!err) {
B
Björn Töpel 已提交
930
		if (xsk->rx) {
931 932
			munmap(xsk->rx->ring - off.rx.desc,
			       off.rx.desc + xsk->config.rx_size * desc_sz);
B
Björn Töpel 已提交
933 934
		}
		if (xsk->tx) {
935 936
			munmap(xsk->tx->ring - off.tx.desc,
			       off.tx.desc + xsk->config.tx_size * desc_sz);
B
Björn Töpel 已提交
937
		}
938 939
	}

940
	xsk_put_ctx(ctx, true);
941

942
	umem->refcount--;
943 944 945
	/* Do not close an fd that also has an associated umem connected
	 * to it.
	 */
946
	if (xsk->fd != umem->fd)
947 948 949
		close(xsk->fd);
	free(xsk);
}