xsk.c 21.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)

/*
 * AF_XDP user-space access library.
 *
 * Copyright(c) 2018 - 2019 Intel Corporation.
 *
 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
 */

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <asm/barrier.h>
#include <linux/compiler.h>
#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/if_xdp.h>
23
#include <linux/kernel.h>
24
#include <linux/list.h>
25 26 27 28 29 30 31 32 33
#include <linux/sockios.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>

#include "bpf.h"
#include "libbpf.h"
34
#include "libbpf_internal.h"
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#include "xsk.h"

#ifndef SOL_XDP
 #define SOL_XDP 283
#endif

#ifndef AF_XDP
 #define AF_XDP 44
#endif

#ifndef PF_XDP
 #define PF_XDP AF_XDP
#endif

struct xsk_umem {
50 51
	struct xsk_ring_prod *fill_save;
	struct xsk_ring_cons *comp_save;
52 53 54 55
	char *umem_area;
	struct xsk_umem_config config;
	int fd;
	int refcount;
56
	struct list_head ctx_list;
57 58
	bool rx_ring_setup_done;
	bool tx_ring_setup_done;
59 60 61 62 63 64 65 66 67 68 69 70 71
};

struct xsk_ctx {
	struct xsk_ring_prod *fill;
	struct xsk_ring_cons *comp;
	__u32 queue_id;
	struct xsk_umem *umem;
	int refcount;
	int ifindex;
	struct list_head list;
	int prog_fd;
	int xsks_map_fd;
	char ifname[IFNAMSIZ];
72 73 74 75 76 77
};

struct xsk_socket {
	struct xsk_ring_cons *rx;
	struct xsk_ring_prod *tx;
	__u64 outstanding_tx;
78
	struct xsk_ctx *ctx;
79 80 81 82 83 84 85 86 87 88
	struct xsk_socket_config config;
	int fd;
};

struct xsk_nl_info {
	bool xdp_prog_attached;
	int ifindex;
	int fd;
};

89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
/* Up until and including Linux 5.3 */
struct xdp_ring_offset_v1 {
	__u64 producer;
	__u64 consumer;
	__u64 desc;
};

/* Up until and including Linux 5.3 */
struct xdp_mmap_offsets_v1 {
	struct xdp_ring_offset_v1 rx;
	struct xdp_ring_offset_v1 tx;
	struct xdp_ring_offset_v1 fr;
	struct xdp_ring_offset_v1 cr;
};

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
int xsk_umem__fd(const struct xsk_umem *umem)
{
	return umem ? umem->fd : -EINVAL;
}

int xsk_socket__fd(const struct xsk_socket *xsk)
{
	return xsk ? xsk->fd : -EINVAL;
}

static bool xsk_page_aligned(void *buffer)
{
	unsigned long addr = (unsigned long)buffer;

	return !(addr & (getpagesize() - 1));
}

static void xsk_set_umem_config(struct xsk_umem_config *cfg,
				const struct xsk_umem_config *usr_cfg)
{
	if (!usr_cfg) {
		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
K
Kevin Laatz 已提交
129
		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
130 131 132 133 134 135 136
		return;
	}

	cfg->fill_size = usr_cfg->fill_size;
	cfg->comp_size = usr_cfg->comp_size;
	cfg->frame_size = usr_cfg->frame_size;
	cfg->frame_headroom = usr_cfg->frame_headroom;
K
Kevin Laatz 已提交
137
	cfg->flags = usr_cfg->flags;
138 139
}

140 141
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
				     const struct xsk_socket_config *usr_cfg)
142 143 144 145 146 147 148
{
	if (!usr_cfg) {
		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
		cfg->libbpf_flags = 0;
		cfg->xdp_flags = 0;
		cfg->bind_flags = 0;
149
		return 0;
150 151
	}

152 153 154
	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
		return -EINVAL;

155 156 157 158 159
	cfg->rx_size = usr_cfg->rx_size;
	cfg->tx_size = usr_cfg->tx_size;
	cfg->libbpf_flags = usr_cfg->libbpf_flags;
	cfg->xdp_flags = usr_cfg->xdp_flags;
	cfg->bind_flags = usr_cfg->bind_flags;
160 161

	return 0;
162 163
}

164 165 166 167 168 169 170 171 172 173 174 175 176
static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
{
	struct xdp_mmap_offsets_v1 off_v1;

	/* getsockopt on a kernel <= 5.3 has no flags fields.
	 * Copy over the offsets to the correct places in the >=5.4 format
	 * and put the flags where they would have been on that kernel.
	 */
	memcpy(&off_v1, off, sizeof(off_v1));

	off->rx.producer = off_v1.rx.producer;
	off->rx.consumer = off_v1.rx.consumer;
	off->rx.desc = off_v1.rx.desc;
177
	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
178 179 180 181

	off->tx.producer = off_v1.tx.producer;
	off->tx.consumer = off_v1.tx.consumer;
	off->tx.desc = off_v1.tx.desc;
182
	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
183 184 185 186

	off->fr.producer = off_v1.fr.producer;
	off->fr.consumer = off_v1.fr.consumer;
	off->fr.desc = off_v1.fr.desc;
187
	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
188 189 190 191

	off->cr.producer = off_v1.cr.producer;
	off->cr.consumer = off_v1.cr.consumer;
	off->cr.desc = off_v1.cr.desc;
192
	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
}

static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
{
	socklen_t optlen;
	int err;

	optlen = sizeof(*off);
	err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
	if (err)
		return err;

	if (optlen == sizeof(*off))
		return 0;

	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
		xsk_mmap_offsets_v1(off);
		return 0;
	}

	return -EINVAL;
}

216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
				 struct xsk_ring_prod *fill,
				 struct xsk_ring_cons *comp)
{
	struct xdp_mmap_offsets off;
	void *map;
	int err;

	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
			 &umem->config.fill_size,
			 sizeof(umem->config.fill_size));
	if (err)
		return -errno;

	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
			 &umem->config.comp_size,
			 sizeof(umem->config.comp_size));
	if (err)
		return -errno;

	err = xsk_get_mmap_offsets(fd, &off);
	if (err)
		return -errno;

	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_FILL_RING);
	if (map == MAP_FAILED)
		return -errno;

	fill->mask = umem->config.fill_size - 1;
	fill->size = umem->config.fill_size;
	fill->producer = map + off.fr.producer;
	fill->consumer = map + off.fr.consumer;
	fill->flags = map + off.fr.flags;
	fill->ring = map + off.fr.desc;
	fill->cached_cons = umem->config.fill_size;

	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_COMPLETION_RING);
	if (map == MAP_FAILED) {
		err = -errno;
		goto out_mmap;
	}

	comp->mask = umem->config.comp_size - 1;
	comp->size = umem->config.comp_size;
	comp->producer = map + off.cr.producer;
	comp->consumer = map + off.cr.consumer;
	comp->flags = map + off.cr.flags;
	comp->ring = map + off.cr.desc;

	return 0;

out_mmap:
	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
	return err;
}

K
Kevin Laatz 已提交
276 277 278 279
int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
			    __u64 size, struct xsk_ring_prod *fill,
			    struct xsk_ring_cons *comp,
			    const struct xsk_umem_config *usr_config)
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
{
	struct xdp_umem_reg mr;
	struct xsk_umem *umem;
	int err;

	if (!umem_area || !umem_ptr || !fill || !comp)
		return -EFAULT;
	if (!size && !xsk_page_aligned(umem_area))
		return -EINVAL;

	umem = calloc(1, sizeof(*umem));
	if (!umem)
		return -ENOMEM;

	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
	if (umem->fd < 0) {
		err = -errno;
		goto out_umem_alloc;
	}

	umem->umem_area = umem_area;
301
	INIT_LIST_HEAD(&umem->ctx_list);
302 303
	xsk_set_umem_config(&umem->config, usr_config);

304
	memset(&mr, 0, sizeof(mr));
305 306 307 308
	mr.addr = (uintptr_t)umem_area;
	mr.len = size;
	mr.chunk_size = umem->config.frame_size;
	mr.headroom = umem->config.frame_headroom;
K
Kevin Laatz 已提交
309
	mr.flags = umem->config.flags;
310 311 312 313 314 315 316

	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
	if (err) {
		err = -errno;
		goto out_socket;
	}

317 318
	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
	if (err)
319 320
		goto out_socket;

321 322
	umem->fill_save = fill;
	umem->comp_save = comp;
323 324 325 326 327 328 329 330 331 332
	*umem_ptr = umem;
	return 0;

out_socket:
	close(umem->fd);
out_umem_alloc:
	free(umem);
	return err;
}

K
Kevin Laatz 已提交
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
struct xsk_umem_config_v1 {
	__u32 fill_size;
	__u32 comp_size;
	__u32 frame_size;
	__u32 frame_headroom;
};

int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
			    __u64 size, struct xsk_ring_prod *fill,
			    struct xsk_ring_cons *comp,
			    const struct xsk_umem_config *usr_config)
{
	struct xsk_umem_config config;

	memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
	config.flags = 0;

	return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
					&config);
}
353 354
COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2)
DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4)
K
Kevin Laatz 已提交
355

356 357
static int xsk_load_xdp_prog(struct xsk_socket *xsk)
{
358
	static const int log_buf_size = 16 * 1024;
359
	struct xsk_ctx *ctx = xsk->ctx;
360
	char log_buf[log_buf_size];
361 362 363 364 365
	int err, prog_fd;

	/* This is the C-program:
	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
	 * {
366
	 *     int ret, index = ctx->rx_queue_index;
367 368 369
	 *
	 *     // A set entry here means that the correspnding queue_id
	 *     // has an active AF_XDP socket bound to it.
370 371 372 373 374 375
	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
	 *     if (ret > 0)
	 *         return ret;
	 *
	 *     // Fallback for pre-5.3 kernels, not supporting default
	 *     // action in the flags parameter.
376
	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
377 378 379 380 381
	 *         return bpf_redirect_map(&xsks_map, index, 0);
	 *     return XDP_PASS;
	 * }
	 */
	struct bpf_insn prog[] = {
382 383 384 385 386
		/* r2 = *(u32 *)(r1 + 16) */
		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
		/* *(u32 *)(r10 - 4) = r2 */
		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
		/* r1 = xskmap[] */
387
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
388 389 390 391 392 393 394
		/* r3 = XDP_PASS */
		BPF_MOV64_IMM(BPF_REG_3, 2),
		/* call bpf_redirect_map */
		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
		/* if w0 != 0 goto pc+13 */
		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
		/* r2 = r10 */
395
		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
396
		/* r2 += -4 */
397
		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
398
		/* r1 = xskmap[] */
399
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
400
		/* call bpf_map_lookup_elem */
401
		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
402
		/* r1 = r0 */
403
		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
404 405 406
		/* r0 = XDP_PASS */
		BPF_MOV64_IMM(BPF_REG_0, 2),
		/* if r1 == 0 goto pc+5 */
407 408 409
		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
		/* r2 = *(u32 *)(r10 - 4) */
		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
410
		/* r1 = xskmap[] */
411
		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
412 413 414
		/* r3 = 0 */
		BPF_MOV64_IMM(BPF_REG_3, 0),
		/* call bpf_redirect_map */
415 416 417 418 419 420 421
		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
		/* The jumps are to this instruction */
		BPF_EXIT_INSN(),
	};
	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);

	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
422 423
				   "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
				   log_buf_size);
424
	if (prog_fd < 0) {
425
		pr_warn("BPF log buffer:\n%s", log_buf);
426 427 428
		return prog_fd;
	}

429 430
	err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
				  xsk->config.xdp_flags);
431 432 433 434 435
	if (err) {
		close(prog_fd);
		return err;
	}

436
	ctx->prog_fd = prog_fd;
437 438 439 440 441
	return 0;
}

static int xsk_get_max_queues(struct xsk_socket *xsk)
{
442
	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
443
	struct xsk_ctx *ctx = xsk->ctx;
444
	struct ifreq ifr = {};
445 446 447 448 449 450 451
	int fd, err, ret;

	fd = socket(AF_INET, SOCK_DGRAM, 0);
	if (fd < 0)
		return -errno;

	ifr.ifr_data = (void *)&channels;
452
	memcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ - 1);
453
	ifr.ifr_name[IFNAMSIZ - 1] = '\0';
454 455 456 457 458 459
	err = ioctl(fd, SIOCETHTOOL, &ifr);
	if (err && errno != EOPNOTSUPP) {
		ret = -errno;
		goto out;
	}

460
	if (err) {
461 462 463 464
		/* If the device says it has no channels, then all traffic
		 * is sent to a single stream, so max queues = 1.
		 */
		ret = 1;
465 466 467 468 469 470 471
	} else {
		/* Take the max of rx, tx, combined. Drivers return
		 * the number of channels in different ways.
		 */
		ret = max(channels.max_rx, channels.max_tx);
		ret = max(ret, (int)channels.max_combined);
	}
472 473 474 475 476 477 478 479

out:
	close(fd);
	return ret;
}

static int xsk_create_bpf_maps(struct xsk_socket *xsk)
{
480
	struct xsk_ctx *ctx = xsk->ctx;
481 482 483 484 485 486 487
	int max_queues;
	int fd;

	max_queues = xsk_get_max_queues(xsk);
	if (max_queues < 0)
		return max_queues;

488
	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
489 490 491 492
				 sizeof(int), sizeof(int), max_queues, 0);
	if (fd < 0)
		return fd;

493
	ctx->xsks_map_fd = fd;
494 495 496 497 498 499

	return 0;
}

static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
{
500 501 502 503
	struct xsk_ctx *ctx = xsk->ctx;

	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
	close(ctx->xsks_map_fd);
504 505
}

B
Björn Töpel 已提交
506
static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
507
{
B
Björn Töpel 已提交
508 509
	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
	__u32 map_len = sizeof(struct bpf_map_info);
510
	struct bpf_prog_info prog_info = {};
511
	struct xsk_ctx *ctx = xsk->ctx;
512
	struct bpf_map_info map_info;
B
Björn Töpel 已提交
513
	int fd, err;
514

515
	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
516 517 518 519 520 521 522 523 524 525 526 527 528
	if (err)
		return err;

	num_maps = prog_info.nr_map_ids;

	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
	if (!map_ids)
		return -ENOMEM;

	memset(&prog_info, 0, prog_len);
	prog_info.nr_map_ids = num_maps;
	prog_info.map_ids = (__u64)(unsigned long)map_ids;

529
	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
530 531 532
	if (err)
		goto out_map_ids;

533
	ctx->xsks_map_fd = -1;
534

535
	for (i = 0; i < prog_info.nr_map_ids; i++) {
536
		fd = bpf_map_get_fd_by_id(map_ids[i]);
B
Björn Töpel 已提交
537 538
		if (fd < 0)
			continue;
539

540
		memset(&map_info, 0, map_len);
541
		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
B
Björn Töpel 已提交
542 543 544 545
		if (err) {
			close(fd);
			continue;
		}
546

547
		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
548
			ctx->xsks_map_fd = fd;
549
			break;
550 551
		}

B
Björn Töpel 已提交
552
		close(fd);
553 554
	}

B
Björn Töpel 已提交
555
	err = 0;
556
	if (ctx->xsks_map_fd == -1)
557 558 559 560 561 562 563
		err = -ENOENT;

out_map_ids:
	free(map_ids);
	return err;
}

B
Björn Töpel 已提交
564 565
static int xsk_set_bpf_maps(struct xsk_socket *xsk)
{
566 567 568
	struct xsk_ctx *ctx = xsk->ctx;

	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
569
				   &xsk->fd, 0);
B
Björn Töpel 已提交
570 571
}

572 573
static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
{
574
	struct xsk_ctx *ctx = xsk->ctx;
575 576 577
	__u32 prog_id = 0;
	int err;

578
	err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
579 580 581 582 583 584 585 586 587 588
				  xsk->config.xdp_flags);
	if (err)
		return err;

	if (!prog_id) {
		err = xsk_create_bpf_maps(xsk);
		if (err)
			return err;

		err = xsk_load_xdp_prog(xsk);
589 590 591 592
		if (err) {
			xsk_delete_bpf_maps(xsk);
			return err;
		}
593
	} else {
594 595
		ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
		if (ctx->prog_fd < 0)
596
			return -errno;
B
Björn Töpel 已提交
597
		err = xsk_lookup_bpf_maps(xsk);
598
		if (err) {
599
			close(ctx->prog_fd);
600 601
			return err;
		}
602 603
	}

604 605
	if (xsk->rx)
		err = xsk_set_bpf_maps(xsk);
606 607
	if (err) {
		xsk_delete_bpf_maps(xsk);
608
		close(ctx->prog_fd);
609 610
		return err;
	}
611 612 613 614

	return 0;
}

615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
				   __u32 queue_id)
{
	struct xsk_ctx *ctx;

	if (list_empty(&umem->ctx_list))
		return NULL;

	list_for_each_entry(ctx, &umem->ctx_list, list) {
		if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
			ctx->refcount++;
			return ctx;
		}
	}

	return NULL;
}

633
static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
634 635 636 637 638
{
	struct xsk_umem *umem = ctx->umem;
	struct xdp_mmap_offsets off;
	int err;

639 640
	if (--ctx->refcount)
		return;
641

642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
	if (!unmap)
		goto out_free;

	err = xsk_get_mmap_offsets(umem->fd, &off);
	if (err)
		goto out_free;

	munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
	       sizeof(__u64));
	munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
	       sizeof(__u64));

out_free:
	list_del(&ctx->list);
	free(ctx);
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
}

static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
				      struct xsk_umem *umem, int ifindex,
				      const char *ifname, __u32 queue_id,
				      struct xsk_ring_prod *fill,
				      struct xsk_ring_cons *comp)
{
	struct xsk_ctx *ctx;
	int err;

	ctx = calloc(1, sizeof(*ctx));
	if (!ctx)
		return NULL;

	if (!umem->fill_save) {
		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
		if (err) {
			free(ctx);
			return NULL;
		}
	} else if (umem->fill_save != fill || umem->comp_save != comp) {
		/* Copy over rings to new structs. */
		memcpy(fill, umem->fill_save, sizeof(*fill));
		memcpy(comp, umem->comp_save, sizeof(*comp));
	}

	ctx->ifindex = ifindex;
	ctx->refcount = 1;
	ctx->umem = umem;
	ctx->queue_id = queue_id;
	memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
	ctx->ifname[IFNAMSIZ - 1] = '\0';

	ctx->fill = fill;
	ctx->comp = comp;
	list_add(&ctx->list, &umem->ctx_list);
	return ctx;
}

int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
			      const char *ifname,
			      __u32 queue_id, struct xsk_umem *umem,
			      struct xsk_ring_cons *rx,
			      struct xsk_ring_prod *tx,
			      struct xsk_ring_prod *fill,
			      struct xsk_ring_cons *comp,
			      const struct xsk_socket_config *usr_config)
705
{
B
Björn Töpel 已提交
706
	void *rx_map = NULL, *tx_map = NULL;
707 708 709
	struct sockaddr_xdp sxdp = {};
	struct xdp_mmap_offsets off;
	struct xsk_socket *xsk;
710 711
	struct xsk_ctx *ctx;
	int err, ifindex;
712
	bool unmap = umem->fill_save != fill;
713
	bool rx_setup_done = false, tx_setup_done = false;
714

715
	if (!umem || !xsk_ptr || !(rx || tx))
716 717 718 719 720 721
		return -EFAULT;

	xsk = calloc(1, sizeof(*xsk));
	if (!xsk)
		return -ENOMEM;

722 723 724 725
	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
	if (err)
		goto out_xsk_alloc;

726 727 728 729
	xsk->outstanding_tx = 0;
	ifindex = if_nametoindex(ifname);
	if (!ifindex) {
		err = -errno;
730 731 732
		goto out_xsk_alloc;
	}

733 734 735 736 737 738 739 740
	if (umem->refcount++ > 0) {
		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
		if (xsk->fd < 0) {
			err = -errno;
			goto out_xsk_alloc;
		}
	} else {
		xsk->fd = umem->fd;
741 742
		rx_setup_done = umem->rx_ring_setup_done;
		tx_setup_done = umem->tx_ring_setup_done;
743 744
	}

745 746
	ctx = xsk_get_ctx(umem, ifindex, queue_id);
	if (!ctx) {
747 748 749 750 751
		if (!fill || !comp) {
			err = -EFAULT;
			goto out_socket;
		}

752 753 754 755 756 757
		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
				     fill, comp);
		if (!ctx) {
			err = -ENOMEM;
			goto out_socket;
		}
758
	}
759
	xsk->ctx = ctx;
760

761
	if (rx && !rx_setup_done) {
762 763 764 765 766
		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
				 &xsk->config.rx_size,
				 sizeof(xsk->config.rx_size));
		if (err) {
			err = -errno;
767
			goto out_put_ctx;
768
		}
769 770
		if (xsk->fd == umem->fd)
			umem->rx_ring_setup_done = true;
771
	}
772
	if (tx && !tx_setup_done) {
773 774 775 776 777
		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
				 &xsk->config.tx_size,
				 sizeof(xsk->config.tx_size));
		if (err) {
			err = -errno;
778
			goto out_put_ctx;
779
		}
780 781
		if (xsk->fd == umem->fd)
			umem->rx_ring_setup_done = true;
782 783
	}

784
	err = xsk_get_mmap_offsets(xsk->fd, &off);
785 786
	if (err) {
		err = -errno;
787
		goto out_put_ctx;
788 789 790
	}

	if (rx) {
791 792 793 794
		rx_map = mmap(NULL, off.rx.desc +
			      xsk->config.rx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_RX_RING);
B
Björn Töpel 已提交
795
		if (rx_map == MAP_FAILED) {
796
			err = -errno;
797
			goto out_put_ctx;
798 799 800 801
		}

		rx->mask = xsk->config.rx_size - 1;
		rx->size = xsk->config.rx_size;
B
Björn Töpel 已提交
802 803
		rx->producer = rx_map + off.rx.producer;
		rx->consumer = rx_map + off.rx.consumer;
804
		rx->flags = rx_map + off.rx.flags;
B
Björn Töpel 已提交
805
		rx->ring = rx_map + off.rx.desc;
806 807
		rx->cached_prod = *rx->producer;
		rx->cached_cons = *rx->consumer;
808 809 810 811
	}
	xsk->rx = rx;

	if (tx) {
812 813 814 815
		tx_map = mmap(NULL, off.tx.desc +
			      xsk->config.tx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_TX_RING);
B
Björn Töpel 已提交
816
		if (tx_map == MAP_FAILED) {
817 818 819 820 821 822
			err = -errno;
			goto out_mmap_rx;
		}

		tx->mask = xsk->config.tx_size - 1;
		tx->size = xsk->config.tx_size;
B
Björn Töpel 已提交
823 824
		tx->producer = tx_map + off.tx.producer;
		tx->consumer = tx_map + off.tx.consumer;
825
		tx->flags = tx_map + off.tx.flags;
B
Björn Töpel 已提交
826
		tx->ring = tx_map + off.tx.desc;
827 828 829 830 831
		tx->cached_prod = *tx->producer;
		/* cached_cons is r->size bigger than the real consumer pointer
		 * See xsk_prod_nb_free
		 */
		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
832 833 834 835
	}
	xsk->tx = tx;

	sxdp.sxdp_family = PF_XDP;
836 837
	sxdp.sxdp_ifindex = ctx->ifindex;
	sxdp.sxdp_queue_id = ctx->queue_id;
838
	if (umem->refcount > 1) {
839
		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
840 841 842 843
		sxdp.sxdp_shared_umem_fd = umem->fd;
	} else {
		sxdp.sxdp_flags = xsk->config.bind_flags;
	}
844 845 846 847 848 849 850

	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
	if (err) {
		err = -errno;
		goto out_mmap_tx;
	}

851
	ctx->prog_fd = -1;
852

853 854 855 856 857 858 859
	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
		err = xsk_setup_xdp_prog(xsk);
		if (err)
			goto out_mmap_tx;
	}

	*xsk_ptr = xsk;
860 861
	umem->fill_save = NULL;
	umem->comp_save = NULL;
862 863 864 865
	return 0;

out_mmap_tx:
	if (tx)
B
Björn Töpel 已提交
866
		munmap(tx_map, off.tx.desc +
867 868 869
		       xsk->config.tx_size * sizeof(struct xdp_desc));
out_mmap_rx:
	if (rx)
B
Björn Töpel 已提交
870
		munmap(rx_map, off.rx.desc +
871
		       xsk->config.rx_size * sizeof(struct xdp_desc));
872
out_put_ctx:
873
	xsk_put_ctx(ctx, unmap);
874 875 876 877 878 879 880 881
out_socket:
	if (--umem->refcount)
		close(xsk->fd);
out_xsk_alloc:
	free(xsk);
	return err;
}

882 883 884 885
int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
		       __u32 queue_id, struct xsk_umem *umem,
		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
		       const struct xsk_socket_config *usr_config)
886
{
887 888 889
	if (!umem)
		return -EFAULT;

890 891 892 893
	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
					 rx, tx, umem->fill_save,
					 umem->comp_save, usr_config);
}
894

895 896
int xsk_umem__delete(struct xsk_umem *umem)
{
897 898 899 900 901 902 903 904 905 906 907 908 909 910
	if (!umem)
		return 0;

	if (umem->refcount)
		return -EBUSY;

	close(umem->fd);
	free(umem);

	return 0;
}

void xsk_socket__delete(struct xsk_socket *xsk)
{
B
Björn Töpel 已提交
911
	size_t desc_sz = sizeof(struct xdp_desc);
912
	struct xdp_mmap_offsets off;
913
	struct xsk_umem *umem;
914
	struct xsk_ctx *ctx;
915 916 917 918 919
	int err;

	if (!xsk)
		return;

920
	ctx = xsk->ctx;
921
	umem = ctx->umem;
922
	if (ctx->prog_fd != -1) {
923
		xsk_delete_bpf_maps(xsk);
924
		close(ctx->prog_fd);
925
	}
926

927
	err = xsk_get_mmap_offsets(xsk->fd, &off);
928
	if (!err) {
B
Björn Töpel 已提交
929
		if (xsk->rx) {
930 931
			munmap(xsk->rx->ring - off.rx.desc,
			       off.rx.desc + xsk->config.rx_size * desc_sz);
B
Björn Töpel 已提交
932 933
		}
		if (xsk->tx) {
934 935
			munmap(xsk->tx->ring - off.tx.desc,
			       off.tx.desc + xsk->config.tx_size * desc_sz);
B
Björn Töpel 已提交
936
		}
937 938
	}

939
	xsk_put_ctx(ctx, true);
940

941
	umem->refcount--;
942 943 944
	/* Do not close an fd that also has an associated umem connected
	 * to it.
	 */
945
	if (xsk->fd != umem->fd)
946 947 948
		close(xsk->fd);
	free(xsk);
}