xdpsock_user.c 21.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
B
Björn Töpel 已提交
2
/* Copyright(c) 2017 - 2018 Intel Corporation. */
3

4
#include <asm/barrier.h>
5 6 7 8
#include <errno.h>
#include <getopt.h>
#include <libgen.h>
#include <linux/bpf.h>
9
#include <linux/compiler.h>
10 11 12
#include <linux/if_link.h>
#include <linux/if_xdp.h>
#include <linux/if_ether.h>
13 14
#include <locale.h>
#include <net/ethernet.h>
15
#include <net/if.h>
16 17
#include <poll.h>
#include <pthread.h>
18 19 20 21 22
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
23
#include <sys/mman.h>
24 25
#include <sys/resource.h>
#include <sys/socket.h>
26
#include <sys/types.h>
27 28 29
#include <time.h>
#include <unistd.h>

30 31
#include "libbpf.h"
#include "xsk.h"
32
#include "xdpsock.h"
33
#include <bpf/bpf.h>
34 35 36 37 38 39 40 41 42 43 44 45 46

#ifndef SOL_XDP
#define SOL_XDP 283
#endif

#ifndef AF_XDP
#define AF_XDP 44
#endif

#ifndef PF_XDP
#define PF_XDP AF_XDP
#endif

47
#define NUM_FRAMES (4 * 1024)
48 49 50

#define DEBUG_HEXDUMP 0

B
Björn Töpel 已提交
51
typedef __u64 u64;
52 53 54 55 56 57 58 59 60 61 62
typedef __u32 u32;

static unsigned long prev_time;

enum benchmark_type {
	BENCH_RXDROP = 0,
	BENCH_TXONLY = 1,
	BENCH_L2FWD = 2,
};

static enum benchmark_type opt_bench = BENCH_RXDROP;
63
static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
64 65 66
static const char *opt_if = "";
static int opt_ifindex;
static int opt_queue;
67 68 69
static unsigned long opt_duration;
static unsigned long start_time;
static bool benchmark_done;
70
static u32 opt_batch_size = 64;
71
static int opt_pkt_count;
72 73
static int opt_poll;
static int opt_interval = 1;
74
static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
75 76
static u32 opt_umem_flags;
static int opt_unaligned_chunks;
77
static int opt_mmap_flags;
78
static u32 opt_xdp_bind_flags;
79
static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
80 81
static int opt_timeout = 1000;
static bool opt_need_wakeup = true;
82 83
static u32 opt_num_xsks = 1;
static u32 prog_id;
84

85 86 87 88 89
struct xsk_umem_info {
	struct xsk_ring_prod fq;
	struct xsk_ring_cons cq;
	struct xsk_umem *umem;
	void *buffer;
90 91
};

92 93 94 95 96
struct xsk_socket_info {
	struct xsk_ring_cons rx;
	struct xsk_ring_prod tx;
	struct xsk_umem_info *umem;
	struct xsk_socket *xsk;
97 98 99 100
	unsigned long rx_npkts;
	unsigned long tx_npkts;
	unsigned long prev_rx_npkts;
	unsigned long prev_tx_npkts;
101
	u32 outstanding_tx;
102 103 104
};

static int num_socks;
105
struct xsk_socket_info *xsks[MAX_SOCKS];
106 107 108 109 110 111 112 113 114

static unsigned long get_nsecs(void)
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);
	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
}

115
static void print_benchmark(bool running)
116
{
117
	const char *bench_str = "INVALID";
118

119 120 121 122 123 124
	if (opt_bench == BENCH_RXDROP)
		bench_str = "rxdrop";
	else if (opt_bench == BENCH_TXONLY)
		bench_str = "txonly";
	else if (opt_bench == BENCH_L2FWD)
		bench_str = "l2fwd";
125

126 127 128 129 130 131 132
	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
		printf("xdp-skb ");
	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
		printf("xdp-drv ");
	else
		printf("	");
133

134 135
	if (opt_poll)
		printf("poll() ");
136

137 138 139
	if (running) {
		printf("running...");
		fflush(stdout);
140 141 142
	}
}

143
static void dump_stats(void)
144
{
145 146 147
	unsigned long now = get_nsecs();
	long dt = now - prev_time;
	int i;
148

149
	prev_time = now;
150

151 152 153
	for (i = 0; i < num_socks && xsks[i]; i++) {
		char *fmt = "%-15s %'-11.0f %'-11lu\n";
		double rx_pps, tx_pps;
154

155 156 157 158
		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
			 1000000000. / dt;
		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
			 1000000000. / dt;
159

160 161 162
		printf("\n sock%d@", i);
		print_benchmark(false);
		printf("\n");
163

164 165 166 167
		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
		       dt / 1000000000.);
		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
168

169 170
		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
171 172 173
	}
}

174 175 176 177 178 179 180 181 182 183 184
static bool is_benchmark_done(void)
{
	if (opt_duration > 0) {
		unsigned long dt = (get_nsecs() - start_time);

		if (dt >= opt_duration)
			benchmark_done = true;
	}
	return benchmark_done;
}

185
static void *poller(void *arg)
186
{
187
	(void)arg;
188
	while (!is_benchmark_done()) {
189 190
		sleep(opt_interval);
		dump_stats();
191 192
	}

193
	return NULL;
194 195
}

196
static void remove_xdp_program(void)
197
{
198
	u32 curr_prog_id = 0;
199

200 201 202
	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
		printf("bpf_get_link_xdp_id failed\n");
		exit(EXIT_FAILURE);
203
	}
204 205 206 207 208 209
	if (prog_id == curr_prog_id)
		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
	else if (!curr_prog_id)
		printf("couldn't find a prog id on a given interface\n");
	else
		printf("program on interface changed, not removing\n");
210 211
}

212
static void int_exit(int sig)
213 214 215 216 217
{
	benchmark_done = true;
}

static void xdpsock_cleanup(void)
218
{
219
	struct xsk_umem *umem = xsks[0]->umem->umem;
220
	int i;
221

222
	dump_stats();
223 224
	for (i = 0; i < num_socks; i++)
		xsk_socket__delete(xsks[i]->xsk);
225 226
	(void)xsk_umem__delete(umem);
	remove_xdp_program();
227 228
}

229 230
static void __exit_with_error(int error, const char *file, const char *func,
			      int line)
231
{
232 233 234 235 236
	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
		line, error, strerror(error));
	dump_stats();
	remove_xdp_program();
	exit(EXIT_FAILURE);
237 238
}

239 240 241 242 243 244 245
#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
						 __LINE__)
static const char pkt_data[] =
	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
246 247 248 249 250 251 252 253 254 255 256 257 258

static void swap_mac_addresses(void *data)
{
	struct ether_header *eth = (struct ether_header *)data;
	struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
	struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
	struct ether_addr tmp;

	tmp = *src_addr;
	*src_addr = *dst_addr;
	*dst_addr = tmp;
}

B
Björn Töpel 已提交
259
static void hex_dump(void *pkt, size_t length, u64 addr)
260 261 262 263 264
{
	const unsigned char *address = (unsigned char *)pkt;
	const unsigned char *line = address;
	size_t line_size = 32;
	unsigned char c;
B
Björn Töpel 已提交
265 266
	char buf[32];
	int i = 0;
267

B
Björn Töpel 已提交
268 269 270 271
	if (!DEBUG_HEXDUMP)
		return;

	sprintf(buf, "addr=%llu", addr);
272
	printf("length = %zu\n", length);
B
Björn Töpel 已提交
273
	printf("%s | ", buf);
274 275 276 277 278 279 280 281 282 283 284 285 286 287
	while (length-- > 0) {
		printf("%02X ", *address++);
		if (!(++i % line_size) || (length == 0 && i % line_size)) {
			if (length == 0) {
				while (i++ % line_size)
					printf("__ ");
			}
			printf(" | ");	/* right close */
			while (line < address) {
				c = *line++;
				printf("%c", (c < 33 || c == 255) ? 0x2E : c);
			}
			printf("\n");
			if (length > 0)
B
Björn Töpel 已提交
288
				printf("%s | ", buf);
289 290 291 292 293
		}
	}
	printf("\n");
}

294
static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
295
{
296 297
	memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
	       sizeof(pkt_data) - 1);
298 299
}

300
static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
301
{
302
	struct xsk_umem_info *umem;
303 304 305 306 307
	struct xsk_umem_config cfg = {
		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
		.frame_size = opt_xsk_frame_size,
		.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
308
		.flags = opt_umem_flags
309
	};
310
	int ret;
311 312

	umem = calloc(1, sizeof(*umem));
313 314
	if (!umem)
		exit_with_error(errno);
315

316
	ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
317
			       &cfg);
318 319
	if (ret)
		exit_with_error(-ret);
320

321 322 323 324 325 326 327 328 329
	umem->buffer = buffer;
	return umem;
}

static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
{
	int ret, i;
	u32 idx;

330 331 332 333 334 335 336 337
	ret = xsk_ring_prod__reserve(&umem->fq,
				     XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx);
	if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
		exit_with_error(-ret);
	for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
		*xsk_ring_prod__fill_addr(&umem->fq, idx++) =
			i * opt_xsk_frame_size;
	xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
338 339
}

340 341
static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
						    bool rx, bool tx)
342
{
343 344
	struct xsk_socket_config cfg;
	struct xsk_socket_info *xsk;
345 346
	struct xsk_ring_cons *rxr;
	struct xsk_ring_prod *txr;
347
	int ret;
348 349

	xsk = calloc(1, sizeof(*xsk));
350 351 352 353 354 355
	if (!xsk)
		exit_with_error(errno);

	xsk->umem = umem;
	cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
	cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
356 357 358 359
	if (opt_num_xsks > 1)
		cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
	else
		cfg.libbpf_flags = 0;
360 361
	cfg.xdp_flags = opt_xdp_flags;
	cfg.bind_flags = opt_xdp_bind_flags;
362

363 364 365 366
	rxr = rx ? &xsk->rx : NULL;
	txr = tx ? &xsk->tx : NULL;
	ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
				 rxr, txr, &cfg);
367 368 369 370 371 372 373
	if (ret)
		exit_with_error(-ret);

	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
	if (ret)
		exit_with_error(-ret);

374 375 376 377 378 379 380 381 382 383 384 385 386
	return xsk;
}

static struct option long_options[] = {
	{"rxdrop", no_argument, 0, 'r'},
	{"txonly", no_argument, 0, 't'},
	{"l2fwd", no_argument, 0, 'l'},
	{"interface", required_argument, 0, 'i'},
	{"queue", required_argument, 0, 'q'},
	{"poll", no_argument, 0, 'p'},
	{"xdp-skb", no_argument, 0, 'S'},
	{"xdp-native", no_argument, 0, 'N'},
	{"interval", required_argument, 0, 'n'},
387 388
	{"zero-copy", no_argument, 0, 'z'},
	{"copy", no_argument, 0, 'c'},
389
	{"frame-size", required_argument, 0, 'f'},
390
	{"no-need-wakeup", no_argument, 0, 'm'},
391
	{"unaligned", no_argument, 0, 'u'},
392
	{"shared-umem", no_argument, 0, 'M'},
393
	{"force", no_argument, 0, 'F'},
394
	{"duration", required_argument, 0, 'd'},
395
	{"batch-size", required_argument, 0, 'b'},
396
	{"tx-pkt-count", required_argument, 0, 'C'},
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
	{0, 0, 0, 0}
};

static void usage(const char *prog)
{
	const char *str =
		"  Usage: %s [OPTIONS]\n"
		"  Options:\n"
		"  -r, --rxdrop		Discard all incoming packets (default)\n"
		"  -t, --txonly		Only send packets\n"
		"  -l, --l2fwd		MAC swap L2 forwarding\n"
		"  -i, --interface=n	Run on interface n\n"
		"  -q, --queue=n	Use queue n (default 0)\n"
		"  -p, --poll		Use poll syscall\n"
		"  -S, --xdp-skb=n	Use XDP skb-mod\n"
412
		"  -N, --xdp-native=n	Enforce XDP native mode\n"
413
		"  -n, --interval=n	Specify statistics update interval (default 1 sec).\n"
414 415
		"  -z, --zero-copy      Force zero-copy mode.\n"
		"  -c, --copy           Force copy mode.\n"
416
		"  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
417 418
		"  -f, --frame-size=n   Set the frame size (must be a power of two in aligned mode, default is %d).\n"
		"  -u, --unaligned	Enable unaligned chunk placement\n"
419
		"  -M, --shared-umem	Enable XDP_SHARED_UMEM\n"
420
		"  -F, --force		Force loading the XDP prog\n"
421 422
		"  -d, --duration=n	Duration in secs to run command.\n"
		"			Default: forever.\n"
423 424
		"  -b, --batch-size=n	Batch size for sending or receiving\n"
		"			packets. Default: %d\n"
425 426
		"  -C, --tx-pkt-count=n	Number of packets to send.\n"
		"			Default: Continuous packets.\n"
427
		"\n";
428 429
	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
		opt_batch_size);
430 431 432 433 434 435 436 437 438 439
	exit(EXIT_FAILURE);
}

static void parse_command_line(int argc, char **argv)
{
	int option_index, c;

	opterr = 0;

	for (;;) {
440
		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:",
441
				long_options, &option_index);
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
		if (c == -1)
			break;

		switch (c) {
		case 'r':
			opt_bench = BENCH_RXDROP;
			break;
		case 't':
			opt_bench = BENCH_TXONLY;
			break;
		case 'l':
			opt_bench = BENCH_L2FWD;
			break;
		case 'i':
			opt_if = optarg;
			break;
		case 'q':
			opt_queue = atoi(optarg);
			break;
		case 'p':
			opt_poll = 1;
			break;
		case 'S':
			opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
466
			opt_xdp_bind_flags |= XDP_COPY;
467 468
			break;
		case 'N':
469
			/* default, set below */
470 471 472 473
			break;
		case 'n':
			opt_interval = atoi(optarg);
			break;
474 475 476 477 478 479
		case 'z':
			opt_xdp_bind_flags |= XDP_ZEROCOPY;
			break;
		case 'c':
			opt_xdp_bind_flags |= XDP_COPY;
			break;
480 481 482
		case 'u':
			opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
			opt_unaligned_chunks = 1;
483
			opt_mmap_flags = MAP_HUGETLB;
484
			break;
485 486 487
		case 'F':
			opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
			break;
488 489
		case 'f':
			opt_xsk_frame_size = atoi(optarg);
490
			break;
491 492 493
		case 'm':
			opt_need_wakeup = false;
			opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
494
			break;
495 496 497
		case 'M':
			opt_num_xsks = MAX_SOCKS;
			break;
498 499 500 501
		case 'd':
			opt_duration = atoi(optarg);
			opt_duration *= 1000000000;
			break;
502 503 504
		case 'b':
			opt_batch_size = atoi(optarg);
			break;
505 506 507
		case 'C':
			opt_pkt_count = atoi(optarg);
			break;
508 509 510 511 512
		default:
			usage(basename(argv[0]));
		}
	}

513 514 515
	if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE))
		opt_xdp_flags |= XDP_FLAGS_DRV_MODE;

516 517 518 519 520 521
	opt_ifindex = if_nametoindex(opt_if);
	if (!opt_ifindex) {
		fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
			opt_if);
		usage(basename(argv[0]));
	}
522

523 524
	if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
	    !opt_unaligned_chunks) {
525 526 527 528
		fprintf(stderr, "--frame-size=%d is not a power of two\n",
			opt_xsk_frame_size);
		usage(basename(argv[0]));
	}
529 530
}

531
static void kick_tx(struct xsk_socket_info *xsk)
532 533 534
{
	int ret;

535
	ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
536
	if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY)
537
		return;
538
	exit_with_error(errno);
539 540
}

541 542
static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
				     struct pollfd *fds)
543
{
544
	struct xsk_umem_info *umem = xsk->umem;
545
	u32 idx_cq = 0, idx_fq = 0;
546 547 548 549 550 551
	unsigned int rcvd;
	size_t ndescs;

	if (!xsk->outstanding_tx)
		return;

552 553 554
	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
		kick_tx(xsk);

555
	ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
556
		xsk->outstanding_tx;
557 558

	/* re-add completed Tx buffers */
559
	rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
560
	if (rcvd > 0) {
561 562 563
		unsigned int i;
		int ret;

564
		ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
565 566 567
		while (ret != rcvd) {
			if (ret < 0)
				exit_with_error(-ret);
568
			if (xsk_ring_prod__needs_wakeup(&umem->fq))
569
				ret = poll(fds, num_socks, opt_timeout);
570
			ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
571
		}
572

573
		for (i = 0; i < rcvd; i++)
574 575
			*xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
				*xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
576 577 578

		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
579 580 581 582 583
		xsk->outstanding_tx -= rcvd;
		xsk->tx_npkts += rcvd;
	}
}

584 585
static inline void complete_tx_only(struct xsk_socket_info *xsk,
				    int batch_size)
586 587
{
	unsigned int rcvd;
588
	u32 idx;
589 590 591 592

	if (!xsk->outstanding_tx)
		return;

593 594
	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
		kick_tx(xsk);
595

596
	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
597
	if (rcvd > 0) {
598
		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
599 600 601 602 603
		xsk->outstanding_tx -= rcvd;
		xsk->tx_npkts += rcvd;
	}
}

604
static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
605 606
{
	unsigned int rcvd, i;
607
	u32 idx_rx = 0, idx_fq = 0;
608
	int ret;
609

610
	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
611 612 613
	if (!rcvd) {
		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
			ret = poll(fds, num_socks, opt_timeout);
614
		return;
615
	}
616

617 618 619 620
	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
	while (ret != rcvd) {
		if (ret < 0)
			exit_with_error(-ret);
621 622
		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
			ret = poll(fds, num_socks, opt_timeout);
623 624 625
		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
	}

626
	for (i = 0; i < rcvd; i++) {
627 628
		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
629 630 631
		u64 orig = xsk_umem__extract_addr(addr);

		addr = xsk_umem__add_offset_to_addr(addr);
632
		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
633

634
		hex_dump(pkt, len, addr);
635
		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
636 637
	}

638 639
	xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
	xsk_ring_cons__release(&xsk->rx, rcvd);
640 641 642 643 644
	xsk->rx_npkts += rcvd;
}

static void rx_drop_all(void)
{
645
	struct pollfd fds[MAX_SOCKS] = {};
646
	int i, ret;
647 648

	for (i = 0; i < num_socks; i++) {
649
		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
650 651 652 653 654
		fds[i].events = POLLIN;
	}

	for (;;) {
		if (opt_poll) {
655
			ret = poll(fds, num_socks, opt_timeout);
656 657 658 659 660
			if (ret <= 0)
				continue;
		}

		for (i = 0; i < num_socks; i++)
661
			rx_drop(xsks[i], fds);
662 663 664

		if (benchmark_done)
			break;
665 666 667
	}
}

668
static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size)
669 670
{
	u32 idx;
671
	unsigned int i;
672

673 674 675
	while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) <
				      batch_size) {
		complete_tx_only(xsk, batch_size);
676
	}
677

678
	for (i = 0; i < batch_size; i++) {
679 680 681 682
		struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
								  idx + i);
		tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
		tx_desc->len = sizeof(pkt_data) - 1;
683
	}
684

685 686 687
	xsk_ring_prod__submit(&xsk->tx, batch_size);
	xsk->outstanding_tx += batch_size;
	frame_nb += batch_size;
688
	frame_nb %= NUM_FRAMES;
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
	complete_tx_only(xsk, batch_size);
}

static inline int get_batch_size(int pkt_cnt)
{
	if (!opt_pkt_count)
		return opt_batch_size;

	if (pkt_cnt + opt_batch_size <= opt_pkt_count)
		return opt_batch_size;

	return opt_pkt_count - pkt_cnt;
}

static void complete_tx_only_all(void)
{
	bool pending;
	int i;

	do {
		pending = false;
		for (i = 0; i < num_socks; i++) {
			if (xsks[i]->outstanding_tx) {
				complete_tx_only(xsks[i], opt_batch_size);
				pending = !!xsks[i]->outstanding_tx;
			}
		}
	} while (pending);
717 718
}

719
static void tx_only_all(void)
720
{
721
	struct pollfd fds[MAX_SOCKS] = {};
722
	u32 frame_nb[MAX_SOCKS] = {};
723
	int pkt_cnt = 0;
724
	int i, ret;
725

726 727 728 729
	for (i = 0; i < num_socks; i++) {
		fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
		fds[0].events = POLLOUT;
	}
730

731 732 733
	while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
		int batch_size = get_batch_size(pkt_cnt);

734
		if (opt_poll) {
735
			ret = poll(fds, num_socks, opt_timeout);
736 737 738
			if (ret <= 0)
				continue;

739
			if (!(fds[0].revents & POLLOUT))
740 741 742
				continue;
		}

743
		for (i = 0; i < num_socks; i++)
744 745 746
			tx_only(xsks[i], frame_nb[i], batch_size);

		pkt_cnt += batch_size;
747 748 749

		if (benchmark_done)
			break;
750
	}
751 752 753

	if (opt_pkt_count)
		complete_tx_only_all();
754 755
}

756
static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
757
{
758 759 760
	unsigned int rcvd, i;
	u32 idx_rx = 0, idx_tx = 0;
	int ret;
761

762
	complete_tx_l2fwd(xsk, fds);
763

764
	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
765 766 767 768 769
	if (!rcvd) {
		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
			ret = poll(fds, num_socks, opt_timeout);
		return;
	}
770

771 772 773 774 775 776
	ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
	while (ret != rcvd) {
		if (ret < 0)
			exit_with_error(-ret);
		if (xsk_ring_prod__needs_wakeup(&xsk->tx))
			kick_tx(xsk);
777
		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
778 779 780 781 782
	}

	for (i = 0; i < rcvd; i++) {
		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
783
		u64 orig = addr;
784 785

		addr = xsk_umem__add_offset_to_addr(addr);
786 787 788
		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);

		swap_mac_addresses(pkt);
789

790
		hex_dump(pkt, len, addr);
791
		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
792 793
		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
	}
794

795 796
	xsk_ring_prod__submit(&xsk->tx, rcvd);
	xsk_ring_cons__release(&xsk->rx, rcvd);
797

798 799 800 801 802 803
	xsk->rx_npkts += rcvd;
	xsk->outstanding_tx += rcvd;
}

static void l2fwd_all(void)
{
804
	struct pollfd fds[MAX_SOCKS] = {};
805 806 807 808 809 810
	int i, ret;

	for (i = 0; i < num_socks; i++) {
		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
		fds[i].events = POLLOUT | POLLIN;
	}
811

812 813 814 815 816 817
	for (;;) {
		if (opt_poll) {
			ret = poll(fds, num_socks, opt_timeout);
			if (ret <= 0)
				continue;
		}
818

819 820
		for (i = 0; i < num_socks; i++)
			l2fwd(xsks[i], fds);
821 822 823

		if (benchmark_done)
			break;
824 825 826
	}
}

827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
static void load_xdp_program(char **argv, struct bpf_object **obj)
{
	struct bpf_prog_load_attr prog_load_attr = {
		.prog_type      = BPF_PROG_TYPE_XDP,
	};
	char xdp_filename[256];
	int prog_fd;

	snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
	prog_load_attr.file = xdp_filename;

	if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd))
		exit(EXIT_FAILURE);
	if (prog_fd < 0) {
		fprintf(stderr, "ERROR: no program found: %s\n",
			strerror(prog_fd));
		exit(EXIT_FAILURE);
	}

	if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
		fprintf(stderr, "ERROR: link set xdp fd failed\n");
		exit(EXIT_FAILURE);
	}
}

static void enter_xsks_into_map(struct bpf_object *obj)
{
	struct bpf_map *map;
	int i, xsks_map;

	map = bpf_object__find_map_by_name(obj, "xsks_map");
	xsks_map = bpf_map__fd(map);
	if (xsks_map < 0) {
		fprintf(stderr, "ERROR: no xsks map found: %s\n",
			strerror(xsks_map));
			exit(EXIT_FAILURE);
	}

	for (i = 0; i < num_socks; i++) {
		int fd = xsk_socket__fd(xsks[i]->xsk);
		int key, ret;

		key = i;
		ret = bpf_map_update_elem(xsks_map, &key, &fd, 0);
		if (ret) {
			fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
			exit(EXIT_FAILURE);
		}
	}
}

878 879 880
int main(int argc, char **argv)
{
	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
881
	bool rx = false, tx = false;
882
	struct xsk_umem_info *umem;
883
	struct bpf_object *obj;
884
	pthread_t pt;
885
	int i, ret;
886
	void *bufs;
887 888 889 890 891 892 893 894 895

	parse_command_line(argc, argv);

	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
		fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
			strerror(errno));
		exit(EXIT_FAILURE);
	}

896 897 898
	if (opt_num_xsks > 1)
		load_xdp_program(argv, &obj);

899 900 901 902 903 904 905 906
	/* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
	bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
		    PROT_READ | PROT_WRITE,
		    MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
	if (bufs == MAP_FAILED) {
		printf("ERROR: mmap failed\n");
		exit(EXIT_FAILURE);
	}
907 908

	/* Create sockets... */
909
	umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
910 911 912 913 914 915
	if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) {
		rx = true;
		xsk_populate_fill_ring(umem);
	}
	if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY)
		tx = true;
916
	for (i = 0; i < opt_num_xsks; i++)
917
		xsks[num_socks++] = xsk_configure_socket(umem, rx, tx);
918

919 920 921
	if (opt_bench == BENCH_TXONLY)
		for (i = 0; i < NUM_FRAMES; i++)
			gen_eth_frame(umem, i * opt_xsk_frame_size);
922

923 924
	if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY)
		enter_xsks_into_map(obj);
925 926 927 928 929 930 931 932

	signal(SIGINT, int_exit);
	signal(SIGTERM, int_exit);
	signal(SIGABRT, int_exit);

	setlocale(LC_ALL, "");

	ret = pthread_create(&pt, NULL, poller, NULL);
933 934
	if (ret)
		exit_with_error(ret);
935 936

	prev_time = get_nsecs();
937
	start_time = prev_time;
938 939 940 941

	if (opt_bench == BENCH_RXDROP)
		rx_drop_all();
	else if (opt_bench == BENCH_TXONLY)
942
		tx_only_all();
943
	else
944
		l2fwd_all();
945

946 947
	benchmark_done = true;

948 949
	pthread_join(pt, NULL);

950 951
	xdpsock_cleanup();

952 953
	return 0;
}