xsk.c 24.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
// SPDX-License-Identifier: GPL-2.0
/* XDP sockets
 *
 * AF_XDP sockets allows a channel between XDP programs and userspace
 * applications.
 * Copyright(c) 2018 Intel Corporation.
 *
 * Author(s): Björn Töpel <bjorn.topel@intel.com>
 *	      Magnus Karlsson <magnus.karlsson@intel.com>
 */

#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__

#include <linux/if_xdp.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/net.h>
#include <linux/netdevice.h>
24
#include <linux/rculist.h>
25
#include <net/xdp_sock_drv.h>
26
#include <net/xdp.h>
27

28
#include "xsk_queue.h"
29
#include "xdp_umem.h"
30
#include "xsk.h"
31

M
Magnus Karlsson 已提交
32 33
#define TX_BATCH_SIZE 16

34 35
static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);

36 37
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
38 39
	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
		READ_ONCE(xs->umem->fq);
40 41
}

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
	if (umem->need_wakeup & XDP_WAKEUP_RX)
		return;

	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
	umem->need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);

void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
{
	struct xdp_sock *xs;

	if (umem->need_wakeup & XDP_WAKEUP_TX)
		return;

	rcu_read_lock();
60
	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
	}
	rcu_read_unlock();

	umem->need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);

void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
{
	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
		return;

	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
	umem->need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);

void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
{
	struct xdp_sock *xs;

	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
		return;

	rcu_read_lock();
87
	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
88 89 90 91 92 93 94 95 96 97 98 99 100 101
		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
	}
	rcu_read_unlock();

	umem->need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);

bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
{
	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);

102
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
103
{
104 105 106
	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
	u64 addr;
	int err;
107

108 109 110 111 112
	addr = xp_get_handle(xskb);
	err = xskq_prod_reserve_desc(xs->rx, addr, len);
	if (err) {
		xs->rx_dropped++;
		return err;
113 114
	}

115 116
	xp_release(xskb);
	return 0;
117 118
}

119
static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
120
{
121
	void *from_buf, *to_buf;
122
	u32 metalen;
123

124 125 126
	if (unlikely(xdp_data_meta_unsupported(from))) {
		from_buf = from->data;
		to_buf = to->data;
127 128
		metalen = 0;
	} else {
129 130 131
		from_buf = from->data_meta;
		metalen = from->data - from->data_meta;
		to_buf = to->data - metalen;
132 133
	}

134
	memcpy(to_buf, from_buf, len + metalen);
135 136
}

137 138
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
		     bool explicit_free)
139
{
140 141
	struct xdp_buff *xsk_xdp;
	int err;
142

143 144 145 146 147 148 149
	if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
		xs->rx_dropped++;
		return -ENOSPC;
	}

	xsk_xdp = xsk_buff_alloc(xs->umem);
	if (!xsk_xdp) {
150
		xs->rx_dropped++;
151 152
		return -ENOSPC;
	}
153

154 155 156 157 158 159 160 161 162
	xsk_copy_xdp(xsk_xdp, xdp, len);
	err = __xsk_rcv_zc(xs, xsk_xdp, len);
	if (err) {
		xsk_buff_free(xsk_xdp);
		return err;
	}
	if (explicit_free)
		xdp_return_buff(xdp);
	return 0;
163 164
}

165 166 167 168 169 170 171 172 173 174
static bool xsk_is_bound(struct xdp_sock *xs)
{
	if (READ_ONCE(xs->state) == XSK_BOUND) {
		/* Matches smp_wmb() in bind(). */
		smp_rmb();
		return true;
	}
	return false;
}

175 176
static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
		   bool explicit_free)
177 178 179
{
	u32 len;

180 181 182
	if (!xsk_is_bound(xs))
		return -EINVAL;

183 184 185 186 187
	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
		return -EINVAL;

	len = xdp->data_end - xdp->data;

188
	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
189 190
		__xsk_rcv_zc(xs, xdp, len) :
		__xsk_rcv(xs, xdp, len, explicit_free);
191 192
}

193
static void xsk_flush(struct xdp_sock *xs)
194
{
195
	xskq_prod_submit(xs->rx);
196
	__xskq_cons_release(xs->umem->fq);
197
	sock_def_readable(&xs->sk);
198 199 200 201 202 203
}

int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
	int err;

204
	spin_lock_bh(&xs->rx_lock);
205 206
	err = xsk_rcv(xs, xdp, false);
	xsk_flush(xs);
207
	spin_unlock_bh(&xs->rx_lock);
208 209 210
	return err;
}

211
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
212
{
213
	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
214 215
	int err;

216
	err = xsk_rcv(xs, xdp, true);
217 218 219 220 221 222 223 224 225
	if (err)
		return err;

	if (!xs->flush_node.prev)
		list_add(&xs->flush_node, flush_list);

	return 0;
}

226
void __xsk_map_flush(void)
227
{
228
	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
229 230 231 232 233 234 235 236
	struct xdp_sock *xs, *tmp;

	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
		xsk_flush(xs);
		__list_del_clearprev(&xs->flush_node);
	}
}

237 238
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
239
	xskq_prod_submit_n(umem->cq, nb_entries);
240 241 242 243 244 245 246 247
}
EXPORT_SYMBOL(xsk_umem_complete_tx);

void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
	struct xdp_sock *xs;

	rcu_read_lock();
248
	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
249
		__xskq_cons_release(xs->tx);
250 251 252 253 254 255
		xs->sk.sk_write_space(&xs->sk);
	}
	rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);

256
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
257 258 259 260
{
	struct xdp_sock *xs;

	rcu_read_lock();
261
	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
262
		if (!xskq_cons_peek_desc(xs->tx, desc, umem))
263 264
			continue;

265
		/* This is the backpressure mechanism for the Tx path.
266 267 268 269
		 * Reserve space in the completion queue and only proceed
		 * if there is space in it. This avoids having to implement
		 * any buffering in the Tx path.
		 */
270
		if (xskq_prod_reserve_addr(umem->cq, desc->addr))
271 272
			goto out;

273
		xskq_cons_release(xs->tx);
274 275 276 277 278 279 280 281 282 283
		rcu_read_unlock();
		return true;
	}

out:
	rcu_read_unlock();
	return false;
}
EXPORT_SYMBOL(xsk_umem_consume_tx);

284
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
285 286
{
	struct net_device *dev = xs->dev;
287 288 289 290 291
	int err;

	rcu_read_lock();
	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
	rcu_read_unlock();
292

293 294 295 296 297 298
	return err;
}

static int xsk_zc_xmit(struct xdp_sock *xs)
{
	return xsk_wakeup(xs, XDP_WAKEUP_TX);
299 300
}

M
Magnus Karlsson 已提交
301 302
static void xsk_destruct_skb(struct sk_buff *skb)
{
303
	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
M
Magnus Karlsson 已提交
304
	struct xdp_sock *xs = xdp_sk(skb->sk);
305
	unsigned long flags;
M
Magnus Karlsson 已提交
306

307
	spin_lock_irqsave(&xs->tx_completion_lock, flags);
308
	xskq_prod_submit_addr(xs->umem->cq, addr);
309
	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
M
Magnus Karlsson 已提交
310 311 312 313

	sock_wfree(skb);
}

314
static int xsk_generic_xmit(struct sock *sk)
M
Magnus Karlsson 已提交
315 316
{
	struct xdp_sock *xs = xdp_sk(sk);
317
	u32 max_batch = TX_BATCH_SIZE;
M
Magnus Karlsson 已提交
318 319 320 321 322 323 324
	bool sent_frame = false;
	struct xdp_desc desc;
	struct sk_buff *skb;
	int err = 0;

	mutex_lock(&xs->mutex);

I
Ilya Maximets 已提交
325 326 327
	if (xs->queue_id >= xs->dev->real_num_tx_queues)
		goto out;

328
	while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
M
Magnus Karlsson 已提交
329
		char *buffer;
330 331
		u64 addr;
		u32 len;
M
Magnus Karlsson 已提交
332 333 334 335 336 337

		if (max_batch-- == 0) {
			err = -EAGAIN;
			goto out;
		}

338
		len = desc.len;
339
		skb = sock_alloc_send_skb(sk, len, 1, &err);
M
Magnus Karlsson 已提交
340 341 342 343 344 345
		if (unlikely(!skb)) {
			err = -EAGAIN;
			goto out;
		}

		skb_put(skb, len);
346
		addr = desc.addr;
347
		buffer = xsk_buff_raw_get_data(xs->umem, addr);
M
Magnus Karlsson 已提交
348
		err = skb_store_bits(skb, 0, buffer, len);
349
		/* This is the backpressure mechanism for the Tx path.
350 351 352 353
		 * Reserve space in the completion queue and only proceed
		 * if there is space in it. This avoids having to implement
		 * any buffering in the Tx path.
		 */
354
		if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
M
Magnus Karlsson 已提交
355 356 357 358 359 360 361
			kfree_skb(skb);
			goto out;
		}

		skb->dev = xs->dev;
		skb->priority = sk->sk_priority;
		skb->mark = sk->sk_mark;
362
		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
M
Magnus Karlsson 已提交
363 364 365
		skb->destructor = xsk_destruct_skb;

		err = dev_direct_xmit(skb, xs->queue_id);
366
		xskq_cons_release(xs->tx);
M
Magnus Karlsson 已提交
367 368
		/* Ignore NET_XMIT_CN as packet might have been sent */
		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
369 370
			/* SKB completed but not sent */
			err = -EBUSY;
M
Magnus Karlsson 已提交
371 372 373 374 375 376 377 378 379 380 381 382 383 384
			goto out;
		}

		sent_frame = true;
	}

out:
	if (sent_frame)
		sk->sk_write_space(sk);

	mutex_unlock(&xs->mutex);
	return err;
}

385 386 387 388 389 390 391 392 393 394 395 396
static int __xsk_sendmsg(struct sock *sk)
{
	struct xdp_sock *xs = xdp_sk(sk);

	if (unlikely(!(xs->dev->flags & IFF_UP)))
		return -ENETDOWN;
	if (unlikely(!xs->tx))
		return -ENOBUFS;

	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
}

M
Magnus Karlsson 已提交
397 398
static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
399
	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
M
Magnus Karlsson 已提交
400 401 402
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);

403
	if (unlikely(!xsk_is_bound(xs)))
M
Magnus Karlsson 已提交
404
		return -ENXIO;
405
	if (unlikely(need_wait))
406
		return -EOPNOTSUPP;
M
Magnus Karlsson 已提交
407

408
	return __xsk_sendmsg(sk);
M
Magnus Karlsson 已提交
409 410
}

411
static __poll_t xsk_poll(struct file *file, struct socket *sock,
412
			     struct poll_table_struct *wait)
413
{
414
	__poll_t mask = datagram_poll(file, sock, wait);
415 416
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
417 418 419 420 421 422
	struct xdp_umem *umem;

	if (unlikely(!xsk_is_bound(xs)))
		return mask;

	umem = xs->umem;
423

424
	if (umem->need_wakeup) {
425 426
		if (xs->zc)
			xsk_wakeup(xs, umem->need_wakeup);
427 428 429 430
		else
			/* Poll needs to drive Tx also in copy mode */
			__xsk_sendmsg(sk);
	}
431

432
	if (xs->rx && !xskq_prod_is_empty(xs->rx))
433
		mask |= EPOLLIN | EPOLLRDNORM;
434
	if (xs->tx && !xskq_cons_is_full(xs->tx))
435
		mask |= EPOLLOUT | EPOLLWRNORM;
436 437 438 439

	return mask;
}

440 441
static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
			  bool umem_queue)
442 443 444 445 446 447
{
	struct xsk_queue *q;

	if (entries == 0 || *queue || !is_power_of_2(entries))
		return -EINVAL;

448
	q = xskq_create(entries, umem_queue);
449 450 451
	if (!q)
		return -ENOMEM;

452 453
	/* Make sure queue is ready before it can be seen by others */
	smp_wmb();
454
	WRITE_ONCE(*queue, q);
455 456 457
	return 0;
}

458 459 460 461
static void xsk_unbind_dev(struct xdp_sock *xs)
{
	struct net_device *dev = xs->dev;

462
	if (xs->state != XSK_BOUND)
463
		return;
464
	WRITE_ONCE(xs->state, XSK_UNBOUND);
465 466 467 468 469 470 471 472

	/* Wait for driver to stop using the xdp socket. */
	xdp_del_sk_umem(xs->umem, xs);
	xs->dev = NULL;
	synchronize_net();
	dev_put(dev);
}

473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
					      struct xdp_sock ***map_entry)
{
	struct xsk_map *map = NULL;
	struct xsk_map_node *node;

	*map_entry = NULL;

	spin_lock_bh(&xs->map_list_lock);
	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
					node);
	if (node) {
		WARN_ON(xsk_map_inc(node->map));
		map = node->map;
		*map_entry = node->map_entry;
	}
	spin_unlock_bh(&xs->map_list_lock);
	return map;
}

static void xsk_delete_from_maps(struct xdp_sock *xs)
{
	/* This function removes the current XDP socket from all the
	 * maps it resides in. We need to take extra care here, due to
	 * the two locks involved. Each map has a lock synchronizing
	 * updates to the entries, and each socket has a lock that
	 * synchronizes access to the list of maps (map_list). For
	 * deadlock avoidance the locks need to be taken in the order
	 * "map lock"->"socket map list lock". We start off by
	 * accessing the socket map list, and take a reference to the
	 * map to guarantee existence between the
	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
	 * calls. Then we ask the map to remove the socket, which
	 * tries to remove the socket from the map. Note that there
	 * might be updates to the map between
	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
	 */
	struct xdp_sock **map_entry = NULL;
	struct xsk_map *map;

	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
		xsk_map_try_sock_delete(map, xs, map_entry);
		xsk_map_put(map);
	}
}

519 520 521
static int xsk_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
522
	struct xdp_sock *xs = xdp_sk(sk);
523 524 525 526 527 528 529
	struct net *net;

	if (!sk)
		return 0;

	net = sock_net(sk);

530 531 532 533
	mutex_lock(&net->xdp.lock);
	sk_del_node_init_rcu(sk);
	mutex_unlock(&net->xdp.lock);

534 535 536 537
	local_bh_disable();
	sock_prot_inuse_add(net, sk->sk_prot, -1);
	local_bh_enable();

538
	xsk_delete_from_maps(xs);
539
	mutex_lock(&xs->mutex);
540
	xsk_unbind_dev(xs);
541
	mutex_unlock(&xs->mutex);
542

543 544 545
	xskq_destroy(xs->rx);
	xskq_destroy(xs->tx);

546 547 548 549 550 551 552 553 554
	sock_orphan(sk);
	sock->sk = NULL;

	sk_refcnt_debug_release(sk);
	sock_put(sk);

	return 0;
}

555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
static struct socket *xsk_lookup_xsk_from_fd(int fd)
{
	struct socket *sock;
	int err;

	sock = sockfd_lookup(fd, &err);
	if (!sock)
		return ERR_PTR(-ENOTSOCK);

	if (sock->sk->sk_family != PF_XDP) {
		sockfd_put(sock);
		return ERR_PTR(-ENOPROTOOPT);
	}

	return sock;
}

static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
B
Björn Töpel 已提交
577
	struct net_device *dev;
578
	u32 flags, qid;
579 580 581 582 583 584 585
	int err = 0;

	if (addr_len < sizeof(struct sockaddr_xdp))
		return -EINVAL;
	if (sxdp->sxdp_family != AF_XDP)
		return -EINVAL;

586
	flags = sxdp->sxdp_flags;
587 588
	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
		      XDP_USE_NEED_WAKEUP))
589 590
		return -EINVAL;

591
	rtnl_lock();
592
	mutex_lock(&xs->mutex);
593
	if (xs->state != XSK_READY) {
B
Björn Töpel 已提交
594 595 596 597
		err = -EBUSY;
		goto out_release;
	}

598 599 600 601 602 603
	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
	if (!dev) {
		err = -ENODEV;
		goto out_release;
	}

604
	if (!xs->rx && !xs->tx) {
605 606 607 608
		err = -EINVAL;
		goto out_unlock;
	}

609 610 611
	qid = sxdp->sxdp_queue_id;

	if (flags & XDP_SHARED_UMEM) {
612 613 614
		struct xdp_sock *umem_xs;
		struct socket *sock;

615 616
		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
		    (flags & XDP_USE_NEED_WAKEUP)) {
617 618 619 620 621
			/* Cannot specify flags for shared sockets. */
			err = -EINVAL;
			goto out_unlock;
		}

622 623 624 625 626 627 628 629 630 631 632 633 634
		if (xs->umem) {
			/* We have already our own. */
			err = -EINVAL;
			goto out_unlock;
		}

		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
		if (IS_ERR(sock)) {
			err = PTR_ERR(sock);
			goto out_unlock;
		}

		umem_xs = xdp_sk(sock->sk);
635
		if (!xsk_is_bound(umem_xs)) {
636 637 638
			err = -EBADF;
			sockfd_put(sock);
			goto out_unlock;
639 640
		}
		if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
641 642 643 644 645 646
			err = -EINVAL;
			sockfd_put(sock);
			goto out_unlock;
		}

		xdp_get_umem(umem_xs->umem);
647
		WRITE_ONCE(xs->umem, umem_xs->umem);
648 649 650 651
		sockfd_put(sock);
	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
		err = -EINVAL;
		goto out_unlock;
652 653
	} else {
		/* This xsk has its own umem. */
654 655 656
		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
		if (err)
			goto out_unlock;
657 658 659
	}

	xs->dev = dev;
660 661 662
	xs->zc = xs->umem->zc;
	xs->queue_id = qid;
	xdp_add_sk_umem(xs->umem, xs);
663 664

out_unlock:
665
	if (err) {
666
		dev_put(dev);
667 668 669 670 671 672 673
	} else {
		/* Matches smp_rmb() in bind() for shared umem
		 * sockets, and xsk_is_bound().
		 */
		smp_wmb();
		WRITE_ONCE(xs->state, XSK_BOUND);
	}
674 675
out_release:
	mutex_unlock(&xs->mutex);
676
	rtnl_unlock();
677 678 679
	return err;
}

680 681 682 683 684 685 686
struct xdp_umem_reg_v1 {
	__u64 addr; /* Start of packet data area */
	__u64 len; /* Length of packet data area */
	__u32 chunk_size;
	__u32 headroom;
};

687 688 689 690 691 692 693 694 695 696 697
static int xsk_setsockopt(struct socket *sock, int level, int optname,
			  char __user *optval, unsigned int optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int err;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	switch (optname) {
698
	case XDP_RX_RING:
699
	case XDP_TX_RING:
700 701 702 703 704 705 706 707 708 709
	{
		struct xsk_queue **q;
		int entries;

		if (optlen < sizeof(entries))
			return -EINVAL;
		if (copy_from_user(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
710 711 712 713
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
714
		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
715
		err = xsk_init_queue(entries, q, false);
716 717 718
		if (!err && optname == XDP_TX_RING)
			/* Tx needs to be explicitly woken up the first time */
			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
719 720 721
		mutex_unlock(&xs->mutex);
		return err;
	}
722 723
	case XDP_UMEM_REG:
	{
724 725
		size_t mr_size = sizeof(struct xdp_umem_reg);
		struct xdp_umem_reg mr = {};
726 727
		struct xdp_umem *umem;

728 729 730 731 732 733
		if (optlen < sizeof(struct xdp_umem_reg_v1))
			return -EINVAL;
		else if (optlen < sizeof(mr))
			mr_size = sizeof(struct xdp_umem_reg_v1);

		if (copy_from_user(&mr, optval, mr_size))
734 735 736
			return -EFAULT;

		mutex_lock(&xs->mutex);
737
		if (xs->state != XSK_READY || xs->umem) {
B
Björn Töpel 已提交
738 739 740
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
741

B
Björn Töpel 已提交
742 743
		umem = xdp_umem_create(&mr);
		if (IS_ERR(umem)) {
744
			mutex_unlock(&xs->mutex);
B
Björn Töpel 已提交
745
			return PTR_ERR(umem);
746 747 748 749
		}

		/* Make sure umem is ready before it can be seen by others */
		smp_wmb();
750
		WRITE_ONCE(xs->umem, umem);
751 752 753
		mutex_unlock(&xs->mutex);
		return 0;
	}
754
	case XDP_UMEM_FILL_RING:
755
	case XDP_UMEM_COMPLETION_RING:
756 757 758 759 760 761 762 763
	{
		struct xsk_queue **q;
		int entries;

		if (copy_from_user(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
764 765 766 767
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
B
Björn Töpel 已提交
768 769 770 771 772
		if (!xs->umem) {
			mutex_unlock(&xs->mutex);
			return -EINVAL;
		}

773 774
		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
			&xs->umem->cq;
775
		err = xsk_init_queue(entries, q, true);
776 777
		if (optname == XDP_UMEM_FILL_RING)
			xp_set_fq(xs->umem->pool, *q);
778 779 780
		mutex_unlock(&xs->mutex);
		return err;
	}
781 782 783 784 785 786 787
	default:
		break;
	}

	return -ENOPROTOOPT;
}

788 789 790 791 792 793 794 795 796 797 798 799 800 801
static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
{
	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
}

static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
{
	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
	ring->desc = offsetof(struct xdp_umem_ring, desc);
}

M
Magnus Karlsson 已提交
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
static int xsk_getsockopt(struct socket *sock, int level, int optname,
			  char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int len;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case XDP_STATISTICS:
	{
		struct xdp_statistics stats;

		if (len < sizeof(stats))
			return -EINVAL;

		mutex_lock(&xs->mutex);
		stats.rx_dropped = xs->rx_dropped;
		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
		mutex_unlock(&xs->mutex);

		if (copy_to_user(optval, &stats, sizeof(stats)))
			return -EFAULT;
		if (put_user(sizeof(stats), optlen))
			return -EFAULT;

		return 0;
	}
838 839 840
	case XDP_MMAP_OFFSETS:
	{
		struct xdp_mmap_offsets off;
841 842 843
		struct xdp_mmap_offsets_v1 off_v1;
		bool flags_supported = true;
		void *to_copy;
844

845
		if (len < sizeof(off_v1))
846
			return -EINVAL;
847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
		else if (len < sizeof(off))
			flags_supported = false;

		if (flags_supported) {
			/* xdp_ring_offset is identical to xdp_ring_offset_v1
			 * except for the flags field added to the end.
			 */
			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
					       &off.rx);
			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
					       &off.tx);
			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
					       &off.fr);
			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
					       &off.cr);
			off.rx.flags = offsetof(struct xdp_rxtx_ring,
						ptrs.flags);
			off.tx.flags = offsetof(struct xdp_rxtx_ring,
						ptrs.flags);
			off.fr.flags = offsetof(struct xdp_umem_ring,
						ptrs.flags);
			off.cr.flags = offsetof(struct xdp_umem_ring,
						ptrs.flags);

			len = sizeof(off);
			to_copy = &off;
		} else {
			xsk_enter_rxtx_offsets(&off_v1.rx);
			xsk_enter_rxtx_offsets(&off_v1.tx);
			xsk_enter_umem_offsets(&off_v1.fr);
			xsk_enter_umem_offsets(&off_v1.cr);

			len = sizeof(off_v1);
			to_copy = &off_v1;
		}
882

883
		if (copy_to_user(optval, to_copy, len))
884 885 886 887 888 889
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909
	case XDP_OPTIONS:
	{
		struct xdp_options opts = {};

		if (len < sizeof(opts))
			return -EINVAL;

		mutex_lock(&xs->mutex);
		if (xs->zc)
			opts.flags |= XDP_OPTIONS_ZEROCOPY;
		mutex_unlock(&xs->mutex);

		len = sizeof(opts);
		if (copy_to_user(optval, &opts, len))
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
M
Magnus Karlsson 已提交
910 911 912 913 914 915 916
	default:
		break;
	}

	return -EOPNOTSUPP;
}

917 918 919
static int xsk_mmap(struct file *file, struct socket *sock,
		    struct vm_area_struct *vma)
{
920
	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
921 922 923
	unsigned long size = vma->vm_end - vma->vm_start;
	struct xdp_sock *xs = xdp_sk(sock->sk);
	struct xsk_queue *q = NULL;
924
	struct xdp_umem *umem;
925 926 927
	unsigned long pfn;
	struct page *qpg;

928
	if (READ_ONCE(xs->state) != XSK_READY)
929 930
		return -EBUSY;

931
	if (offset == XDP_PGOFF_RX_RING) {
932
		q = READ_ONCE(xs->rx);
933
	} else if (offset == XDP_PGOFF_TX_RING) {
934
		q = READ_ONCE(xs->tx);
935
	} else {
936 937
		umem = READ_ONCE(xs->umem);
		if (!umem)
938
			return -EINVAL;
939

940 941
		/* Matches the smp_wmb() in XDP_UMEM_REG */
		smp_rmb();
942
		if (offset == XDP_UMEM_PGOFF_FILL_RING)
943
			q = READ_ONCE(umem->fq);
944
		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
945
			q = READ_ONCE(umem->cq);
946
	}
947 948 949 950

	if (!q)
		return -EINVAL;

951 952
	/* Matches the smp_wmb() in xsk_init_queue */
	smp_rmb();
953
	qpg = virt_to_head_page(q->ring);
954
	if (size > page_size(qpg))
955 956 957 958 959 960 961
		return -EINVAL;

	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn,
			       size, vma->vm_page_prot);
}

962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
static int xsk_notifier(struct notifier_block *this,
			unsigned long msg, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
	struct net *net = dev_net(dev);
	struct sock *sk;

	switch (msg) {
	case NETDEV_UNREGISTER:
		mutex_lock(&net->xdp.lock);
		sk_for_each(sk, &net->xdp.list) {
			struct xdp_sock *xs = xdp_sk(sk);

			mutex_lock(&xs->mutex);
			if (xs->dev == dev) {
				sk->sk_err = ENETDOWN;
				if (!sock_flag(sk, SOCK_DEAD))
					sk->sk_error_report(sk);

				xsk_unbind_dev(xs);

				/* Clear device references in umem. */
				xdp_umem_clear_dev(xs->umem);
			}
			mutex_unlock(&xs->mutex);
		}
		mutex_unlock(&net->xdp.lock);
		break;
	}
	return NOTIFY_DONE;
}

994 995 996 997 998 999 1000
static struct proto xsk_proto = {
	.name =		"XDP",
	.owner =	THIS_MODULE,
	.obj_size =	sizeof(struct xdp_sock),
};

static const struct proto_ops xsk_proto_ops = {
B
Björn Töpel 已提交
1001 1002 1003 1004 1005 1006 1007 1008
	.family		= PF_XDP,
	.owner		= THIS_MODULE,
	.release	= xsk_release,
	.bind		= xsk_bind,
	.connect	= sock_no_connect,
	.socketpair	= sock_no_socketpair,
	.accept		= sock_no_accept,
	.getname	= sock_no_getname,
1009
	.poll		= xsk_poll,
B
Björn Töpel 已提交
1010 1011 1012 1013 1014 1015 1016 1017 1018
	.ioctl		= sock_no_ioctl,
	.listen		= sock_no_listen,
	.shutdown	= sock_no_shutdown,
	.setsockopt	= xsk_setsockopt,
	.getsockopt	= xsk_getsockopt,
	.sendmsg	= xsk_sendmsg,
	.recvmsg	= sock_no_recvmsg,
	.mmap		= xsk_mmap,
	.sendpage	= sock_no_sendpage,
1019 1020
};

1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
static void xsk_destruct(struct sock *sk)
{
	struct xdp_sock *xs = xdp_sk(sk);

	if (!sock_flag(sk, SOCK_DEAD))
		return;

	xdp_put_umem(xs->umem);

	sk_refcnt_debug_dec(sk);
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
static int xsk_create(struct net *net, struct socket *sock, int protocol,
		      int kern)
{
	struct sock *sk;
	struct xdp_sock *xs;

	if (!ns_capable(net->user_ns, CAP_NET_RAW))
		return -EPERM;
	if (sock->type != SOCK_RAW)
		return -ESOCKTNOSUPPORT;

	if (protocol)
		return -EPROTONOSUPPORT;

	sock->state = SS_UNCONNECTED;

	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
	if (!sk)
		return -ENOBUFS;

	sock->ops = &xsk_proto_ops;

	sock_init_data(sock, sk);

	sk->sk_family = PF_XDP;

1059 1060 1061
	sk->sk_destruct = xsk_destruct;
	sk_refcnt_debug_inc(sk);

1062 1063
	sock_set_flag(sk, SOCK_RCU_FREE);

1064
	xs = xdp_sk(sk);
1065
	xs->state = XSK_READY;
1066
	mutex_init(&xs->mutex);
1067
	spin_lock_init(&xs->rx_lock);
1068
	spin_lock_init(&xs->tx_completion_lock);
1069

1070 1071 1072
	INIT_LIST_HEAD(&xs->map_list);
	spin_lock_init(&xs->map_list_lock);

1073 1074 1075 1076
	mutex_lock(&net->xdp.lock);
	sk_add_node_rcu(sk, &net->xdp.list);
	mutex_unlock(&net->xdp.lock);

1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
	local_bh_disable();
	sock_prot_inuse_add(net, &xsk_proto, 1);
	local_bh_enable();

	return 0;
}

static const struct net_proto_family xsk_family_ops = {
	.family = PF_XDP,
	.create = xsk_create,
	.owner	= THIS_MODULE,
};

1090 1091 1092 1093
static struct notifier_block xsk_netdev_notifier = {
	.notifier_call	= xsk_notifier,
};

1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
static int __net_init xsk_net_init(struct net *net)
{
	mutex_init(&net->xdp.lock);
	INIT_HLIST_HEAD(&net->xdp.list);
	return 0;
}

static void __net_exit xsk_net_exit(struct net *net)
{
	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
}

static struct pernet_operations xsk_net_ops = {
	.init = xsk_net_init,
	.exit = xsk_net_exit,
};

1111 1112
static int __init xsk_init(void)
{
1113
	int err, cpu;
1114 1115 1116 1117 1118 1119 1120 1121 1122

	err = proto_register(&xsk_proto, 0 /* no slab */);
	if (err)
		goto out;

	err = sock_register(&xsk_family_ops);
	if (err)
		goto out_proto;

1123 1124 1125
	err = register_pernet_subsys(&xsk_net_ops);
	if (err)
		goto out_sk;
1126 1127 1128 1129 1130

	err = register_netdevice_notifier(&xsk_netdev_notifier);
	if (err)
		goto out_pernet;

1131 1132
	for_each_possible_cpu(cpu)
		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1133 1134
	return 0;

1135 1136
out_pernet:
	unregister_pernet_subsys(&xsk_net_ops);
1137 1138
out_sk:
	sock_unregister(PF_XDP);
1139 1140 1141 1142 1143 1144 1145
out_proto:
	proto_unregister(&xsk_proto);
out:
	return err;
}

fs_initcall(xsk_init);