xsk.c 19.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
// SPDX-License-Identifier: GPL-2.0
/* XDP sockets
 *
 * AF_XDP sockets allows a channel between XDP programs and userspace
 * applications.
 * Copyright(c) 2018 Intel Corporation.
 *
 * Author(s): Björn Töpel <bjorn.topel@intel.com>
 *	      Magnus Karlsson <magnus.karlsson@intel.com>
 */

#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__

#include <linux/if_xdp.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/net.h>
#include <linux/netdevice.h>
24
#include <linux/rculist.h>
25
#include <net/xdp_sock.h>
26
#include <net/xdp.h>
27

28
#include "xsk_queue.h"
29
#include "xdp_umem.h"
30
#include "xsk.h"
31

M
Magnus Karlsson 已提交
32 33
#define TX_BATCH_SIZE 16

34 35
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
36 37
	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
		READ_ONCE(xs->umem->fq);
38 39
}

40 41 42 43 44 45
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
	return xskq_has_addrs(umem->fq, cnt);
}
EXPORT_SYMBOL(xsk_umem_has_addrs);

46 47 48 49 50 51 52 53 54 55 56 57 58
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
	return xskq_peek_addr(umem->fq, addr);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);

void xsk_umem_discard_addr(struct xdp_umem *umem)
{
	xskq_discard_addr(umem->fq);
}
EXPORT_SYMBOL(xsk_umem_discard_addr);

static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
59
{
60 61
	void *to_buf, *from_buf;
	u32 metalen;
62
	u64 addr;
63
	int err;
64

65
	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
66
	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
67
		xs->rx_dropped++;
68
		return -ENOSPC;
69
	}
70

71 72
	addr += xs->umem->headroom;

73 74 75 76 77 78 79 80 81 82 83
	if (unlikely(xdp_data_meta_unsupported(xdp))) {
		from_buf = xdp->data;
		metalen = 0;
	} else {
		from_buf = xdp->data_meta;
		metalen = xdp->data - xdp->data_meta;
	}

	to_buf = xdp_umem_get_data(xs->umem, addr);
	memcpy(to_buf, from_buf, len + metalen);
	addr += metalen;
84
	err = xskq_produce_batch_desc(xs->rx, addr, len);
85
	if (!err) {
86
		xskq_discard_addr(xs->umem->fq);
87 88 89
		xdp_return_buff(xdp);
		return 0;
	}
90

91
	xs->rx_dropped++;
92 93 94
	return err;
}

95
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
96
{
97
	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
98

99
	if (err)
100
		xs->rx_dropped++;
101 102 103 104

	return err;
}

105 106 107 108 109 110 111 112 113 114 115 116 117
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
	u32 len;

	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
		return -EINVAL;

	len = xdp->data_end - xdp->data;

	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
}

118 119 120 121 122 123 124 125
void xsk_flush(struct xdp_sock *xs)
{
	xskq_produce_flush_desc(xs->rx);
	xs->sk.sk_data_ready(&xs->sk);
}

int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
126
	u32 metalen = xdp->data - xdp->data_meta;
127 128 129
	u32 len = xdp->data_end - xdp->data;
	void *buffer;
	u64 addr;
130 131
	int err;

132 133 134 135 136 137
	spin_lock_bh(&xs->rx_lock);

	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
		err = -EINVAL;
		goto out_unlock;
	}
138

139
	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
140
	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
141 142
		err = -ENOSPC;
		goto out_drop;
143 144 145 146 147
	}

	addr += xs->umem->headroom;

	buffer = xdp_umem_get_data(xs->umem, addr);
148 149
	memcpy(buffer, xdp->data_meta, len + metalen);
	addr += metalen;
150
	err = xskq_produce_batch_desc(xs->rx, addr, len);
151 152 153 154 155
	if (err)
		goto out_drop;

	xskq_discard_addr(xs->umem->fq);
	xskq_produce_flush_desc(xs->rx);
156

157 158 159 160 161 162
	spin_unlock_bh(&xs->rx_lock);

	xs->sk.sk_data_ready(&xs->sk);
	return 0;

out_drop:
163
	xs->rx_dropped++;
164 165
out_unlock:
	spin_unlock_bh(&xs->rx_lock);
166 167 168
	return err;
}

169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
	xskq_produce_flush_addr_n(umem->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_umem_complete_tx);

void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
	struct xdp_sock *xs;

	rcu_read_lock();
	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
		xs->sk.sk_write_space(&xs->sk);
	}
	rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);

187
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
188 189 190 191 192
{
	struct xdp_sock *xs;

	rcu_read_lock();
	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
193
		if (!xskq_peek_desc(xs->tx, desc))
194 195
			continue;

196
		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
			goto out;

		xskq_discard_desc(xs->tx);
		rcu_read_unlock();
		return true;
	}

out:
	rcu_read_unlock();
	return false;
}
EXPORT_SYMBOL(xsk_umem_consume_tx);

static int xsk_zc_xmit(struct sock *sk)
{
	struct xdp_sock *xs = xdp_sk(sk);
	struct net_device *dev = xs->dev;

	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
}

M
Magnus Karlsson 已提交
218 219
static void xsk_destruct_skb(struct sk_buff *skb)
{
220
	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
M
Magnus Karlsson 已提交
221
	struct xdp_sock *xs = xdp_sk(skb->sk);
222
	unsigned long flags;
M
Magnus Karlsson 已提交
223

224
	spin_lock_irqsave(&xs->tx_completion_lock, flags);
225
	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
226
	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
M
Magnus Karlsson 已提交
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242

	sock_wfree(skb);
}

static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
			    size_t total_len)
{
	u32 max_batch = TX_BATCH_SIZE;
	struct xdp_sock *xs = xdp_sk(sk);
	bool sent_frame = false;
	struct xdp_desc desc;
	struct sk_buff *skb;
	int err = 0;

	mutex_lock(&xs->mutex);

I
Ilya Maximets 已提交
243 244 245
	if (xs->queue_id >= xs->dev->real_num_tx_queues)
		goto out;

M
Magnus Karlsson 已提交
246 247
	while (xskq_peek_desc(xs->tx, &desc)) {
		char *buffer;
248 249
		u64 addr;
		u32 len;
M
Magnus Karlsson 已提交
250 251 252 253 254 255

		if (max_batch-- == 0) {
			err = -EAGAIN;
			goto out;
		}

256
		len = desc.len;
257
		skb = sock_alloc_send_skb(sk, len, 1, &err);
M
Magnus Karlsson 已提交
258 259 260 261 262 263
		if (unlikely(!skb)) {
			err = -EAGAIN;
			goto out;
		}

		skb_put(skb, len);
264 265
		addr = desc.addr;
		buffer = xdp_umem_get_data(xs->umem, addr);
M
Magnus Karlsson 已提交
266
		err = skb_store_bits(skb, 0, buffer, len);
I
Ilya Maximets 已提交
267
		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
M
Magnus Karlsson 已提交
268 269 270 271 272 273 274
			kfree_skb(skb);
			goto out;
		}

		skb->dev = xs->dev;
		skb->priority = sk->sk_priority;
		skb->mark = sk->sk_mark;
275
		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
M
Magnus Karlsson 已提交
276 277 278
		skb->destructor = xsk_destruct_skb;

		err = dev_direct_xmit(skb, xs->queue_id);
279
		xskq_discard_desc(xs->tx);
M
Magnus Karlsson 已提交
280 281
		/* Ignore NET_XMIT_CN as packet might have been sent */
		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
282 283
			/* SKB completed but not sent */
			err = -EBUSY;
M
Magnus Karlsson 已提交
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
			goto out;
		}

		sent_frame = true;
	}

out:
	if (sent_frame)
		sk->sk_write_space(sk);

	mutex_unlock(&xs->mutex);
	return err;
}

static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
300
	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
M
Magnus Karlsson 已提交
301 302 303 304 305 306 307
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);

	if (unlikely(!xs->dev))
		return -ENXIO;
	if (unlikely(!(xs->dev->flags & IFF_UP)))
		return -ENETDOWN;
308 309
	if (unlikely(!xs->tx))
		return -ENOBUFS;
310 311
	if (need_wait)
		return -EOPNOTSUPP;
M
Magnus Karlsson 已提交
312

313
	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
M
Magnus Karlsson 已提交
314 315
}

316 317
static unsigned int xsk_poll(struct file *file, struct socket *sock,
			     struct poll_table_struct *wait)
318
{
319
	unsigned int mask = datagram_poll(file, sock, wait);
320 321 322 323 324
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);

	if (xs->rx && !xskq_empty_desc(xs->rx))
		mask |= POLLIN | POLLRDNORM;
M
Magnus Karlsson 已提交
325 326
	if (xs->tx && !xskq_full_desc(xs->tx))
		mask |= POLLOUT | POLLWRNORM;
327 328 329 330

	return mask;
}

331 332
static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
			  bool umem_queue)
333 334 335 336 337 338
{
	struct xsk_queue *q;

	if (entries == 0 || *queue || !is_power_of_2(entries))
		return -EINVAL;

339
	q = xskq_create(entries, umem_queue);
340 341 342
	if (!q)
		return -ENOMEM;

343 344
	/* Make sure queue is ready before it can be seen by others */
	smp_wmb();
345 346 347 348
	*queue = q;
	return 0;
}

349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
static void xsk_unbind_dev(struct xdp_sock *xs)
{
	struct net_device *dev = xs->dev;

	if (!dev || xs->state != XSK_BOUND)
		return;

	xs->state = XSK_UNBOUND;

	/* Wait for driver to stop using the xdp socket. */
	xdp_del_sk_umem(xs->umem, xs);
	xs->dev = NULL;
	synchronize_net();
	dev_put(dev);
}

365 366 367
static int xsk_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
368
	struct xdp_sock *xs = xdp_sk(sk);
369 370 371 372 373 374 375
	struct net *net;

	if (!sk)
		return 0;

	net = sock_net(sk);

376 377 378 379
	mutex_lock(&net->xdp.lock);
	sk_del_node_init_rcu(sk);
	mutex_unlock(&net->xdp.lock);

380 381 382 383
	local_bh_disable();
	sock_prot_inuse_add(net, sk->sk_prot, -1);
	local_bh_enable();

384
	xsk_unbind_dev(xs);
385

386 387 388
	xskq_destroy(xs->rx);
	xskq_destroy(xs->tx);

389 390 391 392 393 394 395 396 397
	sock_orphan(sk);
	sock->sk = NULL;

	sk_refcnt_debug_release(sk);
	sock_put(sk);

	return 0;
}

398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
static struct socket *xsk_lookup_xsk_from_fd(int fd)
{
	struct socket *sock;
	int err;

	sock = sockfd_lookup(fd, &err);
	if (!sock)
		return ERR_PTR(-ENOTSOCK);

	if (sock->sk->sk_family != PF_XDP) {
		sockfd_put(sock);
		return ERR_PTR(-ENOPROTOOPT);
	}

	return sock;
}

static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
B
Björn Töpel 已提交
420
	struct net_device *dev;
421
	u32 flags, qid;
422 423 424 425 426 427 428
	int err = 0;

	if (addr_len < sizeof(struct sockaddr_xdp))
		return -EINVAL;
	if (sxdp->sxdp_family != AF_XDP)
		return -EINVAL;

429 430 431 432
	flags = sxdp->sxdp_flags;
	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
		return -EINVAL;

433
	mutex_lock(&xs->mutex);
434
	if (xs->state != XSK_READY) {
B
Björn Töpel 已提交
435 436 437 438
		err = -EBUSY;
		goto out_release;
	}

439 440 441 442 443 444
	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
	if (!dev) {
		err = -ENODEV;
		goto out_release;
	}

445
	if (!xs->rx && !xs->tx) {
446 447 448 449
		err = -EINVAL;
		goto out_unlock;
	}

450 451 452
	qid = sxdp->sxdp_queue_id;

	if (flags & XDP_SHARED_UMEM) {
453 454 455
		struct xdp_sock *umem_xs;
		struct socket *sock;

456 457 458 459 460 461
		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
			/* Cannot specify flags for shared sockets. */
			err = -EINVAL;
			goto out_unlock;
		}

462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
		if (xs->umem) {
			/* We have already our own. */
			err = -EINVAL;
			goto out_unlock;
		}

		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
		if (IS_ERR(sock)) {
			err = PTR_ERR(sock);
			goto out_unlock;
		}

		umem_xs = xdp_sk(sock->sk);
		if (!umem_xs->umem) {
			/* No umem to inherit. */
			err = -EBADF;
			sockfd_put(sock);
			goto out_unlock;
480
		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
481 482 483 484 485 486 487 488 489 490 491
			err = -EINVAL;
			sockfd_put(sock);
			goto out_unlock;
		}

		xdp_get_umem(umem_xs->umem);
		xs->umem = umem_xs->umem;
		sockfd_put(sock);
	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
		err = -EINVAL;
		goto out_unlock;
492 493
	} else {
		/* This xsk has its own umem. */
494 495 496 497
		xskq_set_umem(xs->umem->fq, xs->umem->size,
			      xs->umem->chunk_mask);
		xskq_set_umem(xs->umem->cq, xs->umem->size,
			      xs->umem->chunk_mask);
498 499 500 501

		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
		if (err)
			goto out_unlock;
502 503 504
	}

	xs->dev = dev;
505 506
	xs->zc = xs->umem->zc;
	xs->queue_id = qid;
507 508
	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
509
	xdp_add_sk_umem(xs->umem, xs);
510 511 512 513

out_unlock:
	if (err)
		dev_put(dev);
514 515
	else
		xs->state = XSK_BOUND;
516 517 518 519 520
out_release:
	mutex_unlock(&xs->mutex);
	return err;
}

521 522 523 524 525 526 527 528 529 530 531
static int xsk_setsockopt(struct socket *sock, int level, int optname,
			  char __user *optval, unsigned int optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int err;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	switch (optname) {
532
	case XDP_RX_RING:
533
	case XDP_TX_RING:
534 535 536 537 538 539 540 541 542 543
	{
		struct xsk_queue **q;
		int entries;

		if (optlen < sizeof(entries))
			return -EINVAL;
		if (copy_from_user(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
544 545 546 547
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
548
		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
549 550 551 552
		err = xsk_init_queue(entries, q, false);
		mutex_unlock(&xs->mutex);
		return err;
	}
553 554 555 556 557 558 559 560 561
	case XDP_UMEM_REG:
	{
		struct xdp_umem_reg mr;
		struct xdp_umem *umem;

		if (copy_from_user(&mr, optval, sizeof(mr)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
562
		if (xs->state != XSK_READY || xs->umem) {
B
Björn Töpel 已提交
563 564 565
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
566

B
Björn Töpel 已提交
567 568
		umem = xdp_umem_create(&mr);
		if (IS_ERR(umem)) {
569
			mutex_unlock(&xs->mutex);
B
Björn Töpel 已提交
570
			return PTR_ERR(umem);
571 572 573 574 575 576 577 578
		}

		/* Make sure umem is ready before it can be seen by others */
		smp_wmb();
		xs->umem = umem;
		mutex_unlock(&xs->mutex);
		return 0;
	}
579
	case XDP_UMEM_FILL_RING:
580
	case XDP_UMEM_COMPLETION_RING:
581 582 583 584 585 586 587 588
	{
		struct xsk_queue **q;
		int entries;

		if (copy_from_user(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
589 590 591 592
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
B
Björn Töpel 已提交
593 594 595 596 597
		if (!xs->umem) {
			mutex_unlock(&xs->mutex);
			return -EINVAL;
		}

598 599
		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
			&xs->umem->cq;
600
		err = xsk_init_queue(entries, q, true);
601 602 603
		mutex_unlock(&xs->mutex);
		return err;
	}
604 605 606 607 608 609 610
	default:
		break;
	}

	return -ENOPROTOOPT;
}

M
Magnus Karlsson 已提交
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
static int xsk_getsockopt(struct socket *sock, int level, int optname,
			  char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int len;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case XDP_STATISTICS:
	{
		struct xdp_statistics stats;

		if (len < sizeof(stats))
			return -EINVAL;

		mutex_lock(&xs->mutex);
		stats.rx_dropped = xs->rx_dropped;
		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
		mutex_unlock(&xs->mutex);

		if (copy_to_user(optval, &stats, sizeof(stats)))
			return -EFAULT;
		if (put_user(sizeof(stats), optlen))
			return -EFAULT;

		return 0;
	}
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
	case XDP_MMAP_OFFSETS:
	{
		struct xdp_mmap_offsets off;

		if (len < sizeof(off))
			return -EINVAL;

		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);

		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);

		len = sizeof(off);
		if (copy_to_user(optval, &off, len))
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
	case XDP_OPTIONS:
	{
		struct xdp_options opts = {};

		if (len < sizeof(opts))
			return -EINVAL;

		mutex_lock(&xs->mutex);
		if (xs->zc)
			opts.flags |= XDP_OPTIONS_ZEROCOPY;
		mutex_unlock(&xs->mutex);

		len = sizeof(opts);
		if (copy_to_user(optval, &opts, len))
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
M
Magnus Karlsson 已提交
696 697 698 699 700 701 702
	default:
		break;
	}

	return -EOPNOTSUPP;
}

703 704 705
static int xsk_mmap(struct file *file, struct socket *sock,
		    struct vm_area_struct *vma)
{
706
	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
707 708 709
	unsigned long size = vma->vm_end - vma->vm_start;
	struct xdp_sock *xs = xdp_sk(sock->sk);
	struct xsk_queue *q = NULL;
710
	struct xdp_umem *umem;
711 712 713
	unsigned long pfn;
	struct page *qpg;

714 715 716
	if (xs->state != XSK_READY)
		return -EBUSY;

717
	if (offset == XDP_PGOFF_RX_RING) {
718
		q = READ_ONCE(xs->rx);
719
	} else if (offset == XDP_PGOFF_TX_RING) {
720
		q = READ_ONCE(xs->tx);
721
	} else {
722 723
		umem = READ_ONCE(xs->umem);
		if (!umem)
724
			return -EINVAL;
725

726 727
		/* Matches the smp_wmb() in XDP_UMEM_REG */
		smp_rmb();
728
		if (offset == XDP_UMEM_PGOFF_FILL_RING)
729
			q = READ_ONCE(umem->fq);
730
		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
731
			q = READ_ONCE(umem->cq);
732
	}
733 734 735 736

	if (!q)
		return -EINVAL;

737 738
	/* Matches the smp_wmb() in xsk_init_queue */
	smp_rmb();
739 740 741 742 743 744 745 746 747
	qpg = virt_to_head_page(q->ring);
	if (size > (PAGE_SIZE << compound_order(qpg)))
		return -EINVAL;

	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn,
			       size, vma->vm_page_prot);
}

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
static int xsk_notifier(struct notifier_block *this,
			unsigned long msg, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
	struct net *net = dev_net(dev);
	struct sock *sk;

	switch (msg) {
	case NETDEV_UNREGISTER:
		mutex_lock(&net->xdp.lock);
		sk_for_each(sk, &net->xdp.list) {
			struct xdp_sock *xs = xdp_sk(sk);

			mutex_lock(&xs->mutex);
			if (xs->dev == dev) {
				sk->sk_err = ENETDOWN;
				if (!sock_flag(sk, SOCK_DEAD))
					sk->sk_error_report(sk);

				xsk_unbind_dev(xs);

				/* Clear device references in umem. */
				xdp_umem_clear_dev(xs->umem);
			}
			mutex_unlock(&xs->mutex);
		}
		mutex_unlock(&net->xdp.lock);
		break;
	}
	return NOTIFY_DONE;
}

780 781 782 783 784 785 786
static struct proto xsk_proto = {
	.name =		"XDP",
	.owner =	THIS_MODULE,
	.obj_size =	sizeof(struct xdp_sock),
};

static const struct proto_ops xsk_proto_ops = {
B
Björn Töpel 已提交
787 788 789 790 791 792 793 794
	.family		= PF_XDP,
	.owner		= THIS_MODULE,
	.release	= xsk_release,
	.bind		= xsk_bind,
	.connect	= sock_no_connect,
	.socketpair	= sock_no_socketpair,
	.accept		= sock_no_accept,
	.getname	= sock_no_getname,
795
	.poll		= xsk_poll,
B
Björn Töpel 已提交
796 797 798 799 800 801 802 803 804
	.ioctl		= sock_no_ioctl,
	.listen		= sock_no_listen,
	.shutdown	= sock_no_shutdown,
	.setsockopt	= xsk_setsockopt,
	.getsockopt	= xsk_getsockopt,
	.sendmsg	= xsk_sendmsg,
	.recvmsg	= sock_no_recvmsg,
	.mmap		= xsk_mmap,
	.sendpage	= sock_no_sendpage,
805 806
};

807 808 809 810 811 812 813 814 815 816 817 818
static void xsk_destruct(struct sock *sk)
{
	struct xdp_sock *xs = xdp_sk(sk);

	if (!sock_flag(sk, SOCK_DEAD))
		return;

	xdp_put_umem(xs->umem);

	sk_refcnt_debug_dec(sk);
}

819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
static int xsk_create(struct net *net, struct socket *sock, int protocol,
		      int kern)
{
	struct sock *sk;
	struct xdp_sock *xs;

	if (!ns_capable(net->user_ns, CAP_NET_RAW))
		return -EPERM;
	if (sock->type != SOCK_RAW)
		return -ESOCKTNOSUPPORT;

	if (protocol)
		return -EPROTONOSUPPORT;

	sock->state = SS_UNCONNECTED;

	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
	if (!sk)
		return -ENOBUFS;

	sock->ops = &xsk_proto_ops;

	sock_init_data(sock, sk);

	sk->sk_family = PF_XDP;

845 846 847
	sk->sk_destruct = xsk_destruct;
	sk_refcnt_debug_inc(sk);

848 849
	sock_set_flag(sk, SOCK_RCU_FREE);

850
	xs = xdp_sk(sk);
851
	xs->state = XSK_READY;
852
	mutex_init(&xs->mutex);
853
	spin_lock_init(&xs->rx_lock);
854
	spin_lock_init(&xs->tx_completion_lock);
855

856 857 858 859
	mutex_lock(&net->xdp.lock);
	sk_add_node_rcu(sk, &net->xdp.list);
	mutex_unlock(&net->xdp.lock);

860 861 862 863 864 865 866 867 868 869 870 871 872
	local_bh_disable();
	sock_prot_inuse_add(net, &xsk_proto, 1);
	local_bh_enable();

	return 0;
}

static const struct net_proto_family xsk_family_ops = {
	.family = PF_XDP,
	.create = xsk_create,
	.owner	= THIS_MODULE,
};

873 874 875 876
static struct notifier_block xsk_netdev_notifier = {
	.notifier_call	= xsk_notifier,
};

877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
static int __net_init xsk_net_init(struct net *net)
{
	mutex_init(&net->xdp.lock);
	INIT_HLIST_HEAD(&net->xdp.list);
	return 0;
}

static void __net_exit xsk_net_exit(struct net *net)
{
	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
}

static struct pernet_operations xsk_net_ops = {
	.init = xsk_net_init,
	.exit = xsk_net_exit,
};

894 895 896 897 898 899 900 901 902 903 904 905
static int __init xsk_init(void)
{
	int err;

	err = proto_register(&xsk_proto, 0 /* no slab */);
	if (err)
		goto out;

	err = sock_register(&xsk_family_ops);
	if (err)
		goto out_proto;

906 907 908
	err = register_pernet_subsys(&xsk_net_ops);
	if (err)
		goto out_sk;
909 910 911 912 913

	err = register_netdevice_notifier(&xsk_netdev_notifier);
	if (err)
		goto out_pernet;

914 915
	return 0;

916 917
out_pernet:
	unregister_pernet_subsys(&xsk_net_ops);
918 919
out_sk:
	sock_unregister(PF_XDP);
920 921 922 923 924 925 926
out_proto:
	proto_unregister(&xsk_proto);
out:
	return err;
}

fs_initcall(xsk_init);