virtio_net.c 84.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* A network driver using virtio.
R
Rusty Russell 已提交
3 4 5 6 7 8
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
9
#include <linux/ethtool.h>
R
Rusty Russell 已提交
10 11 12
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
13
#include <linux/bpf.h>
14
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
15
#include <linux/scatterlist.h>
16
#include <linux/if_vlan.h>
17
#include <linux/slab.h>
18
#include <linux/cpu.h>
19
#include <linux/average.h>
J
Jason Wang 已提交
20
#include <linux/filter.h>
21
#include <linux/kernel.h>
22
#include <net/route.h>
23
#include <net/xdp.h>
24
#include <net/net_failover.h>
R
Rusty Russell 已提交
25

26
static int napi_weight = NAPI_POLL_WEIGHT;
27 28
module_param(napi_weight, int, 0444);

29
static bool csum = true, gso = true, napi_tx = true;
R
Rusty Russell 已提交
30 31
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
32
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
33

R
Rusty Russell 已提交
34
/* FIXME: MTU in config. */
35
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
36
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
37

38 39
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

40 41 42
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

43 44 45 46
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

47 48
#define VIRTIO_XDP_FLAG	BIT(0)

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56
#define VIRTNET_DRIVER_VERSION "1.0.0"
57

58 59 60 61
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
62 63
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
64
};
65

66 67 68 69 70
#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
				(1ULL << VIRTIO_NET_F_GUEST_UFO))

T
Toshiaki Makita 已提交
71 72 73
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
74 75
};

T
Toshiaki Makita 已提交
76 77 78 79
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
80 81
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
82
	u64 kicks;
T
Toshiaki Makita 已提交
83 84
};

85 86
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
87 88
	u64 packets;
	u64 bytes;
89
	u64 drops;
90 91 92 93
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
94
	u64 kicks;
T
Toshiaki Makita 已提交
95 96 97
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
98
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
99 100

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
101 102 103 104
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
105
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
106 107 108
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
109 110 111 112 113 114 115
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
116
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
117 118 119 120 121
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

122 123 124 125 126 127 128
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
129 130 131

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
132

T
Toshiaki Makita 已提交
133 134
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
135
	struct napi_struct napi;
136 137 138 139 140 141 142
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
143 144
	struct napi_struct napi;

J
John Fastabend 已提交
145 146
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
147 148
	struct virtnet_rq_stats stats;

149 150 151
	/* Chain pages by the private ptr. */
	struct page *pages;

152
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
153
	struct ewma_pkt_len mrg_avg_pkt_len;
154

155 156 157
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

158 159
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
160

161 162 163
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
164 165
	/* Name of this receive queue: input.$index */
	char name[40];
166 167

	struct xdp_rxq_info xdp_rxq;
168 169
};

170 171 172 173 174 175 176
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
177
	__virtio16 vid;
178
	__virtio64 offloads;
179 180
};

181 182 183 184
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
185 186
	struct send_queue *sq;
	struct receive_queue *rq;
187 188
	unsigned int status;

J
Jason Wang 已提交
189 190 191 192 193 194
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

195 196 197
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

198 199 200
	/* I like... big packets and I cannot lie! */
	bool big_packets;

201 202 203
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
204 205 206
	/* Has control virtqueue */
	bool has_cvq;

207 208 209
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

210 211 212
	/* Packet virtio header size */
	u8 hdr_len;

213 214 215
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

216 217 218
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
219 220
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
221

222 223 224
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
225

226
	struct control_buf *ctrl;
227 228 229 230

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
231 232

	unsigned long guest_offloads;
233
	unsigned long guest_offloads_capable;
234 235 236

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
237 238
};

239
struct padded_vnet_hdr {
240
	struct virtio_net_hdr_mrg_rxbuf hdr;
241
	/*
242 243 244
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
245
	 */
246
	char padding[4];
247 248
};

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
static bool is_xdp_frame(void *ptr)
{
	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
}

static void *xdp_to_ptr(struct xdp_frame *ptr)
{
	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
}

static struct xdp_frame *ptr_to_xdp(void *ptr)
{
	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
}

J
Jason Wang 已提交
264 265 266 267 268
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
269
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
270 271 272 273 274 275 276 277 278
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
279
	return vq->index / 2;
J
Jason Wang 已提交
280 281 282 283 284 285 286
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

287
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
288
{
289
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
290 291
}

292 293 294 295
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
296
static void give_pages(struct receive_queue *rq, struct page *page)
297
{
298
	struct page *end;
299

300
	/* Find end of list, sew whole thing into vi->rq.pages. */
301
	for (end = page; end->private; end = (struct page *)end->private);
302 303
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
304 305
}

306
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
307
{
308
	struct page *p = rq->pages;
309

310
	if (p) {
311
		rq->pages = (struct page *)p->private;
312 313 314
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
315 316 317 318
		p = alloc_page(gfp_mask);
	return p;
}

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
334 335 336 337 338 339
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
340 341
}

342
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
343
{
344
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
345
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
346

347
	/* Suppress further interrupts. */
348
	virtqueue_disable_cb(vq);
349

W
Willem de Bruijn 已提交
350 351 352 353 354
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
355 356
}

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

374
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
375 376
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
377
				   struct page *page, unsigned int offset,
378
				   unsigned int len, unsigned int truesize,
379
				   bool hdr_valid, unsigned int metasize)
380 381
{
	struct sk_buff *skb;
382
	struct virtio_net_hdr_mrg_rxbuf *hdr;
383
	unsigned int copy, hdr_len, hdr_padded_len;
384
	char *p;
385

386
	p = page_address(page) + offset;
387

388
	/* copy small packet so we can reuse these pages for small data */
389
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
390 391
	if (unlikely(!skb))
		return NULL;
392

393
	hdr = skb_vnet_hdr(skb);
394

395 396
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
397
		hdr_padded_len = sizeof(*hdr);
398
	else
399
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
400

401
	/* hdr_valid means no XDP, so we can copy the vnet header */
402 403
	if (hdr_valid)
		memcpy(hdr, p, hdr_len);
404

405
	len -= hdr_len;
406 407
	offset += hdr_padded_len;
	p += hdr_padded_len;
408

409 410 411
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
412
	skb_put_data(skb, p, copy);
413

414 415 416 417 418
	if (metasize) {
		__skb_pull(skb, metasize);
		skb_metadata_set(skb, metasize);
	}

419 420
	len -= copy;
	offset += copy;
421

422 423 424 425 426 427 428 429
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

430 431 432 433 434 435 436
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
437
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
438 439 440
		dev_kfree_skb(skb);
		return NULL;
	}
441
	BUG_ON(offset >= PAGE_SIZE);
442
	while (len) {
443 444 445 446
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
447 448 449
		page = (struct page *)page->private;
		offset = 0;
	}
450

451
	if (page)
452
		give_pages(rq, page);
453

454 455
	return skb;
}
456

457 458 459
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
460 461 462 463
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

464 465 466 467 468
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
469
	/* Zero header and leave csum up to XDP layers */
470
	hdr = xdpf->data;
471
	memset(hdr, 0, vi->hdr_len);
472
	xdpf->len   += vi->hdr_len;
473

474
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
475

476 477
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
				   GFP_ATOMIC);
478
	if (unlikely(err))
479
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
480

481
	return 0;
J
John Fastabend 已提交
482 483
}

484 485 486 487 488 489 490 491
static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
{
	unsigned int qp;

	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	return &vi->sq[qp];
}

492
static int virtnet_xdp_xmit(struct net_device *dev,
493
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
494 495
{
	struct virtnet_info *vi = netdev_priv(dev);
496 497
	struct receive_queue *rq = vi->rq;
	struct bpf_prog *xdp_prog;
498 499
	struct send_queue *sq;
	unsigned int len;
500 501
	int packets = 0;
	int bytes = 0;
502
	int drops = 0;
T
Toshiaki Makita 已提交
503
	int kicks = 0;
504
	int ret, err;
505
	void *ptr;
506 507
	int i;

508 509 510
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
511
	xdp_prog = rcu_access_pointer(rq->xdp_prog);
512 513 514 515 516 517 518
	if (!xdp_prog)
		return -ENXIO;

	sq = virtnet_xdp_sq(vi);

	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
519 520 521
		drops = n;
		goto out;
	}
522

523
	/* Free up any pending old buffers before queueing new ones. */
524
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
525 526 527 528 529 530 531 532 533 534 535 536
		if (likely(is_xdp_frame(ptr))) {
			struct xdp_frame *frame = ptr_to_xdp(ptr);

			bytes += frame->len;
			xdp_return_frame(frame);
		} else {
			struct sk_buff *skb = ptr;

			bytes += skb->len;
			napi_consume_skb(skb, false);
		}
		packets++;
537
	}
538 539 540 541 542 543 544 545 546 547

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
		if (err) {
			xdp_return_frame_rx_napi(xdpf);
			drops++;
		}
	}
548
	ret = n - drops;
549

T
Toshiaki Makita 已提交
550 551 552 553
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
554 555
out:
	u64_stats_update_begin(&sq->stats.syncp);
556 557
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
558 559
	sq->stats.xdp_tx += n;
	sq->stats.xdp_tx_drops += drops;
T
Toshiaki Makita 已提交
560
	sq->stats.kicks += kicks;
561
	u64_stats_update_end(&sq->stats.syncp);
562

563
	return ret;
J
Jason Wang 已提交
564 565
}

566 567 568 569 570
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
601
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
602 603 604 605 606 607 608 609 610 611 612 613 614 615
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
616
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

635 636 637
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
638
				     void *buf, void *ctx,
J
Jason Wang 已提交
639
				     unsigned int len,
640
				     unsigned int *xdp_xmit,
641
				     struct virtnet_rq_stats *stats)
642
{
643
	struct sk_buff *skb;
644
	struct bpf_prog *xdp_prog;
645
	unsigned int xdp_headroom = (unsigned long)ctx;
646 647 648 649
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
650
	struct page *page = virt_to_head_page(buf);
651
	unsigned int delta = 0;
652
	struct page *xdp_page;
653
	int err;
654
	unsigned int metasize = 0;
655

656
	len -= vi->hdr_len;
657
	stats->bytes += len;
658

659 660 661
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
662
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
663
		struct xdp_frame *xdpf;
664
		struct xdp_buff xdp;
665
		void *orig_data;
666 667
		u32 act;

668
		if (unlikely(hdr->hdr.gso_type))
669
			goto err_xdp;
670

671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

692
		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
693 694
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
695
		xdp.data_end = xdp.data + len;
696
		xdp.data_meta = xdp.data;
697
		orig_data = xdp.data;
698
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
699
		stats->xdp_packets++;
700

701 702
		switch (act) {
		case XDP_PASS:
703
			/* Recalculate length in case bpf program changed it */
704
			delta = orig_data - xdp.data;
705
			len = xdp.data_end - xdp.data;
706
			metasize = xdp.data - xdp.data_meta;
707 708
			break;
		case XDP_TX:
709
			stats->xdp_tx++;
710
			xdpf = xdp_convert_buff_to_frame(&xdp);
711 712
			if (unlikely(!xdpf))
				goto err_xdp;
713 714
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
715
				trace_xdp_exception(vi->dev, xdp_prog, act);
716 717
				goto err_xdp;
			}
718
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
719 720 721
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
722
			stats->xdp_redirects++;
J
Jason Wang 已提交
723
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
724 725
			if (err)
				goto err_xdp;
726
			*xdp_xmit |= VIRTIO_XDP_REDIR;
727 728 729
			rcu_read_unlock();
			goto xdp_xmit;
		default:
730
			bpf_warn_invalid_xdp_action(act);
731
			fallthrough;
732 733 734
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
735 736 737 738 739
			goto err_xdp;
		}
	}
	rcu_read_unlock();

740 741
	skb = build_skb(buf, buflen);
	if (!skb) {
742
		put_page(page);
743 744 745
		goto err;
	}
	skb_reserve(skb, headroom - delta);
746
	skb_put(skb, len);
747
	if (!xdp_prog) {
748 749
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
750
	} /* keep zeroed vnet hdr since XDP is loaded */
751

752 753 754
	if (metasize)
		skb_metadata_set(skb, metasize);

755
err:
756
	return skb;
757 758 759

err_xdp:
	rcu_read_unlock();
760 761
	stats->xdp_drops++;
	stats->drops++;
762
	put_page(page);
763 764
xdp_xmit:
	return NULL;
765 766 767
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
768
				   struct virtnet_info *vi,
769 770
				   struct receive_queue *rq,
				   void *buf,
771
				   unsigned int len,
772
				   struct virtnet_rq_stats *stats)
773 774
{
	struct page *page = buf;
775 776
	struct sk_buff *skb =
		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
J
John Fastabend 已提交
777

778
	stats->bytes += len - vi->hdr_len;
779 780 781 782 783 784
	if (unlikely(!skb))
		goto err;

	return skb;

err:
785
	stats->drops++;
786 787 788 789
	give_pages(rq, page);
	return NULL;
}

790
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
791
					 struct virtnet_info *vi,
792
					 struct receive_queue *rq,
793 794
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
795
					 unsigned int len,
796
					 unsigned int *xdp_xmit,
797
					 struct virtnet_rq_stats *stats)
798
{
799 800
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
801 802
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
803 804
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
805
	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
806
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
807
	unsigned int metasize = 0;
808 809
	unsigned int frame_sz;
	int err;
J
John Fastabend 已提交
810

J
John Fastabend 已提交
811
	head_skb = NULL;
812
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
813

J
John Fastabend 已提交
814 815 816
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
817
		struct xdp_frame *xdpf;
818
		struct page *xdp_page;
819 820
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
821 822
		u32 act;

823 824 825 826 827 828 829
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

830 831 832 833 834
		/* Buffers with headroom use PAGE_SIZE as alloc size,
		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
		 */
		frame_sz = headroom ? PAGE_SIZE : truesize;

835 836 837 838 839 840
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
841 842
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
843
			/* linearize data for XDP */
844
			xdp_page = xdp_linearize_page(rq, &num_buf,
845 846 847
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
848 849
			frame_sz = PAGE_SIZE;

850 851
			if (!xdp_page)
				goto err_xdp;
852
			offset = VIRTIO_XDP_HEADROOM;
853 854
		} else {
			xdp_page = page;
J
John Fastabend 已提交
855 856
		}

857 858 859
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
860
		data = page_address(xdp_page) + offset;
861
		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
862
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
863 864
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
865
		xdp.data_meta = xdp.data;
866

867
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
868
		stats->xdp_packets++;
869

J
John Fastabend 已提交
870 871
		switch (act) {
		case XDP_PASS:
872 873
			metasize = xdp.data - xdp.data_meta;

874
			/* recalculate offset to account for any header
875 876 877
			 * adjustments and minus the metasize to copy the
			 * metadata in page_to_skb(). Note other cases do not
			 * build an skb and avoid using offset
878
			 */
879 880
			offset = xdp.data - page_address(xdp_page) -
				 vi->hdr_len - metasize;
881

882 883
			/* recalculate len if xdp.data, xdp.data_end or
			 * xdp.data_meta were adjusted
884
			 */
885
			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
886 887 888 889
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
890 891 892
				head_skb = page_to_skb(vi, rq, xdp_page, offset,
						       len, PAGE_SIZE, false,
						       metasize);
893 894
				return head_skb;
			}
J
John Fastabend 已提交
895 896
			break;
		case XDP_TX:
897
			stats->xdp_tx++;
898
			xdpf = xdp_convert_buff_to_frame(&xdp);
899 900
			if (unlikely(!xdpf))
				goto err_xdp;
901 902
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
903
				trace_xdp_exception(vi->dev, xdp_prog, act);
904 905 906 907
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
908
			*xdp_xmit |= VIRTIO_XDP_TX;
909
			if (unlikely(xdp_page != page))
910
				put_page(page);
J
John Fastabend 已提交
911 912
			rcu_read_unlock();
			goto xdp_xmit;
913
		case XDP_REDIRECT:
914
			stats->xdp_redirects++;
915 916 917 918 919 920
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
921
			*xdp_xmit |= VIRTIO_XDP_REDIR;
922
			if (unlikely(xdp_page != page))
923
				put_page(page);
924 925
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
926
		default:
927
			bpf_warn_invalid_xdp_action(act);
928
			fallthrough;
929 930
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
931
			fallthrough;
932
		case XDP_DROP:
933 934
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
935
			goto err_xdp;
J
John Fastabend 已提交
936
		}
J
John Fastabend 已提交
937 938
	}
	rcu_read_unlock();
939

940
	if (unlikely(len > truesize)) {
941
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
942 943 944 945
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
946

947 948
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
			       metasize);
J
John Fastabend 已提交
949
	curr_skb = head_skb;
950

951 952
	if (unlikely(!curr_skb))
		goto err_skb;
953
	while (--num_buf) {
954 955
		int num_skb_frags;

956
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
957
		if (unlikely(!buf)) {
958
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
959
				 dev->name, num_buf,
960 961
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
962 963
			dev->stats.rx_length_errors++;
			goto err_buf;
964
		}
965

966
		stats->bytes += len;
967
		page = virt_to_head_page(buf);
968 969 970

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
971
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
972 973 974 975
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
976 977

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
978 979
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
980 981 982

			if (unlikely(!nskb))
				goto err_skb;
983 984 985 986 987 988 989 990 991 992 993
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
994
			head_skb->truesize += truesize;
995
		}
996
		offset = buf - page_address(page);
997 998 999
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1000
					     len, truesize);
1001 1002
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1003
					offset, len, truesize);
1004
		}
1005 1006
	}

J
Johannes Berg 已提交
1007
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1008 1009
	return head_skb;

J
John Fastabend 已提交
1010 1011
err_xdp:
	rcu_read_unlock();
1012
	stats->xdp_drops++;
1013 1014
err_skb:
	put_page(page);
1015
	while (num_buf-- > 1) {
1016 1017
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
1018 1019 1020 1021 1022
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
1023
		stats->bytes += len;
1024
		page = virt_to_head_page(buf);
1025
		put_page(page);
1026
	}
1027
err_buf:
1028
	stats->drops++;
1029
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
1030
xdp_xmit:
1031
	return NULL;
1032 1033
}

1034 1035
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
1036
			unsigned int *xdp_xmit,
1037
			struct virtnet_rq_stats *stats)
1038
{
1039
	struct net_device *dev = vi->dev;
1040
	struct sk_buff *skb;
1041
	struct virtio_net_hdr_mrg_rxbuf *hdr;
1042

1043
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1044 1045
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1046
		if (vi->mergeable_rx_bufs) {
1047
			put_page(virt_to_head_page(buf));
1048
		} else if (vi->big_packets) {
1049
			give_pages(rq, buf);
1050
		} else {
1051
			put_page(virt_to_head_page(buf));
1052
		}
1053
		return;
1054
	}
1055

1056
	if (vi->mergeable_rx_bufs)
1057
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1058
					stats);
1059
	else if (vi->big_packets)
1060
		skb = receive_big(dev, vi, rq, buf, len, stats);
1061
	else
1062
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1063 1064

	if (unlikely(!skb))
1065
		return;
1066

1067
	hdr = skb_vnet_hdr(skb);
1068

1069
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1070
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1071

1072 1073 1074 1075 1076 1077
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1078 1079
	}

1080
	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1081 1082 1083 1084
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1085
	napi_gro_receive(&rq->napi, skb);
1086
	return;
R
Rusty Russell 已提交
1087 1088 1089 1090 1091 1092

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1093 1094 1095 1096 1097
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1098 1099
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1100
{
1101 1102
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1103
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1104
	void *ctx = (void *)(unsigned long)xdp_headroom;
1105
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1106
	int err;
1107

1108 1109 1110
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1111
		return -ENOMEM;
R
Rusty Russell 已提交
1112

1113 1114 1115 1116 1117
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1118
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1119
	if (err < 0)
1120
		put_page(virt_to_head_page(buf));
1121 1122
	return err;
}
1123

1124 1125
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1126 1127 1128 1129 1130
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1131 1132
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1133
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1134
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1135
		first = get_a_page(rq, gfp);
1136 1137
		if (!first) {
			if (list)
1138
				give_pages(rq, list);
1139
			return -ENOMEM;
1140
		}
1141
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1142

1143 1144 1145 1146
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1147

1148
	first = get_a_page(rq, gfp);
1149
	if (!first) {
1150
		give_pages(rq, list);
1151 1152 1153 1154
		return -ENOMEM;
	}
	p = page_address(first);

1155
	/* rq->sg[0], rq->sg[1] share the same page */
1156 1157
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1158

1159
	/* rq->sg[1] for data packet, from offset */
1160
	offset = sizeof(struct padded_vnet_hdr);
1161
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1162 1163 1164

	/* chain first in list head */
	first->private = (unsigned long)list;
1165 1166
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1167
	if (err < 0)
1168
		give_pages(rq, first);
1169 1170

	return err;
R
Rusty Russell 已提交
1171 1172
}

1173
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1174 1175
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1176
{
1177
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1178 1179
	unsigned int len;

1180 1181 1182 1183
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1184
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1185

1186
	return ALIGN(len, L1_CACHE_BYTES);
1187 1188
}

1189 1190
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1191
{
1192
	struct page_frag *alloc_frag = &rq->alloc_frag;
1193
	unsigned int headroom = virtnet_get_headroom(vi);
1194 1195
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1196
	char *buf;
1197
	void *ctx;
1198
	int err;
1199
	unsigned int len, hole;
1200

1201 1202 1203 1204 1205 1206
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1207
		return -ENOMEM;
1208

1209
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1210
	buf += headroom; /* advance address leaving hole at front of pkt */
1211
	get_page(alloc_frag->page);
1212
	alloc_frag->offset += len + room;
1213
	hole = alloc_frag->size - alloc_frag->offset;
1214
	if (hole < len + room) {
1215 1216
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1217
		 * the current buffer.
1218
		 */
1219 1220 1221
		len += hole;
		alloc_frag->offset += hole;
	}
1222

1223
	sg_init_one(rq->sg, buf, len);
1224
	ctx = mergeable_len_to_ctx(len, headroom);
1225
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1226
	if (err < 0)
1227
		put_page(virt_to_head_page(buf));
1228

1229 1230
	return err;
}
1231

1232 1233 1234 1235 1236 1237 1238
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1239 1240
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1241 1242
{
	int err;
1243
	bool oom;
1244

1245 1246
	do {
		if (vi->mergeable_rx_bufs)
1247
			err = add_recvbuf_mergeable(vi, rq, gfp);
1248
		else if (vi->big_packets)
1249
			err = add_recvbuf_big(vi, rq, gfp);
1250
		else
M
Michael S. Tsirkin 已提交
1251
			err = add_recvbuf_small(vi, rq, gfp);
1252

1253
		oom = err == -ENOMEM;
1254
		if (err)
1255
			break;
1256
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1257
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1258 1259 1260
		unsigned long flags;

		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1261
		rq->stats.kicks++;
1262
		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
T
Toshiaki Makita 已提交
1263 1264
	}

1265
	return !oom;
1266 1267
}

1268
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1269 1270
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1271
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1272

1273
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1274 1275
}

1276
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1277
{
1278
	napi_enable(napi);
1279 1280

	/* If all buffers were filled by other side before we napi_enabled, we
1281 1282 1283 1284 1285 1286
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1287 1288
}

W
Willem de Bruijn 已提交
1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1307 1308 1309 1310 1311 1312
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1313 1314
static void refill_work(struct work_struct *work)
{
1315 1316
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1317
	bool still_empty;
J
Jason Wang 已提交
1318 1319
	int i;

1320
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1321
		struct receive_queue *rq = &vi->rq[i];
1322

J
Jason Wang 已提交
1323
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1324
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1325
		virtnet_napi_enable(rq->vq, &rq->napi);
1326

J
Jason Wang 已提交
1327 1328 1329 1330 1331 1332
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1333 1334
}

1335 1336
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1337
{
1338
	struct virtnet_info *vi = rq->vq->vdev->priv;
1339
	struct virtnet_rq_stats stats = {};
1340
	unsigned int len;
1341
	void *buf;
1342
	int i;
R
Rusty Russell 已提交
1343

1344
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1345 1346
		void *ctx;

1347
		while (stats.packets < budget &&
1348
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1349
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1350
			stats.packets++;
1351 1352
		}
	} else {
1353
		while (stats.packets < budget &&
1354
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1355
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1356
			stats.packets++;
1357
		}
R
Rusty Russell 已提交
1358 1359
	}

1360
	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
M
Michael S. Tsirkin 已提交
1361
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1362
			schedule_delayed_work(&vi->refill, 0);
1363
	}
R
Rusty Russell 已提交
1364

T
Toshiaki Makita 已提交
1365
	u64_stats_update_begin(&rq->stats.syncp);
1366 1367 1368 1369
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1370 1371
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1372
	}
T
Toshiaki Makita 已提交
1373
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1374

1375
	return stats.packets;
1376 1377
}

1378
static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1379 1380 1381 1382
{
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;
1383
	void *ptr;
1384

1385 1386 1387
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		if (likely(!is_xdp_frame(ptr))) {
			struct sk_buff *skb = ptr;
1388

1389
			pr_debug("Sent skb %p\n", skb);
1390

1391 1392 1393 1394
			bytes += skb->len;
			napi_consume_skb(skb, in_napi);
		} else {
			struct xdp_frame *frame = ptr_to_xdp(ptr);
1395

1396 1397 1398 1399
			bytes += frame->len;
			xdp_return_frame(frame);
		}
		packets++;
1400 1401 1402 1403 1404 1405 1406 1407
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1408 1409 1410 1411
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1412 1413
}

1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

1424 1425 1426 1427 1428 1429 1430
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

1431
	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1432 1433 1434
		return;

	if (__netif_tx_trylock(txq)) {
1435
		free_old_xmit_skbs(sq, true);
1436 1437 1438 1439 1440 1441 1442
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1443 1444 1445 1446
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1447 1448
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1449
	unsigned int received;
1450
	unsigned int xdp_xmit = 0;
1451

1452 1453
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1454
	received = virtnet_receive(rq, budget, &xdp_xmit);
1455

1456
	/* Out of packets? */
1457 1458
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1459

1460
	if (xdp_xmit & VIRTIO_XDP_REDIR)
1461
		xdp_do_flush();
1462 1463

	if (xdp_xmit & VIRTIO_XDP_TX) {
1464
		sq = virtnet_xdp_sq(vi);
T
Toshiaki Makita 已提交
1465 1466 1467 1468 1469
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1470
	}
J
Jason Wang 已提交
1471

R
Rusty Russell 已提交
1472 1473 1474
	return received;
}

J
Jason Wang 已提交
1475 1476 1477
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1478
	int i, err;
J
Jason Wang 已提交
1479

1480 1481 1482
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1483
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1484
				schedule_delayed_work(&vi->refill, 0);
1485

1486
		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1487 1488 1489
		if (err < 0)
			return err;

1490 1491 1492 1493 1494 1495 1496
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1497
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1498
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1499 1500 1501 1502 1503
	}

	return 0;
}

W
Willem de Bruijn 已提交
1504 1505 1506 1507
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
1508 1509
	unsigned int index = vq2txq(sq->vq);
	struct netdev_queue *txq;
W
Willem de Bruijn 已提交
1510

1511 1512 1513 1514 1515 1516 1517
	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
		/* We don't need to enable cb for XDP */
		napi_complete_done(napi, 0);
		return 0;
	}

	txq = netdev_get_tx_queue(vi->dev, index);
W
Willem de Bruijn 已提交
1518
	__netif_tx_lock(txq, raw_smp_processor_id());
1519
	free_old_xmit_skbs(sq, true);
W
Willem de Bruijn 已提交
1520 1521 1522 1523 1524 1525 1526 1527 1528 1529
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1530
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1531
{
1532
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1533
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1534
	struct virtnet_info *vi = sq->vq->vdev->priv;
1535
	int num_sg;
1536
	unsigned hdr_len = vi->hdr_len;
1537
	bool can_push;
R
Rusty Russell 已提交
1538

J
Johannes Berg 已提交
1539
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1540 1541 1542 1543 1544 1545 1546

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1547
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1548 1549
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1550

1551
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1552 1553
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1554
		BUG();
R
Rusty Russell 已提交
1555

1556
	if (vi->mergeable_rx_bufs)
1557
		hdr->num_buffers = 0;
1558

1559
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1560 1561 1562
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1563 1564
		if (unlikely(num_sg < 0))
			return num_sg;
1565 1566 1567 1568
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1569 1570 1571 1572
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1573
	}
1574
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1575 1576
}

1577
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1578 1579
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1580 1581
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1582
	int err;
1583
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1584
	bool kick = !netdev_xmit_more();
W
Willem de Bruijn 已提交
1585
	bool use_napi = sq->napi.weight;
1586 1587

	/* Free up any pending old buffers before queueing new ones. */
1588
	free_old_xmit_skbs(sq, false);
1589

1590 1591 1592
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1593 1594 1595
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1596
	/* Try to transmit */
1597
	err = xmit_skb(sq, skb);
1598

1599
	/* This should not happen! */
1600
	if (unlikely(err)) {
1601 1602 1603
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1604 1605
				 "Unexpected TXQ (%d) queue failure: %d\n",
				 qnum, err);
1606
		dev->stats.tx_dropped++;
1607
		dev_kfree_skb_any(skb);
1608
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1609
	}
1610

1611
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1612 1613
	if (!use_napi) {
		skb_orphan(skb);
1614
		nf_reset_ct(skb);
W
Willem de Bruijn 已提交
1615
	}
1616

1617 1618 1619 1620 1621 1622 1623 1624 1625
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1626
	 */
1627
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1628
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1629 1630
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1631
			/* More just got used, free them then recheck. */
1632
			free_old_xmit_skbs(sq, false);
1633
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1634
				netif_start_subqueue(dev, qnum);
1635
				virtqueue_disable_cb(sq->vq);
1636 1637
			}
		}
1638
	}
1639

T
Toshiaki Makita 已提交
1640 1641 1642 1643 1644 1645 1646
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1647

1648
	return NETDEV_TX_OK;
1649 1650
}

1651 1652 1653
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1654
 * never fail unless improperly formatted.
1655 1656
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1657
				 struct scatterlist *out)
1658
{
1659
	struct scatterlist *sgs[4], hdr, stat;
1660
	unsigned out_num = 0, tmp;
1661 1662

	/* Caller should know better */
1663
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1664

1665 1666 1667
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1668
	/* Add header */
1669
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1670
	sgs[out_num++] = &hdr;
1671

1672 1673
	if (out)
		sgs[out_num++] = out;
1674

1675
	/* Add return status. */
1676
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1677
	sgs[out_num] = &stat;
1678

1679
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1680
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1681

1682
	if (unlikely(!virtqueue_kick(vi->cvq)))
1683
		return vi->ctrl->status == VIRTIO_NET_OK;
1684 1685 1686 1687

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1688 1689
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1690 1691
		cpu_relax();

1692
	return vi->ctrl->status == VIRTIO_NET_OK;
1693 1694
}

1695 1696 1697 1698
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1699
	int ret;
1700
	struct sockaddr *addr;
1701
	struct scatterlist sg;
1702

1703 1704 1705
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1706
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1707 1708 1709 1710
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1711
	if (ret)
1712
		goto out;
1713

1714 1715 1716
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1717
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1718 1719
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1720 1721
			ret = -EINVAL;
			goto out;
1722
		}
1723 1724
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1725 1726 1727 1728 1729 1730 1731
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1732 1733 1734
	}

	eth_commit_mac_addr_change(dev, p);
1735
	ret = 0;
1736

1737 1738 1739
out:
	kfree(addr);
	return ret;
1740 1741
}

1742 1743
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1744 1745 1746
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1747
	int i;
1748

T
Toshiaki Makita 已提交
1749
	for (i = 0; i < vi->max_queue_pairs; i++) {
1750
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1751 1752
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1753 1754

		do {
T
Toshiaki Makita 已提交
1755 1756 1757 1758
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1759 1760

		do {
T
Toshiaki Makita 已提交
1761
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1762 1763 1764
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1765
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1766 1767 1768 1769 1770

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1771
		tot->rx_dropped += rdrops;
1772 1773 1774
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1775
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1776 1777 1778 1779
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1780 1781 1782 1783
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1784
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1785 1786 1787 1788
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1789
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1790 1791 1792 1793 1794 1795 1796
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1797 1798
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1799 1800

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1801
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1802 1803 1804
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1805
	} else {
J
Jason Wang 已提交
1806
		vi->curr_queue_pairs = queue_pairs;
1807 1808 1809
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1810
	}
J
Jason Wang 已提交
1811 1812 1813 1814

	return 0;
}

1815 1816 1817 1818 1819 1820 1821 1822 1823 1824
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1825 1826 1827
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1828
	int i;
R
Rusty Russell 已提交
1829

1830 1831
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1832

W
Willem de Bruijn 已提交
1833
	for (i = 0; i < vi->max_queue_pairs; i++) {
1834
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1835
		napi_disable(&vi->rq[i].napi);
1836
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1837
	}
R
Rusty Russell 已提交
1838 1839 1840 1841

	return 0;
}

1842 1843 1844
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1845 1846
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1847
	struct netdev_hw_addr *ha;
1848
	int uc_count;
1849
	int mc_count;
1850 1851
	void *buf;
	int i;
1852

S
stephen hemminger 已提交
1853
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1854 1855 1856
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1857 1858
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1859

1860
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1861 1862

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1863
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1864
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1865
			 vi->ctrl->promisc ? "en" : "dis");
1866

1867
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1868 1869

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1870
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1871
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1872
			 vi->ctrl->allmulti ? "en" : "dis");
1873

1874
	uc_count = netdev_uc_count(dev);
1875
	mc_count = netdev_mc_count(dev);
1876
	/* MAC filter - use one buffer for both lists */
1877 1878 1879
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1880
	if (!buf)
1881 1882
		return;

1883 1884
	sg_init_table(sg, 2);

1885
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1886
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1887
	i = 0;
1888
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1889
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1890 1891

	sg_set_buf(&sg[0], mac_data,
1892
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1893 1894

	/* multicast list and count fill the end */
1895
	mac_data = (void *)&mac_data->macs[uc_count][0];
1896

M
Michael S. Tsirkin 已提交
1897
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1898
	i = 0;
1899 1900
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1901 1902

	sg_set_buf(&sg[1], mac_data,
1903
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1904 1905

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1906
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1907
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1908 1909

	kfree(buf);
1910 1911
}

1912 1913
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1914 1915 1916 1917
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1918
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1919
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1920 1921

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1922
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1923
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1924
	return 0;
1925 1926
}

1927 1928
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1929 1930 1931 1932
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1933
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1934
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1935 1936

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1937
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1938
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1939
	return 0;
1940 1941
}

1942
static void virtnet_clean_affinity(struct virtnet_info *vi)
J
Jason Wang 已提交
1943 1944 1945
{
	int i;

1946 1947
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1948 1949
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
1950 1951
		}

1952 1953 1954
		vi->affinity_hint_set = false;
	}
}
1955

1956 1957
static void virtnet_set_affinity(struct virtnet_info *vi)
{
1958 1959 1960 1961 1962 1963 1964 1965
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
1966
		virtnet_clean_affinity(vi);
1967
		return;
J
Jason Wang 已提交
1968 1969
	}

1970 1971 1972 1973 1974 1975
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
1976

1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
		cpumask_clear(mask);
J
Jason Wang 已提交
1989 1990
	}

1991
	vi->affinity_hint_set = true;
1992
	free_cpumask_var(mask);
J
Jason Wang 已提交
1993 1994
}

1995
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1996
{
1997 1998 1999 2000 2001
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
2002

2003 2004 2005 2006 2007 2008 2009
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
2010

2011 2012 2013 2014 2015
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

2016
	virtnet_clean_affinity(vi);
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
2042 2043
}

R
Rick Jones 已提交
2044 2045 2046 2047 2048
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
2049 2050
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
2051 2052 2053 2054
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2082
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2083 2084
		return -EINVAL;

J
John Fastabend 已提交
2085 2086 2087 2088 2089 2090 2091
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2092
	get_online_cpus();
2093
	err = _virtnet_set_queues(vi, queue_pairs);
2094 2095 2096
	if (err) {
		put_online_cpus();
		goto err;
2097
	}
2098
	virtnet_set_affinity(vi);
2099
	put_online_cpus();
2100

2101 2102 2103
	netif_set_real_num_tx_queues(dev, queue_pairs);
	netif_set_real_num_rx_queues(dev, queue_pairs);
 err:
2104 2105 2106
	return err;
}

T
Toshiaki Makita 已提交
2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	char *p = (char *)data;
	unsigned int i, j;

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s",
					 i, virtnet_rq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s",
					 i, virtnet_sq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2158
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2197 2198
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2199 2200 2201
{
	struct virtnet_info *vi = netdev_priv(dev);

2202 2203
	return ethtool_virtdev_set_link_ksettings(dev, cmd,
						  &vi->speed, &vi->duplex);
2204 2205
}

2206 2207
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2208 2209 2210
{
	struct virtnet_info *vi = netdev_priv(dev);

2211 2212 2213
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2214 2215 2216 2217

	return 0;
}

2218 2219 2220 2221 2222 2223
static int virtnet_set_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

2224 2225
	if (ec->tx_max_coalesced_frames > 1 ||
	    ec->rx_max_coalesced_frames != 1)
2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
		return -EINVAL;

	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2256 2257 2258 2259 2260 2261 2262 2263
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2264 2265 2266 2267 2268 2269 2270 2271
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

2272 2273
	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);

2274 2275
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
2276 2277 2278

	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);

2279 2280 2281 2282
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2283
static const struct ethtool_ops virtnet_ethtool_ops = {
2284
	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2285
	.get_drvinfo = virtnet_get_drvinfo,
2286
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2287
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2288 2289 2290
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2291 2292
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2293
	.get_ts_info = ethtool_op_get_ts_info,
2294 2295
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2296 2297
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2298 2299
};

2300 2301 2302 2303 2304 2305 2306 2307
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2308
	netif_tx_lock_bh(vi->dev);
2309
	netif_device_detach(vi->dev);
2310
	netif_tx_unlock_bh(vi->dev);
2311 2312 2313
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2314
		for (i = 0; i < vi->max_queue_pairs; i++) {
2315
			napi_disable(&vi->rq[i].napi);
2316
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2317
		}
2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2339
		for (i = 0; i < vi->max_queue_pairs; i++) {
2340
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2341 2342 2343
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2344 2345
	}

2346
	netif_tx_lock_bh(vi->dev);
2347
	netif_device_attach(vi->dev);
2348
	netif_tx_unlock_bh(vi->dev);
2349 2350 2351
	return err;
}

2352 2353 2354
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2355
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2356

2357
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2358 2359 2360

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2361
		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2388 2389
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2390 2391 2392 2393
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2394
	u16 xdp_qp = 0, curr_qp;
2395
	int i, err;
J
John Fastabend 已提交
2396

2397 2398 2399 2400
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2401 2402 2403
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
J
John Fastabend 已提交
2404 2405 2406 2407
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2408
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2409 2410 2411 2412
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2413
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2414 2415 2416 2417
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2418 2419 2420 2421 2422 2423
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2424
		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
2425 2426 2427 2428 2429
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

2430 2431 2432 2433
	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
	if (!prog && !old_prog)
		return 0;

2434 2435
	if (prog)
		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2436

2437
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2438 2439
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2440
			napi_disable(&vi->rq[i].napi);
2441 2442 2443
			virtnet_napi_tx_disable(&vi->sq[i].napi);
		}
	}
J
John Fastabend 已提交
2444

2445 2446 2447 2448 2449 2450 2451 2452
	if (!prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0)
				virtnet_restore_guest_offloads(vi);
		}
		synchronize_net();
	}
J
John Fastabend 已提交
2453

2454 2455 2456
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
2457
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2458
	vi->xdp_queue_pairs = xdp_qp;
2459

2460 2461 2462 2463
	if (prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0 && !old_prog)
2464 2465
				virtnet_clear_guest_offloads(vi);
		}
2466 2467 2468
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
J
John Fastabend 已提交
2469 2470
		if (old_prog)
			bpf_prog_put(old_prog);
2471
		if (netif_running(dev)) {
2472
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2473 2474 2475
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
J
John Fastabend 已提交
2476 2477 2478
	}

	return 0;
2479

2480
err:
2481 2482 2483 2484 2485 2486
	if (!prog) {
		virtnet_clear_guest_offloads(vi);
		for (i = 0; i < vi->max_queue_pairs; i++)
			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
	}

2487
	if (netif_running(dev)) {
2488
		for (i = 0; i < vi->max_queue_pairs; i++) {
2489
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2490 2491 2492
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2493
	}
2494 2495 2496
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2497 2498
}

2499
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2500 2501 2502
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2503
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2504 2505 2506 2507 2508
	default:
		return -EINVAL;
	}
}

2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2525 2526 2527 2528
static int virtnet_set_features(struct net_device *dev,
				netdev_features_t features)
{
	struct virtnet_info *vi = netdev_priv(dev);
2529
	u64 offloads;
2530 2531
	int err;

2532
	if ((dev->features ^ features) & NETIF_F_LRO) {
2533 2534 2535
		if (vi->xdp_queue_pairs)
			return -EBUSY;

2536
		if (features & NETIF_F_LRO)
2537
			offloads = vi->guest_offloads_capable;
2538
		else
2539 2540
			offloads = vi->guest_offloads_capable &
				   ~GUEST_OFFLOAD_LRO_MASK;
2541

2542 2543 2544 2545
		err = virtnet_set_guest_offloads(vi, offloads);
		if (err)
			return err;
		vi->guest_offloads = offloads;
2546 2547 2548 2549 2550
	}

	return 0;
}

2551 2552 2553 2554 2555
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2556
	.ndo_set_mac_address = virtnet_set_mac_address,
2557
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2558
	.ndo_get_stats64     = virtnet_stats,
2559 2560
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2561
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2562
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2563
	.ndo_features_check	= passthru_features_check,
2564
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2565
	.ndo_set_features	= virtnet_set_features,
2566 2567
};

2568
static void virtnet_config_changed_work(struct work_struct *work)
2569
{
2570 2571
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2572 2573
	u16 v;

2574 2575
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2576
		return;
2577 2578

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2579
		netdev_notify_peers(vi->dev);
2580 2581
		virtnet_ack_link_announce(vi);
	}
2582 2583 2584 2585 2586

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2587
		return;
2588 2589 2590 2591

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2592
		virtnet_update_settings(vi);
2593
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2594
		netif_tx_wake_all_queues(vi->dev);
2595 2596
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2597
		netif_tx_stop_all_queues(vi->dev);
2598 2599 2600 2601 2602 2603 2604
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2605
	schedule_work(&vi->config_work);
2606 2607
}

J
Jason Wang 已提交
2608 2609
static void virtnet_free_queues(struct virtnet_info *vi)
{
2610 2611
	int i;

2612
	for (i = 0; i < vi->max_queue_pairs; i++) {
2613 2614
		__netif_napi_del(&vi->rq[i].napi);
		__netif_napi_del(&vi->sq[i].napi);
2615
	}
2616

2617
	/* We called __netif_napi_del(),
2618 2619 2620 2621
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2622 2623
	kfree(vi->rq);
	kfree(vi->sq);
2624
	kfree(vi->ctrl);
J
Jason Wang 已提交
2625 2626
}

2627
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2628
{
J
John Fastabend 已提交
2629
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2630 2631 2632 2633 2634
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2635 2636 2637 2638 2639

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2640
	}
2641 2642 2643 2644 2645 2646
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2647
	rtnl_unlock();
J
Jason Wang 已提交
2648 2649
}

2650 2651 2652 2653 2654 2655 2656 2657
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
Jason Wang 已提交
2658 2659 2660 2661 2662 2663 2664
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2665
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2666
			if (!is_xdp_frame(buf))
J
John Fastabend 已提交
2667 2668
				dev_kfree_skb(buf);
			else
2669
				xdp_return_frame(ptr_to_xdp(buf));
J
John Fastabend 已提交
2670
		}
J
Jason Wang 已提交
2671 2672 2673 2674 2675 2676
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2677
			if (vi->mergeable_rx_bufs) {
2678
				put_page(virt_to_head_page(buf));
2679
			} else if (vi->big_packets) {
2680
				give_pages(&vi->rq[i], buf);
2681
			} else {
2682
				put_page(virt_to_head_page(buf));
2683
			}
J
Jason Wang 已提交
2684 2685 2686 2687
		}
	}
}

2688 2689 2690 2691
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2692
	virtnet_clean_affinity(vi);
J
Jason Wang 已提交
2693

2694
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2695 2696

	virtnet_free_queues(vi);
2697 2698
}

2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2711 2712
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2713 2714
}

J
Jason Wang 已提交
2715
static int virtnet_find_vqs(struct virtnet_info *vi)
2716
{
J
Jason Wang 已提交
2717 2718 2719 2720 2721
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2722
	bool *ctx;
J
Jason Wang 已提交
2723 2724 2725 2726 2727 2728 2729 2730 2731

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2732
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2733 2734
	if (!vqs)
		goto err_vq;
2735
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2736 2737
	if (!callbacks)
		goto err_callback;
2738
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2739 2740
	if (!names)
		goto err_names;
2741
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2742
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2743 2744 2745 2746 2747
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2748 2749 2750 2751 2752 2753

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2754

J
Jason Wang 已提交
2755 2756 2757 2758 2759 2760 2761 2762
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2763 2764
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2765
	}
2766

J
Jason Wang 已提交
2767
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2768
					 names, ctx, NULL);
J
Jason Wang 已提交
2769 2770
	if (ret)
		goto err_find;
2771

J
Jason Wang 已提交
2772 2773
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2774
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2775
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2776
	}
J
Jason Wang 已提交
2777 2778 2779

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2780
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2781 2782 2783
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2784
	/* run here: ret == 0. */
J
Jason Wang 已提交
2785 2786 2787


err_find:
2788 2789
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2803 2804 2805
	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
	if (!vi->ctrl)
		goto err_ctrl;
K
Kees Cook 已提交
2806
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2807 2808
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2809
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2810
	if (!vi->rq)
J
Jason Wang 已提交
2811 2812 2813 2814 2815 2816 2817
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2818 2819
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2820 2821

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2822
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2823
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2824 2825 2826

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2827 2828 2829 2830 2831 2832 2833
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2834 2835
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2852
	get_online_cpus();
2853
	virtnet_set_affinity(vi);
2854 2855
	put_online_cpus();

J
Jason Wang 已提交
2856 2857 2858 2859 2860 2861
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2862 2863
}

2864 2865
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2866
		char *buf)
2867 2868 2869
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2870 2871
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2872
	struct ewma_pkt_len *avg;
2873 2874 2875

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2876
	return sprintf(buf, "%u\n",
2877 2878
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2929 2930 2931
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2932
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2933
{
2934 2935 2936 2937 2938 2939
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2940 2941 2942
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
2956
	int i, err = -ENOMEM;
2957 2958 2959 2960 2961
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2962
	/* Find if host supports multiqueue virtio_net device */
2963 2964 2965
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2966 2967 2968 2969 2970 2971

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2972 2973

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2974
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2975 2976 2977 2978
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2979
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2980
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2981
	dev->features = NETIF_F_HIGHDMA;
2982

2983
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2984 2985 2986
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2987
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2988
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2989
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2990
		if (csum)
J
Jason Wang 已提交
2991
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2992 2993

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2994
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
2995 2996
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2997
		/* Individual feature bits: what can host handle? */
2998 2999 3000 3001 3002 3003 3004
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

3005 3006
		dev->features |= NETIF_F_GSO_ROBUST;

3007
		if (gso)
3008
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3009
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
3010
	}
3011 3012
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
3013 3014 3015
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
		dev->features |= NETIF_F_LRO;
3016
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3017
		dev->hw_features |= NETIF_F_LRO;
R
Rusty Russell 已提交
3018

3019 3020
	dev->vlan_features = dev->features;

3021 3022 3023 3024
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
3025
	/* Configuration may specify what MAC to use.  Otherwise random. */
3026 3027 3028 3029 3030
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
3031
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
3032 3033 3034 3035 3036

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
3037
	vdev->priv = vi;
3038

3039
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
3040

3041
	/* If we can receive ANY GSO packets, we must allocate large ones. */
3042 3043
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3044 3045
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3046 3047
		vi->big_packets = true;

3048 3049 3050
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

3051 3052
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3053 3054 3055 3056
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

3057 3058
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3059 3060
		vi->any_header_sg = true;

J
Jason Wang 已提交
3061 3062 3063
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

3064 3065 3066 3067
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3068
		if (mtu < dev->min_mtu) {
3069 3070 3071
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
3072 3073 3074
			dev_err(&vdev->dev,
				"device MTU appears to have changed it is now %d < %d",
				mtu, dev->min_mtu);
3075
			err = -EINVAL;
T
Toshiaki Makita 已提交
3076
			goto free;
3077
		}
3078

3079 3080 3081
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3082 3083 3084
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3085 3086
	}

3087 3088
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3089

3090 3091 3092 3093 3094
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3095 3096 3097
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3098
	err = init_vqs(vi);
3099
	if (err)
T
Toshiaki Makita 已提交
3100
		goto free;
R
Rusty Russell 已提交
3101

3102 3103 3104 3105
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3106 3107
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3108

3109 3110
	virtnet_init_settings(dev);

3111 3112
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3113 3114
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3115
			goto free_vqs;
3116
		}
3117 3118
	}

R
Rusty Russell 已提交
3119 3120 3121
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3122
		goto free_failover;
R
Rusty Russell 已提交
3123
	}
3124

M
Michael S. Tsirkin 已提交
3125 3126
	virtio_device_ready(vdev);

3127
	err = virtnet_cpu_notif_add(vi);
3128 3129
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3130
		goto free_unregister_netdev;
3131 3132
	}

3133
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3134

J
Jason Wang 已提交
3135 3136
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3137
	netif_carrier_off(dev);
J
Jason Wang 已提交
3138
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3139
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3140 3141
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3142
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3143 3144
		netif_carrier_on(dev);
	}
3145

3146 3147 3148
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);
3149
	vi->guest_offloads_capable = vi->guest_offloads;
3150

J
Jason Wang 已提交
3151 3152 3153
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3154 3155
	return 0;

3156
free_unregister_netdev:
3157 3158
	vi->vdev->config->reset(vdev);

3159
	unregister_netdev(dev);
3160 3161
free_failover:
	net_failover_destroy(vi->failover);
3162
free_vqs:
J
Jason Wang 已提交
3163
	cancel_delayed_work_sync(&vi->refill);
3164
	free_receive_page_frags(vi);
3165
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3166 3167 3168 3169 3170
free:
	free_netdev(dev);
	return err;
}

3171
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3172
{
3173
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3174 3175

	/* Free unused buffers in both send and recv, if any. */
3176
	free_unused_bufs(vi);
3177

J
Jason Wang 已提交
3178
	free_receive_bufs(vi);
3179

3180 3181
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3182
	virtnet_del_vqs(vi);
3183 3184
}

3185
static void virtnet_remove(struct virtio_device *vdev)
3186 3187 3188
{
	struct virtnet_info *vi = vdev->priv;

3189
	virtnet_cpu_notif_remove(vi);
3190

3191 3192
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3193

3194 3195
	unregister_netdev(vi->dev);

3196 3197
	net_failover_destroy(vi->failover);

3198
	remove_vq_common(vi);
3199

3200
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3201 3202
}

3203
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3204 3205 3206
{
	struct virtnet_info *vi = vdev->priv;

3207
	virtnet_cpu_notif_remove(vi);
3208
	virtnet_freeze_down(vdev);
3209 3210 3211 3212 3213
	remove_vq_common(vi);

	return 0;
}

3214
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3215 3216
{
	struct virtnet_info *vi = vdev->priv;
3217
	int err;
3218

3219
	err = virtnet_restore_up(vdev);
3220 3221
	if (err)
		return err;
J
Jason Wang 已提交
3222 3223
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3224
	err = virtnet_cpu_notif_add(vi);
3225 3226 3227
	if (err)
		return err;

3228 3229 3230
	return 0;
}

R
Rusty Russell 已提交
3231 3232 3233 3234 3235
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3236 3237 3238 3239 3240 3241 3242 3243 3244 3245
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3246
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3247
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3248

3249
static unsigned int features[] = {
3250 3251 3252 3253 3254 3255
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3256
	VIRTIO_F_ANY_LAYOUT,
3257 3258
};

3259
static struct virtio_driver virtio_net_driver = {
3260 3261
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3262 3263
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3264 3265 3266
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3267
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3268
	.probe =	virtnet_probe,
3269
	.remove =	virtnet_remove,
3270
	.config_changed = virtnet_config_changed,
3271
#ifdef CONFIG_PM_SLEEP
3272 3273 3274
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3275 3276
};

3277 3278 3279 3280
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3281
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3282 3283 3284 3285 3286
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3287
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3307
	unregister_virtio_driver(&virtio_net_driver);
3308 3309 3310 3311
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3312 3313 3314 3315

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");