virtio_net.c 81.7 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
J
Jason Wang 已提交
32
#include <linux/filter.h>
33
#include <linux/kernel.h>
34
#include <linux/pci.h>
35
#include <net/route.h>
36
#include <net/xdp.h>
37
#include <net/net_failover.h>
R
Rusty Russell 已提交
38

39
static int napi_weight = NAPI_POLL_WEIGHT;
40 41
module_param(napi_weight, int, 0444);

W
Willem de Bruijn 已提交
42
static bool csum = true, gso = true, napi_tx;
R
Rusty Russell 已提交
43 44
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
45
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
46

R
Rusty Russell 已提交
47
/* FIXME: MTU in config. */
48
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
49
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
50

51 52
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

53 54 55
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

56 57 58 59
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

J
Johannes Berg 已提交
60 61 62 63
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
64
 */
65
DECLARE_EWMA(pkt_len, 0, 64)
66

67
#define VIRTNET_DRIVER_VERSION "1.0.0"
68

69 70 71 72
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
73 74
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
75
};
76

T
Toshiaki Makita 已提交
77 78 79
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
80 81
};

T
Toshiaki Makita 已提交
82 83 84 85
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
86 87
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
88
	u64 kicks;
T
Toshiaki Makita 已提交
89 90
};

91 92
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
93 94
	u64 packets;
	u64 bytes;
95
	u64 drops;
96 97 98 99
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
100
	u64 kicks;
T
Toshiaki Makita 已提交
101 102 103
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
104
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
105 106

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
107 108 109 110
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
111
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
112 113 114
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
115 116 117 118 119 120 121
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
122
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
123 124 125 126 127
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

128 129 130 131 132 133 134
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
135 136 137

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
138

T
Toshiaki Makita 已提交
139 140
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
141
	struct napi_struct napi;
142 143 144 145 146 147 148
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
149 150
	struct napi_struct napi;

J
John Fastabend 已提交
151 152
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
153 154
	struct virtnet_rq_stats stats;

155 156 157
	/* Chain pages by the private ptr. */
	struct page *pages;

158
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
159
	struct ewma_pkt_len mrg_avg_pkt_len;
160

161 162 163
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

164 165
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
166

167 168 169
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
170 171
	/* Name of this receive queue: input.$index */
	char name[40];
172 173

	struct xdp_rxq_info xdp_rxq;
174 175
};

176 177 178 179 180 181 182
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
183
	__virtio16 vid;
184
	__virtio64 offloads;
185 186
};

187 188 189 190
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
191 192
	struct send_queue *sq;
	struct receive_queue *rq;
193 194
	unsigned int status;

J
Jason Wang 已提交
195 196 197 198 199 200
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

201 202 203
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

204 205 206
	/* I like... big packets and I cannot lie! */
	bool big_packets;

207 208 209
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
210 211 212
	/* Has control virtqueue */
	bool has_cvq;

213 214 215
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

216 217 218
	/* Packet virtio header size */
	u8 hdr_len;

219 220 221
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

222 223 224
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
225 226
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
227

228 229 230
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
231

232
	struct control_buf *ctrl;
233 234 235 236

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
237 238

	unsigned long guest_offloads;
239 240 241

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
242 243
};

244
struct padded_vnet_hdr {
245
	struct virtio_net_hdr_mrg_rxbuf hdr;
246
	/*
247 248 249
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
250
	 */
251
	char padding[4];
252 253
};

J
Jason Wang 已提交
254 255 256 257 258
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
259
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
260 261 262 263 264 265 266 267 268
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
269
	return vq->index / 2;
J
Jason Wang 已提交
270 271 272 273 274 275 276
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

277
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
278
{
279
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
280 281
}

282 283 284 285
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
286
static void give_pages(struct receive_queue *rq, struct page *page)
287
{
288
	struct page *end;
289

290
	/* Find end of list, sew whole thing into vi->rq.pages. */
291
	for (end = page; end->private; end = (struct page *)end->private);
292 293
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
294 295
}

296
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
297
{
298
	struct page *p = rq->pages;
299

300
	if (p) {
301
		rq->pages = (struct page *)p->private;
302 303 304
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
305 306 307 308
		p = alloc_page(gfp_mask);
	return p;
}

309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
324 325 326 327 328 329
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
330 331
}

332
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
333
{
334
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
335
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
336

337
	/* Suppress further interrupts. */
338
	virtqueue_disable_cb(vq);
339

W
Willem de Bruijn 已提交
340 341 342 343 344
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
345 346
}

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

364
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
365 366
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
367
				   struct page *page, unsigned int offset,
368 369
				   unsigned int len, unsigned int truesize,
				   bool hdr_valid)
370 371
{
	struct sk_buff *skb;
372
	struct virtio_net_hdr_mrg_rxbuf *hdr;
373
	unsigned int copy, hdr_len, hdr_padded_len;
374
	char *p;
375

376
	p = page_address(page) + offset;
377

378
	/* copy small packet so we can reuse these pages for small data */
379
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
380 381
	if (unlikely(!skb))
		return NULL;
382

383
	hdr = skb_vnet_hdr(skb);
384

385 386
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
387
		hdr_padded_len = sizeof(*hdr);
388
	else
389
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
390

391 392
	if (hdr_valid)
		memcpy(hdr, p, hdr_len);
393

394
	len -= hdr_len;
395 396
	offset += hdr_padded_len;
	p += hdr_padded_len;
397

398 399 400
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
401
	skb_put_data(skb, p, copy);
402

403 404
	len -= copy;
	offset += copy;
405

406 407 408 409 410 411 412 413
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

414 415 416 417 418 419 420
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
421
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
422 423 424
		dev_kfree_skb(skb);
		return NULL;
	}
425
	BUG_ON(offset >= PAGE_SIZE);
426
	while (len) {
427 428 429 430
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
431 432 433
		page = (struct page *)page->private;
		offset = 0;
	}
434

435
	if (page)
436
		give_pages(rq, page);
437

438 439
	return skb;
}
440

441 442 443
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
444 445 446 447
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

448 449 450
	/* virtqueue want to use data area in-front of packet */
	if (unlikely(xdpf->metasize > 0))
		return -EOPNOTSUPP;
J
John Fastabend 已提交
451

452 453 454 455 456
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
457
	/* Zero header and leave csum up to XDP layers */
458
	hdr = xdpf->data;
459
	memset(hdr, 0, vi->hdr_len);
460
	xdpf->len   += vi->hdr_len;
461

462
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
463

464
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
465
	if (unlikely(err))
466
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
467

468
	return 0;
J
John Fastabend 已提交
469 470
}

471 472 473 474 475 476 477 478
static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
{
	unsigned int qp;

	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	return &vi->sq[qp];
}

479
static int virtnet_xdp_xmit(struct net_device *dev,
480
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
481 482
{
	struct virtnet_info *vi = netdev_priv(dev);
483
	struct receive_queue *rq = vi->rq;
484
	struct xdp_frame *xdpf_sent;
485
	struct bpf_prog *xdp_prog;
486 487 488
	struct send_queue *sq;
	unsigned int len;
	int drops = 0;
T
Toshiaki Makita 已提交
489
	int kicks = 0;
490
	int ret, err;
491 492
	int i;

493
	sq = virtnet_xdp_sq(vi);
J
Jason Wang 已提交
494

495 496 497 498 499 500
	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
		drops = n;
		goto out;
	}

501 502 503 504
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
	xdp_prog = rcu_dereference(rq->xdp_prog);
505 506 507 508 509
	if (!xdp_prog) {
		ret = -ENXIO;
		drops = n;
		goto out;
	}
510

511 512 513 514 515 516 517 518 519 520 521 522 523
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
		xdp_return_frame(xdpf_sent);

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
		if (err) {
			xdp_return_frame_rx_napi(xdpf);
			drops++;
		}
	}
524
	ret = n - drops;
525

T
Toshiaki Makita 已提交
526 527 528 529
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
530 531 532 533
out:
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.xdp_tx += n;
	sq->stats.xdp_tx_drops += drops;
T
Toshiaki Makita 已提交
534
	sq->stats.kicks += kicks;
535
	u64_stats_update_end(&sq->stats.syncp);
536

537
	return ret;
J
Jason Wang 已提交
538 539
}

540 541 542 543 544
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
575
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
576 577 578 579 580 581 582 583 584 585 586 587 588 589
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
590
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

609 610 611
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
612
				     void *buf, void *ctx,
J
Jason Wang 已提交
613
				     unsigned int len,
614
				     unsigned int *xdp_xmit,
615
				     struct virtnet_rq_stats *stats)
616
{
617
	struct sk_buff *skb;
618
	struct bpf_prog *xdp_prog;
619
	unsigned int xdp_headroom = (unsigned long)ctx;
620 621 622 623
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
624
	struct page *page = virt_to_head_page(buf);
625
	unsigned int delta = 0;
626
	struct page *xdp_page;
627 628
	int err;

629
	len -= vi->hdr_len;
630
	stats->bytes += len;
631

632 633 634
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
635
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
636
		struct xdp_frame *xdpf;
637
		struct xdp_buff xdp;
638
		void *orig_data;
639 640
		u32 act;

641
		if (unlikely(hdr->hdr.gso_type))
642
			goto err_xdp;
643

644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

665 666
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
667
		xdp_set_data_meta_invalid(&xdp);
668
		xdp.data_end = xdp.data + len;
669
		xdp.rxq = &rq->xdp_rxq;
670
		orig_data = xdp.data;
671
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
672
		stats->xdp_packets++;
673

674 675
		switch (act) {
		case XDP_PASS:
676
			/* Recalculate length in case bpf program changed it */
677
			delta = orig_data - xdp.data;
678
			len = xdp.data_end - xdp.data;
679 680
			break;
		case XDP_TX:
681
			stats->xdp_tx++;
682 683 684
			xdpf = convert_to_xdp_frame(&xdp);
			if (unlikely(!xdpf))
				goto err_xdp;
685 686
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
687
				trace_xdp_exception(vi->dev, xdp_prog, act);
688 689
				goto err_xdp;
			}
690
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
691 692 693
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
694
			stats->xdp_redirects++;
J
Jason Wang 已提交
695
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
696 697
			if (err)
				goto err_xdp;
698
			*xdp_xmit |= VIRTIO_XDP_REDIR;
699 700 701
			rcu_read_unlock();
			goto xdp_xmit;
		default:
702
			bpf_warn_invalid_xdp_action(act);
703
			/* fall through */
704 705 706
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
707 708 709 710 711
			goto err_xdp;
		}
	}
	rcu_read_unlock();

712 713
	skb = build_skb(buf, buflen);
	if (!skb) {
714
		put_page(page);
715 716 717
		goto err;
	}
	skb_reserve(skb, headroom - delta);
718
	skb_put(skb, len);
719 720 721 722 723 724
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
725
	return skb;
726 727 728

err_xdp:
	rcu_read_unlock();
729 730
	stats->xdp_drops++;
	stats->drops++;
731
	put_page(page);
732 733
xdp_xmit:
	return NULL;
734 735 736
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
737
				   struct virtnet_info *vi,
738 739
				   struct receive_queue *rq,
				   void *buf,
740
				   unsigned int len,
741
				   struct virtnet_rq_stats *stats)
742 743
{
	struct page *page = buf;
744 745
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len,
					  PAGE_SIZE, true);
J
John Fastabend 已提交
746

747
	stats->bytes += len - vi->hdr_len;
748 749 750 751 752 753
	if (unlikely(!skb))
		goto err;

	return skb;

err:
754
	stats->drops++;
755 756 757 758
	give_pages(rq, page);
	return NULL;
}

759
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
760
					 struct virtnet_info *vi,
761
					 struct receive_queue *rq,
762 763
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
764
					 unsigned int len,
765
					 unsigned int *xdp_xmit,
766
					 struct virtnet_rq_stats *stats)
767
{
768 769
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
770 771
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
772 773 774
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;
775
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
776
	int err;
J
John Fastabend 已提交
777

J
John Fastabend 已提交
778
	head_skb = NULL;
779
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
780

J
John Fastabend 已提交
781 782 783
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
784
		struct xdp_frame *xdpf;
785
		struct page *xdp_page;
786 787
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
788 789
		u32 act;

790 791 792 793 794 795 796
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

797 798 799 800 801 802
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
803 804
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
805
			/* linearize data for XDP */
806
			xdp_page = xdp_linearize_page(rq, &num_buf,
807 808 809
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
810 811
			if (!xdp_page)
				goto err_xdp;
812
			offset = VIRTIO_XDP_HEADROOM;
813 814
		} else {
			xdp_page = page;
J
John Fastabend 已提交
815 816
		}

817 818 819
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
820
		data = page_address(xdp_page) + offset;
821
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
822
		xdp.data = data + vi->hdr_len;
823
		xdp_set_data_meta_invalid(&xdp);
824
		xdp.data_end = xdp.data + (len - vi->hdr_len);
825 826
		xdp.rxq = &rq->xdp_rxq;

827
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
828
		stats->xdp_packets++;
829

J
John Fastabend 已提交
830 831
		switch (act) {
		case XDP_PASS:
832 833 834 835 836 837 838
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

839 840 841
			/* recalculate len if xdp.data or xdp.data_end were
			 * adjusted
			 */
842
			len = xdp.data_end - xdp.data + vi->hdr_len;
843 844 845 846 847
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
848 849
						       offset, len,
						       PAGE_SIZE, false);
850 851
				return head_skb;
			}
J
John Fastabend 已提交
852 853
			break;
		case XDP_TX:
854
			stats->xdp_tx++;
855 856 857
			xdpf = convert_to_xdp_frame(&xdp);
			if (unlikely(!xdpf))
				goto err_xdp;
858 859
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
860
				trace_xdp_exception(vi->dev, xdp_prog, act);
861 862 863 864
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
865
			*xdp_xmit |= VIRTIO_XDP_TX;
866
			if (unlikely(xdp_page != page))
867
				put_page(page);
J
John Fastabend 已提交
868 869
			rcu_read_unlock();
			goto xdp_xmit;
870
		case XDP_REDIRECT:
871
			stats->xdp_redirects++;
872 873 874 875 876 877
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
878
			*xdp_xmit |= VIRTIO_XDP_REDIR;
879
			if (unlikely(xdp_page != page))
880
				put_page(page);
881 882
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
883
		default:
884
			bpf_warn_invalid_xdp_action(act);
885
			/* fall through */
886 887
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
888
			/* fall through */
889
		case XDP_DROP:
890 891
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
892
			goto err_xdp;
J
John Fastabend 已提交
893
		}
J
John Fastabend 已提交
894 895
	}
	rcu_read_unlock();
896

897 898
	truesize = mergeable_ctx_to_truesize(ctx);
	if (unlikely(len > truesize)) {
899
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
900 901 902 903
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
904

905
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog);
J
John Fastabend 已提交
906
	curr_skb = head_skb;
907

908 909
	if (unlikely(!curr_skb))
		goto err_skb;
910
	while (--num_buf) {
911 912
		int num_skb_frags;

913
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
914
		if (unlikely(!buf)) {
915
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
916
				 dev->name, num_buf,
917 918
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
919 920
			dev->stats.rx_length_errors++;
			goto err_buf;
921
		}
922

923
		stats->bytes += len;
924
		page = virt_to_head_page(buf);
925 926 927

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
928
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
929 930 931 932
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
933 934

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
935 936
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
937 938 939

			if (unlikely(!nskb))
				goto err_skb;
940 941 942 943 944 945 946 947 948 949 950
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
951
			head_skb->truesize += truesize;
952
		}
953
		offset = buf - page_address(page);
954 955 956
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
957
					     len, truesize);
958 959
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
960
					offset, len, truesize);
961
		}
962 963
	}

J
Johannes Berg 已提交
964
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
965 966
	return head_skb;

J
John Fastabend 已提交
967 968
err_xdp:
	rcu_read_unlock();
969
	stats->xdp_drops++;
970 971
err_skb:
	put_page(page);
972
	while (num_buf-- > 1) {
973 974
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
975 976 977 978 979
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
980
		stats->bytes += len;
981
		page = virt_to_head_page(buf);
982
		put_page(page);
983
	}
984
err_buf:
985
	stats->drops++;
986
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
987
xdp_xmit:
988
	return NULL;
989 990
}

991 992
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
993
			unsigned int *xdp_xmit,
994
			struct virtnet_rq_stats *stats)
995
{
996
	struct net_device *dev = vi->dev;
997
	struct sk_buff *skb;
998
	struct virtio_net_hdr_mrg_rxbuf *hdr;
999

1000
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1001 1002
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1003
		if (vi->mergeable_rx_bufs) {
1004
			put_page(virt_to_head_page(buf));
1005
		} else if (vi->big_packets) {
1006
			give_pages(rq, buf);
1007
		} else {
1008
			put_page(virt_to_head_page(buf));
1009
		}
1010
		return;
1011
	}
1012

1013
	if (vi->mergeable_rx_bufs)
1014
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1015
					stats);
1016
	else if (vi->big_packets)
1017
		skb = receive_big(dev, vi, rq, buf, len, stats);
1018
	else
1019
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1020 1021

	if (unlikely(!skb))
1022
		return;
1023

1024
	hdr = skb_vnet_hdr(skb);
1025

1026
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1027
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1028

1029 1030 1031 1032 1033 1034
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1035 1036
	}

1037 1038 1039 1040
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1041
	napi_gro_receive(&rq->napi, skb);
1042
	return;
R
Rusty Russell 已提交
1043 1044 1045 1046 1047 1048

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1049 1050 1051 1052 1053
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1054 1055
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1056
{
1057 1058
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1059
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1060
	void *ctx = (void *)(unsigned long)xdp_headroom;
1061
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1062
	int err;
1063

1064 1065 1066
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1067
		return -ENOMEM;
R
Rusty Russell 已提交
1068

1069 1070 1071 1072 1073
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1074
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1075
	if (err < 0)
1076
		put_page(virt_to_head_page(buf));
1077 1078
	return err;
}
1079

1080 1081
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1082 1083 1084 1085 1086
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1087 1088
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1089
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1090
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1091
		first = get_a_page(rq, gfp);
1092 1093
		if (!first) {
			if (list)
1094
				give_pages(rq, list);
1095
			return -ENOMEM;
1096
		}
1097
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1098

1099 1100 1101 1102
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1103

1104
	first = get_a_page(rq, gfp);
1105
	if (!first) {
1106
		give_pages(rq, list);
1107 1108 1109 1110
		return -ENOMEM;
	}
	p = page_address(first);

1111
	/* rq->sg[0], rq->sg[1] share the same page */
1112 1113
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1114

1115
	/* rq->sg[1] for data packet, from offset */
1116
	offset = sizeof(struct padded_vnet_hdr);
1117
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1118 1119 1120

	/* chain first in list head */
	first->private = (unsigned long)list;
1121 1122
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1123
	if (err < 0)
1124
		give_pages(rq, first);
1125 1126

	return err;
R
Rusty Russell 已提交
1127 1128
}

1129
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1130 1131
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1132
{
1133
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1134 1135
	unsigned int len;

1136 1137 1138 1139
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1140
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1141

1142
	return ALIGN(len, L1_CACHE_BYTES);
1143 1144
}

1145 1146
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1147
{
1148
	struct page_frag *alloc_frag = &rq->alloc_frag;
1149
	unsigned int headroom = virtnet_get_headroom(vi);
1150 1151
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1152
	char *buf;
1153
	void *ctx;
1154
	int err;
1155
	unsigned int len, hole;
1156

1157 1158 1159 1160 1161 1162
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1163
		return -ENOMEM;
1164

1165
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1166
	buf += headroom; /* advance address leaving hole at front of pkt */
1167
	get_page(alloc_frag->page);
1168
	alloc_frag->offset += len + room;
1169
	hole = alloc_frag->size - alloc_frag->offset;
1170
	if (hole < len + room) {
1171 1172
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1173
		 * the current buffer.
1174
		 */
1175 1176 1177
		len += hole;
		alloc_frag->offset += hole;
	}
1178

1179
	sg_init_one(rq->sg, buf, len);
1180
	ctx = mergeable_len_to_ctx(len, headroom);
1181
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1182
	if (err < 0)
1183
		put_page(virt_to_head_page(buf));
1184

1185 1186
	return err;
}
1187

1188 1189 1190 1191 1192 1193 1194
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1195 1196
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1197 1198
{
	int err;
1199
	bool oom;
1200

1201 1202
	do {
		if (vi->mergeable_rx_bufs)
1203
			err = add_recvbuf_mergeable(vi, rq, gfp);
1204
		else if (vi->big_packets)
1205
			err = add_recvbuf_big(vi, rq, gfp);
1206
		else
M
Michael S. Tsirkin 已提交
1207
			err = add_recvbuf_small(vi, rq, gfp);
1208

1209
		oom = err == -ENOMEM;
1210
		if (err)
1211
			break;
1212
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1213 1214
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
		u64_stats_update_begin(&rq->stats.syncp);
1215
		rq->stats.kicks++;
T
Toshiaki Makita 已提交
1216 1217 1218
		u64_stats_update_end(&rq->stats.syncp);
	}

1219
	return !oom;
1220 1221
}

1222
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1223 1224
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1225
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1226

1227
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1228 1229
}

1230
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1231
{
1232
	napi_enable(napi);
1233 1234

	/* If all buffers were filled by other side before we napi_enabled, we
1235 1236 1237 1238 1239 1240
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1241 1242
}

W
Willem de Bruijn 已提交
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1261 1262 1263 1264 1265 1266
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1267 1268
static void refill_work(struct work_struct *work)
{
1269 1270
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1271
	bool still_empty;
J
Jason Wang 已提交
1272 1273
	int i;

1274
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1275
		struct receive_queue *rq = &vi->rq[i];
1276

J
Jason Wang 已提交
1277
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1278
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1279
		virtnet_napi_enable(rq->vq, &rq->napi);
1280

J
Jason Wang 已提交
1281 1282 1283 1284 1285 1286
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1287 1288
}

1289 1290
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1291
{
1292
	struct virtnet_info *vi = rq->vq->vdev->priv;
1293
	struct virtnet_rq_stats stats = {};
1294
	unsigned int len;
1295
	void *buf;
1296
	int i;
R
Rusty Russell 已提交
1297

1298
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1299 1300
		void *ctx;

1301
		while (stats.packets < budget &&
1302
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1303
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1304
			stats.packets++;
1305 1306
		}
	} else {
1307
		while (stats.packets < budget &&
1308
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1309
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1310
			stats.packets++;
1311
		}
R
Rusty Russell 已提交
1312 1313
	}

1314
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
1315
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1316
			schedule_delayed_work(&vi->refill, 0);
1317
	}
R
Rusty Russell 已提交
1318

T
Toshiaki Makita 已提交
1319
	u64_stats_update_begin(&rq->stats.syncp);
1320 1321 1322 1323
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1324 1325
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1326
	}
T
Toshiaki Makita 已提交
1327
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1328

1329
	return stats.packets;
1330 1331
}

1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344
static void free_old_xmit_skbs(struct send_queue *sq)
{
	struct sk_buff *skb;
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;

	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		pr_debug("Sent skb %p\n", skb);

		bytes += skb->len;
		packets++;

1345
		dev_consume_skb_any(skb);
1346 1347 1348 1349 1350 1351 1352 1353
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1354 1355 1356 1357
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1358 1359
}

1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

	if (!sq->napi.weight)
		return;

	if (__netif_tx_trylock(txq)) {
		free_old_xmit_skbs(sq);
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1379 1380 1381 1382
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1383 1384
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1385
	unsigned int received;
1386
	unsigned int xdp_xmit = 0;
1387

1388 1389
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1390
	received = virtnet_receive(rq, budget, &xdp_xmit);
1391

1392
	/* Out of packets? */
1393 1394
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1395

1396 1397 1398 1399
	if (xdp_xmit & VIRTIO_XDP_REDIR)
		xdp_do_flush_map();

	if (xdp_xmit & VIRTIO_XDP_TX) {
1400
		sq = virtnet_xdp_sq(vi);
T
Toshiaki Makita 已提交
1401 1402 1403 1404 1405
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1406
	}
J
Jason Wang 已提交
1407

R
Rusty Russell 已提交
1408 1409 1410
	return received;
}

J
Jason Wang 已提交
1411 1412 1413
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1414
	int i, err;
J
Jason Wang 已提交
1415

1416 1417 1418
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1419
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1420
				schedule_delayed_work(&vi->refill, 0);
1421 1422 1423 1424 1425

		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
		if (err < 0)
			return err;

1426 1427 1428 1429 1430 1431 1432
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1433
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1434
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1435 1436 1437 1438 1439
	}

	return 0;
}

W
Willem de Bruijn 已提交
1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));

	__netif_tx_lock(txq, raw_smp_processor_id());
	free_old_xmit_skbs(sq);
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1458
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1459
{
1460
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1461
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1462
	struct virtnet_info *vi = sq->vq->vdev->priv;
1463
	int num_sg;
1464
	unsigned hdr_len = vi->hdr_len;
1465
	bool can_push;
R
Rusty Russell 已提交
1466

J
Johannes Berg 已提交
1467
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1468 1469 1470 1471 1472 1473 1474

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1475
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1476 1477
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1478

1479
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1480 1481
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1482
		BUG();
R
Rusty Russell 已提交
1483

1484
	if (vi->mergeable_rx_bufs)
1485
		hdr->num_buffers = 0;
1486

1487
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1488 1489 1490
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1491 1492
		if (unlikely(num_sg < 0))
			return num_sg;
1493 1494 1495 1496
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1497 1498 1499 1500
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1501
	}
1502
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1503 1504
}

1505
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1506 1507
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1508 1509
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1510
	int err;
1511 1512
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
W
Willem de Bruijn 已提交
1513
	bool use_napi = sq->napi.weight;
1514 1515

	/* Free up any pending old buffers before queueing new ones. */
1516
	free_old_xmit_skbs(sq);
1517

1518 1519 1520
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1521 1522 1523
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1524
	/* Try to transmit */
1525
	err = xmit_skb(sq, skb);
1526

1527
	/* This should not happen! */
1528
	if (unlikely(err)) {
1529 1530 1531
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1532
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1533
		dev->stats.tx_dropped++;
1534
		dev_kfree_skb_any(skb);
1535
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1536
	}
1537

1538
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1539 1540 1541 1542
	if (!use_napi) {
		skb_orphan(skb);
		nf_reset(skb);
	}
1543

1544 1545 1546 1547 1548 1549 1550 1551 1552
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1553
	 */
1554
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1555
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1556 1557
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1558
			/* More just got used, free them then recheck. */
1559 1560
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1561
				netif_start_subqueue(dev, qnum);
1562
				virtqueue_disable_cb(sq->vq);
1563 1564
			}
		}
1565
	}
1566

T
Toshiaki Makita 已提交
1567 1568 1569 1570 1571 1572 1573
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1574

1575
	return NETDEV_TX_OK;
1576 1577
}

1578 1579 1580
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1581
 * never fail unless improperly formatted.
1582 1583
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1584
				 struct scatterlist *out)
1585
{
1586
	struct scatterlist *sgs[4], hdr, stat;
1587
	unsigned out_num = 0, tmp;
1588 1589

	/* Caller should know better */
1590
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1591

1592 1593 1594
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1595
	/* Add header */
1596
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1597
	sgs[out_num++] = &hdr;
1598

1599 1600
	if (out)
		sgs[out_num++] = out;
1601

1602
	/* Add return status. */
1603
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1604
	sgs[out_num] = &stat;
1605

1606
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1607
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1608

1609
	if (unlikely(!virtqueue_kick(vi->cvq)))
1610
		return vi->ctrl->status == VIRTIO_NET_OK;
1611 1612 1613 1614

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1615 1616
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1617 1618
		cpu_relax();

1619
	return vi->ctrl->status == VIRTIO_NET_OK;
1620 1621
}

1622 1623 1624 1625
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1626
	int ret;
1627
	struct sockaddr *addr;
1628
	struct scatterlist sg;
1629

1630 1631 1632
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1633
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1634 1635 1636 1637
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1638
	if (ret)
1639
		goto out;
1640

1641 1642 1643
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1644
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1645 1646
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1647 1648
			ret = -EINVAL;
			goto out;
1649
		}
1650 1651
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1652 1653 1654 1655 1656 1657 1658
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1659 1660 1661
	}

	eth_commit_mac_addr_change(dev, p);
1662
	ret = 0;
1663

1664 1665 1666
out:
	kfree(addr);
	return ret;
1667 1668
}

1669 1670
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1671 1672 1673
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1674
	int i;
1675

T
Toshiaki Makita 已提交
1676
	for (i = 0; i < vi->max_queue_pairs; i++) {
1677
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1678 1679
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1680 1681

		do {
T
Toshiaki Makita 已提交
1682 1683 1684 1685
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1686 1687

		do {
T
Toshiaki Makita 已提交
1688
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1689 1690 1691
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1692
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1693 1694 1695 1696 1697

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1698
		tot->rx_dropped += rdrops;
1699 1700 1701
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1702
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1703 1704 1705 1706
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1707 1708 1709 1710
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1711
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1712 1713 1714 1715
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1716
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1717 1718 1719 1720 1721 1722 1723
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1724 1725
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1726 1727

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1728
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1729 1730 1731
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1732
	} else {
J
Jason Wang 已提交
1733
		vi->curr_queue_pairs = queue_pairs;
1734 1735 1736
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1737
	}
J
Jason Wang 已提交
1738 1739 1740 1741

	return 0;
}

1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1752 1753 1754
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1755
	int i;
R
Rusty Russell 已提交
1756

1757 1758
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1759

W
Willem de Bruijn 已提交
1760
	for (i = 0; i < vi->max_queue_pairs; i++) {
1761
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1762
		napi_disable(&vi->rq[i].napi);
1763
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1764
	}
R
Rusty Russell 已提交
1765 1766 1767 1768

	return 0;
}

1769 1770 1771
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1772 1773
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1774
	struct netdev_hw_addr *ha;
1775
	int uc_count;
1776
	int mc_count;
1777 1778
	void *buf;
	int i;
1779

S
stephen hemminger 已提交
1780
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1781 1782 1783
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1784 1785
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1786

1787
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1788 1789

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1790
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1791
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1792
			 vi->ctrl->promisc ? "en" : "dis");
1793

1794
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1795 1796

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1797
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1798
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1799
			 vi->ctrl->allmulti ? "en" : "dis");
1800

1801
	uc_count = netdev_uc_count(dev);
1802
	mc_count = netdev_mc_count(dev);
1803
	/* MAC filter - use one buffer for both lists */
1804 1805 1806
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1807
	if (!buf)
1808 1809
		return;

1810 1811
	sg_init_table(sg, 2);

1812
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1813
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1814
	i = 0;
1815
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1816
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1817 1818

	sg_set_buf(&sg[0], mac_data,
1819
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1820 1821

	/* multicast list and count fill the end */
1822
	mac_data = (void *)&mac_data->macs[uc_count][0];
1823

M
Michael S. Tsirkin 已提交
1824
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1825
	i = 0;
1826 1827
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1828 1829

	sg_set_buf(&sg[1], mac_data,
1830
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1831 1832

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1833
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1834
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1835 1836

	kfree(buf);
1837 1838
}

1839 1840
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1841 1842 1843 1844
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1845
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1846
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1847 1848

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1849
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1850
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1851
	return 0;
1852 1853
}

1854 1855
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1856 1857 1858 1859
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1860
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1861
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1862 1863

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1864
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1865
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1866
	return 0;
1867 1868
}

1869
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1870 1871 1872
{
	int i;

1873 1874
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1875 1876
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
1877 1878
		}

1879 1880 1881
		vi->affinity_hint_set = false;
	}
}
1882

1883 1884
static void virtnet_set_affinity(struct virtnet_info *vi)
{
1885 1886 1887 1888 1889 1890 1891 1892
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
1893 1894
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1895 1896
	}

1897 1898 1899 1900 1901 1902
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
1903

1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
		cpumask_clear(mask);
J
Jason Wang 已提交
1916 1917
	}

1918
	vi->affinity_hint_set = true;
1919
	free_cpumask_var(mask);
J
Jason Wang 已提交
1920 1921
}

1922
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1923
{
1924 1925 1926 1927 1928
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1929

1930 1931 1932 1933 1934 1935 1936
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1937

1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1969 1970
}

R
Rick Jones 已提交
1971 1972 1973 1974 1975
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1976 1977
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1978 1979 1980 1981
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2009
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2010 2011
		return -EINVAL;

J
John Fastabend 已提交
2012 2013 2014 2015 2016 2017 2018
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2019
	get_online_cpus();
2020
	err = _virtnet_set_queues(vi, queue_pairs);
2021 2022 2023 2024
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

2025
		virtnet_set_affinity(vi);
2026
	}
2027
	put_online_cpus();
2028 2029 2030 2031

	return err;
}

T
Toshiaki Makita 已提交
2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	char *p = (char *)data;
	unsigned int i, j;

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s",
					 i, virtnet_rq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s",
					 i, virtnet_sq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2083
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2122
/* Check if the user is trying to change anything besides speed/duplex */
2123 2124
static bool
virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
2125
{
2126 2127
	struct ethtool_link_ksettings diff1 = *cmd;
	struct ethtool_link_ksettings diff2 = {};
2128

2129 2130 2131
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145
	diff1.base.speed = 0;
	diff2.base.port = PORT_OTHER;
	ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
	diff1.base.duplex = 0;
	diff1.base.cmd = 0;
	diff1.base.link_mode_masks_nwords = 0;

	return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
		bitmap_empty(diff1.link_modes.supported,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.lp_advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS);
2146 2147
}

2148 2149
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2150 2151 2152 2153
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

2154
	speed = cmd->base.speed;
2155 2156
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
2157
	    !ethtool_validate_duplex(cmd->base.duplex) ||
2158 2159 2160
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
2161
	vi->duplex = cmd->base.duplex;
2162 2163 2164 2165

	return 0;
}

2166 2167
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2168 2169 2170
{
	struct virtnet_info *vi = netdev_priv(dev);

2171 2172 2173
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

	speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config,
						  speed));
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
	duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config,
						  duplex));
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2204
static const struct ethtool_ops virtnet_ethtool_ops = {
2205
	.get_drvinfo = virtnet_get_drvinfo,
2206
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2207
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2208 2209 2210
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2211 2212
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2213
	.get_ts_info = ethtool_op_get_ts_info,
2214 2215
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2216 2217
};

2218 2219 2220 2221 2222 2223 2224 2225
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2226
	netif_tx_lock_bh(vi->dev);
2227
	netif_device_detach(vi->dev);
2228
	netif_tx_unlock_bh(vi->dev);
2229 2230 2231
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2232
		for (i = 0; i < vi->max_queue_pairs; i++) {
2233
			napi_disable(&vi->rq[i].napi);
2234
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2235
		}
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2257
		for (i = 0; i < vi->max_queue_pairs; i++) {
2258
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2259 2260 2261
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2262 2263
	}

2264
	netif_tx_lock_bh(vi->dev);
2265
	netif_device_attach(vi->dev);
2266
	netif_tx_unlock_bh(vi->dev);
2267 2268 2269
	return err;
}

2270 2271 2272
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2273
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2274

2275
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
		dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2306 2307
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2308 2309 2310 2311
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2312
	u16 xdp_qp = 0, curr_qp;
2313
	int i, err;
J
John Fastabend 已提交
2314

2315 2316 2317 2318
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2319 2320 2321
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
J
John Fastabend 已提交
2322 2323 2324 2325
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2326
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2327 2328 2329 2330
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2331
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2332 2333 2334 2335
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2336 2337 2338 2339 2340 2341
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2342
		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
2343 2344 2345 2346 2347
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

2348 2349 2350 2351 2352 2353
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

2354
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2355 2356 2357
	if (netif_running(dev))
		for (i = 0; i < vi->max_queue_pairs; i++)
			napi_disable(&vi->rq[i].napi);
J
John Fastabend 已提交
2358

2359
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2360 2361 2362 2363
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
	vi->xdp_queue_pairs = xdp_qp;
2364

J
John Fastabend 已提交
2365 2366 2367
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2368 2369 2370 2371 2372 2373
		if (i == 0) {
			if (!old_prog)
				virtnet_clear_guest_offloads(vi);
			if (!prog)
				virtnet_restore_guest_offloads(vi);
		}
J
John Fastabend 已提交
2374 2375
		if (old_prog)
			bpf_prog_put(old_prog);
2376 2377
		if (netif_running(dev))
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
J
John Fastabend 已提交
2378 2379 2380
	}

	return 0;
2381

2382
err:
2383 2384 2385 2386
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
	}
2387 2388 2389
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2390 2391
}

2392
static u32 virtnet_xdp_query(struct net_device *dev)
J
John Fastabend 已提交
2393 2394
{
	struct virtnet_info *vi = netdev_priv(dev);
2395
	const struct bpf_prog *xdp_prog;
J
John Fastabend 已提交
2396 2397 2398
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
2399 2400 2401
		xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		if (xdp_prog)
			return xdp_prog->aux->id;
J
John Fastabend 已提交
2402
	}
2403
	return 0;
J
John Fastabend 已提交
2404 2405
}

2406
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2407 2408 2409
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2410
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2411
	case XDP_QUERY_PROG:
2412
		xdp->prog_id = virtnet_xdp_query(dev);
J
John Fastabend 已提交
2413 2414 2415 2416 2417 2418
		return 0;
	default:
		return -EINVAL;
	}
}

2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2435 2436 2437 2438 2439
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2440
	.ndo_set_mac_address = virtnet_set_mac_address,
2441
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2442
	.ndo_get_stats64     = virtnet_stats,
2443 2444
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2445
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2446
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2447
	.ndo_features_check	= passthru_features_check,
2448
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2449 2450
};

2451
static void virtnet_config_changed_work(struct work_struct *work)
2452
{
2453 2454
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2455 2456
	u16 v;

2457 2458
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2459
		return;
2460 2461

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2462
		netdev_notify_peers(vi->dev);
2463 2464
		virtnet_ack_link_announce(vi);
	}
2465 2466 2467 2468 2469

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2470
		return;
2471 2472 2473 2474

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2475
		virtnet_update_settings(vi);
2476
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2477
		netif_tx_wake_all_queues(vi->dev);
2478 2479
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2480
		netif_tx_stop_all_queues(vi->dev);
2481 2482 2483 2484 2485 2486 2487
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2488
	schedule_work(&vi->config_work);
2489 2490
}

J
Jason Wang 已提交
2491 2492
static void virtnet_free_queues(struct virtnet_info *vi)
{
2493 2494
	int i;

2495 2496
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
2497
		netif_napi_del(&vi->rq[i].napi);
W
Willem de Bruijn 已提交
2498
		netif_napi_del(&vi->sq[i].napi);
2499
	}
2500

2501 2502 2503 2504 2505
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2506 2507
	kfree(vi->rq);
	kfree(vi->sq);
2508
	kfree(vi->ctrl);
J
Jason Wang 已提交
2509 2510
}

2511
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2512
{
J
John Fastabend 已提交
2513
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2514 2515 2516 2517 2518
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2519 2520 2521 2522 2523

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2524
	}
2525 2526 2527 2528 2529 2530
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2531
	rtnl_unlock();
J
Jason Wang 已提交
2532 2533
}

2534 2535 2536 2537 2538 2539 2540 2541
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

2542
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
2543 2544 2545 2546 2547 2548 2549 2550 2551
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
2552 2553 2554 2555 2556 2557 2558
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2559
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2560
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2561 2562 2563 2564
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2565 2566 2567 2568 2569 2570
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2571
			if (vi->mergeable_rx_bufs) {
2572
				put_page(virt_to_head_page(buf));
2573
			} else if (vi->big_packets) {
2574
				give_pages(&vi->rq[i], buf);
2575
			} else {
2576
				put_page(virt_to_head_page(buf));
2577
			}
J
Jason Wang 已提交
2578 2579 2580 2581
		}
	}
}

2582 2583 2584 2585
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2586
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2587

2588
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2589 2590

	virtnet_free_queues(vi);
2591 2592
}

2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2605 2606
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2607 2608
}

J
Jason Wang 已提交
2609
static int virtnet_find_vqs(struct virtnet_info *vi)
2610
{
J
Jason Wang 已提交
2611 2612 2613 2614 2615
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2616
	bool *ctx;
J
Jason Wang 已提交
2617 2618 2619 2620 2621 2622 2623 2624 2625

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2626
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2627 2628
	if (!vqs)
		goto err_vq;
2629
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2630 2631
	if (!callbacks)
		goto err_callback;
2632
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2633 2634
	if (!names)
		goto err_names;
2635
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2636
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2637 2638 2639 2640 2641
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2642 2643 2644 2645 2646 2647

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2648

J
Jason Wang 已提交
2649 2650 2651 2652 2653 2654 2655 2656
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2657 2658
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2659
	}
2660

J
Jason Wang 已提交
2661
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2662
					 names, ctx, NULL);
J
Jason Wang 已提交
2663 2664
	if (ret)
		goto err_find;
2665

J
Jason Wang 已提交
2666 2667
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2668
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2669
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2670
	}
J
Jason Wang 已提交
2671 2672 2673

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2674
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2675 2676 2677
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2678
	/* run here: ret == 0. */
J
Jason Wang 已提交
2679 2680 2681


err_find:
2682 2683
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2697 2698 2699
	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
	if (!vi->ctrl)
		goto err_ctrl;
K
Kees Cook 已提交
2700
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2701 2702
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2703
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2704
	if (!vi->rq)
J
Jason Wang 已提交
2705 2706 2707 2708 2709 2710 2711
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2712 2713
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2714 2715

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2716
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2717
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2718 2719 2720

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2721 2722 2723 2724 2725 2726 2727
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2728 2729
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2746
	get_online_cpus();
2747
	virtnet_set_affinity(vi);
2748 2749
	put_online_cpus();

J
Jason Wang 已提交
2750 2751 2752 2753 2754 2755
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2756 2757
}

2758 2759
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2760
		char *buf)
2761 2762 2763
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2764 2765
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2766
	struct ewma_pkt_len *avg;
2767 2768 2769

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2770
	return sprintf(buf, "%u\n",
2771 2772
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2823 2824 2825
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2826
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2827
{
2828 2829 2830 2831 2832 2833
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2834 2835 2836
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
2850
	int i, err = -ENOMEM;
2851 2852 2853 2854 2855
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2856
	/* Find if host supports multiqueue virtio_net device */
2857 2858 2859
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2860 2861 2862 2863 2864 2865

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2866 2867

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2868
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2869 2870 2871 2872
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2873
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2874
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2875
	dev->features = NETIF_F_HIGHDMA;
2876

2877
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2878 2879 2880
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2881
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2882
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2883
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2884
		if (csum)
J
Jason Wang 已提交
2885
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2886 2887

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2888
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
2889 2890
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2891
		/* Individual feature bits: what can host handle? */
2892 2893 2894 2895 2896 2897 2898
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

2899 2900
		dev->features |= NETIF_F_GSO_ROBUST;

2901
		if (gso)
2902
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
2903
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2904
	}
2905 2906
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2907

2908 2909
	dev->vlan_features = dev->features;

2910 2911 2912 2913
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2914
	/* Configuration may specify what MAC to use.  Otherwise random. */
2915 2916 2917 2918 2919
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2920
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2921 2922 2923 2924 2925

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2926
	vdev->priv = vi;
2927

2928
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2929

2930
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2931 2932
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2933 2934
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2935 2936
		vi->big_packets = true;

2937 2938 2939
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2940 2941
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2942 2943 2944 2945
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2946 2947
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2948 2949
		vi->any_header_sg = true;

J
Jason Wang 已提交
2950 2951 2952
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2953 2954 2955 2956
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2957
		if (mtu < dev->min_mtu) {
2958 2959 2960 2961 2962
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
T
Toshiaki Makita 已提交
2963
			goto free;
2964
		}
2965

2966 2967 2968
		dev->mtu = mtu;
		dev->max_mtu = mtu;

2969 2970 2971
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2972 2973
	}

2974 2975
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2976

2977 2978 2979 2980 2981
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2982 2983 2984
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2985
	err = init_vqs(vi);
2986
	if (err)
T
Toshiaki Makita 已提交
2987
		goto free;
R
Rusty Russell 已提交
2988

2989 2990 2991 2992
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2993 2994
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2995

2996 2997
	virtnet_init_settings(dev);

2998 2999
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3000 3001
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3002
			goto free_vqs;
3003
		}
3004 3005
	}

R
Rusty Russell 已提交
3006 3007 3008
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3009
		goto free_failover;
R
Rusty Russell 已提交
3010
	}
3011

M
Michael S. Tsirkin 已提交
3012 3013
	virtio_device_ready(vdev);

3014
	err = virtnet_cpu_notif_add(vi);
3015 3016
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3017
		goto free_unregister_netdev;
3018 3019
	}

3020
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3021

J
Jason Wang 已提交
3022 3023
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3024
	netif_carrier_off(dev);
J
Jason Wang 已提交
3025
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3026
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3027 3028
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3029
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3030 3031
		netif_carrier_on(dev);
	}
3032

3033 3034 3035 3036
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);

J
Jason Wang 已提交
3037 3038 3039
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3040 3041
	return 0;

3042
free_unregister_netdev:
3043 3044
	vi->vdev->config->reset(vdev);

3045
	unregister_netdev(dev);
3046 3047
free_failover:
	net_failover_destroy(vi->failover);
3048
free_vqs:
J
Jason Wang 已提交
3049
	cancel_delayed_work_sync(&vi->refill);
3050
	free_receive_page_frags(vi);
3051
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3052 3053 3054 3055 3056
free:
	free_netdev(dev);
	return err;
}

3057
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3058
{
3059
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3060 3061

	/* Free unused buffers in both send and recv, if any. */
3062
	free_unused_bufs(vi);
3063

J
Jason Wang 已提交
3064
	free_receive_bufs(vi);
3065

3066 3067
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3068
	virtnet_del_vqs(vi);
3069 3070
}

3071
static void virtnet_remove(struct virtio_device *vdev)
3072 3073 3074
{
	struct virtnet_info *vi = vdev->priv;

3075
	virtnet_cpu_notif_remove(vi);
3076

3077 3078
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3079

3080 3081
	unregister_netdev(vi->dev);

3082 3083
	net_failover_destroy(vi->failover);

3084
	remove_vq_common(vi);
3085

3086
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3087 3088
}

3089
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3090 3091 3092
{
	struct virtnet_info *vi = vdev->priv;

3093
	virtnet_cpu_notif_remove(vi);
3094
	virtnet_freeze_down(vdev);
3095 3096 3097 3098 3099
	remove_vq_common(vi);

	return 0;
}

3100
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3101 3102
{
	struct virtnet_info *vi = vdev->priv;
3103
	int err;
3104

3105
	err = virtnet_restore_up(vdev);
3106 3107
	if (err)
		return err;
J
Jason Wang 已提交
3108 3109
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3110
	err = virtnet_cpu_notif_add(vi);
3111 3112 3113
	if (err)
		return err;

3114 3115 3116
	return 0;
}

R
Rusty Russell 已提交
3117 3118 3119 3120 3121
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3122 3123 3124 3125 3126 3127 3128 3129 3130 3131
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3132
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3133
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3134

3135
static unsigned int features[] = {
3136 3137 3138 3139 3140 3141
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3142
	VIRTIO_F_ANY_LAYOUT,
3143 3144
};

3145
static struct virtio_driver virtio_net_driver = {
3146 3147
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3148 3149
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3150 3151 3152
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3153
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3154
	.probe =	virtnet_probe,
3155
	.remove =	virtnet_remove,
3156
	.config_changed = virtnet_config_changed,
3157
#ifdef CONFIG_PM_SLEEP
3158 3159 3160
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3161 3162
};

3163 3164 3165 3166
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3167
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3168 3169 3170 3171 3172
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3173
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3193
	unregister_virtio_driver(&virtio_net_driver);
3194 3195 3196 3197
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3198 3199 3200 3201

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");