virtio_net.c 82.8 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
J
Jason Wang 已提交
32
#include <linux/filter.h>
33
#include <linux/kernel.h>
34
#include <linux/pci.h>
35
#include <net/route.h>
36
#include <net/xdp.h>
37
#include <net/net_failover.h>
R
Rusty Russell 已提交
38

39
static int napi_weight = NAPI_POLL_WEIGHT;
40 41
module_param(napi_weight, int, 0444);

W
Willem de Bruijn 已提交
42
static bool csum = true, gso = true, napi_tx;
R
Rusty Russell 已提交
43 44
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
45
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
46

R
Rusty Russell 已提交
47
/* FIXME: MTU in config. */
48
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
49
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
50

51 52
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

53 54 55
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

56 57 58 59
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

J
Johannes Berg 已提交
60 61 62 63
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
64
 */
65
DECLARE_EWMA(pkt_len, 0, 64)
66

67
#define VIRTNET_DRIVER_VERSION "1.0.0"
68

69 70 71 72
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
73 74
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
75
};
76

T
Toshiaki Makita 已提交
77 78 79
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
80 81
};

T
Toshiaki Makita 已提交
82 83 84 85
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
86 87
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
88
	u64 kicks;
T
Toshiaki Makita 已提交
89 90
};

91 92
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
93 94
	u64 packets;
	u64 bytes;
95
	u64 drops;
96 97 98 99
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
100
	u64 kicks;
T
Toshiaki Makita 已提交
101 102 103
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
104
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
105 106

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
107 108 109 110
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
111
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
112 113 114
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
115 116 117 118 119 120 121
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
122
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
123 124 125 126 127
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

128 129 130 131 132 133 134
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
135 136 137

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
138

T
Toshiaki Makita 已提交
139 140
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
141
	struct napi_struct napi;
142 143 144 145 146 147 148
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
149 150
	struct napi_struct napi;

J
John Fastabend 已提交
151 152
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
153 154
	struct virtnet_rq_stats stats;

155 156 157
	/* Chain pages by the private ptr. */
	struct page *pages;

158
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
159
	struct ewma_pkt_len mrg_avg_pkt_len;
160

161 162 163
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

164 165
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
166

167 168 169
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
170 171
	/* Name of this receive queue: input.$index */
	char name[40];
172 173

	struct xdp_rxq_info xdp_rxq;
174 175
};

176 177 178 179 180 181 182
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
183
	__virtio16 vid;
184
	__virtio64 offloads;
185 186
};

187 188 189 190
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
191 192
	struct send_queue *sq;
	struct receive_queue *rq;
193 194
	unsigned int status;

J
Jason Wang 已提交
195 196 197 198 199 200
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

201 202 203
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

204 205 206
	/* I like... big packets and I cannot lie! */
	bool big_packets;

207 208 209
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
210 211 212
	/* Has control virtqueue */
	bool has_cvq;

213 214 215
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

216 217 218
	/* Packet virtio header size */
	u8 hdr_len;

219 220 221
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

222 223 224
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
225 226
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
227

228 229 230
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
231

232
	struct control_buf *ctrl;
233 234 235 236

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
237 238

	unsigned long guest_offloads;
239 240 241

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
242 243
};

244
struct padded_vnet_hdr {
245
	struct virtio_net_hdr_mrg_rxbuf hdr;
246
	/*
247 248 249
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
250
	 */
251
	char padding[4];
252 253
};

J
Jason Wang 已提交
254 255 256 257 258
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
259
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
260 261 262 263 264 265 266 267 268
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
269
	return vq->index / 2;
J
Jason Wang 已提交
270 271 272 273 274 275 276
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

277
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
278
{
279
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
280 281
}

282 283 284 285
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
286
static void give_pages(struct receive_queue *rq, struct page *page)
287
{
288
	struct page *end;
289

290
	/* Find end of list, sew whole thing into vi->rq.pages. */
291
	for (end = page; end->private; end = (struct page *)end->private);
292 293
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
294 295
}

296
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
297
{
298
	struct page *p = rq->pages;
299

300
	if (p) {
301
		rq->pages = (struct page *)p->private;
302 303 304
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
305 306 307 308
		p = alloc_page(gfp_mask);
	return p;
}

309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
324 325 326 327 328 329
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
330 331
}

332
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
333
{
334
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
335
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
336

337
	/* Suppress further interrupts. */
338
	virtqueue_disable_cb(vq);
339

W
Willem de Bruijn 已提交
340 341 342 343 344
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
345 346
}

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

364
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
365 366
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
367 368
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
369 370
{
	struct sk_buff *skb;
371
	struct virtio_net_hdr_mrg_rxbuf *hdr;
372
	unsigned int copy, hdr_len, hdr_padded_len;
373
	char *p;
374

375
	p = page_address(page) + offset;
376

377
	/* copy small packet so we can reuse these pages for small data */
378
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
379 380
	if (unlikely(!skb))
		return NULL;
381

382
	hdr = skb_vnet_hdr(skb);
383

384 385
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
386
		hdr_padded_len = sizeof(*hdr);
387
	else
388
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
389

390
	memcpy(hdr, p, hdr_len);
391

392
	len -= hdr_len;
393 394
	offset += hdr_padded_len;
	p += hdr_padded_len;
395

396 397 398
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
399
	skb_put_data(skb, p, copy);
400

401 402
	len -= copy;
	offset += copy;
403

404 405 406 407 408 409 410 411
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

412 413 414 415 416 417 418
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
419
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
420 421 422
		dev_kfree_skb(skb);
		return NULL;
	}
423
	BUG_ON(offset >= PAGE_SIZE);
424
	while (len) {
425 426 427 428
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
429 430 431
		page = (struct page *)page->private;
		offset = 0;
	}
432

433
	if (page)
434
		give_pages(rq, page);
435

436 437
	return skb;
}
438

439 440 441
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
442 443 444 445
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

446 447 448
	/* virtqueue want to use data area in-front of packet */
	if (unlikely(xdpf->metasize > 0))
		return -EOPNOTSUPP;
J
John Fastabend 已提交
449

450 451 452 453 454
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
455
	/* Zero header and leave csum up to XDP layers */
456
	hdr = xdpf->data;
457
	memset(hdr, 0, vi->hdr_len);
458
	xdpf->len   += vi->hdr_len;
459

460
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
461

462
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
463
	if (unlikely(err))
464
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
465

466
	return 0;
J
John Fastabend 已提交
467 468
}

469 470 471 472 473 474 475 476
static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
{
	unsigned int qp;

	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	return &vi->sq[qp];
}

477
static int virtnet_xdp_xmit(struct net_device *dev,
478
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
479 480
{
	struct virtnet_info *vi = netdev_priv(dev);
481
	struct receive_queue *rq = vi->rq;
482
	struct xdp_frame *xdpf_sent;
483
	struct bpf_prog *xdp_prog;
484 485 486
	struct send_queue *sq;
	unsigned int len;
	int drops = 0;
T
Toshiaki Makita 已提交
487
	int kicks = 0;
488
	int ret, err;
489 490
	int i;

491
	sq = virtnet_xdp_sq(vi);
J
Jason Wang 已提交
492

493 494 495 496 497 498
	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
		drops = n;
		goto out;
	}

499 500 501 502
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
	xdp_prog = rcu_dereference(rq->xdp_prog);
503 504 505 506 507
	if (!xdp_prog) {
		ret = -ENXIO;
		drops = n;
		goto out;
	}
508

509 510 511 512 513 514 515 516 517 518 519 520 521
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
		xdp_return_frame(xdpf_sent);

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
		if (err) {
			xdp_return_frame_rx_napi(xdpf);
			drops++;
		}
	}
522
	ret = n - drops;
523

T
Toshiaki Makita 已提交
524 525 526 527
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
528 529 530 531
out:
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.xdp_tx += n;
	sq->stats.xdp_tx_drops += drops;
T
Toshiaki Makita 已提交
532
	sq->stats.kicks += kicks;
533
	u64_stats_update_end(&sq->stats.syncp);
534

535
	return ret;
J
Jason Wang 已提交
536 537
}

538 539 540 541 542
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
573
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
574 575 576 577 578 579 580 581 582 583 584 585 586 587
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
588
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

607 608 609
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
610
				     void *buf, void *ctx,
J
Jason Wang 已提交
611
				     unsigned int len,
612
				     unsigned int *xdp_xmit,
613
				     struct virtnet_rq_stats *stats)
614
{
615
	struct sk_buff *skb;
616
	struct bpf_prog *xdp_prog;
617
	unsigned int xdp_headroom = (unsigned long)ctx;
618 619 620 621
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
622
	struct page *page = virt_to_head_page(buf);
623
	unsigned int delta = 0;
624
	struct page *xdp_page;
625 626
	int err;

627
	len -= vi->hdr_len;
628
	stats->bytes += len;
629

630 631 632
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
633
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
634
		struct xdp_frame *xdpf;
635
		struct xdp_buff xdp;
636
		void *orig_data;
637 638
		u32 act;

639
		if (unlikely(hdr->hdr.gso_type))
640
			goto err_xdp;
641

642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

663 664
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
665
		xdp_set_data_meta_invalid(&xdp);
666
		xdp.data_end = xdp.data + len;
667
		xdp.rxq = &rq->xdp_rxq;
668
		orig_data = xdp.data;
669
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
670
		stats->xdp_packets++;
671

672 673
		switch (act) {
		case XDP_PASS:
674
			/* Recalculate length in case bpf program changed it */
675
			delta = orig_data - xdp.data;
676
			len = xdp.data_end - xdp.data;
677 678
			break;
		case XDP_TX:
679
			stats->xdp_tx++;
680 681 682
			xdpf = convert_to_xdp_frame(&xdp);
			if (unlikely(!xdpf))
				goto err_xdp;
683 684
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
685
				trace_xdp_exception(vi->dev, xdp_prog, act);
686 687
				goto err_xdp;
			}
688
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
689 690 691
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
692
			stats->xdp_redirects++;
J
Jason Wang 已提交
693
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
694 695
			if (err)
				goto err_xdp;
696
			*xdp_xmit |= VIRTIO_XDP_REDIR;
697 698 699
			rcu_read_unlock();
			goto xdp_xmit;
		default:
700
			bpf_warn_invalid_xdp_action(act);
701
			/* fall through */
702 703 704
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
705 706 707 708 709
			goto err_xdp;
		}
	}
	rcu_read_unlock();

710 711
	skb = build_skb(buf, buflen);
	if (!skb) {
712
		put_page(page);
713 714 715
		goto err;
	}
	skb_reserve(skb, headroom - delta);
716
	skb_put(skb, len);
717 718 719 720 721 722
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
723
	return skb;
724 725 726

err_xdp:
	rcu_read_unlock();
727 728
	stats->xdp_drops++;
	stats->drops++;
729
	put_page(page);
730 731
xdp_xmit:
	return NULL;
732 733 734
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
735
				   struct virtnet_info *vi,
736 737
				   struct receive_queue *rq,
				   void *buf,
738
				   unsigned int len,
739
				   struct virtnet_rq_stats *stats)
740 741
{
	struct page *page = buf;
742
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
743

744
	stats->bytes += len - vi->hdr_len;
745 746 747 748 749 750
	if (unlikely(!skb))
		goto err;

	return skb;

err:
751
	stats->drops++;
752 753 754 755
	give_pages(rq, page);
	return NULL;
}

756
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
757
					 struct virtnet_info *vi,
758
					 struct receive_queue *rq,
759 760
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
761
					 unsigned int len,
762
					 unsigned int *xdp_xmit,
763
					 struct virtnet_rq_stats *stats)
764
{
765 766
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
767 768
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
769 770 771
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;
772
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
773
	int err;
J
John Fastabend 已提交
774

J
John Fastabend 已提交
775
	head_skb = NULL;
776
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
777

J
John Fastabend 已提交
778 779 780
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
781
		struct xdp_frame *xdpf;
782
		struct page *xdp_page;
783 784
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
785 786
		u32 act;

787 788 789 790 791 792 793
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

794 795 796 797 798 799
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
800 801
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
802
			/* linearize data for XDP */
803
			xdp_page = xdp_linearize_page(rq, &num_buf,
804 805 806
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
807 808
			if (!xdp_page)
				goto err_xdp;
809
			offset = VIRTIO_XDP_HEADROOM;
810 811
		} else {
			xdp_page = page;
J
John Fastabend 已提交
812 813
		}

814 815 816
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
817
		data = page_address(xdp_page) + offset;
818
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
819
		xdp.data = data + vi->hdr_len;
820
		xdp_set_data_meta_invalid(&xdp);
821
		xdp.data_end = xdp.data + (len - vi->hdr_len);
822 823
		xdp.rxq = &rq->xdp_rxq;

824
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
825
		stats->xdp_packets++;
826

J
John Fastabend 已提交
827 828
		switch (act) {
		case XDP_PASS:
829 830 831 832 833 834 835
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

836 837 838
			/* recalculate len if xdp.data or xdp.data_end were
			 * adjusted
			 */
839
			len = xdp.data_end - xdp.data + vi->hdr_len;
840 841 842 843 844
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
845
						       offset, len, PAGE_SIZE);
846 847
				return head_skb;
			}
J
John Fastabend 已提交
848 849
			break;
		case XDP_TX:
850
			stats->xdp_tx++;
851 852 853
			xdpf = convert_to_xdp_frame(&xdp);
			if (unlikely(!xdpf))
				goto err_xdp;
854 855
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
			if (unlikely(err < 0)) {
856
				trace_xdp_exception(vi->dev, xdp_prog, act);
857 858 859 860
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
861
			*xdp_xmit |= VIRTIO_XDP_TX;
862
			if (unlikely(xdp_page != page))
863
				put_page(page);
J
John Fastabend 已提交
864 865
			rcu_read_unlock();
			goto xdp_xmit;
866
		case XDP_REDIRECT:
867
			stats->xdp_redirects++;
868 869 870 871 872 873
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
874
			*xdp_xmit |= VIRTIO_XDP_REDIR;
875
			if (unlikely(xdp_page != page))
876
				put_page(page);
877 878
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
879
		default:
880
			bpf_warn_invalid_xdp_action(act);
881
			/* fall through */
882 883
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
884
			/* fall through */
885
		case XDP_DROP:
886 887
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
888
			goto err_xdp;
J
John Fastabend 已提交
889
		}
J
John Fastabend 已提交
890 891
	}
	rcu_read_unlock();
892

893 894
	truesize = mergeable_ctx_to_truesize(ctx);
	if (unlikely(len > truesize)) {
895
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
896 897 898 899
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
900

J
John Fastabend 已提交
901 902
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
903

904 905
	if (unlikely(!curr_skb))
		goto err_skb;
906
	while (--num_buf) {
907 908
		int num_skb_frags;

909
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
910
		if (unlikely(!buf)) {
911
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
912
				 dev->name, num_buf,
913 914
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
915 916
			dev->stats.rx_length_errors++;
			goto err_buf;
917
		}
918

919
		stats->bytes += len;
920
		page = virt_to_head_page(buf);
921 922 923

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
924
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
925 926 927 928
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
929 930

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
931 932
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
933 934 935

			if (unlikely(!nskb))
				goto err_skb;
936 937 938 939 940 941 942 943 944 945 946
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
947
			head_skb->truesize += truesize;
948
		}
949
		offset = buf - page_address(page);
950 951 952
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
953
					     len, truesize);
954 955
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
956
					offset, len, truesize);
957
		}
958 959
	}

J
Johannes Berg 已提交
960
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
961 962
	return head_skb;

J
John Fastabend 已提交
963 964
err_xdp:
	rcu_read_unlock();
965
	stats->xdp_drops++;
966 967
err_skb:
	put_page(page);
968
	while (num_buf-- > 1) {
969 970
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
971 972 973 974 975
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
976
		stats->bytes += len;
977
		page = virt_to_head_page(buf);
978
		put_page(page);
979
	}
980
err_buf:
981
	stats->drops++;
982
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
983
xdp_xmit:
984
	return NULL;
985 986
}

987 988
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
989
			unsigned int *xdp_xmit,
990
			struct virtnet_rq_stats *stats)
991
{
992
	struct net_device *dev = vi->dev;
993
	struct sk_buff *skb;
994
	struct virtio_net_hdr_mrg_rxbuf *hdr;
995

996
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
997 998
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
999
		if (vi->mergeable_rx_bufs) {
1000
			put_page(virt_to_head_page(buf));
1001
		} else if (vi->big_packets) {
1002
			give_pages(rq, buf);
1003
		} else {
1004
			put_page(virt_to_head_page(buf));
1005
		}
1006
		return;
1007
	}
1008

1009
	if (vi->mergeable_rx_bufs)
1010
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1011
					stats);
1012
	else if (vi->big_packets)
1013
		skb = receive_big(dev, vi, rq, buf, len, stats);
1014
	else
1015
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1016 1017

	if (unlikely(!skb))
1018
		return;
1019

1020
	hdr = skb_vnet_hdr(skb);
1021

1022
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1023
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1024

1025 1026 1027 1028 1029 1030
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1031 1032
	}

1033 1034 1035 1036
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1037
	napi_gro_receive(&rq->napi, skb);
1038
	return;
R
Rusty Russell 已提交
1039 1040 1041 1042 1043 1044

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1045 1046 1047 1048 1049
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1050 1051
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1052
{
1053 1054
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1055
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1056
	void *ctx = (void *)(unsigned long)xdp_headroom;
1057
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1058
	int err;
1059

1060 1061 1062
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1063
		return -ENOMEM;
R
Rusty Russell 已提交
1064

1065 1066 1067 1068 1069
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1070
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1071
	if (err < 0)
1072
		put_page(virt_to_head_page(buf));
1073 1074
	return err;
}
1075

1076 1077
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1078 1079 1080 1081 1082
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1083 1084
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1085
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1086
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1087
		first = get_a_page(rq, gfp);
1088 1089
		if (!first) {
			if (list)
1090
				give_pages(rq, list);
1091
			return -ENOMEM;
1092
		}
1093
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1094

1095 1096 1097 1098
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1099

1100
	first = get_a_page(rq, gfp);
1101
	if (!first) {
1102
		give_pages(rq, list);
1103 1104 1105 1106
		return -ENOMEM;
	}
	p = page_address(first);

1107
	/* rq->sg[0], rq->sg[1] share the same page */
1108 1109
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1110

1111
	/* rq->sg[1] for data packet, from offset */
1112
	offset = sizeof(struct padded_vnet_hdr);
1113
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1114 1115 1116

	/* chain first in list head */
	first->private = (unsigned long)list;
1117 1118
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1119
	if (err < 0)
1120
		give_pages(rq, first);
1121 1122

	return err;
R
Rusty Russell 已提交
1123 1124
}

1125
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1126 1127
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1128
{
1129
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1130 1131
	unsigned int len;

1132 1133 1134 1135
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1136
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1137

1138
	return ALIGN(len, L1_CACHE_BYTES);
1139 1140
}

1141 1142
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1143
{
1144
	struct page_frag *alloc_frag = &rq->alloc_frag;
1145
	unsigned int headroom = virtnet_get_headroom(vi);
1146 1147
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1148
	char *buf;
1149
	void *ctx;
1150
	int err;
1151
	unsigned int len, hole;
1152

1153 1154 1155 1156 1157 1158
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1159
		return -ENOMEM;
1160

1161
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1162
	buf += headroom; /* advance address leaving hole at front of pkt */
1163
	get_page(alloc_frag->page);
1164
	alloc_frag->offset += len + room;
1165
	hole = alloc_frag->size - alloc_frag->offset;
1166
	if (hole < len + room) {
1167 1168
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1169
		 * the current buffer.
1170
		 */
1171 1172 1173
		len += hole;
		alloc_frag->offset += hole;
	}
1174

1175
	sg_init_one(rq->sg, buf, len);
1176
	ctx = mergeable_len_to_ctx(len, headroom);
1177
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1178
	if (err < 0)
1179
		put_page(virt_to_head_page(buf));
1180

1181 1182
	return err;
}
1183

1184 1185 1186 1187 1188 1189 1190
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1191 1192
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1193 1194
{
	int err;
1195
	bool oom;
1196

1197 1198
	do {
		if (vi->mergeable_rx_bufs)
1199
			err = add_recvbuf_mergeable(vi, rq, gfp);
1200
		else if (vi->big_packets)
1201
			err = add_recvbuf_big(vi, rq, gfp);
1202
		else
M
Michael S. Tsirkin 已提交
1203
			err = add_recvbuf_small(vi, rq, gfp);
1204

1205
		oom = err == -ENOMEM;
1206
		if (err)
1207
			break;
1208
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1209 1210
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
		u64_stats_update_begin(&rq->stats.syncp);
1211
		rq->stats.kicks++;
T
Toshiaki Makita 已提交
1212 1213 1214
		u64_stats_update_end(&rq->stats.syncp);
	}

1215
	return !oom;
1216 1217
}

1218
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1219 1220
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1221
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1222

1223
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1224 1225
}

1226
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1227
{
1228
	napi_enable(napi);
1229 1230

	/* If all buffers were filled by other side before we napi_enabled, we
1231 1232 1233 1234 1235 1236
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1237 1238
}

W
Willem de Bruijn 已提交
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1257 1258 1259 1260 1261 1262
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1263 1264
static void refill_work(struct work_struct *work)
{
1265 1266
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1267
	bool still_empty;
J
Jason Wang 已提交
1268 1269
	int i;

1270
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1271
		struct receive_queue *rq = &vi->rq[i];
1272

J
Jason Wang 已提交
1273
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1274
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1275
		virtnet_napi_enable(rq->vq, &rq->napi);
1276

J
Jason Wang 已提交
1277 1278 1279 1280 1281 1282
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1283 1284
}

1285 1286
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1287
{
1288
	struct virtnet_info *vi = rq->vq->vdev->priv;
1289
	struct virtnet_rq_stats stats = {};
1290
	unsigned int len;
1291
	void *buf;
1292
	int i;
R
Rusty Russell 已提交
1293

1294
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1295 1296
		void *ctx;

1297
		while (stats.packets < budget &&
1298
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1299
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1300
			stats.packets++;
1301 1302
		}
	} else {
1303
		while (stats.packets < budget &&
1304
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1305
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1306
			stats.packets++;
1307
		}
R
Rusty Russell 已提交
1308 1309
	}

1310
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
1311
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1312
			schedule_delayed_work(&vi->refill, 0);
1313
	}
R
Rusty Russell 已提交
1314

T
Toshiaki Makita 已提交
1315
	u64_stats_update_begin(&rq->stats.syncp);
1316 1317 1318 1319
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1320 1321
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1322
	}
T
Toshiaki Makita 已提交
1323
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1324

1325
	return stats.packets;
1326 1327
}

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340
static void free_old_xmit_skbs(struct send_queue *sq)
{
	struct sk_buff *skb;
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;

	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		pr_debug("Sent skb %p\n", skb);

		bytes += skb->len;
		packets++;

1341
		dev_consume_skb_any(skb);
1342 1343 1344 1345 1346 1347 1348 1349
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1350 1351 1352 1353
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1354 1355
}

1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

	if (!sq->napi.weight)
		return;

	if (__netif_tx_trylock(txq)) {
		free_old_xmit_skbs(sq);
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1375 1376 1377 1378
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1379 1380
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1381
	unsigned int received;
1382
	unsigned int xdp_xmit = 0;
1383

1384 1385
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1386
	received = virtnet_receive(rq, budget, &xdp_xmit);
1387

1388
	/* Out of packets? */
1389 1390
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1391

1392 1393 1394 1395
	if (xdp_xmit & VIRTIO_XDP_REDIR)
		xdp_do_flush_map();

	if (xdp_xmit & VIRTIO_XDP_TX) {
1396
		sq = virtnet_xdp_sq(vi);
T
Toshiaki Makita 已提交
1397 1398 1399 1400 1401
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1402
	}
J
Jason Wang 已提交
1403

R
Rusty Russell 已提交
1404 1405 1406
	return received;
}

J
Jason Wang 已提交
1407 1408 1409
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1410
	int i, err;
J
Jason Wang 已提交
1411

1412 1413 1414
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1415
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1416
				schedule_delayed_work(&vi->refill, 0);
1417 1418 1419 1420 1421

		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
		if (err < 0)
			return err;

1422 1423 1424 1425 1426 1427 1428
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1429
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1430
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1431 1432 1433 1434 1435
	}

	return 0;
}

W
Willem de Bruijn 已提交
1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));

	__netif_tx_lock(txq, raw_smp_processor_id());
	free_old_xmit_skbs(sq);
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1454
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1455
{
1456
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1457
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1458
	struct virtnet_info *vi = sq->vq->vdev->priv;
1459
	int num_sg;
1460
	unsigned hdr_len = vi->hdr_len;
1461
	bool can_push;
R
Rusty Russell 已提交
1462

J
Johannes Berg 已提交
1463
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1464 1465 1466 1467 1468 1469 1470

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1471
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1472 1473
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1474

1475
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1476 1477
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1478
		BUG();
R
Rusty Russell 已提交
1479

1480
	if (vi->mergeable_rx_bufs)
1481
		hdr->num_buffers = 0;
1482

1483
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1484 1485 1486
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1487 1488
		if (unlikely(num_sg < 0))
			return num_sg;
1489 1490 1491 1492
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1493 1494 1495 1496
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1497
	}
1498
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1499 1500
}

1501
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1502 1503
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1504 1505
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1506
	int err;
1507 1508
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
W
Willem de Bruijn 已提交
1509
	bool use_napi = sq->napi.weight;
1510 1511

	/* Free up any pending old buffers before queueing new ones. */
1512
	free_old_xmit_skbs(sq);
1513

1514 1515 1516
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1517 1518 1519
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1520
	/* Try to transmit */
1521
	err = xmit_skb(sq, skb);
1522

1523
	/* This should not happen! */
1524
	if (unlikely(err)) {
1525 1526 1527
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1528
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1529
		dev->stats.tx_dropped++;
1530
		dev_kfree_skb_any(skb);
1531
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1532
	}
1533

1534
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1535 1536 1537 1538
	if (!use_napi) {
		skb_orphan(skb);
		nf_reset(skb);
	}
1539

1540 1541 1542 1543 1544 1545 1546 1547 1548
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1549
	 */
1550
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1551
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1552 1553
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1554
			/* More just got used, free them then recheck. */
1555 1556
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1557
				netif_start_subqueue(dev, qnum);
1558
				virtqueue_disable_cb(sq->vq);
1559 1560
			}
		}
1561
	}
1562

T
Toshiaki Makita 已提交
1563 1564 1565 1566 1567 1568 1569
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1570

1571
	return NETDEV_TX_OK;
1572 1573
}

1574 1575 1576
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1577
 * never fail unless improperly formatted.
1578 1579
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1580
				 struct scatterlist *out)
1581
{
1582
	struct scatterlist *sgs[4], hdr, stat;
1583
	unsigned out_num = 0, tmp;
1584 1585

	/* Caller should know better */
1586
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1587

1588 1589 1590
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1591
	/* Add header */
1592
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1593
	sgs[out_num++] = &hdr;
1594

1595 1596
	if (out)
		sgs[out_num++] = out;
1597

1598
	/* Add return status. */
1599
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1600
	sgs[out_num] = &stat;
1601

1602
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1603
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1604

1605
	if (unlikely(!virtqueue_kick(vi->cvq)))
1606
		return vi->ctrl->status == VIRTIO_NET_OK;
1607 1608 1609 1610

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1611 1612
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1613 1614
		cpu_relax();

1615
	return vi->ctrl->status == VIRTIO_NET_OK;
1616 1617
}

1618 1619 1620 1621
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1622
	int ret;
1623
	struct sockaddr *addr;
1624
	struct scatterlist sg;
1625

1626 1627 1628
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1629
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1630 1631 1632 1633
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1634
	if (ret)
1635
		goto out;
1636

1637 1638 1639
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1640
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1641 1642
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1643 1644
			ret = -EINVAL;
			goto out;
1645
		}
1646 1647
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1648 1649 1650 1651 1652 1653 1654
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1655 1656 1657
	}

	eth_commit_mac_addr_change(dev, p);
1658
	ret = 0;
1659

1660 1661 1662
out:
	kfree(addr);
	return ret;
1663 1664
}

1665 1666
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1667 1668 1669
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1670
	int i;
1671

T
Toshiaki Makita 已提交
1672
	for (i = 0; i < vi->max_queue_pairs; i++) {
1673
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1674 1675
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1676 1677

		do {
T
Toshiaki Makita 已提交
1678 1679 1680 1681
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1682 1683

		do {
T
Toshiaki Makita 已提交
1684
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1685 1686 1687
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1688
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1689 1690 1691 1692 1693

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1694
		tot->rx_dropped += rdrops;
1695 1696 1697
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1698
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1699 1700 1701 1702
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1703 1704 1705 1706
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1707
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1708 1709 1710 1711
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1712
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1713 1714 1715 1716 1717 1718 1719
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1720 1721
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1722 1723

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1724
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1725 1726 1727
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1728
	} else {
J
Jason Wang 已提交
1729
		vi->curr_queue_pairs = queue_pairs;
1730 1731 1732
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1733
	}
J
Jason Wang 已提交
1734 1735 1736 1737

	return 0;
}

1738 1739 1740 1741 1742 1743 1744 1745 1746 1747
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1748 1749 1750
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1751
	int i;
R
Rusty Russell 已提交
1752

1753 1754
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1755

W
Willem de Bruijn 已提交
1756
	for (i = 0; i < vi->max_queue_pairs; i++) {
1757
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1758
		napi_disable(&vi->rq[i].napi);
1759
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1760
	}
R
Rusty Russell 已提交
1761 1762 1763 1764

	return 0;
}

1765 1766 1767
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1768 1769
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1770
	struct netdev_hw_addr *ha;
1771
	int uc_count;
1772
	int mc_count;
1773 1774
	void *buf;
	int i;
1775

S
stephen hemminger 已提交
1776
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1777 1778 1779
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1780 1781
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1782

1783
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1784 1785

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1786
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1787
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1788
			 vi->ctrl->promisc ? "en" : "dis");
1789

1790
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1791 1792

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1793
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1794
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1795
			 vi->ctrl->allmulti ? "en" : "dis");
1796

1797
	uc_count = netdev_uc_count(dev);
1798
	mc_count = netdev_mc_count(dev);
1799
	/* MAC filter - use one buffer for both lists */
1800 1801 1802
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1803
	if (!buf)
1804 1805
		return;

1806 1807
	sg_init_table(sg, 2);

1808
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1809
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1810
	i = 0;
1811
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1812
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1813 1814

	sg_set_buf(&sg[0], mac_data,
1815
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1816 1817

	/* multicast list and count fill the end */
1818
	mac_data = (void *)&mac_data->macs[uc_count][0];
1819

M
Michael S. Tsirkin 已提交
1820
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1821
	i = 0;
1822 1823
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1824 1825

	sg_set_buf(&sg[1], mac_data,
1826
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1827 1828

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1829
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1830
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1831 1832

	kfree(buf);
1833 1834
}

1835 1836
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1837 1838 1839 1840
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1841
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1842
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1843 1844

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1845
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1846
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1847
	return 0;
1848 1849
}

1850 1851
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1852 1853 1854 1855
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1856
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1857
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1858 1859

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1860
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1861
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1862
	return 0;
1863 1864
}

1865
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1866 1867 1868
{
	int i;

1869 1870
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1871 1872
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
1873 1874
		}

1875 1876 1877
		vi->affinity_hint_set = false;
	}
}
1878

1879 1880
static void virtnet_set_affinity(struct virtnet_info *vi)
{
1881 1882 1883 1884 1885 1886 1887 1888
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
1889 1890
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1891 1892
	}

1893 1894 1895 1896 1897 1898
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
1899

1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
		cpumask_clear(mask);
J
Jason Wang 已提交
1912 1913
	}

1914
	vi->affinity_hint_set = true;
1915
	free_cpumask_var(mask);
J
Jason Wang 已提交
1916 1917
}

1918
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1919
{
1920 1921 1922 1923 1924
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1925

1926 1927 1928 1929 1930 1931 1932
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1933

1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1965 1966
}

R
Rick Jones 已提交
1967 1968 1969 1970 1971
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1972 1973
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1974 1975 1976 1977
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2005
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2006 2007
		return -EINVAL;

J
John Fastabend 已提交
2008 2009 2010 2011 2012 2013 2014
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2015
	get_online_cpus();
2016
	err = _virtnet_set_queues(vi, queue_pairs);
2017 2018 2019 2020
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

2021
		virtnet_set_affinity(vi);
2022
	}
2023
	put_online_cpus();
2024 2025 2026 2027

	return err;
}

T
Toshiaki Makita 已提交
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	char *p = (char *)data;
	unsigned int i, j;

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s",
					 i, virtnet_rq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s",
					 i, virtnet_sq_stats_desc[j].desc);
				p += ETH_GSTRING_LEN;
			}
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2079
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2118
/* Check if the user is trying to change anything besides speed/duplex */
2119 2120
static bool
virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
2121
{
2122 2123
	struct ethtool_link_ksettings diff1 = *cmd;
	struct ethtool_link_ksettings diff2 = {};
2124

2125 2126 2127
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141
	diff1.base.speed = 0;
	diff2.base.port = PORT_OTHER;
	ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
	diff1.base.duplex = 0;
	diff1.base.cmd = 0;
	diff1.base.link_mode_masks_nwords = 0;

	return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
		bitmap_empty(diff1.link_modes.supported,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.lp_advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS);
2142 2143
}

2144 2145
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2146 2147 2148 2149
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

2150
	speed = cmd->base.speed;
2151 2152
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
2153
	    !ethtool_validate_duplex(cmd->base.duplex) ||
2154 2155 2156
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
2157
	vi->duplex = cmd->base.duplex;
2158 2159 2160 2161

	return 0;
}

2162 2163
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2164 2165 2166
{
	struct virtnet_info *vi = netdev_priv(dev);

2167 2168 2169
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2170 2171 2172 2173

	return 0;
}

2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220
static int virtnet_set_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_SCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

	if (ec->tx_max_coalesced_frames > 1)
		return -EINVAL;

	ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;

	/* disallow changes to fields not explicitly tested above */
	if (memcmp(ec, &ec_default, sizeof(ec_default)))
		return -EINVAL;

	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2221 2222 2223 2224 2225 2226 2227 2228
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

	speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config,
						  speed));
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
	duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config,
						  duplex));
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2247
static const struct ethtool_ops virtnet_ethtool_ops = {
2248
	.get_drvinfo = virtnet_get_drvinfo,
2249
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2250
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2251 2252 2253
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2254 2255
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2256
	.get_ts_info = ethtool_op_get_ts_info,
2257 2258
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2259 2260
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2261 2262
};

2263 2264 2265 2266 2267 2268 2269 2270
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2271
	netif_tx_lock_bh(vi->dev);
2272
	netif_device_detach(vi->dev);
2273
	netif_tx_unlock_bh(vi->dev);
2274 2275 2276
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2277
		for (i = 0; i < vi->max_queue_pairs; i++) {
2278
			napi_disable(&vi->rq[i].napi);
2279
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2280
		}
2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2302
		for (i = 0; i < vi->max_queue_pairs; i++) {
2303
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2304 2305 2306
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2307 2308
	}

2309
	netif_tx_lock_bh(vi->dev);
2310
	netif_device_attach(vi->dev);
2311
	netif_tx_unlock_bh(vi->dev);
2312 2313 2314
	return err;
}

2315 2316 2317
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2318
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2319

2320
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
		dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2351 2352
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2353 2354 2355 2356
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2357
	u16 xdp_qp = 0, curr_qp;
2358
	int i, err;
J
John Fastabend 已提交
2359

2360 2361 2362 2363
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2364 2365 2366
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
J
John Fastabend 已提交
2367 2368 2369 2370
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2371
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2372 2373 2374 2375
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2376
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2377 2378 2379 2380
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2381 2382 2383 2384 2385 2386
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2387
		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
2388 2389 2390 2391 2392
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

2393 2394 2395 2396 2397 2398
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

2399
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2400 2401 2402
	if (netif_running(dev))
		for (i = 0; i < vi->max_queue_pairs; i++)
			napi_disable(&vi->rq[i].napi);
J
John Fastabend 已提交
2403

2404
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2405 2406 2407 2408
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
	vi->xdp_queue_pairs = xdp_qp;
2409

J
John Fastabend 已提交
2410 2411 2412
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2413 2414 2415 2416 2417 2418
		if (i == 0) {
			if (!old_prog)
				virtnet_clear_guest_offloads(vi);
			if (!prog)
				virtnet_restore_guest_offloads(vi);
		}
J
John Fastabend 已提交
2419 2420
		if (old_prog)
			bpf_prog_put(old_prog);
2421 2422
		if (netif_running(dev))
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
J
John Fastabend 已提交
2423 2424 2425
	}

	return 0;
2426

2427 2428 2429
err:
	for (i = 0; i < vi->max_queue_pairs; i++)
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2430 2431 2432
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2433 2434
}

2435
static u32 virtnet_xdp_query(struct net_device *dev)
J
John Fastabend 已提交
2436 2437
{
	struct virtnet_info *vi = netdev_priv(dev);
2438
	const struct bpf_prog *xdp_prog;
J
John Fastabend 已提交
2439 2440 2441
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
2442 2443 2444
		xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		if (xdp_prog)
			return xdp_prog->aux->id;
J
John Fastabend 已提交
2445
	}
2446
	return 0;
J
John Fastabend 已提交
2447 2448
}

2449
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2450 2451 2452
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2453
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2454
	case XDP_QUERY_PROG:
2455
		xdp->prog_id = virtnet_xdp_query(dev);
J
John Fastabend 已提交
2456 2457 2458 2459 2460 2461
		return 0;
	default:
		return -EINVAL;
	}
}

2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2478 2479 2480 2481 2482
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2483
	.ndo_set_mac_address = virtnet_set_mac_address,
2484
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2485
	.ndo_get_stats64     = virtnet_stats,
2486 2487
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2488
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2489
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2490
	.ndo_features_check	= passthru_features_check,
2491
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2492 2493
};

2494
static void virtnet_config_changed_work(struct work_struct *work)
2495
{
2496 2497
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2498 2499
	u16 v;

2500 2501
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2502
		return;
2503 2504

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2505
		netdev_notify_peers(vi->dev);
2506 2507
		virtnet_ack_link_announce(vi);
	}
2508 2509 2510 2511 2512

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2513
		return;
2514 2515 2516 2517

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2518
		virtnet_update_settings(vi);
2519
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2520
		netif_tx_wake_all_queues(vi->dev);
2521 2522
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2523
		netif_tx_stop_all_queues(vi->dev);
2524 2525 2526 2527 2528 2529 2530
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2531
	schedule_work(&vi->config_work);
2532 2533
}

J
Jason Wang 已提交
2534 2535
static void virtnet_free_queues(struct virtnet_info *vi)
{
2536 2537
	int i;

2538 2539
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
2540
		netif_napi_del(&vi->rq[i].napi);
W
Willem de Bruijn 已提交
2541
		netif_napi_del(&vi->sq[i].napi);
2542
	}
2543

2544 2545 2546 2547 2548
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2549 2550
	kfree(vi->rq);
	kfree(vi->sq);
2551
	kfree(vi->ctrl);
J
Jason Wang 已提交
2552 2553
}

2554
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2555
{
J
John Fastabend 已提交
2556
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2557 2558 2559 2560 2561
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2562 2563 2564 2565 2566

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2567
	}
2568 2569 2570 2571 2572 2573
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2574
	rtnl_unlock();
J
Jason Wang 已提交
2575 2576
}

2577 2578 2579 2580 2581 2582 2583 2584
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

2585
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
2586 2587 2588 2589 2590 2591 2592 2593 2594
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
2595 2596 2597 2598 2599 2600 2601
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2602
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2603
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2604 2605 2606 2607
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2608 2609 2610 2611 2612 2613
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2614
			if (vi->mergeable_rx_bufs) {
2615
				put_page(virt_to_head_page(buf));
2616
			} else if (vi->big_packets) {
2617
				give_pages(&vi->rq[i], buf);
2618
			} else {
2619
				put_page(virt_to_head_page(buf));
2620
			}
J
Jason Wang 已提交
2621 2622 2623 2624
		}
	}
}

2625 2626 2627 2628
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2629
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2630

2631
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2632 2633

	virtnet_free_queues(vi);
2634 2635
}

2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2648 2649
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2650 2651
}

J
Jason Wang 已提交
2652
static int virtnet_find_vqs(struct virtnet_info *vi)
2653
{
J
Jason Wang 已提交
2654 2655 2656 2657 2658
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2659
	bool *ctx;
J
Jason Wang 已提交
2660 2661 2662 2663 2664 2665 2666 2667 2668

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2669
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2670 2671
	if (!vqs)
		goto err_vq;
2672
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2673 2674
	if (!callbacks)
		goto err_callback;
2675
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2676 2677
	if (!names)
		goto err_names;
2678
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2679
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2680 2681 2682 2683 2684
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2685 2686 2687 2688 2689 2690

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2691

J
Jason Wang 已提交
2692 2693 2694 2695 2696 2697 2698 2699
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2700 2701
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2702
	}
2703

J
Jason Wang 已提交
2704
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2705
					 names, ctx, NULL);
J
Jason Wang 已提交
2706 2707
	if (ret)
		goto err_find;
2708

J
Jason Wang 已提交
2709 2710
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2711
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2712
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2713
	}
J
Jason Wang 已提交
2714 2715 2716

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2717
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2718 2719 2720
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2721
	/* run here: ret == 0. */
J
Jason Wang 已提交
2722 2723 2724


err_find:
2725 2726
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2740 2741 2742
	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
	if (!vi->ctrl)
		goto err_ctrl;
K
Kees Cook 已提交
2743
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2744 2745
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2746
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2747
	if (!vi->rq)
J
Jason Wang 已提交
2748 2749 2750 2751 2752 2753 2754
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2755 2756
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2757 2758

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2759
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2760
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2761 2762 2763

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2764 2765 2766 2767 2768 2769 2770
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2771 2772
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2789
	get_online_cpus();
2790
	virtnet_set_affinity(vi);
2791 2792
	put_online_cpus();

J
Jason Wang 已提交
2793 2794 2795 2796 2797 2798
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2799 2800
}

2801 2802
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2803
		char *buf)
2804 2805 2806
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2807 2808
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2809
	struct ewma_pkt_len *avg;
2810 2811 2812

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2813
	return sprintf(buf, "%u\n",
2814 2815
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2866 2867 2868
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2869
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2870
{
2871 2872 2873 2874 2875 2876
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2877 2878 2879
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
2893
	int i, err = -ENOMEM;
2894 2895 2896 2897 2898
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2899
	/* Find if host supports multiqueue virtio_net device */
2900 2901 2902
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2903 2904 2905 2906 2907 2908

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2909 2910

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2911
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2912 2913 2914 2915
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2916
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2917
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2918
	dev->features = NETIF_F_HIGHDMA;
2919

2920
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2921 2922 2923
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2924
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2925
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2926
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2927
		if (csum)
J
Jason Wang 已提交
2928
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2929 2930

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2931
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
2932 2933
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2934
		/* Individual feature bits: what can host handle? */
2935 2936 2937 2938 2939 2940 2941
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

2942 2943
		dev->features |= NETIF_F_GSO_ROBUST;

2944
		if (gso)
2945
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
2946
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2947
	}
2948 2949
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2950

2951 2952
	dev->vlan_features = dev->features;

2953 2954 2955 2956
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2957
	/* Configuration may specify what MAC to use.  Otherwise random. */
2958 2959 2960 2961 2962
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2963
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2964 2965 2966 2967 2968

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2969
	vdev->priv = vi;
2970

2971
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2972

2973
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2974 2975
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2976 2977
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2978 2979
		vi->big_packets = true;

2980 2981 2982
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2983 2984
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2985 2986 2987 2988
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2989 2990
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2991 2992
		vi->any_header_sg = true;

J
Jason Wang 已提交
2993 2994 2995
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2996 2997 2998 2999
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3000
		if (mtu < dev->min_mtu) {
3001 3002 3003 3004 3005
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
T
Toshiaki Makita 已提交
3006
			goto free;
3007
		}
3008

3009 3010 3011
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3012 3013 3014
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3015 3016
	}

3017 3018
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3019

3020 3021 3022 3023 3024
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3025 3026 3027
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3028
	err = init_vqs(vi);
3029
	if (err)
T
Toshiaki Makita 已提交
3030
		goto free;
R
Rusty Russell 已提交
3031

3032 3033 3034 3035
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3036 3037
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3038

3039 3040
	virtnet_init_settings(dev);

3041 3042
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3043 3044
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3045
			goto free_vqs;
3046
		}
3047 3048
	}

R
Rusty Russell 已提交
3049 3050 3051
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3052
		goto free_failover;
R
Rusty Russell 已提交
3053
	}
3054

M
Michael S. Tsirkin 已提交
3055 3056
	virtio_device_ready(vdev);

3057
	err = virtnet_cpu_notif_add(vi);
3058 3059
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3060
		goto free_unregister_netdev;
3061 3062
	}

3063
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3064

J
Jason Wang 已提交
3065 3066
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3067
	netif_carrier_off(dev);
J
Jason Wang 已提交
3068
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3069
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3070 3071
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3072
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3073 3074
		netif_carrier_on(dev);
	}
3075

3076 3077 3078 3079
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);

J
Jason Wang 已提交
3080 3081 3082
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3083 3084
	return 0;

3085
free_unregister_netdev:
3086 3087
	vi->vdev->config->reset(vdev);

3088
	unregister_netdev(dev);
3089 3090
free_failover:
	net_failover_destroy(vi->failover);
3091
free_vqs:
J
Jason Wang 已提交
3092
	cancel_delayed_work_sync(&vi->refill);
3093
	free_receive_page_frags(vi);
3094
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3095 3096 3097 3098 3099
free:
	free_netdev(dev);
	return err;
}

3100
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3101
{
3102
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3103 3104

	/* Free unused buffers in both send and recv, if any. */
3105
	free_unused_bufs(vi);
3106

J
Jason Wang 已提交
3107
	free_receive_bufs(vi);
3108

3109 3110
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3111
	virtnet_del_vqs(vi);
3112 3113
}

3114
static void virtnet_remove(struct virtio_device *vdev)
3115 3116 3117
{
	struct virtnet_info *vi = vdev->priv;

3118
	virtnet_cpu_notif_remove(vi);
3119

3120 3121
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3122

3123 3124
	unregister_netdev(vi->dev);

3125 3126
	net_failover_destroy(vi->failover);

3127
	remove_vq_common(vi);
3128

3129
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3130 3131
}

3132
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3133 3134 3135
{
	struct virtnet_info *vi = vdev->priv;

3136
	virtnet_cpu_notif_remove(vi);
3137
	virtnet_freeze_down(vdev);
3138 3139 3140 3141 3142
	remove_vq_common(vi);

	return 0;
}

3143
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3144 3145
{
	struct virtnet_info *vi = vdev->priv;
3146
	int err;
3147

3148
	err = virtnet_restore_up(vdev);
3149 3150
	if (err)
		return err;
J
Jason Wang 已提交
3151 3152
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3153
	err = virtnet_cpu_notif_add(vi);
3154 3155 3156
	if (err)
		return err;

3157 3158 3159
	return 0;
}

R
Rusty Russell 已提交
3160 3161 3162 3163 3164
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3165 3166 3167 3168 3169 3170 3171 3172 3173 3174
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3175
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3176
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3177

3178
static unsigned int features[] = {
3179 3180 3181 3182 3183 3184
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3185
	VIRTIO_F_ANY_LAYOUT,
3186 3187
};

3188
static struct virtio_driver virtio_net_driver = {
3189 3190
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3191 3192
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3193 3194 3195
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3196
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3197
	.probe =	virtnet_probe,
3198
	.remove =	virtnet_remove,
3199
	.config_changed = virtnet_config_changed,
3200
#ifdef CONFIG_PM_SLEEP
3201 3202 3203
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3204 3205
};

3206 3207 3208 3209
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3210
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3211 3212 3213 3214 3215
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3216
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3236
	unregister_virtio_driver(&virtio_net_driver);
3237 3238 3239 3240
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3241 3242 3243 3244

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");