virtio_net.c 70.0 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
32
#include <net/route.h>
R
Rusty Russell 已提交
33

34
static int napi_weight = NAPI_POLL_WEIGHT;
35 36
module_param(napi_weight, int, 0444);

W
Willem de Bruijn 已提交
37
static bool csum = true, gso = true, napi_tx;
R
Rusty Russell 已提交
38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
40
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
41

R
Rusty Russell 已提交
42
/* FIXME: MTU in config. */
43
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
44
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
45

46 47
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

48 49 50
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

J
Johannes Berg 已提交
51 52 53 54
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
55
 */
56
DECLARE_EWMA(pkt_len, 0, 64)
57

58
#define VIRTNET_DRIVER_VERSION "1.0.0"
59

60
struct virtnet_stats {
61 62
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
63 64 65 66 67 68 69
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

70 71 72 73 74 75 76
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
77 78 79

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
80 81

	struct napi_struct napi;
82 83 84 85 86 87 88
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
89 90
	struct napi_struct napi;

J
John Fastabend 已提交
91 92
	struct bpf_prog __rcu *xdp_prog;

93 94 95
	/* Chain pages by the private ptr. */
	struct page *pages;

96
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
97
	struct ewma_pkt_len mrg_avg_pkt_len;
98

99 100 101
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

102 103
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
104

105 106 107
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
108 109
	/* Name of this receive queue: input.$index */
	char name[40];
110 111 112 113 114 115
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
116 117
	struct send_queue *sq;
	struct receive_queue *rq;
118 119
	unsigned int status;

J
Jason Wang 已提交
120 121 122 123 124 125
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

126 127 128
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

129 130 131
	/* I like... big packets and I cannot lie! */
	bool big_packets;

132 133 134
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
135 136 137
	/* Has control virtqueue */
	bool has_cvq;

138 139 140
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

141 142 143
	/* Packet virtio header size */
	u8 hdr_len;

144 145 146
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

147 148 149
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

150 151 152
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
153 154
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
155

156 157 158
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
159 160 161 162

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
163
	struct virtio_net_ctrl_mq ctrl_mq;
164 165
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
166
	u16 ctrl_vid;
167 168 169 170

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
171 172
};

173
struct padded_vnet_hdr {
174
	struct virtio_net_hdr_mrg_rxbuf hdr;
175
	/*
176 177 178
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
179
	 */
180
	char padding[4];
181 182
};

J
Jason Wang 已提交
183 184 185 186 187
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
188
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
189 190 191 192 193 194 195 196 197
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
198
	return vq->index / 2;
J
Jason Wang 已提交
199 200 201 202 203 204 205
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

206
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
207
{
208
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
209 210
}

211 212 213 214
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
215
static void give_pages(struct receive_queue *rq, struct page *page)
216
{
217
	struct page *end;
218

219
	/* Find end of list, sew whole thing into vi->rq.pages. */
220
	for (end = page; end->private; end = (struct page *)end->private);
221 222
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
223 224
}

225
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
226
{
227
	struct page *p = rq->pages;
228

229
	if (p) {
230
		rq->pages = (struct page *)p->private;
231 232 233
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
234 235 236 237
		p = alloc_page(gfp_mask);
	return p;
}

238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
	if (napi_complete_done(napi, processed) &&
	    unlikely(virtqueue_poll(vq, opaque)))
		virtqueue_napi_schedule(napi, vq);
}

258
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
259
{
260
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
261
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
262

263
	/* Suppress further interrupts. */
264
	virtqueue_disable_cb(vq);
265

W
Willem de Bruijn 已提交
266 267 268 269 270
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
271 272
}

273
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
274 275
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
276 277
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
278 279
{
	struct sk_buff *skb;
280
	struct virtio_net_hdr_mrg_rxbuf *hdr;
281
	unsigned int copy, hdr_len, hdr_padded_len;
282
	char *p;
283

284
	p = page_address(page) + offset;
285

286
	/* copy small packet so we can reuse these pages for small data */
287
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
288 289
	if (unlikely(!skb))
		return NULL;
290

291
	hdr = skb_vnet_hdr(skb);
292

293 294 295 296
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
297
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
298

299
	memcpy(hdr, p, hdr_len);
300

301
	len -= hdr_len;
302 303
	offset += hdr_padded_len;
	p += hdr_padded_len;
304

305 306 307 308
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
309

310 311
	len -= copy;
	offset += copy;
312

313 314 315 316 317 318 319 320
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

321 322 323 324 325 326 327
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
328
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
329 330 331
		dev_kfree_skb(skb);
		return NULL;
	}
332
	BUG_ON(offset >= PAGE_SIZE);
333
	while (len) {
334 335 336 337
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
338 339 340
		page = (struct page *)page->private;
		offset = 0;
	}
341

342
	if (page)
343
		give_pages(rq, page);
344

345 346
	return skb;
}
347

348
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
349
			     struct receive_queue *rq,
350
			     struct xdp_buff *xdp)
J
John Fastabend 已提交
351 352
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
353
	unsigned int len;
354 355
	struct send_queue *sq;
	unsigned int qp;
J
John Fastabend 已提交
356 357 358
	void *xdp_sent;
	int err;

359 360 361
	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	sq = &vi->sq[qp];

J
John Fastabend 已提交
362 363
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
364
		struct page *sent_page = virt_to_head_page(xdp_sent);
365

366
		put_page(sent_page);
367
	}
J
John Fastabend 已提交
368

369 370 371 372
	xdp->data -= vi->hdr_len;
	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);
373

374
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
375

376
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
J
John Fastabend 已提交
377
	if (unlikely(err)) {
378
		struct page *page = virt_to_head_page(xdp->data);
379

380
		put_page(page);
381
		return false;
J
John Fastabend 已提交
382 383 384
	}

	virtqueue_kick(sq->vq);
385
	return true;
J
John Fastabend 已提交
386 387
}

388 389 390 391 392
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

393 394 395 396
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
397
{
398
	struct sk_buff *skb;
399
	struct bpf_prog *xdp_prog;
400 401 402 403 404 405
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	unsigned int delta = 0;
406
	len -= vi->hdr_len;
407

408 409 410
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
411
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
412
		struct xdp_buff xdp;
413
		void *orig_data;
414 415 416 417
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
418

419 420
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
421
		xdp.data_end = xdp.data + len;
422
		orig_data = xdp.data;
423 424
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

425 426
		switch (act) {
		case XDP_PASS:
427
			/* Recalculate length in case bpf program changed it */
428
			delta = orig_data - xdp.data;
429 430
			break;
		case XDP_TX:
431
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
432
				trace_xdp_exception(vi->dev, xdp_prog, act);
433 434 435
			rcu_read_unlock();
			goto xdp_xmit;
		default:
436 437 438 439
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
440 441 442 443 444
			goto err_xdp;
		}
	}
	rcu_read_unlock();

445 446 447 448 449 450 451 452 453 454 455 456 457
	skb = build_skb(buf, buflen);
	if (!skb) {
		put_page(virt_to_head_page(buf));
		goto err;
	}
	skb_reserve(skb, headroom - delta);
	skb_put(skb, len + delta);
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
458
	return skb;
459 460 461 462

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
463
	put_page(virt_to_head_page(buf));
464 465
xdp_xmit:
	return NULL;
466 467 468
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
469
				   struct virtnet_info *vi,
470 471 472 473 474
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
475
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
476

477 478 479 480 481 482 483 484 485 486 487
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

488 489 490 491 492 493 494 495 496 497 498 499
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
500
				       u16 *num_buf,
501 502 503 504 505
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
506
	unsigned int page_off = VIRTIO_XDP_HEADROOM;
507 508 509 510 511 512 513

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

514
	while (--*num_buf) {
515 516 517 518
		unsigned int buflen;
		void *buf;
		int off;

519 520
		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
521 522
			goto err_buf;

523 524 525
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

526 527 528
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
529 530
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
531
			goto err_buf;
532
		}
533 534 535 536

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
537
		put_page(p);
538 539
	}

540 541
	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
542 543 544 545 546 547
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

548
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
549
					 struct virtnet_info *vi,
550
					 struct receive_queue *rq,
551 552
					 void *buf,
					 void *ctx,
553
					 unsigned int len)
554
{
555 556
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
557 558
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
559 560 561 562
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
563 564
	head_skb = NULL;

J
John Fastabend 已提交
565 566 567
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
568
		struct page *xdp_page;
569 570
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
571 572
		u32 act;

573
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
574
		if (unlikely(num_buf > 1)) {
575
			/* linearize data for XDP */
576
			xdp_page = xdp_linearize_page(rq, &num_buf,
577 578 579
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
580
			offset = VIRTIO_XDP_HEADROOM;
581 582
		} else {
			xdp_page = page;
J
John Fastabend 已提交
583 584 585 586 587 588 589
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
590
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
591 592
			goto err_xdp;

593 594 595
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
596
		data = page_address(xdp_page) + offset;
597
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
598 599 600 601
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

J
John Fastabend 已提交
602 603
		switch (act) {
		case XDP_PASS:
604 605 606 607 608 609 610
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

611 612 613 614 615
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
616
						       offset, len, PAGE_SIZE);
617
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
618 619
				return head_skb;
			}
J
John Fastabend 已提交
620 621
			break;
		case XDP_TX:
622
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
623
				trace_xdp_exception(vi->dev, xdp_prog, act);
624
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
625 626
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
627 628 629
			rcu_read_unlock();
			goto xdp_xmit;
		default:
630 631 632 633
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
634 635
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
636
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
637
			goto err_xdp;
J
John Fastabend 已提交
638
		}
J
John Fastabend 已提交
639 640
	}
	rcu_read_unlock();
641

642
	if (unlikely(len > (unsigned long)ctx)) {
643
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
644 645 646 647 648
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
	truesize = (unsigned long)ctx;
J
John Fastabend 已提交
649 650
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
651

652 653
	if (unlikely(!curr_skb))
		goto err_skb;
654
	while (--num_buf) {
655 656
		int num_skb_frags;

657
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
658
		if (unlikely(!ctx)) {
659
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
660
				 dev->name, num_buf,
661 662
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
663 664
			dev->stats.rx_length_errors++;
			goto err_buf;
665
		}
666 667

		page = virt_to_head_page(buf);
668
		if (unlikely(len > (unsigned long)ctx)) {
669
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
670 671 672 673 674
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
		truesize = (unsigned long)ctx;
675 676

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
677 678
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
679 680 681

			if (unlikely(!nskb))
				goto err_skb;
682 683 684 685 686 687 688 689 690 691 692
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
693
			head_skb->truesize += truesize;
694
		}
695
		offset = buf - page_address(page);
696 697 698
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
699
					     len, truesize);
700 701
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
702
					offset, len, truesize);
703
		}
704 705
	}

J
Johannes Berg 已提交
706
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
707 708
	return head_skb;

J
John Fastabend 已提交
709 710
err_xdp:
	rcu_read_unlock();
711 712 713
err_skb:
	put_page(page);
	while (--num_buf) {
714 715
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
716 717 718 719 720
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
721
		page = virt_to_head_page(buf);
722
		put_page(page);
723
	}
724 725 726
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
727
xdp_xmit:
728
	return NULL;
729 730
}

J
Jason Wang 已提交
731
static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
732
		       void *buf, unsigned int len, void **ctx)
733
{
734
	struct net_device *dev = vi->dev;
735
	struct sk_buff *skb;
736
	struct virtio_net_hdr_mrg_rxbuf *hdr;
J
Jason Wang 已提交
737
	int ret;
738

739
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
740 741
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
742
		if (vi->mergeable_rx_bufs) {
743
			put_page(virt_to_head_page(buf));
744
		} else if (vi->big_packets) {
745
			give_pages(rq, buf);
746
		} else {
747
			put_page(virt_to_head_page(buf));
748
		}
J
Jason Wang 已提交
749
		return 0;
750
	}
751

752
	if (vi->mergeable_rx_bufs)
753
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
754
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
755
		skb = receive_big(dev, vi, rq, buf, len);
756
	else
757
		skb = receive_small(dev, vi, rq, buf, len);
758 759

	if (unlikely(!skb))
J
Jason Wang 已提交
760
		return 0;
761

762
	hdr = skb_vnet_hdr(skb);
763

J
Jason Wang 已提交
764
	ret = skb->len;
R
Rusty Russell 已提交
765

766
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
767
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
768

769 770 771 772 773 774
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
775 776
	}

777 778 779 780
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
781
	napi_gro_receive(&rq->napi, skb);
J
Jason Wang 已提交
782
	return ret;
R
Rusty Russell 已提交
783 784 785 786

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
J
Jason Wang 已提交
787
	return 0;
R
Rusty Russell 已提交
788 789
}

M
Michael S. Tsirkin 已提交
790 791
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
792
{
793 794
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
795
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
796
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
797
	int err;
798

799 800 801
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
802
		return -ENOMEM;
R
Rusty Russell 已提交
803

804 805 806 807 808 809
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
810
	if (err < 0)
811
		put_page(virt_to_head_page(buf));
812

813 814
	return err;
}
815

816 817
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
818 819 820 821 822
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

823 824
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

825
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
826
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
827
		first = get_a_page(rq, gfp);
828 829
		if (!first) {
			if (list)
830
				give_pages(rq, list);
831
			return -ENOMEM;
832
		}
833
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
834

835 836 837 838
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
839

840
	first = get_a_page(rq, gfp);
841
	if (!first) {
842
		give_pages(rq, list);
843 844 845 846
		return -ENOMEM;
	}
	p = page_address(first);

847
	/* rq->sg[0], rq->sg[1] share the same page */
848 849
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
850

851
	/* rq->sg[1] for data packet, from offset */
852
	offset = sizeof(struct padded_vnet_hdr);
853
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
854 855 856

	/* chain first in list head */
	first->private = (unsigned long)list;
857 858
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
859
	if (err < 0)
860
		give_pages(rq, first);
861 862

	return err;
R
Rusty Russell 已提交
863 864
}

865 866
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
					  struct ewma_pkt_len *avg_pkt_len)
867
{
868
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
869 870
	unsigned int len;

J
Johannes Berg 已提交
871
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
872
				rq->min_buf_len - hdr_len, PAGE_SIZE - hdr_len);
873
	return ALIGN(len, L1_CACHE_BYTES);
874 875
}

876 877
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
878
{
879
	struct page_frag *alloc_frag = &rq->alloc_frag;
880
	unsigned int headroom = virtnet_get_headroom(vi);
881
	char *buf;
882
	void *ctx;
883
	int err;
884
	unsigned int len, hole;
885

886
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
887
	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
888
		return -ENOMEM;
889

890
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
891
	buf += headroom; /* advance address leaving hole at front of pkt */
892
	ctx = (void *)(unsigned long)len;
893
	get_page(alloc_frag->page);
894
	alloc_frag->offset += len + headroom;
895
	hole = alloc_frag->size - alloc_frag->offset;
896
	if (hole < len + headroom) {
897 898 899 900 901
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
902 903 904
		len += hole;
		alloc_frag->offset += hole;
	}
905

906
	sg_init_one(rq->sg, buf, len);
907
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
908
	if (err < 0)
909
		put_page(virt_to_head_page(buf));
910

911 912
	return err;
}
913

914 915 916 917 918 919 920
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
921 922
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
923 924
{
	int err;
925
	bool oom;
926

927
	gfp |= __GFP_COLD;
928 929
	do {
		if (vi->mergeable_rx_bufs)
930
			err = add_recvbuf_mergeable(vi, rq, gfp);
931
		else if (vi->big_packets)
932
			err = add_recvbuf_big(vi, rq, gfp);
933
		else
M
Michael S. Tsirkin 已提交
934
			err = add_recvbuf_small(vi, rq, gfp);
935

936
		oom = err == -ENOMEM;
937
		if (err)
938
			break;
939
	} while (rq->vq->num_free);
940
	virtqueue_kick(rq->vq);
941
	return !oom;
942 943
}

944
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
945 946
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
947
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
948

949
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
950 951
}

952
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
953
{
954
	napi_enable(napi);
955 956

	/* If all buffers were filled by other side before we napi_enabled, we
957 958 959 960 961 962
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
963 964
}

W
Willem de Bruijn 已提交
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

983 984 985 986 987 988
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

989 990
static void refill_work(struct work_struct *work)
{
991 992
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
993
	bool still_empty;
J
Jason Wang 已提交
994 995
	int i;

996
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
997
		struct receive_queue *rq = &vi->rq[i];
998

J
Jason Wang 已提交
999
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1000
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1001
		virtnet_napi_enable(rq->vq, &rq->napi);
1002

J
Jason Wang 已提交
1003 1004 1005 1006 1007 1008
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1009 1010
}

1011
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
1012
{
1013
	struct virtnet_info *vi = rq->vq->vdev->priv;
J
Jason Wang 已提交
1014
	unsigned int len, received = 0, bytes = 0;
1015
	void *buf;
J
Jason Wang 已提交
1016
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1017

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
	if (vi->mergeable_rx_bufs) {
		void *ctx;

		while (received < budget &&
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
			bytes += receive_buf(vi, rq, buf, len, ctx);
			received++;
		}
	} else {
		while (received < budget &&
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
			bytes += receive_buf(vi, rq, buf, len, NULL);
			received++;
		}
R
Rusty Russell 已提交
1032 1033
	}

1034
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
1035
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1036
			schedule_delayed_work(&vi->refill, 0);
1037
	}
R
Rusty Russell 已提交
1038

J
Jason Wang 已提交
1039 1040 1041 1042 1043
	u64_stats_update_begin(&stats->rx_syncp);
	stats->rx_bytes += bytes;
	stats->rx_packets += received;
	u64_stats_update_end(&stats->rx_syncp);

1044 1045 1046
	return received;
}

1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
static void free_old_xmit_skbs(struct send_queue *sq)
{
	struct sk_buff *skb;
	unsigned int len;
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
	unsigned int packets = 0;
	unsigned int bytes = 0;

	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		pr_debug("Sent skb %p\n", skb);

		bytes += skb->len;
		packets++;

		dev_kfree_skb_any(skb);
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

	u64_stats_update_begin(&stats->tx_syncp);
	stats->tx_bytes += bytes;
	stats->tx_packets += packets;
	u64_stats_update_end(&stats->tx_syncp);
}

1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

	if (!sq->napi.weight)
		return;

	if (__netif_tx_trylock(txq)) {
		free_old_xmit_skbs(sq);
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1096 1097 1098 1099
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1100
	unsigned int received;
1101

1102 1103
	virtnet_poll_cleantx(rq);

1104
	received = virtnet_receive(rq, budget);
1105

1106
	/* Out of packets? */
1107 1108
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1109 1110 1111 1112

	return received;
}

J
Jason Wang 已提交
1113 1114 1115 1116 1117
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1118 1119 1120
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1121
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1122
				schedule_delayed_work(&vi->refill, 0);
1123
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1124
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1125 1126 1127 1128 1129
	}

	return 0;
}

W
Willem de Bruijn 已提交
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));

	__netif_tx_lock(txq, raw_smp_processor_id());
	free_old_xmit_skbs(sq);
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1148
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1149
{
1150
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1151
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1152
	struct virtnet_info *vi = sq->vq->vdev->priv;
1153
	unsigned num_sg;
1154
	unsigned hdr_len = vi->hdr_len;
1155
	bool can_push;
R
Rusty Russell 已提交
1156

J
Johannes Berg 已提交
1157
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1158 1159 1160 1161 1162 1163 1164

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1165
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1166 1167
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1168

1169
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1170
				    virtio_is_little_endian(vi->vdev), false))
1171
		BUG();
R
Rusty Russell 已提交
1172

1173
	if (vi->mergeable_rx_bufs)
1174
		hdr->num_buffers = 0;
1175

1176
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1177 1178 1179 1180 1181 1182 1183 1184 1185
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1186
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1187 1188
}

1189
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1190 1191
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1192 1193
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1194
	int err;
1195 1196
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
W
Willem de Bruijn 已提交
1197
	bool use_napi = sq->napi.weight;
1198 1199

	/* Free up any pending old buffers before queueing new ones. */
1200
	free_old_xmit_skbs(sq);
1201

1202 1203 1204
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1205 1206 1207
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1208
	/* Try to transmit */
1209
	err = xmit_skb(sq, skb);
1210

1211
	/* This should not happen! */
1212
	if (unlikely(err)) {
1213 1214 1215
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1216
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1217
		dev->stats.tx_dropped++;
1218
		dev_kfree_skb_any(skb);
1219
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1220
	}
1221

1222
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1223 1224 1225 1226
	if (!use_napi) {
		skb_orphan(skb);
		nf_reset(skb);
	}
1227

1228 1229 1230 1231 1232 1233 1234 1235 1236
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1237
	 */
1238
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1239
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1240 1241
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1242
			/* More just got used, free them then recheck. */
1243 1244
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1245
				netif_start_subqueue(dev, qnum);
1246
				virtqueue_disable_cb(sq->vq);
1247 1248
			}
		}
1249
	}
1250

1251
	if (kick || netif_xmit_stopped(txq))
1252
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1253

1254
	return NETDEV_TX_OK;
1255 1256
}

1257 1258 1259
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1260
 * never fail unless improperly formatted.
1261 1262
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1263
				 struct scatterlist *out)
1264
{
1265
	struct scatterlist *sgs[4], hdr, stat;
1266
	unsigned out_num = 0, tmp;
1267 1268

	/* Caller should know better */
1269
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1270

1271 1272 1273
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1274
	/* Add header */
1275
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1276
	sgs[out_num++] = &hdr;
1277

1278 1279
	if (out)
		sgs[out_num++] = out;
1280

1281
	/* Add return status. */
1282
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1283
	sgs[out_num] = &stat;
1284

1285
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1286
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1287

1288
	if (unlikely(!virtqueue_kick(vi->cvq)))
1289
		return vi->ctrl_status == VIRTIO_NET_OK;
1290 1291 1292 1293

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1294 1295
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1296 1297
		cpu_relax();

1298
	return vi->ctrl_status == VIRTIO_NET_OK;
1299 1300
}

1301 1302 1303 1304
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1305
	int ret;
1306
	struct sockaddr *addr;
1307
	struct scatterlist sg;
1308

1309
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1310 1311 1312 1313
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1314
	if (ret)
1315
		goto out;
1316

1317 1318 1319
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1320
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1321 1322
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1323 1324
			ret = -EINVAL;
			goto out;
1325
		}
1326 1327
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1328 1329 1330 1331 1332 1333 1334
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1335 1336 1337
	}

	eth_commit_mac_addr_change(dev, p);
1338
	ret = 0;
1339

1340 1341 1342
out:
	kfree(addr);
	return ret;
1343 1344
}

1345 1346
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1347 1348 1349 1350 1351 1352
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1353
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1354 1355 1356
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1357
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1358 1359
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1360
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1361 1362

		do {
1363
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1364 1365
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1366
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1367 1368 1369 1370 1371 1372 1373 1374

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1375
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1376 1377 1378 1379 1380
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1381 1382 1383 1384
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1385
	int i;
1386

J
Jason Wang 已提交
1387 1388
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1389 1390 1391
}
#endif

1392 1393 1394 1395
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1396
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1397 1398 1399 1400
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1401
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1402 1403 1404 1405 1406 1407 1408
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1409 1410
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1411 1412

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1413
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1414 1415 1416
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1417
	} else {
J
Jason Wang 已提交
1418
		vi->curr_queue_pairs = queue_pairs;
1419 1420 1421
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1422
	}
J
Jason Wang 已提交
1423 1424 1425 1426

	return 0;
}

1427 1428 1429 1430 1431 1432 1433 1434 1435 1436
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1437 1438 1439
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1440
	int i;
R
Rusty Russell 已提交
1441

1442 1443
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1444

W
Willem de Bruijn 已提交
1445
	for (i = 0; i < vi->max_queue_pairs; i++) {
J
Jason Wang 已提交
1446
		napi_disable(&vi->rq[i].napi);
1447
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1448
	}
R
Rusty Russell 已提交
1449 1450 1451 1452

	return 0;
}

1453 1454 1455
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1456 1457
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1458
	struct netdev_hw_addr *ha;
1459
	int uc_count;
1460
	int mc_count;
1461 1462
	void *buf;
	int i;
1463

S
stephen hemminger 已提交
1464
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1465 1466 1467
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1468 1469
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1470

1471
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1472 1473

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1474
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1475
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1476
			 vi->ctrl_promisc ? "en" : "dis");
1477

1478
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1479 1480

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1481
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1482
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1483
			 vi->ctrl_allmulti ? "en" : "dis");
1484

1485
	uc_count = netdev_uc_count(dev);
1486
	mc_count = netdev_mc_count(dev);
1487
	/* MAC filter - use one buffer for both lists */
1488 1489 1490
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1491
	if (!buf)
1492 1493
		return;

1494 1495
	sg_init_table(sg, 2);

1496
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1497
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1498
	i = 0;
1499
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1500
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1501 1502

	sg_set_buf(&sg[0], mac_data,
1503
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1504 1505

	/* multicast list and count fill the end */
1506
	mac_data = (void *)&mac_data->macs[uc_count][0];
1507

M
Michael S. Tsirkin 已提交
1508
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1509
	i = 0;
1510 1511
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1512 1513

	sg_set_buf(&sg[1], mac_data,
1514
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1515 1516

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1517
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1518
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1519 1520

	kfree(buf);
1521 1522
}

1523 1524
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1525 1526 1527 1528
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1529 1530
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1531 1532

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1533
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1534
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1535
	return 0;
1536 1537
}

1538 1539
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1540 1541 1542 1543
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1544 1545
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1546 1547

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1548
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1549
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1550
	return 0;
1551 1552
}

1553
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1554 1555 1556
{
	int i;

1557 1558
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1559 1560 1561 1562
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1563 1564 1565
		vi->affinity_hint_set = false;
	}
}
1566

1567 1568 1569 1570
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1571 1572 1573 1574 1575

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1576 1577 1578 1579
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1580 1581
	}

1582 1583
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1584 1585
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1586
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1587
		i++;
J
Jason Wang 已提交
1588 1589
	}

1590
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1591 1592
}

1593
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1594
{
1595 1596 1597 1598 1599
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1600

1601 1602 1603 1604 1605 1606 1607
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1608

1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1640 1641
}

R
Rick Jones 已提交
1642 1643 1644 1645 1646
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1647 1648
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1649 1650 1651 1652
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1680
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1681 1682
		return -EINVAL;

J
John Fastabend 已提交
1683 1684 1685 1686 1687 1688 1689
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1690
	get_online_cpus();
1691
	err = _virtnet_set_queues(vi, queue_pairs);
1692 1693 1694 1695
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1696
		virtnet_set_affinity(vi);
1697
	}
1698
	put_online_cpus();
1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1716
/* Check if the user is trying to change anything besides speed/duplex */
1717 1718
static bool
virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
1719
{
1720 1721
	struct ethtool_link_ksettings diff1 = *cmd;
	struct ethtool_link_ksettings diff2 = {};
1722

1723 1724 1725
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739
	diff1.base.speed = 0;
	diff2.base.port = PORT_OTHER;
	ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
	diff1.base.duplex = 0;
	diff1.base.cmd = 0;
	diff1.base.link_mode_masks_nwords = 0;

	return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
		bitmap_empty(diff1.link_modes.supported,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.lp_advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS);
1740 1741
}

1742 1743
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
1744 1745 1746 1747
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

1748
	speed = cmd->base.speed;
1749 1750
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
1751
	    !ethtool_validate_duplex(cmd->base.duplex) ||
1752 1753 1754
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
1755
	vi->duplex = cmd->base.duplex;
1756 1757 1758 1759

	return 0;
}

1760 1761
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
1762 1763 1764
{
	struct virtnet_info *vi = netdev_priv(dev);

1765 1766 1767
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1780
static const struct ethtool_ops virtnet_ethtool_ops = {
1781
	.get_drvinfo = virtnet_get_drvinfo,
1782
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1783
	.get_ringparam = virtnet_get_ringparam,
1784 1785
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1786
	.get_ts_info = ethtool_op_get_ts_info,
1787 1788
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
1789 1790
};

1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
1803
		for (i = 0; i < vi->max_queue_pairs; i++) {
1804
			napi_disable(&vi->rq[i].napi);
1805
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1806
		}
1807 1808 1809 1810
	}
}

static int init_vqs(struct virtnet_info *vi);
1811
static void _remove_vq_common(struct virtnet_info *vi);
1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
1829
		for (i = 0; i < vi->max_queue_pairs; i++) {
1830
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1831 1832 1833
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
1834 1835 1836 1837 1838 1839
	}

	netif_device_attach(vi->dev);
	return err;
}

1840
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
{
	struct virtio_device *dev = vi->vdev;
	int ret;

	virtio_config_disable(dev);
	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
	virtnet_freeze_down(dev);
	_remove_vq_common(vi);

	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

	ret = virtio_finalize_features(dev);
	if (ret)
		goto err;

1857
	vi->xdp_queue_pairs = xdp_qp;
1858 1859 1860
	ret = virtnet_restore_up(dev);
	if (ret)
		goto err;
1861
	ret = _virtnet_set_queues(vi, curr_qp);
1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872
	if (ret)
		goto err;

	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
	virtio_config_enable(dev);
	return 0;
err:
	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
	return ret;
}

1873 1874
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
1875 1876 1877 1878
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1879
	u16 xdp_qp = 0, curr_qp;
1880
	int i, err;
J
John Fastabend 已提交
1881 1882

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1883 1884 1885
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
1886
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
J
John Fastabend 已提交
1887 1888 1889 1890
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
1891
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
1892 1893 1894 1895
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
1896
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
1897 1898 1899 1900
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1901 1902 1903 1904 1905 1906
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
1907
		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
1908 1909 1910 1911 1912
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

	/* Changing the headroom in buffers is a disruptive operation because
	 * existing buffers must be flushed and reallocated. This will happen
	 * when a xdp program is initially added or xdp is disabled by removing
	 * the xdp program resulting in number of XDP queues changing.
	 */
	if (vi->xdp_queue_pairs != xdp_qp) {
1925 1926 1927
		err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
		if (err) {
			dev_warn(&dev->dev, "XDP reset failure.\n");
1928
			goto virtio_reset_err;
1929
		}
J
John Fastabend 已提交
1930 1931
	}

1932 1933
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1934 1935 1936 1937 1938 1939 1940 1941
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
1942 1943 1944 1945 1946 1947 1948 1949 1950

virtio_reset_err:
	/* On reset error do our best to unwind XDP changes inflight and return
	 * error up to user space for resolution. The underlying reset hung on
	 * us so not much we can do here.
	 */
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
1969
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
1970 1971 1972 1973 1974 1975 1976 1977
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1978 1979 1980 1981 1982
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1983
	.ndo_set_mac_address = virtnet_set_mac_address,
1984
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1985
	.ndo_get_stats64     = virtnet_stats,
1986 1987
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1988 1989
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1990
#endif
J
John Fastabend 已提交
1991
	.ndo_xdp		= virtnet_xdp,
1992
	.ndo_features_check	= passthru_features_check,
1993 1994
};

1995
static void virtnet_config_changed_work(struct work_struct *work)
1996
{
1997 1998
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1999 2000
	u16 v;

2001 2002
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2003
		return;
2004 2005

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2006
		netdev_notify_peers(vi->dev);
2007 2008
		virtnet_ack_link_announce(vi);
	}
2009 2010 2011 2012 2013

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2014
		return;
2015 2016 2017 2018 2019

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2020
		netif_tx_wake_all_queues(vi->dev);
2021 2022
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2023
		netif_tx_stop_all_queues(vi->dev);
2024 2025 2026 2027 2028 2029 2030
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2031
	schedule_work(&vi->config_work);
2032 2033
}

J
Jason Wang 已提交
2034 2035
static void virtnet_free_queues(struct virtnet_info *vi)
{
2036 2037
	int i;

2038 2039
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
2040
		netif_napi_del(&vi->rq[i].napi);
W
Willem de Bruijn 已提交
2041
		netif_napi_del(&vi->sq[i].napi);
2042
	}
2043

2044 2045 2046 2047 2048
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2049 2050 2051 2052
	kfree(vi->rq);
	kfree(vi->sq);
}

2053
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2054
{
J
John Fastabend 已提交
2055
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2056 2057 2058 2059 2060
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2061 2062 2063 2064 2065

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2066
	}
2067 2068 2069 2070 2071 2072
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2073
	rtnl_unlock();
J
Jason Wang 已提交
2074 2075
}

2076 2077 2078 2079 2080 2081 2082 2083
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

2084
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
2085 2086 2087 2088 2089 2090 2091 2092 2093
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
2094 2095 2096 2097 2098 2099 2100
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2101
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2102
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2103 2104 2105 2106
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2107 2108 2109 2110 2111 2112
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2113
			if (vi->mergeable_rx_bufs) {
2114
				put_page(virt_to_head_page(buf));
2115
			} else if (vi->big_packets) {
2116
				give_pages(&vi->rq[i], buf);
2117
			} else {
2118
				put_page(virt_to_head_page(buf));
2119
			}
J
Jason Wang 已提交
2120 2121 2122 2123
		}
	}
}

2124 2125 2126 2127
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2128
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2129

2130
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2131 2132

	virtnet_free_queues(vi);
2133 2134
}

2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

	return max(min_buf_len, hdr_len);
}

J
Jason Wang 已提交
2150
static int virtnet_find_vqs(struct virtnet_info *vi)
2151
{
J
Jason Wang 已提交
2152 2153 2154 2155 2156
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2157
	bool *ctx;
J
Jason Wang 已提交
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;
2176 2177 2178 2179 2180 2181 2182
	if (vi->mergeable_rx_bufs) {
		ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2183 2184 2185 2186 2187 2188

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2189

J
Jason Wang 已提交
2190 2191 2192 2193 2194 2195 2196 2197
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2198 2199
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2200
	}
2201

J
Jason Wang 已提交
2202
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2203
					 names, ctx, NULL);
J
Jason Wang 已提交
2204 2205
	if (ret)
		goto err_find;
2206

J
Jason Wang 已提交
2207 2208
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2209
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2210
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2211
	}
J
Jason Wang 已提交
2212 2213 2214

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2215
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2216 2217 2218 2219 2220 2221 2222
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2223
	return 0;
J
Jason Wang 已提交
2224 2225

err_find:
2226 2227
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2245
	if (!vi->rq)
J
Jason Wang 已提交
2246 2247 2248 2249 2250 2251 2252
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2253 2254
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2255 2256

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2257
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2282
	get_online_cpus();
2283
	virtnet_set_affinity(vi);
2284 2285
	put_online_cpus();

J
Jason Wang 已提交
2286 2287 2288 2289 2290 2291
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2292 2293
}

2294 2295 2296 2297 2298 2299
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2300
	struct ewma_pkt_len *avg;
2301 2302 2303

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2304 2305
	return sprintf(buf, "%u\n",
		       get_mergeable_buf_len(&vi->rq[queue_index], avg));
2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2356 2357 2358
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2359
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2360
{
2361 2362 2363 2364 2365 2366
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2367 2368 2369
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
	int i, err;
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2389
	/* Find if host supports multiqueue virtio_net device */
2390 2391 2392
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2393 2394 2395 2396 2397 2398

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2399 2400

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2401
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2402 2403 2404 2405
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2406
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2407
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2408
	dev->features = NETIF_F_HIGHDMA;
2409

2410
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2411 2412 2413
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2414
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2415
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2416
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2417
		if (csum)
J
Jason Wang 已提交
2418
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2419 2420

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2421
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2422 2423
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2424
		/* Individual feature bits: what can host handle? */
2425 2426 2427 2428 2429 2430
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2431 2432
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2433

2434 2435
		dev->features |= NETIF_F_GSO_ROBUST;

2436
		if (gso)
2437
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2438
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2439
	}
2440 2441
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2442

2443 2444
	dev->vlan_features = dev->features;

2445 2446 2447 2448
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2449
	/* Configuration may specify what MAC to use.  Otherwise random. */
2450 2451 2452 2453 2454
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2455
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2456 2457 2458 2459 2460

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2461
	vdev->priv = vi;
2462 2463 2464 2465 2466
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2467 2468 2469 2470 2471 2472 2473
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2474
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2475

2476
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2477 2478
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2479 2480
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2481 2482
		vi->big_packets = true;

2483 2484 2485
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2486 2487
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2488 2489 2490 2491
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2492 2493
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2494 2495
		vi->any_header_sg = true;

J
Jason Wang 已提交
2496 2497 2498
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2499 2500 2501 2502
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2503
		if (mtu < dev->min_mtu) {
2504 2505 2506 2507 2508 2509
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
			goto free_stats;
2510
		}
2511

2512 2513 2514
		dev->mtu = mtu;
		dev->max_mtu = mtu;

2515 2516 2517
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2518 2519
	}

2520 2521
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2522

2523 2524 2525 2526 2527
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2528 2529 2530
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2531
	err = init_vqs(vi);
2532
	if (err)
2533
		goto free_stats;
R
Rusty Russell 已提交
2534

2535 2536 2537 2538
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2539 2540
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2541

2542 2543
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2544 2545 2546
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2547
		goto free_vqs;
R
Rusty Russell 已提交
2548
	}
2549

M
Michael S. Tsirkin 已提交
2550 2551
	virtio_device_ready(vdev);

2552
	err = virtnet_cpu_notif_add(vi);
2553 2554
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2555
		goto free_unregister_netdev;
2556 2557
	}

2558
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2559

J
Jason Wang 已提交
2560 2561 2562 2563
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2564
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2565 2566 2567 2568
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2569

J
Jason Wang 已提交
2570 2571 2572
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2573 2574
	return 0;

2575
free_unregister_netdev:
2576 2577
	vi->vdev->config->reset(vdev);

2578
	unregister_netdev(dev);
2579
free_vqs:
J
Jason Wang 已提交
2580
	cancel_delayed_work_sync(&vi->refill);
2581
	free_receive_page_frags(vi);
2582
	virtnet_del_vqs(vi);
2583 2584
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2585 2586 2587 2588 2589
free:
	free_netdev(dev);
	return err;
}

2590 2591 2592 2593 2594 2595 2596 2597 2598
static void _remove_vq_common(struct virtnet_info *vi)
{
	vi->vdev->config->reset(vi->vdev);
	free_unused_bufs(vi);
	_free_receive_bufs(vi);
	free_receive_page_frags(vi);
	virtnet_del_vqs(vi);
}

2599
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2600
{
2601
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2602 2603

	/* Free unused buffers in both send and recv, if any. */
2604
	free_unused_bufs(vi);
2605

J
Jason Wang 已提交
2606
	free_receive_bufs(vi);
2607

2608 2609
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2610
	virtnet_del_vqs(vi);
2611 2612
}

2613
static void virtnet_remove(struct virtio_device *vdev)
2614 2615 2616
{
	struct virtnet_info *vi = vdev->priv;

2617
	virtnet_cpu_notif_remove(vi);
2618

2619 2620
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2621

2622 2623 2624
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2625

2626
	free_percpu(vi->stats);
2627
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2628 2629
}

2630
#ifdef CONFIG_PM_SLEEP
2631 2632 2633 2634
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2635
	virtnet_cpu_notif_remove(vi);
2636
	virtnet_freeze_down(vdev);
2637 2638 2639 2640 2641 2642 2643 2644
	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
2645
	int err;
2646

2647
	err = virtnet_restore_up(vdev);
2648 2649
	if (err)
		return err;
J
Jason Wang 已提交
2650 2651
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2652
	err = virtnet_cpu_notif_add(vi);
2653 2654 2655
	if (err)
		return err;

2656 2657 2658 2659
	return 0;
}
#endif

R
Rusty Russell 已提交
2660 2661 2662 2663 2664
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2677
static unsigned int features[] = {
2678 2679 2680 2681 2682 2683
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2684
	VIRTIO_F_ANY_LAYOUT,
2685 2686
};

2687
static struct virtio_driver virtio_net_driver = {
2688 2689
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2690 2691
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2692 2693 2694
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
2695
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
2696
	.probe =	virtnet_probe,
2697
	.remove =	virtnet_remove,
2698
	.config_changed = virtnet_config_changed,
2699
#ifdef CONFIG_PM_SLEEP
2700 2701 2702
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2703 2704
};

2705 2706 2707 2708
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2709
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2710 2711 2712 2713 2714
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2715
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2740 2741 2742 2743

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");