virtio_net.c 70.2 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
32
#include <net/route.h>
R
Rusty Russell 已提交
33

34
static int napi_weight = NAPI_POLL_WEIGHT;
35 36
module_param(napi_weight, int, 0444);

W
Willem de Bruijn 已提交
37
static bool csum = true, gso = true, napi_tx;
R
Rusty Russell 已提交
38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
40
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
41

R
Rusty Russell 已提交
42
/* FIXME: MTU in config. */
43
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
44
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
45

46 47
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

48 49 50
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

J
Johannes Berg 已提交
51 52 53 54
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
55
 */
56
DECLARE_EWMA(pkt_len, 0, 64)
57

58
#define VIRTNET_DRIVER_VERSION "1.0.0"
59

60
struct virtnet_stats {
61 62
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
63 64 65 66 67 68 69
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

70 71 72 73 74 75 76
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
77 78 79

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
80 81

	struct napi_struct napi;
82 83 84 85 86 87 88
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
89 90
	struct napi_struct napi;

J
John Fastabend 已提交
91 92
	struct bpf_prog __rcu *xdp_prog;

93 94 95
	/* Chain pages by the private ptr. */
	struct page *pages;

96
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
97
	struct ewma_pkt_len mrg_avg_pkt_len;
98

99 100 101
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

102 103
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
104

105 106 107
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
108 109
	/* Name of this receive queue: input.$index */
	char name[40];
110 111 112 113 114 115
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
116 117
	struct send_queue *sq;
	struct receive_queue *rq;
118 119
	unsigned int status;

J
Jason Wang 已提交
120 121 122 123 124 125
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

126 127 128
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

129 130 131
	/* I like... big packets and I cannot lie! */
	bool big_packets;

132 133 134
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
135 136 137
	/* Has control virtqueue */
	bool has_cvq;

138 139 140
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

141 142 143
	/* Packet virtio header size */
	u8 hdr_len;

144 145 146
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

147 148 149
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

150 151 152
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
153 154
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
155

156 157 158
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
159 160 161 162

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
163
	struct virtio_net_ctrl_mq ctrl_mq;
164 165
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
166
	u16 ctrl_vid;
167 168 169 170

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
171 172
};

173
struct padded_vnet_hdr {
174
	struct virtio_net_hdr_mrg_rxbuf hdr;
175
	/*
176 177 178
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
179
	 */
180
	char padding[4];
181 182
};

J
Jason Wang 已提交
183 184 185 186 187
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
188
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
189 190 191 192 193 194 195 196 197
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
198
	return vq->index / 2;
J
Jason Wang 已提交
199 200 201 202 203 204 205
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

206
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
207
{
208
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
209 210
}

211 212 213 214
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
215
static void give_pages(struct receive_queue *rq, struct page *page)
216
{
217
	struct page *end;
218

219
	/* Find end of list, sew whole thing into vi->rq.pages. */
220
	for (end = page; end->private; end = (struct page *)end->private);
221 222
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
223 224
}

225
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
226
{
227
	struct page *p = rq->pages;
228

229
	if (p) {
230
		rq->pages = (struct page *)p->private;
231 232 233
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
234 235 236 237
		p = alloc_page(gfp_mask);
	return p;
}

238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
	if (napi_complete_done(napi, processed) &&
	    unlikely(virtqueue_poll(vq, opaque)))
		virtqueue_napi_schedule(napi, vq);
}

258
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
259
{
260
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
261
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
262

263
	/* Suppress further interrupts. */
264
	virtqueue_disable_cb(vq);
265

W
Willem de Bruijn 已提交
266 267 268 269 270
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
271 272
}

273
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
274 275
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
276 277
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
278 279
{
	struct sk_buff *skb;
280
	struct virtio_net_hdr_mrg_rxbuf *hdr;
281
	unsigned int copy, hdr_len, hdr_padded_len;
282
	char *p;
283

284
	p = page_address(page) + offset;
285

286
	/* copy small packet so we can reuse these pages for small data */
287
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
288 289
	if (unlikely(!skb))
		return NULL;
290

291
	hdr = skb_vnet_hdr(skb);
292

293 294 295 296
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
297
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
298

299
	memcpy(hdr, p, hdr_len);
300

301
	len -= hdr_len;
302 303
	offset += hdr_padded_len;
	p += hdr_padded_len;
304

305 306 307
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
308
	skb_put_data(skb, p, copy);
309

310 311
	len -= copy;
	offset += copy;
312

313 314 315 316 317 318 319 320
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

321 322 323 324 325 326 327
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
328
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
329 330 331
		dev_kfree_skb(skb);
		return NULL;
	}
332
	BUG_ON(offset >= PAGE_SIZE);
333
	while (len) {
334 335 336 337
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
338 339 340
		page = (struct page *)page->private;
		offset = 0;
	}
341

342
	if (page)
343
		give_pages(rq, page);
344

345 346
	return skb;
}
347

348
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
349
			     struct receive_queue *rq,
350
			     struct xdp_buff *xdp)
J
John Fastabend 已提交
351 352
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
353
	unsigned int len;
354 355
	struct send_queue *sq;
	unsigned int qp;
J
John Fastabend 已提交
356 357 358
	void *xdp_sent;
	int err;

359 360 361
	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	sq = &vi->sq[qp];

J
John Fastabend 已提交
362 363
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
364
		struct page *sent_page = virt_to_head_page(xdp_sent);
365

366
		put_page(sent_page);
367
	}
J
John Fastabend 已提交
368

369 370 371 372
	xdp->data -= vi->hdr_len;
	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);
373

374
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
375

376
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
J
John Fastabend 已提交
377
	if (unlikely(err)) {
378
		struct page *page = virt_to_head_page(xdp->data);
379

380
		put_page(page);
381
		return false;
J
John Fastabend 已提交
382 383 384
	}

	virtqueue_kick(sq->vq);
385
	return true;
J
John Fastabend 已提交
386 387
}

388 389 390 391 392
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

393 394 395 396
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
397
{
398
	struct sk_buff *skb;
399
	struct bpf_prog *xdp_prog;
400 401 402 403 404 405
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	unsigned int delta = 0;
406
	len -= vi->hdr_len;
407

408 409 410
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
411
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
412
		struct xdp_buff xdp;
413
		void *orig_data;
414 415 416 417
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
418

419 420
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
421
		xdp.data_end = xdp.data + len;
422
		orig_data = xdp.data;
423 424
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

425 426
		switch (act) {
		case XDP_PASS:
427
			/* Recalculate length in case bpf program changed it */
428
			delta = orig_data - xdp.data;
429 430
			break;
		case XDP_TX:
431
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
432
				trace_xdp_exception(vi->dev, xdp_prog, act);
433 434 435
			rcu_read_unlock();
			goto xdp_xmit;
		default:
436 437 438 439
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
440 441 442 443 444
			goto err_xdp;
		}
	}
	rcu_read_unlock();

445 446 447 448 449 450 451 452 453 454 455 456 457
	skb = build_skb(buf, buflen);
	if (!skb) {
		put_page(virt_to_head_page(buf));
		goto err;
	}
	skb_reserve(skb, headroom - delta);
	skb_put(skb, len + delta);
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
458
	return skb;
459 460 461 462

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
463
	put_page(virt_to_head_page(buf));
464 465
xdp_xmit:
	return NULL;
466 467 468
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
469
				   struct virtnet_info *vi,
470 471 472 473 474
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
475
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
476

477 478 479 480 481 482 483 484 485 486 487
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

488 489 490 491 492 493 494 495 496 497 498 499
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
500
				       u16 *num_buf,
501 502 503 504 505
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
506
	unsigned int page_off = VIRTIO_XDP_HEADROOM;
507 508 509 510 511 512 513

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

514
	while (--*num_buf) {
515 516 517 518
		unsigned int buflen;
		void *buf;
		int off;

519 520
		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
521 522
			goto err_buf;

523 524 525
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

526 527 528
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
529 530
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
531
			goto err_buf;
532
		}
533 534 535 536

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
537
		put_page(p);
538 539
	}

540 541
	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
542 543 544 545 546 547
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

548
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
549
					 struct virtnet_info *vi,
550
					 struct receive_queue *rq,
551 552
					 void *buf,
					 void *ctx,
553
					 unsigned int len)
554
{
555 556
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
557 558
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
559 560 561 562
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
563 564
	head_skb = NULL;

J
John Fastabend 已提交
565 566 567
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
568
		struct page *xdp_page;
569 570
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
571 572
		u32 act;

573
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
574
		if (unlikely(num_buf > 1)) {
575
			/* linearize data for XDP */
576
			xdp_page = xdp_linearize_page(rq, &num_buf,
577 578 579
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
580
			offset = VIRTIO_XDP_HEADROOM;
581 582
		} else {
			xdp_page = page;
J
John Fastabend 已提交
583 584 585 586 587 588 589
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
590
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
591 592
			goto err_xdp;

593 594 595
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
596
		data = page_address(xdp_page) + offset;
597
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
598 599 600 601
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

J
John Fastabend 已提交
602 603
		switch (act) {
		case XDP_PASS:
604 605 606 607 608 609 610
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

611 612 613 614 615
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
616
						       offset, len, PAGE_SIZE);
617
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
618 619
				return head_skb;
			}
J
John Fastabend 已提交
620 621
			break;
		case XDP_TX:
622
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
623
				trace_xdp_exception(vi->dev, xdp_prog, act);
624
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
625 626
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
627 628 629
			rcu_read_unlock();
			goto xdp_xmit;
		default:
630 631 632 633
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
634 635
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
636
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
637
			goto err_xdp;
J
John Fastabend 已提交
638
		}
J
John Fastabend 已提交
639 640
	}
	rcu_read_unlock();
641

642
	if (unlikely(len > (unsigned long)ctx)) {
643
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
644 645 646 647 648
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
	truesize = (unsigned long)ctx;
J
John Fastabend 已提交
649 650
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
651

652 653
	if (unlikely(!curr_skb))
		goto err_skb;
654
	while (--num_buf) {
655 656
		int num_skb_frags;

657
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
658
		if (unlikely(!ctx)) {
659
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
660
				 dev->name, num_buf,
661 662
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
663 664
			dev->stats.rx_length_errors++;
			goto err_buf;
665
		}
666 667

		page = virt_to_head_page(buf);
668
		if (unlikely(len > (unsigned long)ctx)) {
669
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
670 671 672 673 674
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
		truesize = (unsigned long)ctx;
675 676

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
677 678
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
679 680 681

			if (unlikely(!nskb))
				goto err_skb;
682 683 684 685 686 687 688 689 690 691 692
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
693
			head_skb->truesize += truesize;
694
		}
695
		offset = buf - page_address(page);
696 697 698
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
699
					     len, truesize);
700 701
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
702
					offset, len, truesize);
703
		}
704 705
	}

J
Johannes Berg 已提交
706
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
707 708
	return head_skb;

J
John Fastabend 已提交
709 710
err_xdp:
	rcu_read_unlock();
711 712 713
err_skb:
	put_page(page);
	while (--num_buf) {
714 715
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
716 717 718 719 720
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
721
		page = virt_to_head_page(buf);
722
		put_page(page);
723
	}
724 725 726
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
727
xdp_xmit:
728
	return NULL;
729 730
}

J
Jason Wang 已提交
731
static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
732
		       void *buf, unsigned int len, void **ctx)
733
{
734
	struct net_device *dev = vi->dev;
735
	struct sk_buff *skb;
736
	struct virtio_net_hdr_mrg_rxbuf *hdr;
J
Jason Wang 已提交
737
	int ret;
738

739
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
740 741
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
742
		if (vi->mergeable_rx_bufs) {
743
			put_page(virt_to_head_page(buf));
744
		} else if (vi->big_packets) {
745
			give_pages(rq, buf);
746
		} else {
747
			put_page(virt_to_head_page(buf));
748
		}
J
Jason Wang 已提交
749
		return 0;
750
	}
751

752
	if (vi->mergeable_rx_bufs)
753
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
754
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
755
		skb = receive_big(dev, vi, rq, buf, len);
756
	else
757
		skb = receive_small(dev, vi, rq, buf, len);
758 759

	if (unlikely(!skb))
J
Jason Wang 已提交
760
		return 0;
761

762
	hdr = skb_vnet_hdr(skb);
763

J
Jason Wang 已提交
764
	ret = skb->len;
R
Rusty Russell 已提交
765

766
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
767
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
768

769 770 771 772 773 774
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
775 776
	}

777 778 779 780
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
781
	napi_gro_receive(&rq->napi, skb);
J
Jason Wang 已提交
782
	return ret;
R
Rusty Russell 已提交
783 784 785 786

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
J
Jason Wang 已提交
787
	return 0;
R
Rusty Russell 已提交
788 789
}

M
Michael S. Tsirkin 已提交
790 791
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
792
{
793 794
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
795
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
796
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
797
	int err;
798

799 800 801
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
802
		return -ENOMEM;
R
Rusty Russell 已提交
803

804 805 806 807 808 809
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
810
	if (err < 0)
811
		put_page(virt_to_head_page(buf));
812

813 814
	return err;
}
815

816 817
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
818 819 820 821 822
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

823 824
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

825
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
826
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
827
		first = get_a_page(rq, gfp);
828 829
		if (!first) {
			if (list)
830
				give_pages(rq, list);
831
			return -ENOMEM;
832
		}
833
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
834

835 836 837 838
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
839

840
	first = get_a_page(rq, gfp);
841
	if (!first) {
842
		give_pages(rq, list);
843 844 845 846
		return -ENOMEM;
	}
	p = page_address(first);

847
	/* rq->sg[0], rq->sg[1] share the same page */
848 849
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
850

851
	/* rq->sg[1] for data packet, from offset */
852
	offset = sizeof(struct padded_vnet_hdr);
853
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
854 855 856

	/* chain first in list head */
	first->private = (unsigned long)list;
857 858
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
859
	if (err < 0)
860
		give_pages(rq, first);
861 862

	return err;
R
Rusty Russell 已提交
863 864
}

865 866
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
					  struct ewma_pkt_len *avg_pkt_len)
867
{
868
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
869 870
	unsigned int len;

J
Johannes Berg 已提交
871
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
872
				rq->min_buf_len, PAGE_SIZE - hdr_len);
873
	return ALIGN(len, L1_CACHE_BYTES);
874 875
}

876 877
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
878
{
879
	struct page_frag *alloc_frag = &rq->alloc_frag;
880
	unsigned int headroom = virtnet_get_headroom(vi);
881
	char *buf;
882
	void *ctx;
883
	int err;
884
	unsigned int len, hole;
885

886
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
887
	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
888
		return -ENOMEM;
889

890
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
891
	buf += headroom; /* advance address leaving hole at front of pkt */
892
	ctx = (void *)(unsigned long)len;
893
	get_page(alloc_frag->page);
894
	alloc_frag->offset += len + headroom;
895
	hole = alloc_frag->size - alloc_frag->offset;
896
	if (hole < len + headroom) {
897 898 899 900 901
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
902 903 904
		len += hole;
		alloc_frag->offset += hole;
	}
905

906
	sg_init_one(rq->sg, buf, len);
907
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
908
	if (err < 0)
909
		put_page(virt_to_head_page(buf));
910

911 912
	return err;
}
913

914 915 916 917 918 919 920
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
921 922
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
923 924
{
	int err;
925
	bool oom;
926

927
	gfp |= __GFP_COLD;
928 929
	do {
		if (vi->mergeable_rx_bufs)
930
			err = add_recvbuf_mergeable(vi, rq, gfp);
931
		else if (vi->big_packets)
932
			err = add_recvbuf_big(vi, rq, gfp);
933
		else
M
Michael S. Tsirkin 已提交
934
			err = add_recvbuf_small(vi, rq, gfp);
935

936
		oom = err == -ENOMEM;
937
		if (err)
938
			break;
939
	} while (rq->vq->num_free);
940
	virtqueue_kick(rq->vq);
941
	return !oom;
942 943
}

944
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
945 946
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
947
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
948

949
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
950 951
}

952
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
953
{
954
	napi_enable(napi);
955 956

	/* If all buffers were filled by other side before we napi_enabled, we
957 958 959 960 961 962
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
963 964
}

W
Willem de Bruijn 已提交
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

983 984 985 986 987 988
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

989 990
static void refill_work(struct work_struct *work)
{
991 992
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
993
	bool still_empty;
J
Jason Wang 已提交
994 995
	int i;

996
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
997
		struct receive_queue *rq = &vi->rq[i];
998

J
Jason Wang 已提交
999
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1000
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1001
		virtnet_napi_enable(rq->vq, &rq->napi);
1002

J
Jason Wang 已提交
1003 1004 1005 1006 1007 1008
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1009 1010
}

1011
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
1012
{
1013
	struct virtnet_info *vi = rq->vq->vdev->priv;
J
Jason Wang 已提交
1014
	unsigned int len, received = 0, bytes = 0;
1015
	void *buf;
J
Jason Wang 已提交
1016
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1017

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
	if (vi->mergeable_rx_bufs) {
		void *ctx;

		while (received < budget &&
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
			bytes += receive_buf(vi, rq, buf, len, ctx);
			received++;
		}
	} else {
		while (received < budget &&
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
			bytes += receive_buf(vi, rq, buf, len, NULL);
			received++;
		}
R
Rusty Russell 已提交
1032 1033
	}

1034
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
1035
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1036
			schedule_delayed_work(&vi->refill, 0);
1037
	}
R
Rusty Russell 已提交
1038

J
Jason Wang 已提交
1039 1040 1041 1042 1043
	u64_stats_update_begin(&stats->rx_syncp);
	stats->rx_bytes += bytes;
	stats->rx_packets += received;
	u64_stats_update_end(&stats->rx_syncp);

1044 1045 1046
	return received;
}

1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
static void free_old_xmit_skbs(struct send_queue *sq)
{
	struct sk_buff *skb;
	unsigned int len;
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
	unsigned int packets = 0;
	unsigned int bytes = 0;

	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		pr_debug("Sent skb %p\n", skb);

		bytes += skb->len;
		packets++;

		dev_kfree_skb_any(skb);
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

	u64_stats_update_begin(&stats->tx_syncp);
	stats->tx_bytes += bytes;
	stats->tx_packets += packets;
	u64_stats_update_end(&stats->tx_syncp);
}

1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

	if (!sq->napi.weight)
		return;

	if (__netif_tx_trylock(txq)) {
		free_old_xmit_skbs(sq);
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1096 1097 1098 1099
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1100
	unsigned int received;
1101

1102 1103
	virtnet_poll_cleantx(rq);

1104
	received = virtnet_receive(rq, budget);
1105

1106
	/* Out of packets? */
1107 1108
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1109 1110 1111 1112

	return received;
}

J
Jason Wang 已提交
1113 1114 1115 1116 1117
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1118 1119 1120
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1121
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1122
				schedule_delayed_work(&vi->refill, 0);
1123
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1124
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1125 1126 1127 1128 1129
	}

	return 0;
}

W
Willem de Bruijn 已提交
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));

	__netif_tx_lock(txq, raw_smp_processor_id());
	free_old_xmit_skbs(sq);
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1148
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1149
{
1150
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1151
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1152
	struct virtnet_info *vi = sq->vq->vdev->priv;
1153
	int num_sg;
1154
	unsigned hdr_len = vi->hdr_len;
1155
	bool can_push;
R
Rusty Russell 已提交
1156

J
Johannes Berg 已提交
1157
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1158 1159 1160 1161 1162 1163 1164

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1165
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1166 1167
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1168

1169
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1170
				    virtio_is_little_endian(vi->vdev), false))
1171
		BUG();
R
Rusty Russell 已提交
1172

1173
	if (vi->mergeable_rx_bufs)
1174
		hdr->num_buffers = 0;
1175

1176
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1177 1178 1179
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1180 1181
		if (unlikely(num_sg < 0))
			return num_sg;
1182 1183 1184 1185
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1186 1187 1188 1189
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1190
	}
1191
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1192 1193
}

1194
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1195 1196
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1197 1198
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1199
	int err;
1200 1201
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
W
Willem de Bruijn 已提交
1202
	bool use_napi = sq->napi.weight;
1203 1204

	/* Free up any pending old buffers before queueing new ones. */
1205
	free_old_xmit_skbs(sq);
1206

1207 1208 1209
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1210 1211 1212
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1213
	/* Try to transmit */
1214
	err = xmit_skb(sq, skb);
1215

1216
	/* This should not happen! */
1217
	if (unlikely(err)) {
1218 1219 1220
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1221
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1222
		dev->stats.tx_dropped++;
1223
		dev_kfree_skb_any(skb);
1224
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1225
	}
1226

1227
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1228 1229 1230 1231
	if (!use_napi) {
		skb_orphan(skb);
		nf_reset(skb);
	}
1232

1233 1234 1235 1236 1237 1238 1239 1240 1241
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1242
	 */
1243
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1244
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1245 1246
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1247
			/* More just got used, free them then recheck. */
1248 1249
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1250
				netif_start_subqueue(dev, qnum);
1251
				virtqueue_disable_cb(sq->vq);
1252 1253
			}
		}
1254
	}
1255

1256
	if (kick || netif_xmit_stopped(txq))
1257
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1258

1259
	return NETDEV_TX_OK;
1260 1261
}

1262 1263 1264
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1265
 * never fail unless improperly formatted.
1266 1267
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1268
				 struct scatterlist *out)
1269
{
1270
	struct scatterlist *sgs[4], hdr, stat;
1271
	unsigned out_num = 0, tmp;
1272 1273

	/* Caller should know better */
1274
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1275

1276 1277 1278
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1279
	/* Add header */
1280
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1281
	sgs[out_num++] = &hdr;
1282

1283 1284
	if (out)
		sgs[out_num++] = out;
1285

1286
	/* Add return status. */
1287
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1288
	sgs[out_num] = &stat;
1289

1290
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1291
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1292

1293
	if (unlikely(!virtqueue_kick(vi->cvq)))
1294
		return vi->ctrl_status == VIRTIO_NET_OK;
1295 1296 1297 1298

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1299 1300
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1301 1302
		cpu_relax();

1303
	return vi->ctrl_status == VIRTIO_NET_OK;
1304 1305
}

1306 1307 1308 1309
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1310
	int ret;
1311
	struct sockaddr *addr;
1312
	struct scatterlist sg;
1313

1314
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1315 1316 1317 1318
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1319
	if (ret)
1320
		goto out;
1321

1322 1323 1324
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1325
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1326 1327
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1328 1329
			ret = -EINVAL;
			goto out;
1330
		}
1331 1332
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1333 1334 1335 1336 1337 1338 1339
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1340 1341 1342
	}

	eth_commit_mac_addr_change(dev, p);
1343
	ret = 0;
1344

1345 1346 1347
out:
	kfree(addr);
	return ret;
1348 1349
}

1350 1351
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1352 1353 1354 1355 1356 1357
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1358
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1359 1360 1361
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1362
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1363 1364
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1365
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1366 1367

		do {
1368
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1369 1370
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1371
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1372 1373 1374 1375 1376 1377 1378 1379

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1380
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1381 1382 1383 1384 1385
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1386 1387 1388 1389
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1390
	int i;
1391

J
Jason Wang 已提交
1392 1393
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1394 1395 1396
}
#endif

1397 1398 1399 1400
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1401
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1402 1403 1404 1405
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1406
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1407 1408 1409 1410 1411 1412 1413
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1414 1415
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1416 1417

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1418
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1419 1420 1421
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1422
	} else {
J
Jason Wang 已提交
1423
		vi->curr_queue_pairs = queue_pairs;
1424 1425 1426
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1427
	}
J
Jason Wang 已提交
1428 1429 1430 1431

	return 0;
}

1432 1433 1434 1435 1436 1437 1438 1439 1440 1441
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1442 1443 1444
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1445
	int i;
R
Rusty Russell 已提交
1446

1447 1448
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1449

W
Willem de Bruijn 已提交
1450
	for (i = 0; i < vi->max_queue_pairs; i++) {
J
Jason Wang 已提交
1451
		napi_disable(&vi->rq[i].napi);
1452
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1453
	}
R
Rusty Russell 已提交
1454 1455 1456 1457

	return 0;
}

1458 1459 1460
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1461 1462
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1463
	struct netdev_hw_addr *ha;
1464
	int uc_count;
1465
	int mc_count;
1466 1467
	void *buf;
	int i;
1468

S
stephen hemminger 已提交
1469
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1470 1471 1472
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1473 1474
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1475

1476
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1477 1478

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1479
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1480
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1481
			 vi->ctrl_promisc ? "en" : "dis");
1482

1483
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1484 1485

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1486
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1487
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1488
			 vi->ctrl_allmulti ? "en" : "dis");
1489

1490
	uc_count = netdev_uc_count(dev);
1491
	mc_count = netdev_mc_count(dev);
1492
	/* MAC filter - use one buffer for both lists */
1493 1494 1495
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1496
	if (!buf)
1497 1498
		return;

1499 1500
	sg_init_table(sg, 2);

1501
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1502
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1503
	i = 0;
1504
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1505
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1506 1507

	sg_set_buf(&sg[0], mac_data,
1508
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1509 1510

	/* multicast list and count fill the end */
1511
	mac_data = (void *)&mac_data->macs[uc_count][0];
1512

M
Michael S. Tsirkin 已提交
1513
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1514
	i = 0;
1515 1516
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1517 1518

	sg_set_buf(&sg[1], mac_data,
1519
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1520 1521

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1522
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1523
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1524 1525

	kfree(buf);
1526 1527
}

1528 1529
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1530 1531 1532 1533
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1534 1535
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1536 1537

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1538
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1539
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1540
	return 0;
1541 1542
}

1543 1544
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1545 1546 1547 1548
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1549 1550
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1551 1552

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1553
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1554
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1555
	return 0;
1556 1557
}

1558
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1559 1560 1561
{
	int i;

1562 1563
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1564 1565 1566 1567
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1568 1569 1570
		vi->affinity_hint_set = false;
	}
}
1571

1572 1573 1574 1575
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1576 1577 1578 1579 1580

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1581 1582 1583 1584
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1585 1586
	}

1587 1588
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1589 1590
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1591
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1592
		i++;
J
Jason Wang 已提交
1593 1594
	}

1595
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1596 1597
}

1598
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1599
{
1600 1601 1602 1603 1604
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1605

1606 1607 1608 1609 1610 1611 1612
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1613

1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1645 1646
}

R
Rick Jones 已提交
1647 1648 1649 1650 1651
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1652 1653
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1654 1655 1656 1657
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1685
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1686 1687
		return -EINVAL;

J
John Fastabend 已提交
1688 1689 1690 1691 1692 1693 1694
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1695
	get_online_cpus();
1696
	err = _virtnet_set_queues(vi, queue_pairs);
1697 1698 1699 1700
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1701
		virtnet_set_affinity(vi);
1702
	}
1703
	put_online_cpus();
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1721
/* Check if the user is trying to change anything besides speed/duplex */
1722 1723
static bool
virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
1724
{
1725 1726
	struct ethtool_link_ksettings diff1 = *cmd;
	struct ethtool_link_ksettings diff2 = {};
1727

1728 1729 1730
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744
	diff1.base.speed = 0;
	diff2.base.port = PORT_OTHER;
	ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
	diff1.base.duplex = 0;
	diff1.base.cmd = 0;
	diff1.base.link_mode_masks_nwords = 0;

	return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
		bitmap_empty(diff1.link_modes.supported,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.lp_advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS);
1745 1746
}

1747 1748
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
1749 1750 1751 1752
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

1753
	speed = cmd->base.speed;
1754 1755
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
1756
	    !ethtool_validate_duplex(cmd->base.duplex) ||
1757 1758 1759
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
1760
	vi->duplex = cmd->base.duplex;
1761 1762 1763 1764

	return 0;
}

1765 1766
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
1767 1768 1769
{
	struct virtnet_info *vi = netdev_priv(dev);

1770 1771 1772
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1785
static const struct ethtool_ops virtnet_ethtool_ops = {
1786
	.get_drvinfo = virtnet_get_drvinfo,
1787
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1788
	.get_ringparam = virtnet_get_ringparam,
1789 1790
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1791
	.get_ts_info = ethtool_op_get_ts_info,
1792 1793
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
1794 1795
};

1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
1808
		for (i = 0; i < vi->max_queue_pairs; i++) {
1809
			napi_disable(&vi->rq[i].napi);
1810
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1811
		}
1812 1813 1814 1815
	}
}

static int init_vqs(struct virtnet_info *vi);
1816
static void _remove_vq_common(struct virtnet_info *vi);
1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
1834
		for (i = 0; i < vi->max_queue_pairs; i++) {
1835
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1836 1837 1838
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
1839 1840 1841 1842 1843 1844
	}

	netif_device_attach(vi->dev);
	return err;
}

1845
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
{
	struct virtio_device *dev = vi->vdev;
	int ret;

	virtio_config_disable(dev);
	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
	virtnet_freeze_down(dev);
	_remove_vq_common(vi);

	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

	ret = virtio_finalize_features(dev);
	if (ret)
		goto err;

1862
	vi->xdp_queue_pairs = xdp_qp;
1863 1864 1865
	ret = virtnet_restore_up(dev);
	if (ret)
		goto err;
1866
	ret = _virtnet_set_queues(vi, curr_qp);
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877
	if (ret)
		goto err;

	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
	virtio_config_enable(dev);
	return 0;
err:
	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
	return ret;
}

1878 1879
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
1880 1881 1882 1883
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1884
	u16 xdp_qp = 0, curr_qp;
1885
	int i, err;
J
John Fastabend 已提交
1886 1887

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1888 1889 1890
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
1891
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
J
John Fastabend 已提交
1892 1893 1894 1895
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
1896
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
1897 1898 1899 1900
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
1901
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
1902 1903 1904 1905
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1906 1907 1908 1909 1910 1911
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
1912
		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
1913 1914 1915 1916 1917
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

	/* Changing the headroom in buffers is a disruptive operation because
	 * existing buffers must be flushed and reallocated. This will happen
	 * when a xdp program is initially added or xdp is disabled by removing
	 * the xdp program resulting in number of XDP queues changing.
	 */
	if (vi->xdp_queue_pairs != xdp_qp) {
1930 1931 1932
		err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
		if (err) {
			dev_warn(&dev->dev, "XDP reset failure.\n");
1933
			goto virtio_reset_err;
1934
		}
J
John Fastabend 已提交
1935 1936
	}

1937 1938
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1939 1940 1941 1942 1943 1944 1945 1946
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
1947 1948 1949 1950 1951 1952 1953 1954 1955

virtio_reset_err:
	/* On reset error do our best to unwind XDP changes inflight and return
	 * error up to user space for resolution. The underlying reset hung on
	 * us so not much we can do here.
	 */
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
1974
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
1975 1976 1977 1978 1979 1980 1981 1982
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1983 1984 1985 1986 1987
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1988
	.ndo_set_mac_address = virtnet_set_mac_address,
1989
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1990
	.ndo_get_stats64     = virtnet_stats,
1991 1992
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1993 1994
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1995
#endif
J
John Fastabend 已提交
1996
	.ndo_xdp		= virtnet_xdp,
1997
	.ndo_features_check	= passthru_features_check,
1998 1999
};

2000
static void virtnet_config_changed_work(struct work_struct *work)
2001
{
2002 2003
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2004 2005
	u16 v;

2006 2007
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2008
		return;
2009 2010

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2011
		netdev_notify_peers(vi->dev);
2012 2013
		virtnet_ack_link_announce(vi);
	}
2014 2015 2016 2017 2018

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2019
		return;
2020 2021 2022 2023 2024

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2025
		netif_tx_wake_all_queues(vi->dev);
2026 2027
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2028
		netif_tx_stop_all_queues(vi->dev);
2029 2030 2031 2032 2033 2034 2035
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2036
	schedule_work(&vi->config_work);
2037 2038
}

J
Jason Wang 已提交
2039 2040
static void virtnet_free_queues(struct virtnet_info *vi)
{
2041 2042
	int i;

2043 2044
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
2045
		netif_napi_del(&vi->rq[i].napi);
W
Willem de Bruijn 已提交
2046
		netif_napi_del(&vi->sq[i].napi);
2047
	}
2048

2049 2050 2051 2052 2053
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2054 2055 2056 2057
	kfree(vi->rq);
	kfree(vi->sq);
}

2058
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2059
{
J
John Fastabend 已提交
2060
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2061 2062 2063 2064 2065
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2066 2067 2068 2069 2070

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2071
	}
2072 2073 2074 2075 2076 2077
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2078
	rtnl_unlock();
J
Jason Wang 已提交
2079 2080
}

2081 2082 2083 2084 2085 2086 2087 2088
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

2089
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
2090 2091 2092 2093 2094 2095 2096 2097 2098
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
2099 2100 2101 2102 2103 2104 2105
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2106
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2107
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2108 2109 2110 2111
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2112 2113 2114 2115 2116 2117
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2118
			if (vi->mergeable_rx_bufs) {
2119
				put_page(virt_to_head_page(buf));
2120
			} else if (vi->big_packets) {
2121
				give_pages(&vi->rq[i], buf);
2122
			} else {
2123
				put_page(virt_to_head_page(buf));
2124
			}
J
Jason Wang 已提交
2125 2126 2127 2128
		}
	}
}

2129 2130 2131 2132
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2133
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2134

2135
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2136 2137

	virtnet_free_queues(vi);
2138 2139
}

2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2152 2153
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2154 2155
}

J
Jason Wang 已提交
2156
static int virtnet_find_vqs(struct virtnet_info *vi)
2157
{
J
Jason Wang 已提交
2158 2159 2160 2161 2162
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2163
	bool *ctx;
J
Jason Wang 已提交
2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;
2182 2183 2184 2185 2186 2187 2188
	if (vi->mergeable_rx_bufs) {
		ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2189 2190 2191 2192 2193 2194

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2195

J
Jason Wang 已提交
2196 2197 2198 2199 2200 2201 2202 2203
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2204 2205
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2206
	}
2207

J
Jason Wang 已提交
2208
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2209
					 names, ctx, NULL);
J
Jason Wang 已提交
2210 2211
	if (ret)
		goto err_find;
2212

J
Jason Wang 已提交
2213 2214
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2215
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2216
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2217
	}
J
Jason Wang 已提交
2218 2219 2220

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2221
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2222 2223 2224 2225 2226 2227 2228
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2229
	return 0;
J
Jason Wang 已提交
2230 2231

err_find:
2232 2233
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2251
	if (!vi->rq)
J
Jason Wang 已提交
2252 2253 2254 2255 2256 2257 2258
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2259 2260
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2261 2262

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2263
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2288
	get_online_cpus();
2289
	virtnet_set_affinity(vi);
2290 2291
	put_online_cpus();

J
Jason Wang 已提交
2292 2293 2294 2295 2296 2297
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2298 2299
}

2300 2301 2302 2303 2304 2305
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2306
	struct ewma_pkt_len *avg;
2307 2308 2309

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2310 2311
	return sprintf(buf, "%u\n",
		       get_mergeable_buf_len(&vi->rq[queue_index], avg));
2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2362 2363 2364
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2365
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2366
{
2367 2368 2369 2370 2371 2372
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2373 2374 2375
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
	int i, err;
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2395
	/* Find if host supports multiqueue virtio_net device */
2396 2397 2398
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2399 2400 2401 2402 2403 2404

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2405 2406

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2407
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2408 2409 2410 2411
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2412
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2413
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2414
	dev->features = NETIF_F_HIGHDMA;
2415

2416
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2417 2418 2419
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2420
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2421
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2422
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2423
		if (csum)
J
Jason Wang 已提交
2424
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2425 2426

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2427
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2428 2429
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2430
		/* Individual feature bits: what can host handle? */
2431 2432 2433 2434 2435 2436
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2437 2438
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2439

2440 2441
		dev->features |= NETIF_F_GSO_ROBUST;

2442
		if (gso)
2443
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2444
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2445
	}
2446 2447
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2448

2449 2450
	dev->vlan_features = dev->features;

2451 2452 2453 2454
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2455
	/* Configuration may specify what MAC to use.  Otherwise random. */
2456 2457 2458 2459 2460
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2461
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2462 2463 2464 2465 2466

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2467
	vdev->priv = vi;
2468 2469 2470 2471 2472
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2473 2474 2475 2476 2477 2478 2479
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2480
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2481

2482
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2483 2484
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2485 2486
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2487 2488
		vi->big_packets = true;

2489 2490 2491
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2492 2493
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2494 2495 2496 2497
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2498 2499
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2500 2501
		vi->any_header_sg = true;

J
Jason Wang 已提交
2502 2503 2504
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2505 2506 2507 2508
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2509
		if (mtu < dev->min_mtu) {
2510 2511 2512 2513 2514 2515
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
			goto free_stats;
2516
		}
2517

2518 2519 2520
		dev->mtu = mtu;
		dev->max_mtu = mtu;

2521 2522 2523
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2524 2525
	}

2526 2527
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2528

2529 2530 2531 2532 2533
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2534 2535 2536
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2537
	err = init_vqs(vi);
2538
	if (err)
2539
		goto free_stats;
R
Rusty Russell 已提交
2540

2541 2542 2543 2544
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2545 2546
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2547

2548 2549
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2550 2551 2552
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2553
		goto free_vqs;
R
Rusty Russell 已提交
2554
	}
2555

M
Michael S. Tsirkin 已提交
2556 2557
	virtio_device_ready(vdev);

2558
	err = virtnet_cpu_notif_add(vi);
2559 2560
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2561
		goto free_unregister_netdev;
2562 2563
	}

2564
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2565

J
Jason Wang 已提交
2566 2567 2568 2569
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2570
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2571 2572 2573 2574
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2575

J
Jason Wang 已提交
2576 2577 2578
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2579 2580
	return 0;

2581
free_unregister_netdev:
2582 2583
	vi->vdev->config->reset(vdev);

2584
	unregister_netdev(dev);
2585
free_vqs:
J
Jason Wang 已提交
2586
	cancel_delayed_work_sync(&vi->refill);
2587
	free_receive_page_frags(vi);
2588
	virtnet_del_vqs(vi);
2589 2590
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2591 2592 2593 2594 2595
free:
	free_netdev(dev);
	return err;
}

2596 2597 2598 2599 2600 2601 2602 2603 2604
static void _remove_vq_common(struct virtnet_info *vi)
{
	vi->vdev->config->reset(vi->vdev);
	free_unused_bufs(vi);
	_free_receive_bufs(vi);
	free_receive_page_frags(vi);
	virtnet_del_vqs(vi);
}

2605
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2606
{
2607
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2608 2609

	/* Free unused buffers in both send and recv, if any. */
2610
	free_unused_bufs(vi);
2611

J
Jason Wang 已提交
2612
	free_receive_bufs(vi);
2613

2614 2615
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2616
	virtnet_del_vqs(vi);
2617 2618
}

2619
static void virtnet_remove(struct virtio_device *vdev)
2620 2621 2622
{
	struct virtnet_info *vi = vdev->priv;

2623
	virtnet_cpu_notif_remove(vi);
2624

2625 2626
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2627

2628 2629 2630
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2631

2632
	free_percpu(vi->stats);
2633
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2634 2635
}

2636
#ifdef CONFIG_PM_SLEEP
2637 2638 2639 2640
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2641
	virtnet_cpu_notif_remove(vi);
2642
	virtnet_freeze_down(vdev);
2643 2644 2645 2646 2647 2648 2649 2650
	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
2651
	int err;
2652

2653
	err = virtnet_restore_up(vdev);
2654 2655
	if (err)
		return err;
J
Jason Wang 已提交
2656 2657
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2658
	err = virtnet_cpu_notif_add(vi);
2659 2660 2661
	if (err)
		return err;

2662 2663 2664 2665
	return 0;
}
#endif

R
Rusty Russell 已提交
2666 2667 2668 2669 2670
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2683
static unsigned int features[] = {
2684 2685 2686 2687 2688 2689
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2690
	VIRTIO_F_ANY_LAYOUT,
2691 2692
};

2693
static struct virtio_driver virtio_net_driver = {
2694 2695
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2696 2697
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2698 2699 2700
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
2701
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
2702
	.probe =	virtnet_probe,
2703
	.remove =	virtnet_remove,
2704
	.config_changed = virtnet_config_changed,
2705
#ifdef CONFIG_PM_SLEEP
2706 2707 2708
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2709 2710
};

2711 2712 2713 2714
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2715
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2716 2717 2718 2719 2720
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2721
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2746 2747 2748 2749

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");