virtio_net.c 63.6 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
R
Rusty Russell 已提交
26
#include <linux/scatterlist.h>
27
#include <linux/if_vlan.h>
28
#include <linux/slab.h>
29
#include <linux/cpu.h>
30
#include <linux/average.h>
J
Jason Wang 已提交
31
#include <net/busy_poll.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

J
Johannes Berg 已提交
44 45 46 47
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
48
 */
J
Johannes Berg 已提交
49
DECLARE_EWMA(pkt_len, 1, 64)
50

51 52 53 54 55 56 57
/* With mergeable buffers we align buffer address and use the low bits to
 * encode its true size. Buffer size is up to 1 page so we need to align to
 * square root of page size to ensure we reserve enough bits to encode the true
 * size.
 */
#define MERGEABLE_BUFFER_MIN_ALIGN_SHIFT ((PAGE_SHIFT + 1) / 2)

58
/* Minimum alignment for mergeable packet buffers. */
59 60
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, \
				   1 << MERGEABLE_BUFFER_MIN_ALIGN_SHIFT)
61

62
#define VIRTNET_DRIVER_VERSION "1.0.0"
63

64
struct virtnet_stats {
65 66
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
67 68 69 70 71 72 73
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

74 75 76 77 78 79 80
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
81 82 83

	/* Name of the send queue: output.$index */
	char name[40];
84 85 86 87 88 89 90
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
91 92
	struct napi_struct napi;

J
John Fastabend 已提交
93 94
	struct bpf_prog __rcu *xdp_prog;

95 96 97
	/* Chain pages by the private ptr. */
	struct page *pages;

98
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
99
	struct ewma_pkt_len mrg_avg_pkt_len;
100

101 102 103
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

104 105
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
106 107 108

	/* Name of this receive queue: input.$index */
	char name[40];
109 110 111 112 113 114
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
115 116
	struct send_queue *sq;
	struct receive_queue *rq;
117 118
	unsigned int status;

J
Jason Wang 已提交
119 120 121 122 123 124
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

125 126 127
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

128 129 130
	/* I like... big packets and I cannot lie! */
	bool big_packets;

131 132 133
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
134 135 136
	/* Has control virtqueue */
	bool has_cvq;

137 138 139
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

140 141 142
	/* Packet virtio header size */
	u8 hdr_len;

143 144 145
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

146 147 148
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

149 150 151
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
152 153
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
154

155 156 157
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
158 159 160 161

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
162
	struct virtio_net_ctrl_mq ctrl_mq;
163 164
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
165
	u16 ctrl_vid;
166 167 168 169

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
170 171
};

172
struct padded_vnet_hdr {
173
	struct virtio_net_hdr_mrg_rxbuf hdr;
174
	/*
175 176 177
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
178
	 */
179
	char padding[4];
180 181
};

J
Jason Wang 已提交
182 183 184 185 186
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
187
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
188 189 190 191 192 193 194 195 196
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
197
	return vq->index / 2;
J
Jason Wang 已提交
198 199 200 201 202 203 204
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

205
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
206
{
207
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
208 209
}

210 211 212 213
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
214
static void give_pages(struct receive_queue *rq, struct page *page)
215
{
216
	struct page *end;
217

218
	/* Find end of list, sew whole thing into vi->rq.pages. */
219
	for (end = page; end->private; end = (struct page *)end->private);
220 221
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
222 223
}

224
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
225
{
226
	struct page *p = rq->pages;
227

228
	if (p) {
229
		rq->pages = (struct page *)p->private;
230 231 232
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
233 234 235 236
		p = alloc_page(gfp_mask);
	return p;
}

237
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
238
{
239
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
240

241
	/* Suppress further interrupts. */
242
	virtqueue_disable_cb(vq);
243

244
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
245
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
246 247
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

266
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
267 268
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
269 270
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
271 272
{
	struct sk_buff *skb;
273
	struct virtio_net_hdr_mrg_rxbuf *hdr;
274
	unsigned int copy, hdr_len, hdr_padded_len;
275
	char *p;
276

277
	p = page_address(page) + offset;
278

279
	/* copy small packet so we can reuse these pages for small data */
280
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
281 282
	if (unlikely(!skb))
		return NULL;
283

284
	hdr = skb_vnet_hdr(skb);
285

286 287 288 289
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
290
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
291

292
	memcpy(hdr, p, hdr_len);
293

294
	len -= hdr_len;
295 296
	offset += hdr_padded_len;
	p += hdr_padded_len;
297

298 299 300 301
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
302

303 304
	len -= copy;
	offset += copy;
305

306 307 308 309 310 311 312 313
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

314 315 316 317 318 319 320
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
321
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
322 323 324
		dev_kfree_skb(skb);
		return NULL;
	}
325
	BUG_ON(offset >= PAGE_SIZE);
326
	while (len) {
327 328 329 330
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
331 332 333
		page = (struct page *)page->private;
		offset = 0;
	}
334

335
	if (page)
336
		give_pages(rq, page);
337

338 339
	return skb;
}
340

J
John Fastabend 已提交
341 342 343
static void virtnet_xdp_xmit(struct virtnet_info *vi,
			     struct receive_queue *rq,
			     struct send_queue *sq,
344 345
			     struct xdp_buff *xdp,
			     void *data)
J
John Fastabend 已提交
346 347 348 349 350 351 352 353
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	unsigned int num_sg, len;
	void *xdp_sent;
	int err;

	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
354 355 356 357 358 359
		if (vi->mergeable_rx_bufs) {
			struct page *sent_page = virt_to_head_page(xdp_sent);

			put_page(sent_page);
		} else { /* small buffer */
			struct sk_buff *skb = xdp_sent;
J
John Fastabend 已提交
360

361 362 363
			kfree_skb(skb);
		}
	}
J
John Fastabend 已提交
364

365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
	if (vi->mergeable_rx_bufs) {
		/* Zero header and leave csum up to XDP layers */
		hdr = xdp->data;
		memset(hdr, 0, vi->hdr_len);

		num_sg = 1;
		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
	} else { /* small buffer */
		struct sk_buff *skb = data;

		/* Zero header and leave csum up to XDP layers */
		hdr = skb_vnet_hdr(skb);
		memset(hdr, 0, vi->hdr_len);

		num_sg = 2;
		sg_init_table(sq->sg, 2);
		sg_set_buf(sq->sg, hdr, vi->hdr_len);
		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
	}
J
John Fastabend 已提交
384
	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
385
				   data, GFP_ATOMIC);
J
John Fastabend 已提交
386
	if (unlikely(err)) {
387 388 389 390 391 392
		if (vi->mergeable_rx_bufs) {
			struct page *page = virt_to_head_page(xdp->data);

			put_page(page);
		} else /* small buffer */
			kfree_skb(data);
J
John Fastabend 已提交
393 394 395 396 397 398
		return; // On error abort to avoid unnecessary kick
	}

	virtqueue_kick(sq->vq);
}

J
John Fastabend 已提交
399
static u32 do_xdp_prog(struct virtnet_info *vi,
J
John Fastabend 已提交
400
		       struct receive_queue *rq,
J
John Fastabend 已提交
401
		       struct bpf_prog *xdp_prog,
402
		       void *data, int len)
J
John Fastabend 已提交
403 404 405
{
	int hdr_padded_len;
	struct xdp_buff xdp;
406
	void *buf;
J
John Fastabend 已提交
407
	unsigned int qp;
J
John Fastabend 已提交
408 409
	u32 act;

410
	if (vi->mergeable_rx_bufs) {
J
John Fastabend 已提交
411
		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
412 413 414 415 416 417 418 419 420 421
		xdp.data = data + hdr_padded_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		buf = data;
	} else { /* small buffers */
		struct sk_buff *skb = data;

		xdp.data = skb->data;
		xdp.data_end = xdp.data + len;
		buf = skb->data;
	}
J
John Fastabend 已提交
422 423 424 425 426

	act = bpf_prog_run_xdp(xdp_prog, &xdp);
	switch (act) {
	case XDP_PASS:
		return XDP_PASS;
J
John Fastabend 已提交
427 428 429 430
	case XDP_TX:
		qp = vi->curr_queue_pairs -
			vi->xdp_queue_pairs +
			smp_processor_id();
431 432
		xdp.data = buf;
		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
J
John Fastabend 已提交
433
		return XDP_TX;
J
John Fastabend 已提交
434 435 436 437 438 439 440 441
	default:
		bpf_warn_invalid_xdp_action(act);
	case XDP_ABORTED:
	case XDP_DROP:
		return XDP_DROP;
	}
}

442 443 444 445
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
446 447
{
	struct sk_buff * skb = buf;
448
	struct bpf_prog *xdp_prog;
449

450
	len -= vi->hdr_len;
451 452
	skb_trim(skb, len);

453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
		act = do_xdp_prog(vi, rq, xdp_prog, skb, len);
		switch (act) {
		case XDP_PASS:
			break;
		case XDP_TX:
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
			goto err_xdp;
		}
	}
	rcu_read_unlock();

475
	return skb;
476 477 478 479 480 481 482

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
	kfree_skb(skb);
xdp_xmit:
	return NULL;
483 484 485
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
486
				   struct virtnet_info *vi,
487 488 489 490 491
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
492
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
493

494 495 496 497 498 499 500 501 502 503 504
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

505 506 507 508 509 510 511 512 513 514 515 516
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
517
				       u16 *num_buf,
518 519 520 521 522 523 524 525 526 527 528 529 530
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
	unsigned int page_off = 0;

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

531
	while (--*num_buf) {
532 533 534 535 536 537 538 539 540
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

541 542 543 544
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

545 546 547
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
548 549
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
550
			goto err_buf;
551
		}
552 553 554 555

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
556
		put_page(p);
557 558 559 560 561 562 563 564 565
	}

	*len = page_off;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

566
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
567
					 struct virtnet_info *vi,
568
					 struct receive_queue *rq,
569
					 unsigned long ctx,
570
					 unsigned int len)
571
{
572
	void *buf = mergeable_ctx_to_buf_address(ctx);
573 574
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
575 576
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
577 578 579 580
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
581 582
	head_skb = NULL;

J
John Fastabend 已提交
583 584 585
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
586
		struct page *xdp_page;
J
John Fastabend 已提交
587 588
		u32 act;

589
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
590
		if (unlikely(num_buf > 1)) {
591
			/* linearize data for XDP */
592
			xdp_page = xdp_linearize_page(rq, &num_buf,
593 594 595 596 597 598
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
			offset = 0;
		} else {
			xdp_page = page;
J
John Fastabend 已提交
599 600 601 602 603 604 605
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
606
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
607 608
			goto err_xdp;

609 610
		act = do_xdp_prog(vi, rq, xdp_prog,
				  page_address(xdp_page) + offset, len);
J
John Fastabend 已提交
611 612
		switch (act) {
		case XDP_PASS:
613 614 615 616 617 618
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
						       0, len, PAGE_SIZE);
619
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
620 621
				return head_skb;
			}
J
John Fastabend 已提交
622 623
			break;
		case XDP_TX:
624
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
625 626
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
627 628 629 630
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
631 632
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
633
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
634
			goto err_xdp;
J
John Fastabend 已提交
635
		}
J
John Fastabend 已提交
636 637
	}
	rcu_read_unlock();
638

J
John Fastabend 已提交
639 640 641
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
642

643 644
	if (unlikely(!curr_skb))
		goto err_skb;
645
	while (--num_buf) {
646 647
		int num_skb_frags;

648 649
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
650
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
651
				 dev->name, num_buf,
652 653
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
654 655
			dev->stats.rx_length_errors++;
			goto err_buf;
656
		}
657

658
		buf = mergeable_ctx_to_buf_address(ctx);
659 660 661
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
662 663
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
664 665 666

			if (unlikely(!nskb))
				goto err_skb;
667 668 669 670 671 672 673 674
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
675
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
676 677 678
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
679
			head_skb->truesize += truesize;
680
		}
681
		offset = buf - page_address(page);
682 683 684
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
685
					     len, truesize);
686 687
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
688
					offset, len, truesize);
689
		}
690 691
	}

J
Johannes Berg 已提交
692
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
693 694
	return head_skb;

J
John Fastabend 已提交
695 696
err_xdp:
	rcu_read_unlock();
697 698 699
err_skb:
	put_page(page);
	while (--num_buf) {
700 701
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
702 703 704 705 706
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
707
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
708
		put_page(page);
709
	}
710 711 712
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
713
xdp_xmit:
714
	return NULL;
715 716
}

M
Michael S. Tsirkin 已提交
717 718
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len)
719
{
720
	struct net_device *dev = vi->dev;
E
Eric Dumazet 已提交
721
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
722
	struct sk_buff *skb;
723
	struct virtio_net_hdr_mrg_rxbuf *hdr;
724

725
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
726 727
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
728 729 730 731 732
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
733
			give_pages(rq, buf);
734
		} else {
735
			dev_kfree_skb(buf);
736
		}
737 738
		return;
	}
739

740
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
741
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
742
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
743
		skb = receive_big(dev, vi, rq, buf, len);
744
	else
745
		skb = receive_small(dev, vi, rq, buf, len);
746 747 748

	if (unlikely(!skb))
		return;
749

750
	hdr = skb_vnet_hdr(skb);
751

752
	u64_stats_update_begin(&stats->rx_syncp);
753 754
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
755
	u64_stats_update_end(&stats->rx_syncp);
R
Rusty Russell 已提交
756

757
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
758
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
759

760 761 762 763 764 765
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
766 767
	}

768 769 770 771
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
772
	napi_gro_receive(&rq->napi, skb);
R
Rusty Russell 已提交
773 774 775 776 777 778 779
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

M
Michael S. Tsirkin 已提交
780 781
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
782 783
{
	struct sk_buff *skb;
784
	struct virtio_net_hdr_mrg_rxbuf *hdr;
785
	int err;
786

787
	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
788 789
	if (unlikely(!skb))
		return -ENOMEM;
R
Rusty Russell 已提交
790

791
	skb_put(skb, GOOD_PACKET_LEN);
792

793
	hdr = skb_vnet_hdr(skb);
794
	sg_init_table(rq->sg, 2);
795
	sg_set_buf(rq->sg, hdr, vi->hdr_len);
796
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
797

798
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
799 800
	if (err < 0)
		dev_kfree_skb(skb);
801

802 803
	return err;
}
804

805 806
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
807 808 809 810 811
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

812 813
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

814
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
815
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
816
		first = get_a_page(rq, gfp);
817 818
		if (!first) {
			if (list)
819
				give_pages(rq, list);
820
			return -ENOMEM;
821
		}
822
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
823

824 825 826 827
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
828

829
	first = get_a_page(rq, gfp);
830
	if (!first) {
831
		give_pages(rq, list);
832 833 834 835
		return -ENOMEM;
	}
	p = page_address(first);

836
	/* rq->sg[0], rq->sg[1] share the same page */
837 838
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
839

840
	/* rq->sg[1] for data packet, from offset */
841
	offset = sizeof(struct padded_vnet_hdr);
842
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
843 844 845

	/* chain first in list head */
	first->private = (unsigned long)list;
846 847
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
848
	if (err < 0)
849
		give_pages(rq, first);
850 851

	return err;
R
Rusty Russell 已提交
852 853
}

J
Johannes Berg 已提交
854
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
855
{
856
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
857 858
	unsigned int len;

J
Johannes Berg 已提交
859
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
860 861 862 863 864 865
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
866 867
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
868
	unsigned long ctx;
869
	int err;
870
	unsigned int len, hole;
871

872
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
873
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
874
		return -ENOMEM;
875

876
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
877
	ctx = mergeable_buf_to_ctx(buf, len);
878 879 880
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	hole = alloc_frag->size - alloc_frag->offset;
881 882 883 884 885 886
	if (hole < len) {
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
887 888 889
		len += hole;
		alloc_frag->offset += hole;
	}
890

891
	sg_init_one(rq->sg, buf, len);
892
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
893
	if (err < 0)
894
		put_page(virt_to_head_page(buf));
895

896 897
	return err;
}
898

899 900 901 902 903 904 905
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
906 907
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
908 909
{
	int err;
910
	bool oom;
911

912
	gfp |= __GFP_COLD;
913 914
	do {
		if (vi->mergeable_rx_bufs)
915
			err = add_recvbuf_mergeable(rq, gfp);
916
		else if (vi->big_packets)
917
			err = add_recvbuf_big(vi, rq, gfp);
918
		else
M
Michael S. Tsirkin 已提交
919
			err = add_recvbuf_small(vi, rq, gfp);
920

921
		oom = err == -ENOMEM;
922
		if (err)
923
			break;
924
	} while (rq->vq->num_free);
925
	virtqueue_kick(rq->vq);
926
	return !oom;
927 928
}

929
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
930 931
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
932
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
933

934
	/* Schedule NAPI, Suppress further interrupts if successful. */
935
	if (napi_schedule_prep(&rq->napi)) {
936
		virtqueue_disable_cb(rvq);
937
		__napi_schedule(&rq->napi);
938
	}
R
Rusty Russell 已提交
939 940
}

941
static void virtnet_napi_enable(struct receive_queue *rq)
942
{
943
	napi_enable(&rq->napi);
944 945 946 947 948

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
949 950
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
951
		local_bh_disable();
952
		__napi_schedule(&rq->napi);
953
		local_bh_enable();
954 955 956
	}
}

957 958
static void refill_work(struct work_struct *work)
{
959 960
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
961
	bool still_empty;
J
Jason Wang 已提交
962 963
	int i;

964
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
965
		struct receive_queue *rq = &vi->rq[i];
966

J
Jason Wang 已提交
967
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
968
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
969
		virtnet_napi_enable(rq);
970

J
Jason Wang 已提交
971 972 973 974 975 976
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
977 978
}

979
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
980
{
981
	struct virtnet_info *vi = rq->vq->vdev->priv;
982
	unsigned int len, received = 0;
983
	void *buf;
R
Rusty Russell 已提交
984 985

	while (received < budget &&
986
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
M
Michael S. Tsirkin 已提交
987
		receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
988 989 990
		received++;
	}

991
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
992
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
993
			schedule_delayed_work(&vi->refill, 0);
994
	}
R
Rusty Russell 已提交
995

996 997 998 999 1000 1001 1002
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1003
	unsigned int r, received;
1004

1005
	received = virtnet_receive(rq, budget);
1006

1007 1008
	/* Out of packets? */
	if (received < budget) {
1009
		r = virtqueue_enable_cb_prepare(rq->vq);
E
Eric Dumazet 已提交
1010
		napi_complete_done(napi, received);
1011
		if (unlikely(virtqueue_poll(rq->vq, r)) &&
1012
		    napi_schedule_prep(napi)) {
1013
			virtqueue_disable_cb(rq->vq);
1014
			__napi_schedule(napi);
1015
		}
R
Rusty Russell 已提交
1016 1017 1018 1019 1020
	}

	return received;
}

J
Jason Wang 已提交
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
#ifdef CONFIG_NET_RX_BUSY_POLL
/* must be called with local_bh_disable()d */
static int virtnet_busy_poll(struct napi_struct *napi)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	struct virtnet_info *vi = rq->vq->vdev->priv;
	int r, received = 0, budget = 4;

	if (!(vi->status & VIRTIO_NET_S_LINK_UP))
		return LL_FLUSH_FAILED;

	if (!napi_schedule_prep(napi))
		return LL_FLUSH_BUSY;

	virtqueue_disable_cb(rq->vq);

again:
	received += virtnet_receive(rq, budget);

	r = virtqueue_enable_cb_prepare(rq->vq);
	clear_bit(NAPI_STATE_SCHED, &napi->state);
	if (unlikely(virtqueue_poll(rq->vq, r)) &&
	    napi_schedule_prep(napi)) {
		virtqueue_disable_cb(rq->vq);
		if (received < budget) {
			budget -= received;
			goto again;
		} else {
			__napi_schedule(napi);
		}
	}

	return received;
}
#endif	/* CONFIG_NET_RX_BUSY_POLL */

J
Jason Wang 已提交
1058 1059 1060 1061 1062
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1063 1064 1065
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1066
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1067
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
1068 1069 1070 1071 1072 1073
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1074
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1075 1076
{
	struct sk_buff *skb;
1077
	unsigned int len;
1078
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1079
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1080

1081
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1082
		pr_debug("Sent skb %p\n", skb);
1083

1084
		u64_stats_update_begin(&stats->tx_syncp);
1085 1086
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
1087
		u64_stats_update_end(&stats->tx_syncp);
1088

1089
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1090 1091 1092
	}
}

1093
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1094
{
1095
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1096
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1097
	struct virtnet_info *vi = sq->vq->vdev->priv;
1098
	unsigned num_sg;
1099
	unsigned hdr_len = vi->hdr_len;
1100
	bool can_push;
R
Rusty Russell 已提交
1101

J
Johannes Berg 已提交
1102
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1103 1104 1105 1106 1107 1108 1109

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1110
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1111 1112
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1113

1114
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1115
				    virtio_is_little_endian(vi->vdev), false))
1116
		BUG();
R
Rusty Russell 已提交
1117

1118
	if (vi->mergeable_rx_bufs)
1119
		hdr->num_buffers = 0;
1120

1121
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1122 1123 1124 1125 1126 1127 1128 1129 1130
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1131
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1132 1133
}

1134
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1135 1136
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1137 1138
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1139
	int err;
1140 1141
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1142 1143

	/* Free up any pending old buffers before queueing new ones. */
1144
	free_old_xmit_skbs(sq);
1145

1146 1147 1148
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1149
	/* Try to transmit */
1150
	err = xmit_skb(sq, skb);
1151

1152
	/* This should not happen! */
1153
	if (unlikely(err)) {
1154 1155 1156
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1157
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1158
		dev->stats.tx_dropped++;
1159
		dev_kfree_skb_any(skb);
1160
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1161
	}
1162

1163 1164 1165 1166
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1167 1168 1169 1170 1171 1172 1173 1174 1175
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1176
	 */
1177
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1178
		netif_stop_subqueue(dev, qnum);
1179
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1180
			/* More just got used, free them then recheck. */
1181 1182
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1183
				netif_start_subqueue(dev, qnum);
1184
				virtqueue_disable_cb(sq->vq);
1185 1186
			}
		}
1187
	}
1188

1189
	if (kick || netif_xmit_stopped(txq))
1190
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1191

1192
	return NETDEV_TX_OK;
1193 1194
}

1195 1196 1197
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1198
 * never fail unless improperly formatted.
1199 1200
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1201
				 struct scatterlist *out)
1202
{
1203
	struct scatterlist *sgs[4], hdr, stat;
1204
	unsigned out_num = 0, tmp;
1205 1206

	/* Caller should know better */
1207
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1208

1209 1210 1211
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1212
	/* Add header */
1213
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1214
	sgs[out_num++] = &hdr;
1215

1216 1217
	if (out)
		sgs[out_num++] = out;
1218

1219
	/* Add return status. */
1220
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1221
	sgs[out_num] = &stat;
1222

1223
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1224
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1225

1226
	if (unlikely(!virtqueue_kick(vi->cvq)))
1227
		return vi->ctrl_status == VIRTIO_NET_OK;
1228 1229 1230 1231

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1232 1233
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1234 1235
		cpu_relax();

1236
	return vi->ctrl_status == VIRTIO_NET_OK;
1237 1238
}

1239 1240 1241 1242
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1243
	int ret;
1244
	struct sockaddr *addr;
1245
	struct scatterlist sg;
1246

1247 1248 1249 1250 1251 1252
	addr = kmalloc(sizeof(*addr), GFP_KERNEL);
	if (!addr)
		return -ENOMEM;
	memcpy(addr, p, sizeof(*addr));

	ret = eth_prepare_mac_addr_change(dev, addr);
1253
	if (ret)
1254
		goto out;
1255

1256 1257 1258
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1259
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1260 1261
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1262 1263
			ret = -EINVAL;
			goto out;
1264
		}
1265 1266
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1267 1268 1269 1270 1271 1272 1273
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1274 1275 1276
	}

	eth_commit_mac_addr_change(dev, p);
1277
	ret = 0;
1278

1279 1280 1281
out:
	kfree(addr);
	return ret;
1282 1283
}

1284 1285 1286 1287 1288 1289 1290 1291
static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
					       struct rtnl_link_stats64 *tot)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1292
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1293 1294 1295
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1296
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1297 1298
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1299
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1300 1301

		do {
1302
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1303 1304
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1305
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1306 1307 1308 1309 1310 1311 1312 1313

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1314
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1315 1316 1317 1318 1319 1320 1321
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;

	return tot;
}

1322 1323 1324 1325
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1326
	int i;
1327

J
Jason Wang 已提交
1328 1329
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1330 1331 1332
}
#endif

1333 1334 1335 1336
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1337
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1338 1339 1340 1341
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

J
Jason Wang 已提交
1342 1343 1344 1345 1346 1347 1348 1349
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1350 1351
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1352 1353

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1354
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1355 1356 1357
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1358
	} else {
J
Jason Wang 已提交
1359
		vi->curr_queue_pairs = queue_pairs;
1360 1361 1362
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1363
	}
J
Jason Wang 已提交
1364 1365 1366 1367

	return 0;
}

R
Rusty Russell 已提交
1368 1369 1370
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1371
	int i;
R
Rusty Russell 已提交
1372

1373 1374
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1375 1376 1377

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1378 1379 1380 1381

	return 0;
}

1382 1383 1384
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1385 1386
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1387
	struct netdev_hw_addr *ha;
1388
	int uc_count;
1389
	int mc_count;
1390 1391
	void *buf;
	int i;
1392

S
stephen hemminger 已提交
1393
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1394 1395 1396
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1397 1398
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1399

1400
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1401 1402

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1403
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1404
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1405
			 vi->ctrl_promisc ? "en" : "dis");
1406

1407
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1408 1409

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1410
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1411
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1412
			 vi->ctrl_allmulti ? "en" : "dis");
1413

1414
	uc_count = netdev_uc_count(dev);
1415
	mc_count = netdev_mc_count(dev);
1416
	/* MAC filter - use one buffer for both lists */
1417 1418 1419
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1420
	if (!buf)
1421 1422
		return;

1423 1424
	sg_init_table(sg, 2);

1425
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1426
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1427
	i = 0;
1428
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1429
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1430 1431

	sg_set_buf(&sg[0], mac_data,
1432
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1433 1434

	/* multicast list and count fill the end */
1435
	mac_data = (void *)&mac_data->macs[uc_count][0];
1436

M
Michael S. Tsirkin 已提交
1437
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1438
	i = 0;
1439 1440
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1441 1442

	sg_set_buf(&sg[1], mac_data,
1443
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1444 1445

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1446
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1447
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1448 1449

	kfree(buf);
1450 1451
}

1452 1453
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1454 1455 1456 1457
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1458 1459
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1460 1461

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1462
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1463
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1464
	return 0;
1465 1466
}

1467 1468
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1469 1470 1471 1472
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1473 1474
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1475 1476

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1477
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1478
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1479
	return 0;
1480 1481
}

1482
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1483 1484 1485
{
	int i;

1486 1487
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1488 1489 1490 1491
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1492 1493 1494
		vi->affinity_hint_set = false;
	}
}
1495

1496 1497 1498 1499
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1500 1501 1502 1503 1504

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1505 1506 1507 1508
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1509 1510
	}

1511 1512
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1513 1514
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1515
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1516
		i++;
J
Jason Wang 已提交
1517 1518
	}

1519
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1520 1521
}

1522
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1523
{
1524 1525 1526 1527 1528
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1529

1530 1531 1532 1533 1534 1535 1536
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1537

1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1569 1570
}

R
Rick Jones 已提交
1571 1572 1573 1574 1575
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1576 1577
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1578 1579 1580 1581
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1609
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1610 1611
		return -EINVAL;

J
John Fastabend 已提交
1612 1613 1614 1615 1616 1617 1618
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1619
	get_online_cpus();
1620 1621 1622 1623 1624
	err = virtnet_set_queues(vi, queue_pairs);
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1625
		virtnet_set_affinity(vi);
1626
	}
1627
	put_online_cpus();
1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1645 1646 1647 1648 1649 1650
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1651 1652 1653
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1654
	ethtool_cmd_speed_set(&diff1, 0);
1655
	diff2.port = PORT_OTHER;
1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1699
static const struct ethtool_ops virtnet_ethtool_ops = {
1700
	.get_drvinfo = virtnet_get_drvinfo,
1701
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1702
	.get_ringparam = virtnet_get_ringparam,
1703 1704
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1705
	.get_ts_info = ethtool_op_get_ts_info,
1706 1707
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1708 1709
};

J
John Fastabend 已提交
1710 1711 1712 1713 1714
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1715 1716
	u16 xdp_qp = 0, curr_qp;
	int i, err;
J
John Fastabend 已提交
1717 1718

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1719 1720 1721
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err) {
		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
		return err;
	}

J
John Fastabend 已提交
1753 1754
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
1755 1756
		if (IS_ERR(prog)) {
			virtnet_set_queues(vi, curr_qp);
J
John Fastabend 已提交
1757
			return PTR_ERR(prog);
1758
		}
J
John Fastabend 已提交
1759 1760
	}

1761 1762 1763
	vi->xdp_queue_pairs = xdp_qp;
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1799 1800 1801 1802 1803
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1804
	.ndo_set_mac_address = virtnet_set_mac_address,
1805
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1806
	.ndo_get_stats64     = virtnet_stats,
1807 1808
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1809 1810 1811
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
#endif
J
Jason Wang 已提交
1812 1813 1814
#ifdef CONFIG_NET_RX_BUSY_POLL
	.ndo_busy_poll		= virtnet_busy_poll,
#endif
J
John Fastabend 已提交
1815
	.ndo_xdp		= virtnet_xdp,
1816 1817
};

1818
static void virtnet_config_changed_work(struct work_struct *work)
1819
{
1820 1821
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1822 1823
	u16 v;

1824 1825
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1826
		return;
1827 1828

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1829
		netdev_notify_peers(vi->dev);
1830 1831
		virtnet_ack_link_announce(vi);
	}
1832 1833 1834 1835 1836

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1837
		return;
1838 1839 1840 1841 1842

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1843
		netif_tx_wake_all_queues(vi->dev);
1844 1845
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1846
		netif_tx_stop_all_queues(vi->dev);
1847 1848 1849 1850 1851 1852 1853
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1854
	schedule_work(&vi->config_work);
1855 1856
}

J
Jason Wang 已提交
1857 1858
static void virtnet_free_queues(struct virtnet_info *vi)
{
1859 1860
	int i;

1861 1862
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1863
		netif_napi_del(&vi->rq[i].napi);
1864
	}
1865

1866 1867 1868 1869 1870
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1871 1872 1873 1874 1875 1876
	kfree(vi->rq);
	kfree(vi->sq);
}

static void free_receive_bufs(struct virtnet_info *vi)
{
J
John Fastabend 已提交
1877
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1878 1879
	int i;

J
John Fastabend 已提交
1880
	rtnl_lock();
J
Jason Wang 已提交
1881 1882 1883
	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1884 1885 1886 1887 1888

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1889
	}
J
John Fastabend 已提交
1890
	rtnl_unlock();
J
Jason Wang 已提交
1891 1892
}

1893 1894 1895 1896 1897 1898 1899 1900
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
John Fastabend 已提交
1901 1902 1903 1904 1905 1906 1907 1908 1909 1910
static bool is_xdp_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1911 1912 1913 1914 1915 1916 1917
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
1918 1919 1920 1921 1922 1923
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
			if (!is_xdp_queue(vi, i))
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
1924 1925 1926 1927 1928 1929
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1930 1931 1932 1933 1934
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
1935
				give_pages(&vi->rq[i], buf);
1936
			} else {
J
Jason Wang 已提交
1937
				dev_kfree_skb(buf);
1938
			}
J
Jason Wang 已提交
1939 1940 1941 1942
		}
	}
}

1943 1944 1945 1946
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

1947
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
1948

1949
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
1950 1951

	virtnet_free_queues(vi);
1952 1953
}

J
Jason Wang 已提交
1954
static int virtnet_find_vqs(struct virtnet_info *vi)
1955
{
J
Jason Wang 已提交
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
1985

J
Jason Wang 已提交
1986 1987 1988 1989 1990 1991 1992 1993 1994
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
1995

J
Jason Wang 已提交
1996 1997 1998 1999
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names);
	if (ret)
		goto err_find;
2000

J
Jason Wang 已提交
2001 2002
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2003
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2004
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2005
	}
J
Jason Wang 已提交
2006 2007 2008 2009 2010 2011 2012 2013 2014 2015

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2016
	return 0;
J
Jason Wang 已提交
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2036
	if (!vi->rq)
J
Jason Wang 已提交
2037 2038 2039 2040 2041 2042 2043 2044 2045
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2046
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2071
	get_online_cpus();
2072
	virtnet_set_affinity(vi);
2073 2074
	put_online_cpus();

J
Jason Wang 已提交
2075 2076 2077 2078 2079 2080
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2081 2082
}

2083 2084 2085 2086 2087 2088
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2089
	struct ewma_pkt_len *avg;
2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2144 2145 2146
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

R
Rusty Russell 已提交
2147 2148
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
2149
	int i, err;
R
Rusty Russell 已提交
2150 2151
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
2152
	u16 max_queue_pairs;
2153
	int mtu;
J
Jason Wang 已提交
2154

2155 2156 2157 2158 2159 2160
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2161 2162 2163
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

J
Jason Wang 已提交
2164
	/* Find if host supports multiqueue virtio_net device */
2165 2166 2167
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2168 2169 2170 2171 2172 2173

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2174 2175

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2176
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2177 2178 2179 2180
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2181
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2182
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2183
	dev->features = NETIF_F_HIGHDMA;
2184

2185
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2186 2187 2188
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2189
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2190
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2191
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2192
		if (csum)
J
Jason Wang 已提交
2193
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2194 2195

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2196
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2197 2198
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2199
		/* Individual feature bits: what can host handle? */
2200 2201 2202 2203 2204 2205
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2206 2207
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2208

2209 2210
		dev->features |= NETIF_F_GSO_ROBUST;

2211
		if (gso)
2212
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2213
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2214
	}
2215 2216
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2217

2218 2219
	dev->vlan_features = dev->features;

2220 2221 2222 2223
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2224
	/* Configuration may specify what MAC to use.  Otherwise random. */
2225 2226 2227 2228 2229
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2230
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2231 2232 2233 2234 2235

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2236
	vdev->priv = vi;
2237 2238 2239 2240 2241
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2242 2243 2244 2245 2246 2247 2248
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2249
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2250

2251
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2252 2253
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2254 2255
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2256 2257
		vi->big_packets = true;

2258 2259 2260
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2261 2262
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2263 2264 2265 2266
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2267 2268
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2269 2270
		vi->any_header_sg = true;

J
Jason Wang 已提交
2271 2272 2273
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2274 2275 2276 2277
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2278
		if (mtu < dev->min_mtu) {
2279
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2280
		} else {
2281
			dev->mtu = mtu;
2282 2283
			dev->max_mtu = mtu;
		}
2284 2285
	}

2286 2287
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2288

2289 2290 2291 2292 2293
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2294 2295 2296
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2297
	err = init_vqs(vi);
2298
	if (err)
2299
		goto free_stats;
R
Rusty Russell 已提交
2300

2301 2302 2303 2304
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2305 2306
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2307

2308 2309
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2310 2311 2312
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2313
		goto free_vqs;
R
Rusty Russell 已提交
2314
	}
2315

M
Michael S. Tsirkin 已提交
2316 2317
	virtio_device_ready(vdev);

2318
	err = virtnet_cpu_notif_add(vi);
2319 2320
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2321
		goto free_unregister_netdev;
2322 2323
	}

2324 2325 2326
	rtnl_lock();
	virtnet_set_queues(vi, vi->curr_queue_pairs);
	rtnl_unlock();
2327

J
Jason Wang 已提交
2328 2329 2330 2331
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2332
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2333 2334 2335 2336
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2337

J
Jason Wang 已提交
2338 2339 2340
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2341 2342
	return 0;

2343
free_unregister_netdev:
2344 2345
	vi->vdev->config->reset(vdev);

2346
	unregister_netdev(dev);
2347
free_vqs:
J
Jason Wang 已提交
2348
	cancel_delayed_work_sync(&vi->refill);
2349
	free_receive_page_frags(vi);
2350
	virtnet_del_vqs(vi);
2351 2352
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2353 2354 2355 2356 2357
free:
	free_netdev(dev);
	return err;
}

2358
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2359
{
2360
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2361 2362

	/* Free unused buffers in both send and recv, if any. */
2363
	free_unused_bufs(vi);
2364

J
Jason Wang 已提交
2365
	free_receive_bufs(vi);
2366

2367 2368
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2369
	virtnet_del_vqs(vi);
2370 2371
}

2372
static void virtnet_remove(struct virtio_device *vdev)
2373 2374 2375
{
	struct virtnet_info *vi = vdev->priv;

2376
	virtnet_cpu_notif_remove(vi);
2377

2378 2379
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2380

2381 2382 2383
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2384

2385
	free_percpu(vi->stats);
2386
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2387 2388
}

2389
#ifdef CONFIG_PM_SLEEP
2390 2391 2392
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2393
	int i;
2394

2395
	virtnet_cpu_notif_remove(vi);
2396

2397 2398
	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);
2399

2400 2401 2402
	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

J
Jason Wang 已提交
2403
	if (netif_running(vi->dev)) {
2404
		for (i = 0; i < vi->max_queue_pairs; i++)
J
Jason Wang 已提交
2405
			napi_disable(&vi->rq[i].napi);
J
Jason Wang 已提交
2406
	}
2407 2408 2409 2410 2411 2412 2413 2414 2415

	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2416
	int err, i;
2417 2418 2419 2420 2421

	err = init_vqs(vi);
	if (err)
		return err;

2422 2423
	virtio_device_ready(vdev);

2424 2425
	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
M
Michael S. Tsirkin 已提交
2426
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
2427 2428
				schedule_delayed_work(&vi->refill, 0);

J
Jason Wang 已提交
2429 2430
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
2431
	}
2432 2433 2434

	netif_device_attach(vi->dev);

2435
	rtnl_lock();
J
Jason Wang 已提交
2436
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2437
	rtnl_unlock();
J
Jason Wang 已提交
2438

2439
	err = virtnet_cpu_notif_add(vi);
2440 2441 2442
	if (err)
		return err;

2443 2444 2445 2446
	return 0;
}
#endif

R
Rusty Russell 已提交
2447 2448 2449 2450 2451
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2464
static unsigned int features[] = {
2465 2466 2467 2468 2469 2470
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2471
	VIRTIO_F_ANY_LAYOUT,
2472 2473
};

2474
static struct virtio_driver virtio_net_driver = {
2475 2476
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2477 2478
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2479 2480 2481 2482
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
2483
	.remove =	virtnet_remove,
2484
	.config_changed = virtnet_config_changed,
2485
#ifdef CONFIG_PM_SLEEP
2486 2487 2488
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2489 2490
};

2491 2492 2493 2494
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2495
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2496 2497 2498 2499 2500
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2501
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2526 2527 2528 2529

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");