virtio_net.c 62.0 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
R
Rusty Russell 已提交
26
#include <linux/scatterlist.h>
27
#include <linux/if_vlan.h>
28
#include <linux/slab.h>
29
#include <linux/cpu.h>
30
#include <linux/average.h>
J
Jason Wang 已提交
31
#include <net/busy_poll.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

J
Johannes Berg 已提交
44 45 46 47
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
48
 */
J
Johannes Berg 已提交
49
DECLARE_EWMA(pkt_len, 1, 64)
50 51 52 53

/* Minimum alignment for mergeable packet buffers. */
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)

54
#define VIRTNET_DRIVER_VERSION "1.0.0"
55

56
struct virtnet_stats {
57 58
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
59 60 61 62 63 64 65
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

66 67 68 69 70 71 72
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
73 74 75

	/* Name of the send queue: output.$index */
	char name[40];
76 77 78 79 80 81 82
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
83 84
	struct napi_struct napi;

J
John Fastabend 已提交
85 86
	struct bpf_prog __rcu *xdp_prog;

87 88 89
	/* Chain pages by the private ptr. */
	struct page *pages;

90
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
91
	struct ewma_pkt_len mrg_avg_pkt_len;
92

93 94 95
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

96 97
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
98 99 100

	/* Name of this receive queue: input.$index */
	char name[40];
101 102 103 104 105 106
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
107 108
	struct send_queue *sq;
	struct receive_queue *rq;
109 110
	unsigned int status;

J
Jason Wang 已提交
111 112 113 114 115 116
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

117 118 119
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

120 121 122
	/* I like... big packets and I cannot lie! */
	bool big_packets;

123 124 125
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
126 127 128
	/* Has control virtqueue */
	bool has_cvq;

129 130 131
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

132 133 134
	/* Packet virtio header size */
	u8 hdr_len;

135 136 137
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

138 139 140
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

141 142 143
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
144 145
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
146

147 148 149
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
150 151 152 153

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
154
	struct virtio_net_ctrl_mq ctrl_mq;
155 156
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
157
	u16 ctrl_vid;
158 159 160 161

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
162 163
};

164
struct padded_vnet_hdr {
165
	struct virtio_net_hdr_mrg_rxbuf hdr;
166
	/*
167 168 169
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
170
	 */
171
	char padding[4];
172 173
};

J
Jason Wang 已提交
174 175 176 177 178
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
179
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
180 181 182 183 184 185 186 187 188
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
189
	return vq->index / 2;
J
Jason Wang 已提交
190 191 192 193 194 195 196
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

197
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
198
{
199
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
200 201
}

202 203 204 205
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
206
static void give_pages(struct receive_queue *rq, struct page *page)
207
{
208
	struct page *end;
209

210
	/* Find end of list, sew whole thing into vi->rq.pages. */
211
	for (end = page; end->private; end = (struct page *)end->private);
212 213
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
214 215
}

216
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
217
{
218
	struct page *p = rq->pages;
219

220
	if (p) {
221
		rq->pages = (struct page *)p->private;
222 223 224
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
225 226 227 228
		p = alloc_page(gfp_mask);
	return p;
}

229
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
230
{
231
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
232

233
	/* Suppress further interrupts. */
234
	virtqueue_disable_cb(vq);
235

236
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
237
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
238 239
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

258
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
259 260
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
261 262
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
263 264
{
	struct sk_buff *skb;
265
	struct virtio_net_hdr_mrg_rxbuf *hdr;
266
	unsigned int copy, hdr_len, hdr_padded_len;
267
	char *p;
268

269
	p = page_address(page) + offset;
270

271
	/* copy small packet so we can reuse these pages for small data */
272
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
273 274
	if (unlikely(!skb))
		return NULL;
275

276
	hdr = skb_vnet_hdr(skb);
277

278 279 280 281
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
282
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
283

284
	memcpy(hdr, p, hdr_len);
285

286
	len -= hdr_len;
287 288
	offset += hdr_padded_len;
	p += hdr_padded_len;
289

290 291 292 293
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
294

295 296
	len -= copy;
	offset += copy;
297

298 299 300 301 302 303 304 305
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

306 307 308 309 310 311 312
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
313
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
314 315 316
		dev_kfree_skb(skb);
		return NULL;
	}
317
	BUG_ON(offset >= PAGE_SIZE);
318
	while (len) {
319 320 321 322
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
323 324 325
		page = (struct page *)page->private;
		offset = 0;
	}
326

327
	if (page)
328
		give_pages(rq, page);
329

330 331
	return skb;
}
332

J
John Fastabend 已提交
333 334 335 336 337 338 339 340 341 342 343 344 345 346
static void virtnet_xdp_xmit(struct virtnet_info *vi,
			     struct receive_queue *rq,
			     struct send_queue *sq,
			     struct xdp_buff *xdp)
{
	struct page *page = virt_to_head_page(xdp->data);
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	unsigned int num_sg, len;
	void *xdp_sent;
	int err;

	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		struct page *sent_page = virt_to_head_page(xdp_sent);
347
		put_page(sent_page);
J
John Fastabend 已提交
348 349 350 351 352 353 354 355 356 357 358
	}

	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);

	num_sg = 1;
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
				   xdp->data, GFP_ATOMIC);
	if (unlikely(err)) {
359
		put_page(page);
J
John Fastabend 已提交
360 361 362 363 364 365
		return; // On error abort to avoid unnecessary kick
	}

	virtqueue_kick(sq->vq);
}

J
John Fastabend 已提交
366
static u32 do_xdp_prog(struct virtnet_info *vi,
J
John Fastabend 已提交
367
		       struct receive_queue *rq,
J
John Fastabend 已提交
368 369 370 371 372
		       struct bpf_prog *xdp_prog,
		       struct page *page, int offset, int len)
{
	int hdr_padded_len;
	struct xdp_buff xdp;
J
John Fastabend 已提交
373
	unsigned int qp;
J
John Fastabend 已提交
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
	u32 act;
	u8 *buf;

	buf = page_address(page) + offset;

	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		hdr_padded_len = sizeof(struct padded_vnet_hdr);

	xdp.data = buf + hdr_padded_len;
	xdp.data_end = xdp.data + (len - vi->hdr_len);

	act = bpf_prog_run_xdp(xdp_prog, &xdp);
	switch (act) {
	case XDP_PASS:
		return XDP_PASS;
J
John Fastabend 已提交
391 392 393 394 395 396 397
	case XDP_TX:
		qp = vi->curr_queue_pairs -
			vi->xdp_queue_pairs +
			smp_processor_id();
		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
		return XDP_TX;
J
John Fastabend 已提交
398 399 400 401 402 403 404 405
	default:
		bpf_warn_invalid_xdp_action(act);
	case XDP_ABORTED:
	case XDP_DROP:
		return XDP_DROP;
	}
}

406
static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
407 408 409
{
	struct sk_buff * skb = buf;

410
	len -= vi->hdr_len;
411 412 413 414 415 416
	skb_trim(skb, len);

	return skb;
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
417
				   struct virtnet_info *vi,
418 419 420 421 422
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
423
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
424

425 426 427 428 429 430 431 432 433 434 435
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

436 437 438 439 440 441 442 443 444 445 446 447
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
448
				       u16 *num_buf,
449 450 451 452 453 454 455 456 457 458 459 460 461
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
	unsigned int page_off = 0;

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

462
	while (--*num_buf) {
463 464 465 466 467 468 469 470 471
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

472 473 474 475
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

476 477 478
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
479 480
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
481
			goto err_buf;
482
		}
483 484 485 486

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
487
		put_page(p);
488 489 490 491 492 493 494 495 496
	}

	*len = page_off;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

497
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
498
					 struct virtnet_info *vi,
499
					 struct receive_queue *rq,
500
					 unsigned long ctx,
501
					 unsigned int len)
502
{
503
	void *buf = mergeable_ctx_to_buf_address(ctx);
504 505
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
506 507
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
508 509 510 511
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
512 513
	head_skb = NULL;

J
John Fastabend 已提交
514 515 516
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
517
		struct page *xdp_page;
J
John Fastabend 已提交
518 519
		u32 act;

520
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
521
		if (unlikely(num_buf > 1)) {
522
			/* linearize data for XDP */
523
			xdp_page = xdp_linearize_page(rq, &num_buf,
524 525 526 527 528 529
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
			offset = 0;
		} else {
			xdp_page = page;
J
John Fastabend 已提交
530 531 532 533 534 535 536
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
537
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
538 539
			goto err_xdp;

540
		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
J
John Fastabend 已提交
541 542
		switch (act) {
		case XDP_PASS:
543 544 545 546 547 548
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
						       0, len, PAGE_SIZE);
549
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
550 551
				return head_skb;
			}
J
John Fastabend 已提交
552 553
			break;
		case XDP_TX:
554
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
555 556
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
557 558 559 560
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
561 562
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
563
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
564
			goto err_xdp;
J
John Fastabend 已提交
565
		}
J
John Fastabend 已提交
566 567
	}
	rcu_read_unlock();
568

J
John Fastabend 已提交
569 570 571
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
572

573 574
	if (unlikely(!curr_skb))
		goto err_skb;
575
	while (--num_buf) {
576 577
		int num_skb_frags;

578 579
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
580
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
581
				 dev->name, num_buf,
582 583
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
584 585
			dev->stats.rx_length_errors++;
			goto err_buf;
586
		}
587

588
		buf = mergeable_ctx_to_buf_address(ctx);
589 590 591
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
592 593
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
594 595 596

			if (unlikely(!nskb))
				goto err_skb;
597 598 599 600 601 602 603 604
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
605
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
606 607 608
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
609
			head_skb->truesize += truesize;
610
		}
611
		offset = buf - page_address(page);
612 613 614
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
615
					     len, truesize);
616 617
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
618
					offset, len, truesize);
619
		}
620 621
	}

J
Johannes Berg 已提交
622
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
623 624
	return head_skb;

J
John Fastabend 已提交
625 626
err_xdp:
	rcu_read_unlock();
627 628 629
err_skb:
	put_page(page);
	while (--num_buf) {
630 631
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
632 633 634 635 636
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
637
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
638
		put_page(page);
639
	}
640 641 642
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
643
xdp_xmit:
644
	return NULL;
645 646
}

M
Michael S. Tsirkin 已提交
647 648
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len)
649
{
650
	struct net_device *dev = vi->dev;
E
Eric Dumazet 已提交
651
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
652
	struct sk_buff *skb;
653
	struct virtio_net_hdr_mrg_rxbuf *hdr;
654

655
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
656 657
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
658 659 660 661 662
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
663
			give_pages(rq, buf);
664
		} else {
665
			dev_kfree_skb(buf);
666
		}
667 668
		return;
	}
669

670
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
671
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
672
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
673
		skb = receive_big(dev, vi, rq, buf, len);
674
	else
675
		skb = receive_small(vi, buf, len);
676 677 678

	if (unlikely(!skb))
		return;
679

680
	hdr = skb_vnet_hdr(skb);
681

682
	u64_stats_update_begin(&stats->rx_syncp);
683 684
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
685
	u64_stats_update_end(&stats->rx_syncp);
R
Rusty Russell 已提交
686

687
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
688
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
689

690 691 692 693 694 695
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
696 697
	}

698 699 700 701
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
702
	napi_gro_receive(&rq->napi, skb);
R
Rusty Russell 已提交
703 704 705 706 707 708 709
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

M
Michael S. Tsirkin 已提交
710 711
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
712 713
{
	struct sk_buff *skb;
714
	struct virtio_net_hdr_mrg_rxbuf *hdr;
715
	int err;
716

717
	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
718 719
	if (unlikely(!skb))
		return -ENOMEM;
R
Rusty Russell 已提交
720

721
	skb_put(skb, GOOD_PACKET_LEN);
722

723
	hdr = skb_vnet_hdr(skb);
724
	sg_init_table(rq->sg, 2);
725
	sg_set_buf(rq->sg, hdr, vi->hdr_len);
726
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
727

728
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
729 730
	if (err < 0)
		dev_kfree_skb(skb);
731

732 733
	return err;
}
734

735 736
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
737 738 739 740 741
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

742 743
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

744
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
745
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
746
		first = get_a_page(rq, gfp);
747 748
		if (!first) {
			if (list)
749
				give_pages(rq, list);
750
			return -ENOMEM;
751
		}
752
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
753

754 755 756 757
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
758

759
	first = get_a_page(rq, gfp);
760
	if (!first) {
761
		give_pages(rq, list);
762 763 764 765
		return -ENOMEM;
	}
	p = page_address(first);

766
	/* rq->sg[0], rq->sg[1] share the same page */
767 768
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
769

770
	/* rq->sg[1] for data packet, from offset */
771
	offset = sizeof(struct padded_vnet_hdr);
772
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
773 774 775

	/* chain first in list head */
	first->private = (unsigned long)list;
776 777
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
778
	if (err < 0)
779
		give_pages(rq, first);
780 781

	return err;
R
Rusty Russell 已提交
782 783
}

J
Johannes Berg 已提交
784
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
785
{
786
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
787 788
	unsigned int len;

J
Johannes Berg 已提交
789
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
790 791 792 793 794 795
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
796 797
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
798
	unsigned long ctx;
799
	int err;
800
	unsigned int len, hole;
801

802
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
803
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
804
		return -ENOMEM;
805

806
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
807
	ctx = mergeable_buf_to_ctx(buf, len);
808 809 810
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	hole = alloc_frag->size - alloc_frag->offset;
811 812 813 814 815 816
	if (hole < len) {
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
817 818 819
		len += hole;
		alloc_frag->offset += hole;
	}
820

821
	sg_init_one(rq->sg, buf, len);
822
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
823
	if (err < 0)
824
		put_page(virt_to_head_page(buf));
825

826 827
	return err;
}
828

829 830 831 832 833 834 835
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
836 837
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
838 839
{
	int err;
840
	bool oom;
841

842
	gfp |= __GFP_COLD;
843 844
	do {
		if (vi->mergeable_rx_bufs)
845
			err = add_recvbuf_mergeable(rq, gfp);
846
		else if (vi->big_packets)
847
			err = add_recvbuf_big(vi, rq, gfp);
848
		else
M
Michael S. Tsirkin 已提交
849
			err = add_recvbuf_small(vi, rq, gfp);
850

851
		oom = err == -ENOMEM;
852
		if (err)
853
			break;
854
	} while (rq->vq->num_free);
855
	virtqueue_kick(rq->vq);
856
	return !oom;
857 858
}

859
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
860 861
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
862
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
863

864
	/* Schedule NAPI, Suppress further interrupts if successful. */
865
	if (napi_schedule_prep(&rq->napi)) {
866
		virtqueue_disable_cb(rvq);
867
		__napi_schedule(&rq->napi);
868
	}
R
Rusty Russell 已提交
869 870
}

871
static void virtnet_napi_enable(struct receive_queue *rq)
872
{
873
	napi_enable(&rq->napi);
874 875 876 877 878

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
879 880
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
881
		local_bh_disable();
882
		__napi_schedule(&rq->napi);
883
		local_bh_enable();
884 885 886
	}
}

887 888
static void refill_work(struct work_struct *work)
{
889 890
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
891
	bool still_empty;
J
Jason Wang 已提交
892 893
	int i;

894
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
895
		struct receive_queue *rq = &vi->rq[i];
896

J
Jason Wang 已提交
897
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
898
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
899
		virtnet_napi_enable(rq);
900

J
Jason Wang 已提交
901 902 903 904 905 906
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
907 908
}

909
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
910
{
911
	struct virtnet_info *vi = rq->vq->vdev->priv;
912
	unsigned int len, received = 0;
913
	void *buf;
R
Rusty Russell 已提交
914 915

	while (received < budget &&
916
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
M
Michael S. Tsirkin 已提交
917
		receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
918 919 920
		received++;
	}

921
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
922
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
923
			schedule_delayed_work(&vi->refill, 0);
924
	}
R
Rusty Russell 已提交
925

926 927 928 929 930 931 932
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
933
	unsigned int r, received;
934

935
	received = virtnet_receive(rq, budget);
936

937 938
	/* Out of packets? */
	if (received < budget) {
939
		r = virtqueue_enable_cb_prepare(rq->vq);
E
Eric Dumazet 已提交
940
		napi_complete_done(napi, received);
941
		if (unlikely(virtqueue_poll(rq->vq, r)) &&
942
		    napi_schedule_prep(napi)) {
943
			virtqueue_disable_cb(rq->vq);
944
			__napi_schedule(napi);
945
		}
R
Rusty Russell 已提交
946 947 948 949 950
	}

	return received;
}

J
Jason Wang 已提交
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
#ifdef CONFIG_NET_RX_BUSY_POLL
/* must be called with local_bh_disable()d */
static int virtnet_busy_poll(struct napi_struct *napi)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	struct virtnet_info *vi = rq->vq->vdev->priv;
	int r, received = 0, budget = 4;

	if (!(vi->status & VIRTIO_NET_S_LINK_UP))
		return LL_FLUSH_FAILED;

	if (!napi_schedule_prep(napi))
		return LL_FLUSH_BUSY;

	virtqueue_disable_cb(rq->vq);

again:
	received += virtnet_receive(rq, budget);

	r = virtqueue_enable_cb_prepare(rq->vq);
	clear_bit(NAPI_STATE_SCHED, &napi->state);
	if (unlikely(virtqueue_poll(rq->vq, r)) &&
	    napi_schedule_prep(napi)) {
		virtqueue_disable_cb(rq->vq);
		if (received < budget) {
			budget -= received;
			goto again;
		} else {
			__napi_schedule(napi);
		}
	}

	return received;
}
#endif	/* CONFIG_NET_RX_BUSY_POLL */

J
Jason Wang 已提交
988 989 990 991 992
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

993 994 995
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
996
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
997
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
998 999 1000 1001 1002 1003
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1004
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1005 1006
{
	struct sk_buff *skb;
1007
	unsigned int len;
1008
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1009
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1010

1011
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1012
		pr_debug("Sent skb %p\n", skb);
1013

1014
		u64_stats_update_begin(&stats->tx_syncp);
1015 1016
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
1017
		u64_stats_update_end(&stats->tx_syncp);
1018

1019
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1020 1021 1022
	}
}

1023
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1024
{
1025
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1026
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1027
	struct virtnet_info *vi = sq->vq->vdev->priv;
1028
	unsigned num_sg;
1029
	unsigned hdr_len = vi->hdr_len;
1030
	bool can_push;
R
Rusty Russell 已提交
1031

J
Johannes Berg 已提交
1032
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1033 1034 1035 1036 1037 1038 1039

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1040
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1041 1042
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1043

1044 1045 1046
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
				    virtio_is_little_endian(vi->vdev)))
		BUG();
R
Rusty Russell 已提交
1047

1048
	if (vi->mergeable_rx_bufs)
1049
		hdr->num_buffers = 0;
1050

1051
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1052 1053 1054 1055 1056 1057 1058 1059 1060
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1061
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1062 1063
}

1064
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1065 1066
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1067 1068
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1069
	int err;
1070 1071
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1072 1073

	/* Free up any pending old buffers before queueing new ones. */
1074
	free_old_xmit_skbs(sq);
1075

1076 1077 1078
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1079
	/* Try to transmit */
1080
	err = xmit_skb(sq, skb);
1081

1082
	/* This should not happen! */
1083
	if (unlikely(err)) {
1084 1085 1086
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1087
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1088
		dev->stats.tx_dropped++;
1089
		dev_kfree_skb_any(skb);
1090
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1091
	}
1092

1093 1094 1095 1096
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1097 1098 1099 1100 1101 1102 1103 1104 1105
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1106
	 */
1107
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1108
		netif_stop_subqueue(dev, qnum);
1109
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1110
			/* More just got used, free them then recheck. */
1111 1112
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1113
				netif_start_subqueue(dev, qnum);
1114
				virtqueue_disable_cb(sq->vq);
1115 1116
			}
		}
1117
	}
1118

1119
	if (kick || netif_xmit_stopped(txq))
1120
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1121

1122
	return NETDEV_TX_OK;
1123 1124
}

1125 1126 1127
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1128
 * never fail unless improperly formatted.
1129 1130
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1131
				 struct scatterlist *out)
1132
{
1133
	struct scatterlist *sgs[4], hdr, stat;
1134
	unsigned out_num = 0, tmp;
1135 1136

	/* Caller should know better */
1137
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1138

1139 1140 1141
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1142
	/* Add header */
1143
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1144
	sgs[out_num++] = &hdr;
1145

1146 1147
	if (out)
		sgs[out_num++] = out;
1148

1149
	/* Add return status. */
1150
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1151
	sgs[out_num] = &stat;
1152

1153
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1154
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1155

1156
	if (unlikely(!virtqueue_kick(vi->cvq)))
1157
		return vi->ctrl_status == VIRTIO_NET_OK;
1158 1159 1160 1161

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1162 1163
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1164 1165
		cpu_relax();

1166
	return vi->ctrl_status == VIRTIO_NET_OK;
1167 1168
}

1169 1170 1171 1172
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1173
	int ret;
1174
	struct sockaddr *addr;
1175
	struct scatterlist sg;
1176

1177 1178 1179 1180 1181 1182
	addr = kmalloc(sizeof(*addr), GFP_KERNEL);
	if (!addr)
		return -ENOMEM;
	memcpy(addr, p, sizeof(*addr));

	ret = eth_prepare_mac_addr_change(dev, addr);
1183
	if (ret)
1184
		goto out;
1185

1186 1187 1188
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1189
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1190 1191
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1192 1193
			ret = -EINVAL;
			goto out;
1194
		}
1195 1196
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1197 1198 1199 1200 1201 1202 1203
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1204 1205 1206
	}

	eth_commit_mac_addr_change(dev, p);
1207
	ret = 0;
1208

1209 1210 1211
out:
	kfree(addr);
	return ret;
1212 1213
}

1214 1215 1216 1217 1218 1219 1220 1221
static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
					       struct rtnl_link_stats64 *tot)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1222
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1223 1224 1225
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1226
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1227 1228
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1229
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1230 1231

		do {
1232
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1233 1234
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1235
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1236 1237 1238 1239 1240 1241 1242 1243

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1244
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1245 1246 1247 1248 1249 1250 1251
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;

	return tot;
}

1252 1253 1254 1255
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1256
	int i;
1257

J
Jason Wang 已提交
1258 1259
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1260 1261 1262
}
#endif

1263 1264 1265 1266
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1267
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1268 1269 1270 1271
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

J
Jason Wang 已提交
1272 1273 1274 1275 1276 1277 1278 1279
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1280 1281
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1282 1283

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1284
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1285 1286 1287
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1288
	} else {
J
Jason Wang 已提交
1289
		vi->curr_queue_pairs = queue_pairs;
1290 1291 1292
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1293
	}
J
Jason Wang 已提交
1294 1295 1296 1297

	return 0;
}

R
Rusty Russell 已提交
1298 1299 1300
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1301
	int i;
R
Rusty Russell 已提交
1302

1303 1304
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1305 1306 1307

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1308 1309 1310 1311

	return 0;
}

1312 1313 1314
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1315 1316
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1317
	struct netdev_hw_addr *ha;
1318
	int uc_count;
1319
	int mc_count;
1320 1321
	void *buf;
	int i;
1322

S
stephen hemminger 已提交
1323
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1324 1325 1326
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1327 1328
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1329

1330
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1331 1332

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1333
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1334
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1335
			 vi->ctrl_promisc ? "en" : "dis");
1336

1337
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1338 1339

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1340
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1341
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1342
			 vi->ctrl_allmulti ? "en" : "dis");
1343

1344
	uc_count = netdev_uc_count(dev);
1345
	mc_count = netdev_mc_count(dev);
1346
	/* MAC filter - use one buffer for both lists */
1347 1348 1349
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1350
	if (!buf)
1351 1352
		return;

1353 1354
	sg_init_table(sg, 2);

1355
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1356
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1357
	i = 0;
1358
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1359
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1360 1361

	sg_set_buf(&sg[0], mac_data,
1362
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1363 1364

	/* multicast list and count fill the end */
1365
	mac_data = (void *)&mac_data->macs[uc_count][0];
1366

M
Michael S. Tsirkin 已提交
1367
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1368
	i = 0;
1369 1370
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1371 1372

	sg_set_buf(&sg[1], mac_data,
1373
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1374 1375

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1376
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1377
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1378 1379

	kfree(buf);
1380 1381
}

1382 1383
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1384 1385 1386 1387
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1388 1389
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1390 1391

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1392
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1393
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1394
	return 0;
1395 1396
}

1397 1398
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1399 1400 1401 1402
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1403 1404
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1405 1406

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1407
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1408
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1409
	return 0;
1410 1411
}

1412
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1413 1414 1415
{
	int i;

1416 1417
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1418 1419 1420 1421
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1422 1423 1424
		vi->affinity_hint_set = false;
	}
}
1425

1426 1427 1428 1429
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1430 1431 1432 1433 1434

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1435 1436 1437 1438
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1439 1440
	}

1441 1442
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1443 1444
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1445
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1446
		i++;
J
Jason Wang 已提交
1447 1448
	}

1449
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1450 1451
}

1452
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1453
{
1454 1455 1456 1457 1458
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1459

1460 1461 1462 1463 1464 1465 1466
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1467

1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1499 1500
}

R
Rick Jones 已提交
1501 1502 1503 1504 1505
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1506 1507
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1508 1509 1510 1511
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1539
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1540 1541
		return -EINVAL;

J
John Fastabend 已提交
1542 1543 1544 1545 1546 1547 1548
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1549
	get_online_cpus();
1550 1551 1552 1553 1554
	err = virtnet_set_queues(vi, queue_pairs);
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1555
		virtnet_set_affinity(vi);
1556
	}
1557
	put_online_cpus();
1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1575 1576 1577 1578 1579 1580
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1581 1582 1583
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1584
	ethtool_cmd_speed_set(&diff1, 0);
1585
	diff2.port = PORT_OTHER;
1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1629
static const struct ethtool_ops virtnet_ethtool_ops = {
1630
	.get_drvinfo = virtnet_get_drvinfo,
1631
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1632
	.get_ringparam = virtnet_get_ringparam,
1633 1634
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1635
	.get_ts_info = ethtool_op_get_ts_info,
1636 1637
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1638 1639
};

J
John Fastabend 已提交
1640 1641 1642 1643 1644
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1645 1646
	u16 xdp_qp = 0, curr_qp;
	int i, err;
J
John Fastabend 已提交
1647 1648

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1649 1650 1651
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err) {
		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
		return err;
	}

J
John Fastabend 已提交
1683 1684
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
1685 1686
		if (IS_ERR(prog)) {
			virtnet_set_queues(vi, curr_qp);
J
John Fastabend 已提交
1687
			return PTR_ERR(prog);
1688
		}
J
John Fastabend 已提交
1689 1690
	}

1691 1692 1693
	vi->xdp_queue_pairs = xdp_qp;
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1729 1730 1731 1732 1733
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1734
	.ndo_set_mac_address = virtnet_set_mac_address,
1735
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1736
	.ndo_get_stats64     = virtnet_stats,
1737 1738
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1739 1740 1741
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
#endif
J
Jason Wang 已提交
1742 1743 1744
#ifdef CONFIG_NET_RX_BUSY_POLL
	.ndo_busy_poll		= virtnet_busy_poll,
#endif
J
John Fastabend 已提交
1745
	.ndo_xdp		= virtnet_xdp,
1746 1747
};

1748
static void virtnet_config_changed_work(struct work_struct *work)
1749
{
1750 1751
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1752 1753
	u16 v;

1754 1755
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1756
		return;
1757 1758

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1759
		netdev_notify_peers(vi->dev);
1760 1761
		virtnet_ack_link_announce(vi);
	}
1762 1763 1764 1765 1766

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1767
		return;
1768 1769 1770 1771 1772

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1773
		netif_tx_wake_all_queues(vi->dev);
1774 1775
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1776
		netif_tx_stop_all_queues(vi->dev);
1777 1778 1779 1780 1781 1782 1783
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1784
	schedule_work(&vi->config_work);
1785 1786
}

J
Jason Wang 已提交
1787 1788
static void virtnet_free_queues(struct virtnet_info *vi)
{
1789 1790
	int i;

1791 1792
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1793
		netif_napi_del(&vi->rq[i].napi);
1794
	}
1795

1796 1797 1798 1799 1800
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1801 1802 1803 1804 1805 1806
	kfree(vi->rq);
	kfree(vi->sq);
}

static void free_receive_bufs(struct virtnet_info *vi)
{
J
John Fastabend 已提交
1807
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1808 1809
	int i;

J
John Fastabend 已提交
1810
	rtnl_lock();
J
Jason Wang 已提交
1811 1812 1813
	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1814 1815 1816 1817 1818

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1819
	}
J
John Fastabend 已提交
1820
	rtnl_unlock();
J
Jason Wang 已提交
1821 1822
}

1823 1824 1825 1826 1827 1828 1829 1830
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
John Fastabend 已提交
1831 1832 1833 1834 1835 1836 1837 1838 1839 1840
static bool is_xdp_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1841 1842 1843 1844 1845 1846 1847
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
1848 1849 1850 1851 1852 1853
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
			if (!is_xdp_queue(vi, i))
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
1854 1855 1856 1857 1858 1859
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1860 1861 1862 1863 1864
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
1865
				give_pages(&vi->rq[i], buf);
1866
			} else {
J
Jason Wang 已提交
1867
				dev_kfree_skb(buf);
1868
			}
J
Jason Wang 已提交
1869 1870 1871 1872
		}
	}
}

1873 1874 1875 1876
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

1877
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
1878

1879
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
1880 1881

	virtnet_free_queues(vi);
1882 1883
}

J
Jason Wang 已提交
1884
static int virtnet_find_vqs(struct virtnet_info *vi)
1885
{
J
Jason Wang 已提交
1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
1915

J
Jason Wang 已提交
1916 1917 1918 1919 1920 1921 1922 1923 1924
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
1925

J
Jason Wang 已提交
1926 1927 1928 1929
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names);
	if (ret)
		goto err_find;
1930

J
Jason Wang 已提交
1931 1932
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
1933
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
1934
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
1935
	}
J
Jason Wang 已提交
1936 1937 1938 1939 1940 1941 1942 1943 1944 1945

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

1946
	return 0;
J
Jason Wang 已提交
1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
1966
	if (!vi->rq)
J
Jason Wang 已提交
1967 1968 1969 1970 1971 1972 1973 1974 1975
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
1976
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2001
	get_online_cpus();
2002
	virtnet_set_affinity(vi);
2003 2004
	put_online_cpus();

J
Jason Wang 已提交
2005 2006 2007 2008 2009 2010
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2011 2012
}

2013 2014 2015 2016 2017 2018
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2019
	struct ewma_pkt_len *avg;
2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2074 2075 2076
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

R
Rusty Russell 已提交
2077 2078
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
2079
	int i, err;
R
Rusty Russell 已提交
2080 2081
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
2082
	u16 max_queue_pairs;
2083
	int mtu;
J
Jason Wang 已提交
2084

2085 2086 2087 2088 2089 2090
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2091 2092 2093
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

J
Jason Wang 已提交
2094
	/* Find if host supports multiqueue virtio_net device */
2095 2096 2097
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2098 2099 2100 2101 2102 2103

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2104 2105

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2106
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2107 2108 2109 2110
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2111
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2112
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2113
	dev->features = NETIF_F_HIGHDMA;
2114

2115
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2116 2117 2118
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2119
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2120
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2121
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2122
		if (csum)
J
Jason Wang 已提交
2123
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2124 2125

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2126
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2127 2128
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2129
		/* Individual feature bits: what can host handle? */
2130 2131 2132 2133 2134 2135
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2136 2137
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2138

2139 2140
		dev->features |= NETIF_F_GSO_ROBUST;

2141
		if (gso)
2142
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2143
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2144
	}
2145 2146
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2147

2148 2149
	dev->vlan_features = dev->features;

2150 2151 2152 2153
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2154
	/* Configuration may specify what MAC to use.  Otherwise random. */
2155 2156 2157 2158 2159
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2160
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2161 2162 2163 2164 2165

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2166
	vdev->priv = vi;
2167 2168 2169 2170 2171
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2172 2173 2174 2175 2176 2177 2178
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2179
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2180

2181
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2182 2183
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2184 2185
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2186 2187
		vi->big_packets = true;

2188 2189 2190
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2191 2192
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2193 2194 2195 2196
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2197 2198
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2199 2200
		vi->any_header_sg = true;

J
Jason Wang 已提交
2201 2202 2203
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2204 2205 2206 2207
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2208
		if (mtu < dev->min_mtu) {
2209
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2210
		} else {
2211
			dev->mtu = mtu;
2212 2213
			dev->max_mtu = mtu;
		}
2214 2215
	}

2216 2217
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2218

2219 2220 2221 2222 2223
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2224 2225 2226
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2227
	err = init_vqs(vi);
2228
	if (err)
2229
		goto free_stats;
R
Rusty Russell 已提交
2230

2231 2232 2233 2234
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2235 2236
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2237

2238 2239
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2240 2241 2242
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2243
		goto free_vqs;
R
Rusty Russell 已提交
2244
	}
2245

M
Michael S. Tsirkin 已提交
2246 2247
	virtio_device_ready(vdev);

2248
	err = virtnet_cpu_notif_add(vi);
2249 2250
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2251
		goto free_unregister_netdev;
2252 2253
	}

2254 2255 2256
	rtnl_lock();
	virtnet_set_queues(vi, vi->curr_queue_pairs);
	rtnl_unlock();
2257

J
Jason Wang 已提交
2258 2259 2260 2261
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2262
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2263 2264 2265 2266
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2267

J
Jason Wang 已提交
2268 2269 2270
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2271 2272
	return 0;

2273
free_unregister_netdev:
2274 2275
	vi->vdev->config->reset(vdev);

2276
	unregister_netdev(dev);
2277
free_vqs:
J
Jason Wang 已提交
2278
	cancel_delayed_work_sync(&vi->refill);
2279
	free_receive_page_frags(vi);
2280
	virtnet_del_vqs(vi);
2281 2282
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2283 2284 2285 2286 2287
free:
	free_netdev(dev);
	return err;
}

2288
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2289
{
2290
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2291 2292

	/* Free unused buffers in both send and recv, if any. */
2293
	free_unused_bufs(vi);
2294

J
Jason Wang 已提交
2295
	free_receive_bufs(vi);
2296

2297 2298
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2299
	virtnet_del_vqs(vi);
2300 2301
}

2302
static void virtnet_remove(struct virtio_device *vdev)
2303 2304 2305
{
	struct virtnet_info *vi = vdev->priv;

2306
	virtnet_cpu_notif_remove(vi);
2307

2308 2309
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2310

2311 2312 2313
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2314

2315
	free_percpu(vi->stats);
2316
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2317 2318
}

2319
#ifdef CONFIG_PM_SLEEP
2320 2321 2322
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2323
	int i;
2324

2325
	virtnet_cpu_notif_remove(vi);
2326

2327 2328
	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);
2329

2330 2331 2332
	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

J
Jason Wang 已提交
2333
	if (netif_running(vi->dev)) {
2334
		for (i = 0; i < vi->max_queue_pairs; i++)
J
Jason Wang 已提交
2335
			napi_disable(&vi->rq[i].napi);
J
Jason Wang 已提交
2336
	}
2337 2338 2339 2340 2341 2342 2343 2344 2345

	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2346
	int err, i;
2347 2348 2349 2350 2351

	err = init_vqs(vi);
	if (err)
		return err;

2352 2353
	virtio_device_ready(vdev);

2354 2355
	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
M
Michael S. Tsirkin 已提交
2356
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
2357 2358
				schedule_delayed_work(&vi->refill, 0);

J
Jason Wang 已提交
2359 2360
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
2361
	}
2362 2363 2364

	netif_device_attach(vi->dev);

2365
	rtnl_lock();
J
Jason Wang 已提交
2366
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2367
	rtnl_unlock();
J
Jason Wang 已提交
2368

2369
	err = virtnet_cpu_notif_add(vi);
2370 2371 2372
	if (err)
		return err;

2373 2374 2375 2376
	return 0;
}
#endif

R
Rusty Russell 已提交
2377 2378 2379 2380 2381
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2394
static unsigned int features[] = {
2395 2396 2397 2398 2399 2400
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2401
	VIRTIO_F_ANY_LAYOUT,
2402 2403
};

2404
static struct virtio_driver virtio_net_driver = {
2405 2406
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2407 2408
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2409 2410 2411 2412
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
2413
	.remove =	virtnet_remove,
2414
	.config_changed = virtnet_config_changed,
2415
#ifdef CONFIG_PM_SLEEP
2416 2417 2418
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2419 2420
};

2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455
static __init int virtio_net_driver_init(void)
{
	int ret;

	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "AP_VIRT_NET_ONLINE",
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "VIRT_NET_DEAD",
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2456 2457 2458 2459

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");