virtio_net.c 62.7 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
R
Rusty Russell 已提交
26
#include <linux/scatterlist.h>
27
#include <linux/if_vlan.h>
28
#include <linux/slab.h>
29
#include <linux/cpu.h>
30
#include <linux/average.h>
J
Jason Wang 已提交
31
#include <net/busy_poll.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

J
Johannes Berg 已提交
44 45 46 47
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
48
 */
J
Johannes Berg 已提交
49
DECLARE_EWMA(pkt_len, 1, 64)
50 51 52 53

/* Minimum alignment for mergeable packet buffers. */
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)

54
#define VIRTNET_DRIVER_VERSION "1.0.0"
55

56
struct virtnet_stats {
57 58
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
59 60 61 62 63 64 65
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

66 67 68 69 70 71 72
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
73 74 75

	/* Name of the send queue: output.$index */
	char name[40];
76 77 78 79 80 81 82
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
83 84
	struct napi_struct napi;

J
John Fastabend 已提交
85 86
	struct bpf_prog __rcu *xdp_prog;

87 88 89
	/* Chain pages by the private ptr. */
	struct page *pages;

90
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
91
	struct ewma_pkt_len mrg_avg_pkt_len;
92

93 94 95
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

96 97
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
98 99 100

	/* Name of this receive queue: input.$index */
	char name[40];
101 102 103 104 105 106
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
107 108
	struct send_queue *sq;
	struct receive_queue *rq;
109 110
	unsigned int status;

J
Jason Wang 已提交
111 112 113 114 115 116
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

117 118 119
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

120 121 122
	/* I like... big packets and I cannot lie! */
	bool big_packets;

123 124 125
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
126 127 128
	/* Has control virtqueue */
	bool has_cvq;

129 130 131
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

132 133 134
	/* Packet virtio header size */
	u8 hdr_len;

135 136 137
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

138 139 140
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

141 142 143
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
144 145
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
146

147 148 149
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
150 151 152 153

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
154
	struct virtio_net_ctrl_mq ctrl_mq;
155 156
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
157
	u16 ctrl_vid;
158 159 160 161

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
162 163
};

164
struct padded_vnet_hdr {
165
	struct virtio_net_hdr_mrg_rxbuf hdr;
166
	/*
167 168 169
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
170
	 */
171
	char padding[4];
172 173
};

J
Jason Wang 已提交
174 175 176 177 178
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
179
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
180 181 182 183 184 185 186 187 188
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
189
	return vq->index / 2;
J
Jason Wang 已提交
190 191 192 193 194 195 196
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

197
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
198
{
199
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
200 201
}

202 203 204 205
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
206
static void give_pages(struct receive_queue *rq, struct page *page)
207
{
208
	struct page *end;
209

210
	/* Find end of list, sew whole thing into vi->rq.pages. */
211
	for (end = page; end->private; end = (struct page *)end->private);
212 213
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
214 215
}

216
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
217
{
218
	struct page *p = rq->pages;
219

220
	if (p) {
221
		rq->pages = (struct page *)p->private;
222 223 224
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
225 226 227 228
		p = alloc_page(gfp_mask);
	return p;
}

229
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
230
{
231
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
232

233
	/* Suppress further interrupts. */
234
	virtqueue_disable_cb(vq);
235

236
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
237
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
238 239
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

258
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
259 260
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
261 262
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
263 264
{
	struct sk_buff *skb;
265
	struct virtio_net_hdr_mrg_rxbuf *hdr;
266
	unsigned int copy, hdr_len, hdr_padded_len;
267
	char *p;
268

269
	p = page_address(page) + offset;
270

271
	/* copy small packet so we can reuse these pages for small data */
272
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
273 274
	if (unlikely(!skb))
		return NULL;
275

276
	hdr = skb_vnet_hdr(skb);
277

278 279 280 281
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
282
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
283

284
	memcpy(hdr, p, hdr_len);
285

286
	len -= hdr_len;
287 288
	offset += hdr_padded_len;
	p += hdr_padded_len;
289

290 291 292 293
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
294

295 296
	len -= copy;
	offset += copy;
297

298 299 300 301 302 303 304 305
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

306 307 308 309 310 311 312
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
313
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
314 315 316
		dev_kfree_skb(skb);
		return NULL;
	}
317
	BUG_ON(offset >= PAGE_SIZE);
318
	while (len) {
319 320 321 322
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
323 324 325
		page = (struct page *)page->private;
		offset = 0;
	}
326

327
	if (page)
328
		give_pages(rq, page);
329

330 331
	return skb;
}
332

J
John Fastabend 已提交
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
static void virtnet_xdp_xmit(struct virtnet_info *vi,
			     struct receive_queue *rq,
			     struct send_queue *sq,
			     struct xdp_buff *xdp)
{
	struct page *page = virt_to_head_page(xdp->data);
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	unsigned int num_sg, len;
	void *xdp_sent;
	int err;

	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		struct page *sent_page = virt_to_head_page(xdp_sent);

		if (vi->mergeable_rx_bufs)
			put_page(sent_page);
		else
			give_pages(rq, sent_page);
	}

	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);

	num_sg = 1;
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
				   xdp->data, GFP_ATOMIC);
	if (unlikely(err)) {
		if (vi->mergeable_rx_bufs)
			put_page(page);
		else
			give_pages(rq, page);
		return; // On error abort to avoid unnecessary kick
	} else if (!vi->mergeable_rx_bufs) {
		/* If not mergeable bufs must be big packets so cleanup pages */
		give_pages(rq, (struct page *)page->private);
		page->private = 0;
	}

	virtqueue_kick(sq->vq);
}

J
John Fastabend 已提交
377
static u32 do_xdp_prog(struct virtnet_info *vi,
J
John Fastabend 已提交
378
		       struct receive_queue *rq,
J
John Fastabend 已提交
379 380 381 382 383
		       struct bpf_prog *xdp_prog,
		       struct page *page, int offset, int len)
{
	int hdr_padded_len;
	struct xdp_buff xdp;
J
John Fastabend 已提交
384
	unsigned int qp;
J
John Fastabend 已提交
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
	u32 act;
	u8 *buf;

	buf = page_address(page) + offset;

	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		hdr_padded_len = sizeof(struct padded_vnet_hdr);

	xdp.data = buf + hdr_padded_len;
	xdp.data_end = xdp.data + (len - vi->hdr_len);

	act = bpf_prog_run_xdp(xdp_prog, &xdp);
	switch (act) {
	case XDP_PASS:
		return XDP_PASS;
J
John Fastabend 已提交
402 403 404 405 406 407 408
	case XDP_TX:
		qp = vi->curr_queue_pairs -
			vi->xdp_queue_pairs +
			smp_processor_id();
		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
		return XDP_TX;
J
John Fastabend 已提交
409 410 411 412 413 414 415 416
	default:
		bpf_warn_invalid_xdp_action(act);
	case XDP_ABORTED:
	case XDP_DROP:
		return XDP_DROP;
	}
}

417
static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
418 419 420
{
	struct sk_buff * skb = buf;

421
	len -= vi->hdr_len;
422 423 424 425 426 427
	skb_trim(skb, len);

	return skb;
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
428
				   struct virtnet_info *vi,
429 430 431 432
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
J
John Fastabend 已提交
433
	struct bpf_prog *xdp_prog;
434
	struct page *page = buf;
J
John Fastabend 已提交
435
	struct sk_buff *skb;
436

J
John Fastabend 已提交
437 438 439 440 441 442
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
		u32 act;

443
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
444
			goto err_xdp;
J
John Fastabend 已提交
445 446 447 448 449 450 451 452 453
		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
		switch (act) {
		case XDP_PASS:
			break;
		case XDP_TX:
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
J
John Fastabend 已提交
454
			goto err_xdp;
J
John Fastabend 已提交
455
		}
J
John Fastabend 已提交
456 457 458 459
	}
	rcu_read_unlock();

	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
460 461 462 463 464
	if (unlikely(!skb))
		goto err;

	return skb;

J
John Fastabend 已提交
465 466
err_xdp:
	rcu_read_unlock();
467 468 469
err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
J
John Fastabend 已提交
470
xdp_xmit:
471 472 473
	return NULL;
}

474 475 476 477 478 479 480 481 482 483 484 485
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
486
				       u16 *num_buf,
487 488 489 490 491 492 493 494 495 496 497 498 499
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
	unsigned int page_off = 0;

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

500
	while (--*num_buf) {
501 502 503 504 505 506 507 508 509
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

510 511 512 513
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

514 515 516
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
517 518
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
519
			goto err_buf;
520
		}
521 522 523 524

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
525
		put_page(p);
526 527 528 529 530 531 532 533 534
	}

	*len = page_off;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

535
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
536
					 struct virtnet_info *vi,
537
					 struct receive_queue *rq,
538
					 unsigned long ctx,
539
					 unsigned int len)
540
{
541
	void *buf = mergeable_ctx_to_buf_address(ctx);
542 543
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
544 545
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
546 547 548 549
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
550 551
	head_skb = NULL;

J
John Fastabend 已提交
552 553 554
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
555
		struct page *xdp_page;
J
John Fastabend 已提交
556 557
		u32 act;

558
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
559
		if (unlikely(num_buf > 1)) {
560
			/* linearize data for XDP */
561
			xdp_page = xdp_linearize_page(rq, &num_buf,
562 563 564 565 566 567
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
			offset = 0;
		} else {
			xdp_page = page;
J
John Fastabend 已提交
568 569 570 571 572 573 574
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
575
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
576 577
			goto err_xdp;

578
		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
J
John Fastabend 已提交
579 580
		switch (act) {
		case XDP_PASS:
581 582 583 584 585 586
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
						       0, len, PAGE_SIZE);
587
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
588 589
				return head_skb;
			}
J
John Fastabend 已提交
590 591
			break;
		case XDP_TX:
592
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
593 594
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
595 596 597 598
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
599 600
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
601
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
602
			goto err_xdp;
J
John Fastabend 已提交
603
		}
J
John Fastabend 已提交
604 605
	}
	rcu_read_unlock();
606

J
John Fastabend 已提交
607 608 609
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
610

611 612
	if (unlikely(!curr_skb))
		goto err_skb;
613
	while (--num_buf) {
614 615
		int num_skb_frags;

616 617
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
618
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
619
				 dev->name, num_buf,
620 621
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
622 623
			dev->stats.rx_length_errors++;
			goto err_buf;
624
		}
625

626
		buf = mergeable_ctx_to_buf_address(ctx);
627 628 629
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
630 631
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
632 633 634

			if (unlikely(!nskb))
				goto err_skb;
635 636 637 638 639 640 641 642
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
643
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
644 645 646
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
647
			head_skb->truesize += truesize;
648
		}
649
		offset = buf - page_address(page);
650 651 652
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
653
					     len, truesize);
654 655
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
656
					offset, len, truesize);
657
		}
658 659
	}

J
Johannes Berg 已提交
660
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
661 662
	return head_skb;

J
John Fastabend 已提交
663 664
err_xdp:
	rcu_read_unlock();
665 666 667
err_skb:
	put_page(page);
	while (--num_buf) {
668 669
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
670 671 672 673 674
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
675
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
676
		put_page(page);
677
	}
678 679 680
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
681
xdp_xmit:
682
	return NULL;
683 684
}

M
Michael S. Tsirkin 已提交
685 686
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len)
687
{
688
	struct net_device *dev = vi->dev;
E
Eric Dumazet 已提交
689
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
690
	struct sk_buff *skb;
691
	struct virtio_net_hdr_mrg_rxbuf *hdr;
692

693
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
694 695
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
696 697 698 699 700
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
701
			give_pages(rq, buf);
702
		} else {
703
			dev_kfree_skb(buf);
704
		}
705 706
		return;
	}
707

708
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
709
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
710
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
711
		skb = receive_big(dev, vi, rq, buf, len);
712
	else
713
		skb = receive_small(vi, buf, len);
714 715 716

	if (unlikely(!skb))
		return;
717

718
	hdr = skb_vnet_hdr(skb);
719

720
	u64_stats_update_begin(&stats->rx_syncp);
721 722
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
723
	u64_stats_update_end(&stats->rx_syncp);
R
Rusty Russell 已提交
724

725
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
726
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
727

728 729 730 731 732 733
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
734 735
	}

736 737 738 739
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
740
	napi_gro_receive(&rq->napi, skb);
R
Rusty Russell 已提交
741 742 743 744 745 746 747
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

M
Michael S. Tsirkin 已提交
748 749
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
750 751
{
	struct sk_buff *skb;
752
	struct virtio_net_hdr_mrg_rxbuf *hdr;
753
	int err;
754

755
	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
756 757
	if (unlikely(!skb))
		return -ENOMEM;
R
Rusty Russell 已提交
758

759
	skb_put(skb, GOOD_PACKET_LEN);
760

761
	hdr = skb_vnet_hdr(skb);
762
	sg_init_table(rq->sg, 2);
763
	sg_set_buf(rq->sg, hdr, vi->hdr_len);
764
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
765

766
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
767 768
	if (err < 0)
		dev_kfree_skb(skb);
769

770 771
	return err;
}
772

773 774
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
775 776 777 778 779
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

780 781
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

782
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
783
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
784
		first = get_a_page(rq, gfp);
785 786
		if (!first) {
			if (list)
787
				give_pages(rq, list);
788
			return -ENOMEM;
789
		}
790
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
791

792 793 794 795
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
796

797
	first = get_a_page(rq, gfp);
798
	if (!first) {
799
		give_pages(rq, list);
800 801 802 803
		return -ENOMEM;
	}
	p = page_address(first);

804
	/* rq->sg[0], rq->sg[1] share the same page */
805 806
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
807

808
	/* rq->sg[1] for data packet, from offset */
809
	offset = sizeof(struct padded_vnet_hdr);
810
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
811 812 813

	/* chain first in list head */
	first->private = (unsigned long)list;
814 815
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
816
	if (err < 0)
817
		give_pages(rq, first);
818 819

	return err;
R
Rusty Russell 已提交
820 821
}

J
Johannes Berg 已提交
822
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
823
{
824
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
825 826
	unsigned int len;

J
Johannes Berg 已提交
827
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
828 829 830 831 832 833
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
834 835
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
836
	unsigned long ctx;
837
	int err;
838
	unsigned int len, hole;
839

840
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
841
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
842
		return -ENOMEM;
843

844
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
845
	ctx = mergeable_buf_to_ctx(buf, len);
846 847 848
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	hole = alloc_frag->size - alloc_frag->offset;
849 850 851 852 853 854
	if (hole < len) {
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
855 856 857
		len += hole;
		alloc_frag->offset += hole;
	}
858

859
	sg_init_one(rq->sg, buf, len);
860
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
861
	if (err < 0)
862
		put_page(virt_to_head_page(buf));
863

864 865
	return err;
}
866

867 868 869 870 871 872 873
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
874 875
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
876 877
{
	int err;
878
	bool oom;
879

880
	gfp |= __GFP_COLD;
881 882
	do {
		if (vi->mergeable_rx_bufs)
883
			err = add_recvbuf_mergeable(rq, gfp);
884
		else if (vi->big_packets)
885
			err = add_recvbuf_big(vi, rq, gfp);
886
		else
M
Michael S. Tsirkin 已提交
887
			err = add_recvbuf_small(vi, rq, gfp);
888

889
		oom = err == -ENOMEM;
890
		if (err)
891
			break;
892
	} while (rq->vq->num_free);
893
	virtqueue_kick(rq->vq);
894
	return !oom;
895 896
}

897
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
898 899
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
900
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
901

902
	/* Schedule NAPI, Suppress further interrupts if successful. */
903
	if (napi_schedule_prep(&rq->napi)) {
904
		virtqueue_disable_cb(rvq);
905
		__napi_schedule(&rq->napi);
906
	}
R
Rusty Russell 已提交
907 908
}

909
static void virtnet_napi_enable(struct receive_queue *rq)
910
{
911
	napi_enable(&rq->napi);
912 913 914 915 916

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
917 918
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
919
		local_bh_disable();
920
		__napi_schedule(&rq->napi);
921
		local_bh_enable();
922 923 924
	}
}

925 926
static void refill_work(struct work_struct *work)
{
927 928
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
929
	bool still_empty;
J
Jason Wang 已提交
930 931
	int i;

932
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
933
		struct receive_queue *rq = &vi->rq[i];
934

J
Jason Wang 已提交
935
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
936
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
937
		virtnet_napi_enable(rq);
938

J
Jason Wang 已提交
939 940 941 942 943 944
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
945 946
}

947
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
948
{
949
	struct virtnet_info *vi = rq->vq->vdev->priv;
950
	unsigned int len, received = 0;
951
	void *buf;
R
Rusty Russell 已提交
952 953

	while (received < budget &&
954
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
M
Michael S. Tsirkin 已提交
955
		receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
956 957 958
		received++;
	}

959
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
960
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
961
			schedule_delayed_work(&vi->refill, 0);
962
	}
R
Rusty Russell 已提交
963

964 965 966 967 968 969 970
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
971
	unsigned int r, received;
972

973
	received = virtnet_receive(rq, budget);
974

975 976
	/* Out of packets? */
	if (received < budget) {
977
		r = virtqueue_enable_cb_prepare(rq->vq);
E
Eric Dumazet 已提交
978
		napi_complete_done(napi, received);
979
		if (unlikely(virtqueue_poll(rq->vq, r)) &&
980
		    napi_schedule_prep(napi)) {
981
			virtqueue_disable_cb(rq->vq);
982
			__napi_schedule(napi);
983
		}
R
Rusty Russell 已提交
984 985 986 987 988
	}

	return received;
}

J
Jason Wang 已提交
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
#ifdef CONFIG_NET_RX_BUSY_POLL
/* must be called with local_bh_disable()d */
static int virtnet_busy_poll(struct napi_struct *napi)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	struct virtnet_info *vi = rq->vq->vdev->priv;
	int r, received = 0, budget = 4;

	if (!(vi->status & VIRTIO_NET_S_LINK_UP))
		return LL_FLUSH_FAILED;

	if (!napi_schedule_prep(napi))
		return LL_FLUSH_BUSY;

	virtqueue_disable_cb(rq->vq);

again:
	received += virtnet_receive(rq, budget);

	r = virtqueue_enable_cb_prepare(rq->vq);
	clear_bit(NAPI_STATE_SCHED, &napi->state);
	if (unlikely(virtqueue_poll(rq->vq, r)) &&
	    napi_schedule_prep(napi)) {
		virtqueue_disable_cb(rq->vq);
		if (received < budget) {
			budget -= received;
			goto again;
		} else {
			__napi_schedule(napi);
		}
	}

	return received;
}
#endif	/* CONFIG_NET_RX_BUSY_POLL */

J
Jason Wang 已提交
1026 1027 1028 1029 1030
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1031 1032 1033
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1034
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1035
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
1036 1037 1038 1039 1040 1041
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1042
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1043 1044
{
	struct sk_buff *skb;
1045
	unsigned int len;
1046
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1047
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1048

1049
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1050
		pr_debug("Sent skb %p\n", skb);
1051

1052
		u64_stats_update_begin(&stats->tx_syncp);
1053 1054
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
1055
		u64_stats_update_end(&stats->tx_syncp);
1056

1057
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1058 1059 1060
	}
}

1061
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1062
{
1063
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1064
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1065
	struct virtnet_info *vi = sq->vq->vdev->priv;
1066
	unsigned num_sg;
1067
	unsigned hdr_len = vi->hdr_len;
1068
	bool can_push;
R
Rusty Russell 已提交
1069

J
Johannes Berg 已提交
1070
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1071 1072 1073 1074 1075 1076 1077

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1078
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1079 1080
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1081

1082 1083 1084
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
				    virtio_is_little_endian(vi->vdev)))
		BUG();
R
Rusty Russell 已提交
1085

1086
	if (vi->mergeable_rx_bufs)
1087
		hdr->num_buffers = 0;
1088

1089
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1090 1091 1092 1093 1094 1095 1096 1097 1098
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1099
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1100 1101
}

1102
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1103 1104
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1105 1106
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1107
	int err;
1108 1109
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1110 1111

	/* Free up any pending old buffers before queueing new ones. */
1112
	free_old_xmit_skbs(sq);
1113

1114 1115 1116
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1117
	/* Try to transmit */
1118
	err = xmit_skb(sq, skb);
1119

1120
	/* This should not happen! */
1121
	if (unlikely(err)) {
1122 1123 1124
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1125
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1126
		dev->stats.tx_dropped++;
1127
		dev_kfree_skb_any(skb);
1128
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1129
	}
1130

1131 1132 1133 1134
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1135 1136 1137 1138 1139 1140 1141 1142 1143
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1144
	 */
1145
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1146
		netif_stop_subqueue(dev, qnum);
1147
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1148
			/* More just got used, free them then recheck. */
1149 1150
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1151
				netif_start_subqueue(dev, qnum);
1152
				virtqueue_disable_cb(sq->vq);
1153 1154
			}
		}
1155
	}
1156

1157
	if (kick || netif_xmit_stopped(txq))
1158
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1159

1160
	return NETDEV_TX_OK;
1161 1162
}

1163 1164 1165
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1166
 * never fail unless improperly formatted.
1167 1168
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1169
				 struct scatterlist *out)
1170
{
1171
	struct scatterlist *sgs[4], hdr, stat;
1172
	unsigned out_num = 0, tmp;
1173 1174

	/* Caller should know better */
1175
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1176

1177 1178 1179
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1180
	/* Add header */
1181
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1182
	sgs[out_num++] = &hdr;
1183

1184 1185
	if (out)
		sgs[out_num++] = out;
1186

1187
	/* Add return status. */
1188
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1189
	sgs[out_num] = &stat;
1190

1191
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1192
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1193

1194
	if (unlikely(!virtqueue_kick(vi->cvq)))
1195
		return vi->ctrl_status == VIRTIO_NET_OK;
1196 1197 1198 1199

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1200 1201
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1202 1203
		cpu_relax();

1204
	return vi->ctrl_status == VIRTIO_NET_OK;
1205 1206
}

1207 1208 1209 1210
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1211
	int ret;
1212
	struct sockaddr *addr;
1213
	struct scatterlist sg;
1214

1215 1216 1217 1218 1219 1220
	addr = kmalloc(sizeof(*addr), GFP_KERNEL);
	if (!addr)
		return -ENOMEM;
	memcpy(addr, p, sizeof(*addr));

	ret = eth_prepare_mac_addr_change(dev, addr);
1221
	if (ret)
1222
		goto out;
1223

1224 1225 1226
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1227
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1228 1229
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1230 1231
			ret = -EINVAL;
			goto out;
1232
		}
1233 1234
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1235 1236 1237 1238 1239 1240 1241
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1242 1243 1244
	}

	eth_commit_mac_addr_change(dev, p);
1245
	ret = 0;
1246

1247 1248 1249
out:
	kfree(addr);
	return ret;
1250 1251
}

1252 1253 1254 1255 1256 1257 1258 1259
static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
					       struct rtnl_link_stats64 *tot)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1260
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1261 1262 1263
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1264
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1265 1266
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1267
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1268 1269

		do {
1270
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1271 1272
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1273
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1274 1275 1276 1277 1278 1279 1280 1281

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1282
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1283 1284 1285 1286 1287 1288 1289
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;

	return tot;
}

1290 1291 1292 1293
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1294
	int i;
1295

J
Jason Wang 已提交
1296 1297
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1298 1299 1300
}
#endif

1301 1302 1303 1304
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1305
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1306 1307 1308 1309
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

J
Jason Wang 已提交
1310 1311 1312 1313 1314 1315 1316 1317
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1318 1319
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1320 1321

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1322
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1323 1324 1325
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1326
	} else {
J
Jason Wang 已提交
1327
		vi->curr_queue_pairs = queue_pairs;
1328 1329 1330
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1331
	}
J
Jason Wang 已提交
1332 1333 1334 1335

	return 0;
}

R
Rusty Russell 已提交
1336 1337 1338
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1339
	int i;
R
Rusty Russell 已提交
1340

1341 1342
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1343 1344 1345

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1346 1347 1348 1349

	return 0;
}

1350 1351 1352
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1353 1354
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1355
	struct netdev_hw_addr *ha;
1356
	int uc_count;
1357
	int mc_count;
1358 1359
	void *buf;
	int i;
1360

S
stephen hemminger 已提交
1361
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1362 1363 1364
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1365 1366
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1367

1368
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1369 1370

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1371
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1372
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1373
			 vi->ctrl_promisc ? "en" : "dis");
1374

1375
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1376 1377

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1378
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1379
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1380
			 vi->ctrl_allmulti ? "en" : "dis");
1381

1382
	uc_count = netdev_uc_count(dev);
1383
	mc_count = netdev_mc_count(dev);
1384
	/* MAC filter - use one buffer for both lists */
1385 1386 1387
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1388
	if (!buf)
1389 1390
		return;

1391 1392
	sg_init_table(sg, 2);

1393
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1394
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1395
	i = 0;
1396
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1397
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1398 1399

	sg_set_buf(&sg[0], mac_data,
1400
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1401 1402

	/* multicast list and count fill the end */
1403
	mac_data = (void *)&mac_data->macs[uc_count][0];
1404

M
Michael S. Tsirkin 已提交
1405
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1406
	i = 0;
1407 1408
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1409 1410

	sg_set_buf(&sg[1], mac_data,
1411
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1412 1413

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1414
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1415
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1416 1417

	kfree(buf);
1418 1419
}

1420 1421
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1422 1423 1424 1425
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1426 1427
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1428 1429

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1430
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1431
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1432
	return 0;
1433 1434
}

1435 1436
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1437 1438 1439 1440
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1441 1442
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1443 1444

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1445
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1446
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1447
	return 0;
1448 1449
}

1450
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1451 1452 1453
{
	int i;

1454 1455
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1456 1457 1458 1459
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1460 1461 1462
		vi->affinity_hint_set = false;
	}
}
1463

1464 1465 1466 1467
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1468 1469 1470 1471 1472

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1473 1474 1475 1476
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1477 1478
	}

1479 1480
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1481 1482
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1483
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1484
		i++;
J
Jason Wang 已提交
1485 1486
	}

1487
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1488 1489
}

1490
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1491
{
1492 1493 1494 1495 1496
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1497

1498 1499 1500 1501 1502 1503 1504
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1505

1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1537 1538
}

R
Rick Jones 已提交
1539 1540 1541 1542 1543
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1544 1545
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1546 1547 1548 1549
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1577
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1578 1579
		return -EINVAL;

J
John Fastabend 已提交
1580 1581 1582 1583 1584 1585 1586
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1587
	get_online_cpus();
1588 1589 1590 1591 1592
	err = virtnet_set_queues(vi, queue_pairs);
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1593
		virtnet_set_affinity(vi);
1594
	}
1595
	put_online_cpus();
1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1613 1614 1615 1616 1617 1618
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1619 1620 1621
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1622
	ethtool_cmd_speed_set(&diff1, 0);
1623
	diff2.port = PORT_OTHER;
1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1667
static const struct ethtool_ops virtnet_ethtool_ops = {
1668
	.get_drvinfo = virtnet_get_drvinfo,
1669
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1670
	.get_ringparam = virtnet_get_ringparam,
1671 1672
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1673
	.get_ts_info = ethtool_op_get_ts_info,
1674 1675
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1676 1677
};

J
John Fastabend 已提交
1678 1679 1680 1681 1682
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1683 1684
	u16 xdp_qp = 0, curr_qp;
	int i, err;
J
John Fastabend 已提交
1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err) {
		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
		return err;
	}

J
John Fastabend 已提交
1719 1720
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
1721 1722
		if (IS_ERR(prog)) {
			virtnet_set_queues(vi, curr_qp);
J
John Fastabend 已提交
1723
			return PTR_ERR(prog);
1724
		}
J
John Fastabend 已提交
1725 1726
	}

1727 1728 1729
	vi->xdp_queue_pairs = xdp_qp;
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1765 1766 1767 1768 1769
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1770
	.ndo_set_mac_address = virtnet_set_mac_address,
1771
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1772
	.ndo_get_stats64     = virtnet_stats,
1773 1774
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1775 1776 1777
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
#endif
J
Jason Wang 已提交
1778 1779 1780
#ifdef CONFIG_NET_RX_BUSY_POLL
	.ndo_busy_poll		= virtnet_busy_poll,
#endif
J
John Fastabend 已提交
1781
	.ndo_xdp		= virtnet_xdp,
1782 1783
};

1784
static void virtnet_config_changed_work(struct work_struct *work)
1785
{
1786 1787
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1788 1789
	u16 v;

1790 1791
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1792
		return;
1793 1794

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1795
		netdev_notify_peers(vi->dev);
1796 1797
		virtnet_ack_link_announce(vi);
	}
1798 1799 1800 1801 1802

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1803
		return;
1804 1805 1806 1807 1808

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1809
		netif_tx_wake_all_queues(vi->dev);
1810 1811
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1812
		netif_tx_stop_all_queues(vi->dev);
1813 1814 1815 1816 1817 1818 1819
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1820
	schedule_work(&vi->config_work);
1821 1822
}

J
Jason Wang 已提交
1823 1824
static void virtnet_free_queues(struct virtnet_info *vi)
{
1825 1826
	int i;

1827 1828
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1829
		netif_napi_del(&vi->rq[i].napi);
1830
	}
1831

1832 1833 1834 1835 1836
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1837 1838 1839 1840 1841 1842
	kfree(vi->rq);
	kfree(vi->sq);
}

static void free_receive_bufs(struct virtnet_info *vi)
{
J
John Fastabend 已提交
1843
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1844 1845
	int i;

J
John Fastabend 已提交
1846
	rtnl_lock();
J
Jason Wang 已提交
1847 1848 1849
	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1850 1851 1852 1853 1854

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1855
	}
J
John Fastabend 已提交
1856
	rtnl_unlock();
J
Jason Wang 已提交
1857 1858
}

1859 1860 1861 1862 1863 1864 1865 1866
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
John Fastabend 已提交
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
static bool is_xdp_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1877 1878 1879 1880 1881 1882 1883
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
1884 1885 1886 1887 1888 1889
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
			if (!is_xdp_queue(vi, i))
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
1890 1891 1892 1893 1894 1895
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1896 1897 1898 1899 1900
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
1901
				give_pages(&vi->rq[i], buf);
1902
			} else {
J
Jason Wang 已提交
1903
				dev_kfree_skb(buf);
1904
			}
J
Jason Wang 已提交
1905 1906 1907 1908
		}
	}
}

1909 1910 1911 1912
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

1913
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
1914

1915
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
1916 1917

	virtnet_free_queues(vi);
1918 1919
}

J
Jason Wang 已提交
1920
static int virtnet_find_vqs(struct virtnet_info *vi)
1921
{
J
Jason Wang 已提交
1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
1951

J
Jason Wang 已提交
1952 1953 1954 1955 1956 1957 1958 1959 1960
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
1961

J
Jason Wang 已提交
1962 1963 1964 1965
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names);
	if (ret)
		goto err_find;
1966

J
Jason Wang 已提交
1967 1968
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
1969
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
1970
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
1971
	}
J
Jason Wang 已提交
1972 1973 1974 1975 1976 1977 1978 1979 1980 1981

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

1982
	return 0;
J
Jason Wang 已提交
1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2002
	if (!vi->rq)
J
Jason Wang 已提交
2003 2004 2005 2006 2007 2008 2009 2010 2011
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2012
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2037
	get_online_cpus();
2038
	virtnet_set_affinity(vi);
2039 2040
	put_online_cpus();

J
Jason Wang 已提交
2041 2042 2043 2044 2045 2046
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2047 2048
}

2049 2050 2051 2052 2053 2054
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2055
	struct ewma_pkt_len *avg;
2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2110 2111 2112
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

R
Rusty Russell 已提交
2113 2114
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
2115
	int i, err;
R
Rusty Russell 已提交
2116 2117
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
2118
	u16 max_queue_pairs;
2119
	int mtu;
J
Jason Wang 已提交
2120

2121 2122 2123 2124 2125 2126
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2127 2128 2129
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

J
Jason Wang 已提交
2130
	/* Find if host supports multiqueue virtio_net device */
2131 2132 2133
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2134 2135 2136 2137 2138 2139

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2140 2141

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2142
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2143 2144 2145 2146
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2147
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2148
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2149
	dev->features = NETIF_F_HIGHDMA;
2150

2151
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2152 2153 2154
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2155
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2156
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2157
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2158
		if (csum)
J
Jason Wang 已提交
2159
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2160 2161

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2162
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2163 2164
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2165
		/* Individual feature bits: what can host handle? */
2166 2167 2168 2169 2170 2171
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2172 2173
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2174

2175 2176
		dev->features |= NETIF_F_GSO_ROBUST;

2177
		if (gso)
2178
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2179
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2180
	}
2181 2182
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2183

2184 2185
	dev->vlan_features = dev->features;

2186 2187 2188 2189
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2190
	/* Configuration may specify what MAC to use.  Otherwise random. */
2191 2192 2193 2194 2195
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2196
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2197 2198 2199 2200 2201

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2202
	vdev->priv = vi;
2203 2204 2205 2206 2207
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2208 2209 2210 2211 2212 2213 2214
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2215
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2216

2217
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2218 2219
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2220 2221
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2222 2223
		vi->big_packets = true;

2224 2225 2226
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2227 2228
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2229 2230 2231 2232
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2233 2234
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2235 2236
		vi->any_header_sg = true;

J
Jason Wang 已提交
2237 2238 2239
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2240 2241 2242 2243
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2244
		if (mtu < dev->min_mtu) {
2245
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2246
		} else {
2247
			dev->mtu = mtu;
2248 2249
			dev->max_mtu = mtu;
		}
2250 2251
	}

2252 2253
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2254

2255 2256 2257 2258 2259
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2260 2261 2262
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2263
	err = init_vqs(vi);
2264
	if (err)
2265
		goto free_stats;
R
Rusty Russell 已提交
2266

2267 2268 2269 2270
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2271 2272
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2273

2274 2275
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2276 2277 2278
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2279
		goto free_vqs;
R
Rusty Russell 已提交
2280
	}
2281

M
Michael S. Tsirkin 已提交
2282 2283
	virtio_device_ready(vdev);

2284
	err = virtnet_cpu_notif_add(vi);
2285 2286
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2287
		goto free_unregister_netdev;
2288 2289
	}

2290 2291 2292
	rtnl_lock();
	virtnet_set_queues(vi, vi->curr_queue_pairs);
	rtnl_unlock();
2293

J
Jason Wang 已提交
2294 2295 2296 2297
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2298
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2299 2300 2301 2302
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2303

J
Jason Wang 已提交
2304 2305 2306
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2307 2308
	return 0;

2309
free_unregister_netdev:
2310 2311
	vi->vdev->config->reset(vdev);

2312
	unregister_netdev(dev);
2313
free_vqs:
J
Jason Wang 已提交
2314
	cancel_delayed_work_sync(&vi->refill);
2315
	free_receive_page_frags(vi);
2316
	virtnet_del_vqs(vi);
2317 2318
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2319 2320 2321 2322 2323
free:
	free_netdev(dev);
	return err;
}

2324
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2325
{
2326
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2327 2328

	/* Free unused buffers in both send and recv, if any. */
2329
	free_unused_bufs(vi);
2330

J
Jason Wang 已提交
2331
	free_receive_bufs(vi);
2332

2333 2334
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2335
	virtnet_del_vqs(vi);
2336 2337
}

2338
static void virtnet_remove(struct virtio_device *vdev)
2339 2340 2341
{
	struct virtnet_info *vi = vdev->priv;

2342
	virtnet_cpu_notif_remove(vi);
2343

2344 2345
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2346

2347 2348 2349
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2350

2351
	free_percpu(vi->stats);
2352
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2353 2354
}

2355
#ifdef CONFIG_PM_SLEEP
2356 2357 2358
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2359
	int i;
2360

2361
	virtnet_cpu_notif_remove(vi);
2362

2363 2364
	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);
2365

2366 2367 2368
	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

J
Jason Wang 已提交
2369
	if (netif_running(vi->dev)) {
2370
		for (i = 0; i < vi->max_queue_pairs; i++)
J
Jason Wang 已提交
2371
			napi_disable(&vi->rq[i].napi);
J
Jason Wang 已提交
2372
	}
2373 2374 2375 2376 2377 2378 2379 2380 2381

	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2382
	int err, i;
2383 2384 2385 2386 2387

	err = init_vqs(vi);
	if (err)
		return err;

2388 2389
	virtio_device_ready(vdev);

2390 2391
	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
M
Michael S. Tsirkin 已提交
2392
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
2393 2394
				schedule_delayed_work(&vi->refill, 0);

J
Jason Wang 已提交
2395 2396
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
2397
	}
2398 2399 2400

	netif_device_attach(vi->dev);

2401
	rtnl_lock();
J
Jason Wang 已提交
2402
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2403
	rtnl_unlock();
J
Jason Wang 已提交
2404

2405
	err = virtnet_cpu_notif_add(vi);
2406 2407 2408
	if (err)
		return err;

2409 2410 2411 2412
	return 0;
}
#endif

R
Rusty Russell 已提交
2413 2414 2415 2416 2417
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2430
static unsigned int features[] = {
2431 2432 2433 2434 2435 2436
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2437
	VIRTIO_F_ANY_LAYOUT,
2438 2439
};

2440
static struct virtio_driver virtio_net_driver = {
2441 2442
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2443 2444
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2445 2446 2447 2448
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
2449
	.remove =	virtnet_remove,
2450
	.config_changed = virtnet_config_changed,
2451
#ifdef CONFIG_PM_SLEEP
2452 2453 2454
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2455 2456
};

2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491
static __init int virtio_net_driver_init(void)
{
	int ret;

	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "AP_VIRT_NET_ONLINE",
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "VIRT_NET_DEAD",
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2492 2493 2494 2495

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");