virtio_net.c 63.2 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

J
Johannes Berg 已提交
44 45 46 47
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
48
 */
J
Johannes Berg 已提交
49
DECLARE_EWMA(pkt_len, 1, 64)
50

51 52 53 54 55 56 57
/* With mergeable buffers we align buffer address and use the low bits to
 * encode its true size. Buffer size is up to 1 page so we need to align to
 * square root of page size to ensure we reserve enough bits to encode the true
 * size.
 */
#define MERGEABLE_BUFFER_MIN_ALIGN_SHIFT ((PAGE_SHIFT + 1) / 2)

58
/* Minimum alignment for mergeable packet buffers. */
59 60
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, \
				   1 << MERGEABLE_BUFFER_MIN_ALIGN_SHIFT)
61

62
#define VIRTNET_DRIVER_VERSION "1.0.0"
63

64
struct virtnet_stats {
65 66
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
67 68 69 70 71 72 73
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

74 75 76 77 78 79 80
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
81 82 83

	/* Name of the send queue: output.$index */
	char name[40];
84 85 86 87 88 89 90
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
91 92
	struct napi_struct napi;

J
John Fastabend 已提交
93 94
	struct bpf_prog __rcu *xdp_prog;

95 96 97
	/* Chain pages by the private ptr. */
	struct page *pages;

98
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
99
	struct ewma_pkt_len mrg_avg_pkt_len;
100

101 102 103
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

104 105
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
106 107 108

	/* Name of this receive queue: input.$index */
	char name[40];
109 110 111 112 113 114
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
115 116
	struct send_queue *sq;
	struct receive_queue *rq;
117 118
	unsigned int status;

J
Jason Wang 已提交
119 120 121 122 123 124
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

125 126 127
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

128 129 130
	/* I like... big packets and I cannot lie! */
	bool big_packets;

131 132 133
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
134 135 136
	/* Has control virtqueue */
	bool has_cvq;

137 138 139
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

140 141 142
	/* Packet virtio header size */
	u8 hdr_len;

143 144 145
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

146 147 148
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

149 150 151
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
152 153
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
154

155 156 157
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
158 159 160 161

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
162
	struct virtio_net_ctrl_mq ctrl_mq;
163 164
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
165
	u16 ctrl_vid;
166 167 168 169

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
170 171
};

172
struct padded_vnet_hdr {
173
	struct virtio_net_hdr_mrg_rxbuf hdr;
174
	/*
175 176 177
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
178
	 */
179
	char padding[4];
180 181
};

J
Jason Wang 已提交
182 183 184 185 186
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
187
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
188 189 190 191 192 193 194 195 196
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
197
	return vq->index / 2;
J
Jason Wang 已提交
198 199 200 201 202 203 204
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

205
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
206
{
207
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
208 209
}

210 211 212 213
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
214
static void give_pages(struct receive_queue *rq, struct page *page)
215
{
216
	struct page *end;
217

218
	/* Find end of list, sew whole thing into vi->rq.pages. */
219
	for (end = page; end->private; end = (struct page *)end->private);
220 221
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
222 223
}

224
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
225
{
226
	struct page *p = rq->pages;
227

228
	if (p) {
229
		rq->pages = (struct page *)p->private;
230 231 232
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
233 234 235 236
		p = alloc_page(gfp_mask);
	return p;
}

237
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
238
{
239
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
240

241
	/* Suppress further interrupts. */
242
	virtqueue_disable_cb(vq);
243

244
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
245
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
246 247
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

266
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
267 268
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
269 270
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
271 272
{
	struct sk_buff *skb;
273
	struct virtio_net_hdr_mrg_rxbuf *hdr;
274
	unsigned int copy, hdr_len, hdr_padded_len;
275
	char *p;
276

277
	p = page_address(page) + offset;
278

279
	/* copy small packet so we can reuse these pages for small data */
280
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
281 282
	if (unlikely(!skb))
		return NULL;
283

284
	hdr = skb_vnet_hdr(skb);
285

286 287 288 289
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
290
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
291

292
	memcpy(hdr, p, hdr_len);
293

294
	len -= hdr_len;
295 296
	offset += hdr_padded_len;
	p += hdr_padded_len;
297

298 299 300 301
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
302

303 304
	len -= copy;
	offset += copy;
305

306 307 308 309 310 311 312 313
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

314 315 316 317 318 319 320
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
321
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
322 323 324
		dev_kfree_skb(skb);
		return NULL;
	}
325
	BUG_ON(offset >= PAGE_SIZE);
326
	while (len) {
327 328 329 330
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
331 332 333
		page = (struct page *)page->private;
		offset = 0;
	}
334

335
	if (page)
336
		give_pages(rq, page);
337

338 339
	return skb;
}
340

341
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
342 343
			     struct receive_queue *rq,
			     struct send_queue *sq,
344 345
			     struct xdp_buff *xdp,
			     void *data)
J
John Fastabend 已提交
346 347 348 349 350 351 352 353
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	unsigned int num_sg, len;
	void *xdp_sent;
	int err;

	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
354 355 356 357 358 359
		if (vi->mergeable_rx_bufs) {
			struct page *sent_page = virt_to_head_page(xdp_sent);

			put_page(sent_page);
		} else { /* small buffer */
			struct sk_buff *skb = xdp_sent;
J
John Fastabend 已提交
360

361 362 363
			kfree_skb(skb);
		}
	}
J
John Fastabend 已提交
364

365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
	if (vi->mergeable_rx_bufs) {
		/* Zero header and leave csum up to XDP layers */
		hdr = xdp->data;
		memset(hdr, 0, vi->hdr_len);

		num_sg = 1;
		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
	} else { /* small buffer */
		struct sk_buff *skb = data;

		/* Zero header and leave csum up to XDP layers */
		hdr = skb_vnet_hdr(skb);
		memset(hdr, 0, vi->hdr_len);

		num_sg = 2;
		sg_init_table(sq->sg, 2);
		sg_set_buf(sq->sg, hdr, vi->hdr_len);
		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
	}
J
John Fastabend 已提交
384
	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
385
				   data, GFP_ATOMIC);
J
John Fastabend 已提交
386
	if (unlikely(err)) {
387 388 389 390 391 392
		if (vi->mergeable_rx_bufs) {
			struct page *page = virt_to_head_page(xdp->data);

			put_page(page);
		} else /* small buffer */
			kfree_skb(data);
393 394
		/* On error abort to avoid unnecessary kick */
		return false;
J
John Fastabend 已提交
395 396 397
	}

	virtqueue_kick(sq->vq);
398
	return true;
J
John Fastabend 已提交
399 400
}

J
John Fastabend 已提交
401
static u32 do_xdp_prog(struct virtnet_info *vi,
J
John Fastabend 已提交
402
		       struct receive_queue *rq,
J
John Fastabend 已提交
403
		       struct bpf_prog *xdp_prog,
404
		       void *data, int len)
J
John Fastabend 已提交
405 406 407
{
	int hdr_padded_len;
	struct xdp_buff xdp;
408
	void *buf;
J
John Fastabend 已提交
409
	unsigned int qp;
J
John Fastabend 已提交
410 411
	u32 act;

412
	if (vi->mergeable_rx_bufs) {
J
John Fastabend 已提交
413
		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
414 415 416 417 418 419 420 421 422 423
		xdp.data = data + hdr_padded_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		buf = data;
	} else { /* small buffers */
		struct sk_buff *skb = data;

		xdp.data = skb->data;
		xdp.data_end = xdp.data + len;
		buf = skb->data;
	}
J
John Fastabend 已提交
424 425 426 427 428

	act = bpf_prog_run_xdp(xdp_prog, &xdp);
	switch (act) {
	case XDP_PASS:
		return XDP_PASS;
J
John Fastabend 已提交
429 430 431 432
	case XDP_TX:
		qp = vi->curr_queue_pairs -
			vi->xdp_queue_pairs +
			smp_processor_id();
433
		xdp.data = buf;
434 435 436
		if (unlikely(!virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp,
					       data)))
			trace_xdp_exception(vi->dev, xdp_prog, act);
J
John Fastabend 已提交
437
		return XDP_TX;
J
John Fastabend 已提交
438 439 440
	default:
		bpf_warn_invalid_xdp_action(act);
	case XDP_ABORTED:
441
		trace_xdp_exception(vi->dev, xdp_prog, act);
J
John Fastabend 已提交
442 443 444 445 446
	case XDP_DROP:
		return XDP_DROP;
	}
}

447 448 449 450
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
451 452
{
	struct sk_buff * skb = buf;
453
	struct bpf_prog *xdp_prog;
454

455
	len -= vi->hdr_len;
456 457
	skb_trim(skb, len);

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
		act = do_xdp_prog(vi, rq, xdp_prog, skb, len);
		switch (act) {
		case XDP_PASS:
			break;
		case XDP_TX:
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
			goto err_xdp;
		}
	}
	rcu_read_unlock();

480
	return skb;
481 482 483 484 485 486 487

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
	kfree_skb(skb);
xdp_xmit:
	return NULL;
488 489 490
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
491
				   struct virtnet_info *vi,
492 493 494 495 496
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
497
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
498

499 500 501 502 503 504 505 506 507 508 509
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

510 511 512 513 514 515 516 517 518 519 520 521
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
522
				       u16 *num_buf,
523 524 525 526 527 528 529 530 531 532 533 534 535
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
	unsigned int page_off = 0;

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

536
	while (--*num_buf) {
537 538 539 540 541 542 543 544 545
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

546 547 548 549
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

550 551 552
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
553 554
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
555
			goto err_buf;
556
		}
557 558 559 560

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
561
		put_page(p);
562 563 564 565 566 567 568 569 570
	}

	*len = page_off;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

571
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
572
					 struct virtnet_info *vi,
573
					 struct receive_queue *rq,
574
					 unsigned long ctx,
575
					 unsigned int len)
576
{
577
	void *buf = mergeable_ctx_to_buf_address(ctx);
578 579
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
580 581
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
582 583 584 585
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
586 587
	head_skb = NULL;

J
John Fastabend 已提交
588 589 590
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
591
		struct page *xdp_page;
J
John Fastabend 已提交
592 593
		u32 act;

594
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
595
		if (unlikely(num_buf > 1)) {
596
			/* linearize data for XDP */
597
			xdp_page = xdp_linearize_page(rq, &num_buf,
598 599 600 601 602 603
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
			offset = 0;
		} else {
			xdp_page = page;
J
John Fastabend 已提交
604 605 606 607 608 609 610
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
611
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
612 613
			goto err_xdp;

614 615
		act = do_xdp_prog(vi, rq, xdp_prog,
				  page_address(xdp_page) + offset, len);
J
John Fastabend 已提交
616 617
		switch (act) {
		case XDP_PASS:
618 619 620 621 622 623
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
						       0, len, PAGE_SIZE);
624
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
625 626
				return head_skb;
			}
J
John Fastabend 已提交
627 628
			break;
		case XDP_TX:
629
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
630 631
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
632 633 634 635
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_DROP:
		default:
636 637
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
638
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
639
			goto err_xdp;
J
John Fastabend 已提交
640
		}
J
John Fastabend 已提交
641 642
	}
	rcu_read_unlock();
643

J
John Fastabend 已提交
644 645 646
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
647

648 649
	if (unlikely(!curr_skb))
		goto err_skb;
650
	while (--num_buf) {
651 652
		int num_skb_frags;

653 654
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
655
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
656
				 dev->name, num_buf,
657 658
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
659 660
			dev->stats.rx_length_errors++;
			goto err_buf;
661
		}
662

663
		buf = mergeable_ctx_to_buf_address(ctx);
664 665 666
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
667 668
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
669 670 671

			if (unlikely(!nskb))
				goto err_skb;
672 673 674 675 676 677 678 679
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
680
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
681 682 683
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
684
			head_skb->truesize += truesize;
685
		}
686
		offset = buf - page_address(page);
687 688 689
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
690
					     len, truesize);
691 692
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
693
					offset, len, truesize);
694
		}
695 696
	}

J
Johannes Berg 已提交
697
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
698 699
	return head_skb;

J
John Fastabend 已提交
700 701
err_xdp:
	rcu_read_unlock();
702 703 704
err_skb:
	put_page(page);
	while (--num_buf) {
705 706
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
707 708 709 710 711
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
712
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
713
		put_page(page);
714
	}
715 716 717
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
718
xdp_xmit:
719
	return NULL;
720 721
}

M
Michael S. Tsirkin 已提交
722 723
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len)
724
{
725
	struct net_device *dev = vi->dev;
E
Eric Dumazet 已提交
726
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
727
	struct sk_buff *skb;
728
	struct virtio_net_hdr_mrg_rxbuf *hdr;
729

730
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
731 732
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
733 734 735 736 737
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
738
			give_pages(rq, buf);
739
		} else {
740
			dev_kfree_skb(buf);
741
		}
742 743
		return;
	}
744

745
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
746
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
747
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
748
		skb = receive_big(dev, vi, rq, buf, len);
749
	else
750
		skb = receive_small(dev, vi, rq, buf, len);
751 752 753

	if (unlikely(!skb))
		return;
754

755
	hdr = skb_vnet_hdr(skb);
756

757
	u64_stats_update_begin(&stats->rx_syncp);
758 759
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
760
	u64_stats_update_end(&stats->rx_syncp);
R
Rusty Russell 已提交
761

762
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
763
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
764

765 766 767 768 769 770
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
771 772
	}

773 774 775 776
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
777
	napi_gro_receive(&rq->napi, skb);
R
Rusty Russell 已提交
778 779 780 781 782 783 784
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

M
Michael S. Tsirkin 已提交
785 786
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
787 788
{
	struct sk_buff *skb;
789
	struct virtio_net_hdr_mrg_rxbuf *hdr;
790
	int err;
791

792
	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
793 794
	if (unlikely(!skb))
		return -ENOMEM;
R
Rusty Russell 已提交
795

796
	skb_put(skb, GOOD_PACKET_LEN);
797

798
	hdr = skb_vnet_hdr(skb);
799
	sg_init_table(rq->sg, 2);
800
	sg_set_buf(rq->sg, hdr, vi->hdr_len);
801
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
802

803
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
804 805
	if (err < 0)
		dev_kfree_skb(skb);
806

807 808
	return err;
}
809

810 811
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
812 813 814 815 816
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

817 818
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

819
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
820
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
821
		first = get_a_page(rq, gfp);
822 823
		if (!first) {
			if (list)
824
				give_pages(rq, list);
825
			return -ENOMEM;
826
		}
827
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
828

829 830 831 832
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
833

834
	first = get_a_page(rq, gfp);
835
	if (!first) {
836
		give_pages(rq, list);
837 838 839 840
		return -ENOMEM;
	}
	p = page_address(first);

841
	/* rq->sg[0], rq->sg[1] share the same page */
842 843
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
844

845
	/* rq->sg[1] for data packet, from offset */
846
	offset = sizeof(struct padded_vnet_hdr);
847
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
848 849 850

	/* chain first in list head */
	first->private = (unsigned long)list;
851 852
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
853
	if (err < 0)
854
		give_pages(rq, first);
855 856

	return err;
R
Rusty Russell 已提交
857 858
}

J
Johannes Berg 已提交
859
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
860
{
861
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
862 863
	unsigned int len;

J
Johannes Berg 已提交
864
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
865 866 867 868 869 870
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
871 872
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
873
	unsigned long ctx;
874
	int err;
875
	unsigned int len, hole;
876

877
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
878
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
879
		return -ENOMEM;
880

881
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
882
	ctx = mergeable_buf_to_ctx(buf, len);
883 884 885
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	hole = alloc_frag->size - alloc_frag->offset;
886 887 888 889 890 891
	if (hole < len) {
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
892 893 894
		len += hole;
		alloc_frag->offset += hole;
	}
895

896
	sg_init_one(rq->sg, buf, len);
897
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
898
	if (err < 0)
899
		put_page(virt_to_head_page(buf));
900

901 902
	return err;
}
903

904 905 906 907 908 909 910
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
911 912
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
913 914
{
	int err;
915
	bool oom;
916

917
	gfp |= __GFP_COLD;
918 919
	do {
		if (vi->mergeable_rx_bufs)
920
			err = add_recvbuf_mergeable(rq, gfp);
921
		else if (vi->big_packets)
922
			err = add_recvbuf_big(vi, rq, gfp);
923
		else
M
Michael S. Tsirkin 已提交
924
			err = add_recvbuf_small(vi, rq, gfp);
925

926
		oom = err == -ENOMEM;
927
		if (err)
928
			break;
929
	} while (rq->vq->num_free);
930
	virtqueue_kick(rq->vq);
931
	return !oom;
932 933
}

934
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
935 936
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
937
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
938

939
	/* Schedule NAPI, Suppress further interrupts if successful. */
940
	if (napi_schedule_prep(&rq->napi)) {
941
		virtqueue_disable_cb(rvq);
942
		__napi_schedule(&rq->napi);
943
	}
R
Rusty Russell 已提交
944 945
}

946
static void virtnet_napi_enable(struct receive_queue *rq)
947
{
948
	napi_enable(&rq->napi);
949 950 951 952 953

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
954 955
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
956
		local_bh_disable();
957
		__napi_schedule(&rq->napi);
958
		local_bh_enable();
959 960 961
	}
}

962 963
static void refill_work(struct work_struct *work)
{
964 965
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
966
	bool still_empty;
J
Jason Wang 已提交
967 968
	int i;

969
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
970
		struct receive_queue *rq = &vi->rq[i];
971

J
Jason Wang 已提交
972
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
973
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
974
		virtnet_napi_enable(rq);
975

J
Jason Wang 已提交
976 977 978 979 980 981
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
982 983
}

984
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
985
{
986
	struct virtnet_info *vi = rq->vq->vdev->priv;
987
	unsigned int len, received = 0;
988
	void *buf;
R
Rusty Russell 已提交
989 990

	while (received < budget &&
991
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
M
Michael S. Tsirkin 已提交
992
		receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
993 994 995
		received++;
	}

996
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
997
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
998
			schedule_delayed_work(&vi->refill, 0);
999
	}
R
Rusty Russell 已提交
1000

1001 1002 1003 1004 1005 1006 1007
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1008
	unsigned int r, received;
1009

1010
	received = virtnet_receive(rq, budget);
1011

1012 1013
	/* Out of packets? */
	if (received < budget) {
1014
		r = virtqueue_enable_cb_prepare(rq->vq);
1015 1016 1017 1018 1019 1020
		if (napi_complete_done(napi, received)) {
			if (unlikely(virtqueue_poll(rq->vq, r)) &&
			    napi_schedule_prep(napi)) {
				virtqueue_disable_cb(rq->vq);
				__napi_schedule(napi);
			}
1021
		}
R
Rusty Russell 已提交
1022 1023 1024 1025 1026
	}

	return received;
}

J
Jason Wang 已提交
1027 1028 1029 1030 1031
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1032 1033 1034
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1035
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1036
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
1037 1038 1039 1040 1041 1042
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1043
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1044 1045
{
	struct sk_buff *skb;
1046
	unsigned int len;
1047
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1048
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1049

1050
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1051
		pr_debug("Sent skb %p\n", skb);
1052

1053
		u64_stats_update_begin(&stats->tx_syncp);
1054 1055
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
1056
		u64_stats_update_end(&stats->tx_syncp);
1057

1058
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1059 1060 1061
	}
}

1062
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1063
{
1064
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1065
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1066
	struct virtnet_info *vi = sq->vq->vdev->priv;
1067
	unsigned num_sg;
1068
	unsigned hdr_len = vi->hdr_len;
1069
	bool can_push;
R
Rusty Russell 已提交
1070

J
Johannes Berg 已提交
1071
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1072 1073 1074 1075 1076 1077 1078

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1079
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1080 1081
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1082

1083
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1084
				    virtio_is_little_endian(vi->vdev), false))
1085
		BUG();
R
Rusty Russell 已提交
1086

1087
	if (vi->mergeable_rx_bufs)
1088
		hdr->num_buffers = 0;
1089

1090
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1091 1092 1093 1094 1095 1096 1097 1098 1099
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1100
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1101 1102
}

1103
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1104 1105
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1106 1107
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1108
	int err;
1109 1110
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1111 1112

	/* Free up any pending old buffers before queueing new ones. */
1113
	free_old_xmit_skbs(sq);
1114

1115 1116 1117
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1118
	/* Try to transmit */
1119
	err = xmit_skb(sq, skb);
1120

1121
	/* This should not happen! */
1122
	if (unlikely(err)) {
1123 1124 1125
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1126
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1127
		dev->stats.tx_dropped++;
1128
		dev_kfree_skb_any(skb);
1129
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1130
	}
1131

1132 1133 1134 1135
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1136 1137 1138 1139 1140 1141 1142 1143 1144
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1145
	 */
1146
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1147
		netif_stop_subqueue(dev, qnum);
1148
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1149
			/* More just got used, free them then recheck. */
1150 1151
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1152
				netif_start_subqueue(dev, qnum);
1153
				virtqueue_disable_cb(sq->vq);
1154 1155
			}
		}
1156
	}
1157

1158
	if (kick || netif_xmit_stopped(txq))
1159
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1160

1161
	return NETDEV_TX_OK;
1162 1163
}

1164 1165 1166
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1167
 * never fail unless improperly formatted.
1168 1169
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1170
				 struct scatterlist *out)
1171
{
1172
	struct scatterlist *sgs[4], hdr, stat;
1173
	unsigned out_num = 0, tmp;
1174 1175

	/* Caller should know better */
1176
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1177

1178 1179 1180
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1181
	/* Add header */
1182
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1183
	sgs[out_num++] = &hdr;
1184

1185 1186
	if (out)
		sgs[out_num++] = out;
1187

1188
	/* Add return status. */
1189
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1190
	sgs[out_num] = &stat;
1191

1192
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1193
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1194

1195
	if (unlikely(!virtqueue_kick(vi->cvq)))
1196
		return vi->ctrl_status == VIRTIO_NET_OK;
1197 1198 1199 1200

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1201 1202
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1203 1204
		cpu_relax();

1205
	return vi->ctrl_status == VIRTIO_NET_OK;
1206 1207
}

1208 1209 1210 1211
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1212
	int ret;
1213
	struct sockaddr *addr;
1214
	struct scatterlist sg;
1215

1216
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1217 1218 1219 1220
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1221
	if (ret)
1222
		goto out;
1223

1224 1225 1226
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1227
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1228 1229
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1230 1231
			ret = -EINVAL;
			goto out;
1232
		}
1233 1234
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1235 1236 1237 1238 1239 1240 1241
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1242 1243 1244
	}

	eth_commit_mac_addr_change(dev, p);
1245
	ret = 0;
1246

1247 1248 1249
out:
	kfree(addr);
	return ret;
1250 1251
}

1252 1253
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1254 1255 1256 1257 1258 1259
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1260
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1261 1262 1263
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1264
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1265 1266
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1267
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1268 1269

		do {
1270
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1271 1272
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1273
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1274 1275 1276 1277 1278 1279 1280 1281

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1282
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1283 1284 1285 1286 1287
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1288 1289 1290 1291
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1292
	int i;
1293

J
Jason Wang 已提交
1294 1295
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1296 1297 1298
}
#endif

1299 1300 1301 1302
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1303
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1304 1305 1306 1307
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1308
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1309 1310 1311 1312 1313 1314 1315
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1316 1317
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1318 1319

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1320
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1321 1322 1323
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1324
	} else {
J
Jason Wang 已提交
1325
		vi->curr_queue_pairs = queue_pairs;
1326 1327 1328
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1329
	}
J
Jason Wang 已提交
1330 1331 1332 1333

	return 0;
}

1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1344 1345 1346
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1347
	int i;
R
Rusty Russell 已提交
1348

1349 1350
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1351 1352 1353

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1354 1355 1356 1357

	return 0;
}

1358 1359 1360
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1361 1362
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1363
	struct netdev_hw_addr *ha;
1364
	int uc_count;
1365
	int mc_count;
1366 1367
	void *buf;
	int i;
1368

S
stephen hemminger 已提交
1369
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1370 1371 1372
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1373 1374
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1375

1376
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1377 1378

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1379
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1380
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1381
			 vi->ctrl_promisc ? "en" : "dis");
1382

1383
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1384 1385

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1386
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1387
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1388
			 vi->ctrl_allmulti ? "en" : "dis");
1389

1390
	uc_count = netdev_uc_count(dev);
1391
	mc_count = netdev_mc_count(dev);
1392
	/* MAC filter - use one buffer for both lists */
1393 1394 1395
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1396
	if (!buf)
1397 1398
		return;

1399 1400
	sg_init_table(sg, 2);

1401
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1402
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1403
	i = 0;
1404
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1405
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1406 1407

	sg_set_buf(&sg[0], mac_data,
1408
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1409 1410

	/* multicast list and count fill the end */
1411
	mac_data = (void *)&mac_data->macs[uc_count][0];
1412

M
Michael S. Tsirkin 已提交
1413
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1414
	i = 0;
1415 1416
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1417 1418

	sg_set_buf(&sg[1], mac_data,
1419
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1420 1421

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1422
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1423
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1424 1425

	kfree(buf);
1426 1427
}

1428 1429
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1430 1431 1432 1433
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1434 1435
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1436 1437

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1438
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1439
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1440
	return 0;
1441 1442
}

1443 1444
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1445 1446 1447 1448
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1449 1450
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1451 1452

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1453
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1454
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1455
	return 0;
1456 1457
}

1458
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1459 1460 1461
{
	int i;

1462 1463
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1464 1465 1466 1467
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1468 1469 1470
		vi->affinity_hint_set = false;
	}
}
1471

1472 1473 1474 1475
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1476 1477 1478 1479 1480

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1481 1482 1483 1484
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1485 1486
	}

1487 1488
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1489 1490
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1491
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1492
		i++;
J
Jason Wang 已提交
1493 1494
	}

1495
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1496 1497
}

1498
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1499
{
1500 1501 1502 1503 1504
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1505

1506 1507 1508 1509 1510 1511 1512
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1513

1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1545 1546
}

R
Rick Jones 已提交
1547 1548 1549 1550 1551
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1552 1553
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1554 1555 1556 1557
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1585
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1586 1587
		return -EINVAL;

J
John Fastabend 已提交
1588 1589 1590 1591 1592 1593 1594
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1595
	get_online_cpus();
1596
	err = _virtnet_set_queues(vi, queue_pairs);
1597 1598 1599 1600
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1601
		virtnet_set_affinity(vi);
1602
	}
1603
	put_online_cpus();
1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1621 1622 1623 1624 1625 1626
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1627 1628 1629
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1630
	ethtool_cmd_speed_set(&diff1, 0);
1631
	diff2.port = PORT_OTHER;
1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1675
static const struct ethtool_ops virtnet_ethtool_ops = {
1676
	.get_drvinfo = virtnet_get_drvinfo,
1677
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1678
	.get_ringparam = virtnet_get_ringparam,
1679 1680
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1681
	.get_ts_info = ethtool_op_get_ts_info,
1682 1683
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1684 1685
};

J
John Fastabend 已提交
1686 1687 1688 1689 1690
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1691 1692
	u16 xdp_qp = 0, curr_qp;
	int i, err;
J
John Fastabend 已提交
1693

1694 1695 1696 1697 1698
	if (prog && prog->xdp_adjust_head) {
		netdev_warn(dev, "Does not support bpf_xdp_adjust_head()\n");
		return -EOPNOTSUPP;
	}

J
John Fastabend 已提交
1699
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1700 1701 1702
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1728
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
1729 1730 1731 1732 1733
	if (err) {
		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
		return err;
	}

J
John Fastabend 已提交
1734 1735
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
1736
		if (IS_ERR(prog)) {
1737
			_virtnet_set_queues(vi, curr_qp);
J
John Fastabend 已提交
1738
			return PTR_ERR(prog);
1739
		}
J
John Fastabend 已提交
1740 1741
	}

1742 1743 1744
	vi->xdp_queue_pairs = xdp_qp;
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1780 1781 1782 1783 1784
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1785
	.ndo_set_mac_address = virtnet_set_mac_address,
1786
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1787
	.ndo_get_stats64     = virtnet_stats,
1788 1789
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1790 1791
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1792
#endif
J
John Fastabend 已提交
1793
	.ndo_xdp		= virtnet_xdp,
1794 1795
};

1796
static void virtnet_config_changed_work(struct work_struct *work)
1797
{
1798 1799
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1800 1801
	u16 v;

1802 1803
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1804
		return;
1805 1806

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1807
		netdev_notify_peers(vi->dev);
1808 1809
		virtnet_ack_link_announce(vi);
	}
1810 1811 1812 1813 1814

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1815
		return;
1816 1817 1818 1819 1820

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1821
		netif_tx_wake_all_queues(vi->dev);
1822 1823
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1824
		netif_tx_stop_all_queues(vi->dev);
1825 1826 1827 1828 1829 1830 1831
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1832
	schedule_work(&vi->config_work);
1833 1834
}

J
Jason Wang 已提交
1835 1836
static void virtnet_free_queues(struct virtnet_info *vi)
{
1837 1838
	int i;

1839 1840
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1841
		netif_napi_del(&vi->rq[i].napi);
1842
	}
1843

1844 1845 1846 1847 1848
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1849 1850 1851 1852
	kfree(vi->rq);
	kfree(vi->sq);
}

1853
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
1854
{
J
John Fastabend 已提交
1855
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1856 1857 1858 1859 1860
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1861 1862 1863 1864 1865

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1866
	}
1867 1868 1869 1870 1871 1872
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
1873
	rtnl_unlock();
J
Jason Wang 已提交
1874 1875
}

1876 1877 1878 1879 1880 1881 1882 1883
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

1884
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
1885
{
1886 1887 1888 1889
	/* For small receive mode always use kfree_skb variants */
	if (!vi->mergeable_rx_bufs)
		return false;

J
John Fastabend 已提交
1890 1891 1892 1893 1894 1895 1896 1897
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1898 1899 1900 1901 1902 1903 1904
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
1905
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1906
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
1907 1908 1909 1910
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
1911 1912 1913 1914 1915 1916
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1917 1918 1919 1920 1921
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
1922
				give_pages(&vi->rq[i], buf);
1923
			} else {
J
Jason Wang 已提交
1924
				dev_kfree_skb(buf);
1925
			}
J
Jason Wang 已提交
1926 1927 1928 1929
		}
	}
}

1930 1931 1932 1933
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

1934
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
1935

1936
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
1937 1938

	virtnet_free_queues(vi);
1939 1940
}

J
Jason Wang 已提交
1941
static int virtnet_find_vqs(struct virtnet_info *vi)
1942
{
J
Jason Wang 已提交
1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
1972

J
Jason Wang 已提交
1973 1974 1975 1976 1977 1978 1979 1980 1981
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
1982

J
Jason Wang 已提交
1983 1984 1985 1986
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names);
	if (ret)
		goto err_find;
1987

J
Jason Wang 已提交
1988 1989
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
1990
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
1991
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
1992
	}
J
Jason Wang 已提交
1993 1994 1995 1996 1997 1998 1999 2000 2001 2002

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2003
	return 0;
J
Jason Wang 已提交
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2023
	if (!vi->rq)
J
Jason Wang 已提交
2024 2025 2026 2027 2028 2029 2030 2031 2032
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2033
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2058
	get_online_cpus();
2059
	virtnet_set_affinity(vi);
2060 2061
	put_online_cpus();

J
Jason Wang 已提交
2062 2063 2064 2065 2066 2067
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2068 2069
}

2070 2071 2072 2073 2074 2075
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2076
	struct ewma_pkt_len *avg;
2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2131 2132 2133
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

R
Rusty Russell 已提交
2134 2135
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
2136
	int i, err;
R
Rusty Russell 已提交
2137 2138
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
2139
	u16 max_queue_pairs;
2140
	int mtu;
J
Jason Wang 已提交
2141

2142 2143 2144 2145 2146 2147
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2148 2149 2150
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

J
Jason Wang 已提交
2151
	/* Find if host supports multiqueue virtio_net device */
2152 2153 2154
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2155 2156 2157 2158 2159 2160

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2161 2162

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2163
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2164 2165 2166 2167
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2168
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2169
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2170
	dev->features = NETIF_F_HIGHDMA;
2171

2172
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2173 2174 2175
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2176
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2177
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2178
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2179
		if (csum)
J
Jason Wang 已提交
2180
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2181 2182

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2183
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2184 2185
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2186
		/* Individual feature bits: what can host handle? */
2187 2188 2189 2190 2191 2192
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2193 2194
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2195

2196 2197
		dev->features |= NETIF_F_GSO_ROBUST;

2198
		if (gso)
2199
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2200
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2201
	}
2202 2203
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2204

2205 2206
	dev->vlan_features = dev->features;

2207 2208 2209 2210
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2211
	/* Configuration may specify what MAC to use.  Otherwise random. */
2212 2213 2214 2215 2216
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2217
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2218 2219 2220 2221 2222

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2223
	vdev->priv = vi;
2224 2225 2226 2227 2228
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2229 2230 2231 2232 2233 2234 2235
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2236
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2237

2238
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2239 2240
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2241 2242
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2243 2244
		vi->big_packets = true;

2245 2246 2247
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2248 2249
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2250 2251 2252 2253
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2254 2255
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2256 2257
		vi->any_header_sg = true;

J
Jason Wang 已提交
2258 2259 2260
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2261 2262 2263 2264
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2265
		if (mtu < dev->min_mtu) {
2266
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2267
		} else {
2268
			dev->mtu = mtu;
2269 2270
			dev->max_mtu = mtu;
		}
2271 2272
	}

2273 2274
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2275

2276 2277 2278 2279 2280
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2281 2282 2283
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2284
	err = init_vqs(vi);
2285
	if (err)
2286
		goto free_stats;
R
Rusty Russell 已提交
2287

2288 2289 2290 2291
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2292 2293
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2294

2295 2296
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2297 2298 2299
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2300
		goto free_vqs;
R
Rusty Russell 已提交
2301
	}
2302

M
Michael S. Tsirkin 已提交
2303 2304
	virtio_device_ready(vdev);

2305
	err = virtnet_cpu_notif_add(vi);
2306 2307
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2308
		goto free_unregister_netdev;
2309 2310
	}

2311
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2312

J
Jason Wang 已提交
2313 2314 2315 2316
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2317
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2318 2319 2320 2321
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2322

J
Jason Wang 已提交
2323 2324 2325
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2326 2327
	return 0;

2328
free_unregister_netdev:
2329 2330
	vi->vdev->config->reset(vdev);

2331
	unregister_netdev(dev);
2332
free_vqs:
J
Jason Wang 已提交
2333
	cancel_delayed_work_sync(&vi->refill);
2334
	free_receive_page_frags(vi);
2335
	virtnet_del_vqs(vi);
2336 2337
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2338 2339 2340 2341 2342
free:
	free_netdev(dev);
	return err;
}

2343
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2344
{
2345
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2346 2347

	/* Free unused buffers in both send and recv, if any. */
2348
	free_unused_bufs(vi);
2349

J
Jason Wang 已提交
2350
	free_receive_bufs(vi);
2351

2352 2353
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2354
	virtnet_del_vqs(vi);
2355 2356
}

2357
static void virtnet_remove(struct virtio_device *vdev)
2358 2359 2360
{
	struct virtnet_info *vi = vdev->priv;

2361
	virtnet_cpu_notif_remove(vi);
2362

2363 2364
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2365

2366 2367 2368
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2369

2370
	free_percpu(vi->stats);
2371
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2372 2373
}

2374
#ifdef CONFIG_PM_SLEEP
2375 2376 2377
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2378
	int i;
2379

2380
	virtnet_cpu_notif_remove(vi);
2381

2382 2383
	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);
2384

2385 2386 2387
	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

J
Jason Wang 已提交
2388
	if (netif_running(vi->dev)) {
2389
		for (i = 0; i < vi->max_queue_pairs; i++)
J
Jason Wang 已提交
2390
			napi_disable(&vi->rq[i].napi);
J
Jason Wang 已提交
2391
	}
2392 2393 2394 2395 2396 2397 2398 2399 2400

	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
2401
	int err, i;
2402 2403 2404 2405 2406

	err = init_vqs(vi);
	if (err)
		return err;

2407 2408
	virtio_device_ready(vdev);

2409 2410
	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
M
Michael S. Tsirkin 已提交
2411
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
2412 2413
				schedule_delayed_work(&vi->refill, 0);

J
Jason Wang 已提交
2414 2415
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
2416
	}
2417 2418 2419

	netif_device_attach(vi->dev);

J
Jason Wang 已提交
2420 2421
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2422
	err = virtnet_cpu_notif_add(vi);
2423 2424 2425
	if (err)
		return err;

2426 2427 2428 2429
	return 0;
}
#endif

R
Rusty Russell 已提交
2430 2431 2432 2433 2434
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2447
static unsigned int features[] = {
2448 2449 2450 2451 2452 2453
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2454
	VIRTIO_F_ANY_LAYOUT,
2455 2456
};

2457
static struct virtio_driver virtio_net_driver = {
2458 2459
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2460 2461
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2462 2463 2464 2465
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
2466
	.remove =	virtnet_remove,
2467
	.config_changed = virtnet_config_changed,
2468
#ifdef CONFIG_PM_SLEEP
2469 2470 2471
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2472 2473
};

2474 2475 2476 2477
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2478
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2479 2480 2481 2482 2483
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2484
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2509 2510 2511 2512

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");