virtio_net.c 66.4 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

44 45
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

46 47 48
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56 57 58 59 60 61 62
/* With mergeable buffers we align buffer address and use the low bits to
 * encode its true size. Buffer size is up to 1 page so we need to align to
 * square root of page size to ensure we reserve enough bits to encode the true
 * size.
 */
#define MERGEABLE_BUFFER_MIN_ALIGN_SHIFT ((PAGE_SHIFT + 1) / 2)

63
/* Minimum alignment for mergeable packet buffers. */
64 65
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, \
				   1 << MERGEABLE_BUFFER_MIN_ALIGN_SHIFT)
66

67
#define VIRTNET_DRIVER_VERSION "1.0.0"
68

69
struct virtnet_stats {
70 71
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
72 73 74 75 76 77 78
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

79 80 81 82 83 84 85
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
86 87 88

	/* Name of the send queue: output.$index */
	char name[40];
89 90 91 92 93 94 95
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
96 97
	struct napi_struct napi;

J
John Fastabend 已提交
98 99
	struct bpf_prog __rcu *xdp_prog;

100 101 102
	/* Chain pages by the private ptr. */
	struct page *pages;

103
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
104
	struct ewma_pkt_len mrg_avg_pkt_len;
105

106 107 108
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

109 110
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
111 112 113

	/* Name of this receive queue: input.$index */
	char name[40];
114 115 116 117 118 119
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
120 121
	struct send_queue *sq;
	struct receive_queue *rq;
122 123
	unsigned int status;

J
Jason Wang 已提交
124 125 126 127 128 129
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

130 131 132
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

133 134 135
	/* I like... big packets and I cannot lie! */
	bool big_packets;

136 137 138
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
139 140 141
	/* Has control virtqueue */
	bool has_cvq;

142 143 144
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

145 146 147
	/* Packet virtio header size */
	u8 hdr_len;

148 149 150
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

151 152 153
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

154 155 156
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
157 158
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
159

160 161 162
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
163 164 165 166

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
167
	struct virtio_net_ctrl_mq ctrl_mq;
168 169
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
170
	u16 ctrl_vid;
171 172 173 174

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
175 176
};

177
struct padded_vnet_hdr {
178
	struct virtio_net_hdr_mrg_rxbuf hdr;
179
	/*
180 181 182
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
183
	 */
184
	char padding[4];
185 186
};

J
Jason Wang 已提交
187 188 189 190 191
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
192
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
193 194 195 196 197 198 199 200 201
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
202
	return vq->index / 2;
J
Jason Wang 已提交
203 204 205 206 207 208 209
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

210
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
211
{
212
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
213 214
}

215 216 217 218
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
219
static void give_pages(struct receive_queue *rq, struct page *page)
220
{
221
	struct page *end;
222

223
	/* Find end of list, sew whole thing into vi->rq.pages. */
224
	for (end = page; end->private; end = (struct page *)end->private);
225 226
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
227 228
}

229
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
230
{
231
	struct page *p = rq->pages;
232

233
	if (p) {
234
		rq->pages = (struct page *)p->private;
235 236 237
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
238 239 240 241
		p = alloc_page(gfp_mask);
	return p;
}

242
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
243
{
244
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
245

246
	/* Suppress further interrupts. */
247
	virtqueue_disable_cb(vq);
248

249
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
250
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
251 252
}

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

271
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
272 273
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
274 275
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
276 277
{
	struct sk_buff *skb;
278
	struct virtio_net_hdr_mrg_rxbuf *hdr;
279
	unsigned int copy, hdr_len, hdr_padded_len;
280
	char *p;
281

282
	p = page_address(page) + offset;
283

284
	/* copy small packet so we can reuse these pages for small data */
285
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
286 287
	if (unlikely(!skb))
		return NULL;
288

289
	hdr = skb_vnet_hdr(skb);
290

291 292 293 294
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
295
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
296

297
	memcpy(hdr, p, hdr_len);
298

299
	len -= hdr_len;
300 301
	offset += hdr_padded_len;
	p += hdr_padded_len;
302

303 304 305 306
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
307

308 309
	len -= copy;
	offset += copy;
310

311 312 313 314 315 316 317 318
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

319 320 321 322 323 324 325
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
326
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
327 328 329
		dev_kfree_skb(skb);
		return NULL;
	}
330
	BUG_ON(offset >= PAGE_SIZE);
331
	while (len) {
332 333 334 335
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
336 337 338
		page = (struct page *)page->private;
		offset = 0;
	}
339

340
	if (page)
341
		give_pages(rq, page);
342

343 344
	return skb;
}
345

346
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
347
			     struct receive_queue *rq,
348
			     struct xdp_buff *xdp)
J
John Fastabend 已提交
349 350
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
351
	unsigned int len;
352 353
	struct send_queue *sq;
	unsigned int qp;
J
John Fastabend 已提交
354 355 356
	void *xdp_sent;
	int err;

357 358 359
	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	sq = &vi->sq[qp];

J
John Fastabend 已提交
360 361
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
362
		struct page *sent_page = virt_to_head_page(xdp_sent);
363

364
		put_page(sent_page);
365
	}
J
John Fastabend 已提交
366

367 368 369 370
	xdp->data -= vi->hdr_len;
	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);
371

372
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
373

374
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
J
John Fastabend 已提交
375
	if (unlikely(err)) {
376
		struct page *page = virt_to_head_page(xdp->data);
377

378
		put_page(page);
379
		return false;
J
John Fastabend 已提交
380 381 382
	}

	virtqueue_kick(sq->vq);
383
	return true;
J
John Fastabend 已提交
384 385
}

386 387 388 389 390
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

391 392 393 394
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
395
{
396
	struct sk_buff *skb;
397
	struct bpf_prog *xdp_prog;
398 399 400 401 402 403
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	unsigned int delta = 0;
404
	len -= vi->hdr_len;
405

406 407 408
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
409
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
410
		struct xdp_buff xdp;
411
		void *orig_data;
412 413 414 415
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
416

417 418
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
419
		xdp.data_end = xdp.data + len;
420
		orig_data = xdp.data;
421 422
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

423 424
		switch (act) {
		case XDP_PASS:
425
			/* Recalculate length in case bpf program changed it */
426
			delta = orig_data - xdp.data;
427 428
			break;
		case XDP_TX:
429
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
430
				trace_xdp_exception(vi->dev, xdp_prog, act);
431 432 433
			rcu_read_unlock();
			goto xdp_xmit;
		default:
434 435 436 437
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
438 439 440 441 442
			goto err_xdp;
		}
	}
	rcu_read_unlock();

443 444 445 446 447 448 449 450 451 452 453 454 455
	skb = build_skb(buf, buflen);
	if (!skb) {
		put_page(virt_to_head_page(buf));
		goto err;
	}
	skb_reserve(skb, headroom - delta);
	skb_put(skb, len + delta);
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
456
	return skb;
457 458 459 460

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
461
	put_page(virt_to_head_page(buf));
462 463
xdp_xmit:
	return NULL;
464 465 466
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
467
				   struct virtnet_info *vi,
468 469 470 471 472
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
473
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
474

475 476 477 478 479 480 481 482 483 484 485
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

486 487 488 489 490 491 492 493 494 495 496 497
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
498
				       u16 *num_buf,
499 500 501 502 503
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
504
	unsigned int page_off = VIRTIO_XDP_HEADROOM;
505 506 507 508 509 510 511

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

512
	while (--*num_buf) {
513 514 515 516 517 518 519 520 521
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

522 523 524 525
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

526 527 528
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
529 530
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
531
			goto err_buf;
532
		}
533 534 535 536

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
537
		put_page(p);
538 539
	}

540 541
	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
542 543 544 545 546 547
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

548
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
549
					 struct virtnet_info *vi,
550
					 struct receive_queue *rq,
551
					 unsigned long ctx,
552
					 unsigned int len)
553
{
554
	void *buf = mergeable_ctx_to_buf_address(ctx);
555 556
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
557 558
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
559 560 561 562
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
563 564
	head_skb = NULL;

J
John Fastabend 已提交
565 566 567
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
568
		struct page *xdp_page;
569 570
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
571 572
		u32 act;

573
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
574
		if (unlikely(num_buf > 1)) {
575
			/* linearize data for XDP */
576
			xdp_page = xdp_linearize_page(rq, &num_buf,
577 578 579
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
580
			offset = VIRTIO_XDP_HEADROOM;
581 582
		} else {
			xdp_page = page;
J
John Fastabend 已提交
583 584 585 586 587 588 589
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
590
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
591 592
			goto err_xdp;

593 594 595
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
596
		data = page_address(xdp_page) + offset;
597
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
598 599 600 601
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

J
John Fastabend 已提交
602 603
		switch (act) {
		case XDP_PASS:
604 605 606 607 608 609 610
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

611 612 613 614 615
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
616
						       offset, len, PAGE_SIZE);
617
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
618 619
				return head_skb;
			}
J
John Fastabend 已提交
620 621
			break;
		case XDP_TX:
622
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
623
				trace_xdp_exception(vi->dev, xdp_prog, act);
624
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
625 626
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
627 628 629
			rcu_read_unlock();
			goto xdp_xmit;
		default:
630 631 632 633
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
634 635
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
636
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
637
			goto err_xdp;
J
John Fastabend 已提交
638
		}
J
John Fastabend 已提交
639 640
	}
	rcu_read_unlock();
641

J
John Fastabend 已提交
642 643 644
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
645

646 647
	if (unlikely(!curr_skb))
		goto err_skb;
648
	while (--num_buf) {
649 650
		int num_skb_frags;

651 652
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
653
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
654
				 dev->name, num_buf,
655 656
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
657 658
			dev->stats.rx_length_errors++;
			goto err_buf;
659
		}
660

661
		buf = mergeable_ctx_to_buf_address(ctx);
662 663 664
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
665 666
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
667 668 669

			if (unlikely(!nskb))
				goto err_skb;
670 671 672 673 674 675 676 677
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
678
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
679 680 681
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
682
			head_skb->truesize += truesize;
683
		}
684
		offset = buf - page_address(page);
685 686 687
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
688
					     len, truesize);
689 690
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
691
					offset, len, truesize);
692
		}
693 694
	}

J
Johannes Berg 已提交
695
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
696 697
	return head_skb;

J
John Fastabend 已提交
698 699
err_xdp:
	rcu_read_unlock();
700 701 702
err_skb:
	put_page(page);
	while (--num_buf) {
703 704
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
705 706 707 708 709
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
710
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
711
		put_page(page);
712
	}
713 714 715
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
716
xdp_xmit:
717
	return NULL;
718 719
}

J
Jason Wang 已提交
720 721
static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
		       void *buf, unsigned int len)
722
{
723
	struct net_device *dev = vi->dev;
724
	struct sk_buff *skb;
725
	struct virtio_net_hdr_mrg_rxbuf *hdr;
J
Jason Wang 已提交
726
	int ret;
727

728
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
729 730
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
731 732 733 734 735
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
736
			give_pages(rq, buf);
737
		} else {
738
			put_page(virt_to_head_page(buf));
739
		}
J
Jason Wang 已提交
740
		return 0;
741
	}
742

743
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
744
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
745
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
746
		skb = receive_big(dev, vi, rq, buf, len);
747
	else
748
		skb = receive_small(dev, vi, rq, buf, len);
749 750

	if (unlikely(!skb))
J
Jason Wang 已提交
751
		return 0;
752

753
	hdr = skb_vnet_hdr(skb);
754

J
Jason Wang 已提交
755
	ret = skb->len;
R
Rusty Russell 已提交
756

757
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
758
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
759

760 761 762 763 764 765
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
766 767
	}

768 769 770 771
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
772
	napi_gro_receive(&rq->napi, skb);
J
Jason Wang 已提交
773
	return ret;
R
Rusty Russell 已提交
774 775 776 777

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
J
Jason Wang 已提交
778
	return 0;
R
Rusty Russell 已提交
779 780
}

M
Michael S. Tsirkin 已提交
781 782
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
783
{
784 785
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
786
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
787
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
788
	int err;
789

790 791 792
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
793
		return -ENOMEM;
R
Rusty Russell 已提交
794

795 796 797 798 799 800
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
801
	if (err < 0)
802
		put_page(virt_to_head_page(buf));
803

804 805
	return err;
}
806

807 808
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
809 810 811 812 813
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

814 815
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

816
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
817
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
818
		first = get_a_page(rq, gfp);
819 820
		if (!first) {
			if (list)
821
				give_pages(rq, list);
822
			return -ENOMEM;
823
		}
824
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
825

826 827 828 829
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
830

831
	first = get_a_page(rq, gfp);
832
	if (!first) {
833
		give_pages(rq, list);
834 835 836 837
		return -ENOMEM;
	}
	p = page_address(first);

838
	/* rq->sg[0], rq->sg[1] share the same page */
839 840
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
841

842
	/* rq->sg[1] for data packet, from offset */
843
	offset = sizeof(struct padded_vnet_hdr);
844
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
845 846 847

	/* chain first in list head */
	first->private = (unsigned long)list;
848 849
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
850
	if (err < 0)
851
		give_pages(rq, first);
852 853

	return err;
R
Rusty Russell 已提交
854 855
}

J
Johannes Berg 已提交
856
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
857
{
858
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
859 860
	unsigned int len;

J
Johannes Berg 已提交
861
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
862 863 864 865
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

866 867
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
868
{
869
	struct page_frag *alloc_frag = &rq->alloc_frag;
870
	unsigned int headroom = virtnet_get_headroom(vi);
871
	char *buf;
872
	unsigned long ctx;
873
	int err;
874
	unsigned int len, hole;
875

876
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
877
	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
878
		return -ENOMEM;
879

880
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
881
	buf += headroom; /* advance address leaving hole at front of pkt */
882
	ctx = mergeable_buf_to_ctx(buf, len);
883
	get_page(alloc_frag->page);
884
	alloc_frag->offset += len + headroom;
885
	hole = alloc_frag->size - alloc_frag->offset;
886
	if (hole < len + headroom) {
887 888 889 890 891
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
892 893 894
		len += hole;
		alloc_frag->offset += hole;
	}
895

896
	sg_init_one(rq->sg, buf, len);
897
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
898
	if (err < 0)
899
		put_page(virt_to_head_page(buf));
900

901 902
	return err;
}
903

904 905 906 907 908 909 910
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
911 912
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
913 914
{
	int err;
915
	bool oom;
916

917
	gfp |= __GFP_COLD;
918 919
	do {
		if (vi->mergeable_rx_bufs)
920
			err = add_recvbuf_mergeable(vi, rq, gfp);
921
		else if (vi->big_packets)
922
			err = add_recvbuf_big(vi, rq, gfp);
923
		else
M
Michael S. Tsirkin 已提交
924
			err = add_recvbuf_small(vi, rq, gfp);
925

926
		oom = err == -ENOMEM;
927
		if (err)
928
			break;
929
	} while (rq->vq->num_free);
930
	virtqueue_kick(rq->vq);
931
	return !oom;
932 933
}

934
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
935 936
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
937
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
938

939
	/* Schedule NAPI, Suppress further interrupts if successful. */
940
	if (napi_schedule_prep(&rq->napi)) {
941
		virtqueue_disable_cb(rvq);
942
		__napi_schedule(&rq->napi);
943
	}
R
Rusty Russell 已提交
944 945
}

946
static void virtnet_napi_enable(struct receive_queue *rq)
947
{
948
	napi_enable(&rq->napi);
949 950 951 952 953

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
954 955
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
956
		local_bh_disable();
957
		__napi_schedule(&rq->napi);
958
		local_bh_enable();
959 960 961
	}
}

962 963
static void refill_work(struct work_struct *work)
{
964 965
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
966
	bool still_empty;
J
Jason Wang 已提交
967 968
	int i;

969
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
970
		struct receive_queue *rq = &vi->rq[i];
971

J
Jason Wang 已提交
972
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
973
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
974
		virtnet_napi_enable(rq);
975

J
Jason Wang 已提交
976 977 978 979 980 981
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
982 983
}

984
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
985
{
986
	struct virtnet_info *vi = rq->vq->vdev->priv;
J
Jason Wang 已提交
987
	unsigned int len, received = 0, bytes = 0;
988
	void *buf;
J
Jason Wang 已提交
989
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
990 991

	while (received < budget &&
992
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
J
Jason Wang 已提交
993
		bytes += receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
994 995 996
		received++;
	}

997
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
998
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
999
			schedule_delayed_work(&vi->refill, 0);
1000
	}
R
Rusty Russell 已提交
1001

J
Jason Wang 已提交
1002 1003 1004 1005 1006
	u64_stats_update_begin(&stats->rx_syncp);
	stats->rx_bytes += bytes;
	stats->rx_packets += received;
	u64_stats_update_end(&stats->rx_syncp);

1007 1008 1009 1010 1011 1012 1013
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1014
	unsigned int r, received;
1015

1016
	received = virtnet_receive(rq, budget);
1017

1018 1019
	/* Out of packets? */
	if (received < budget) {
1020
		r = virtqueue_enable_cb_prepare(rq->vq);
1021 1022 1023 1024 1025 1026
		if (napi_complete_done(napi, received)) {
			if (unlikely(virtqueue_poll(rq->vq, r)) &&
			    napi_schedule_prep(napi)) {
				virtqueue_disable_cb(rq->vq);
				__napi_schedule(napi);
			}
1027
		}
R
Rusty Russell 已提交
1028 1029 1030 1031 1032
	}

	return received;
}

J
Jason Wang 已提交
1033 1034 1035 1036 1037
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1038 1039 1040
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1041
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1042
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
1043 1044 1045 1046 1047 1048
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1049
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1050 1051
{
	struct sk_buff *skb;
1052
	unsigned int len;
1053
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1054
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
J
Jason Wang 已提交
1055 1056
	unsigned int packets = 0;
	unsigned int bytes = 0;
R
Rusty Russell 已提交
1057

1058
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1059
		pr_debug("Sent skb %p\n", skb);
1060

J
Jason Wang 已提交
1061 1062
		bytes += skb->len;
		packets++;
1063

1064
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1065
	}
J
Jason Wang 已提交
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

	u64_stats_update_begin(&stats->tx_syncp);
	stats->tx_bytes += bytes;
	stats->tx_packets += packets;
	u64_stats_update_end(&stats->tx_syncp);
R
Rusty Russell 已提交
1077 1078
}

1079
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1080
{
1081
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1082
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1083
	struct virtnet_info *vi = sq->vq->vdev->priv;
1084
	unsigned num_sg;
1085
	unsigned hdr_len = vi->hdr_len;
1086
	bool can_push;
R
Rusty Russell 已提交
1087

J
Johannes Berg 已提交
1088
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1089 1090 1091 1092 1093 1094 1095

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1096
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1097 1098
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1099

1100
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1101
				    virtio_is_little_endian(vi->vdev), false))
1102
		BUG();
R
Rusty Russell 已提交
1103

1104
	if (vi->mergeable_rx_bufs)
1105
		hdr->num_buffers = 0;
1106

1107
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1108 1109 1110 1111 1112 1113 1114 1115 1116
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1117
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1118 1119
}

1120
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1121 1122
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1123 1124
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1125
	int err;
1126 1127
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1128 1129

	/* Free up any pending old buffers before queueing new ones. */
1130
	free_old_xmit_skbs(sq);
1131

1132 1133 1134
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1135
	/* Try to transmit */
1136
	err = xmit_skb(sq, skb);
1137

1138
	/* This should not happen! */
1139
	if (unlikely(err)) {
1140 1141 1142
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1143
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1144
		dev->stats.tx_dropped++;
1145
		dev_kfree_skb_any(skb);
1146
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1147
	}
1148

1149 1150 1151 1152
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1153 1154 1155 1156 1157 1158 1159 1160 1161
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1162
	 */
1163
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1164
		netif_stop_subqueue(dev, qnum);
1165
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1166
			/* More just got used, free them then recheck. */
1167 1168
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1169
				netif_start_subqueue(dev, qnum);
1170
				virtqueue_disable_cb(sq->vq);
1171 1172
			}
		}
1173
	}
1174

1175
	if (kick || netif_xmit_stopped(txq))
1176
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1177

1178
	return NETDEV_TX_OK;
1179 1180
}

1181 1182 1183
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1184
 * never fail unless improperly formatted.
1185 1186
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1187
				 struct scatterlist *out)
1188
{
1189
	struct scatterlist *sgs[4], hdr, stat;
1190
	unsigned out_num = 0, tmp;
1191 1192

	/* Caller should know better */
1193
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1194

1195 1196 1197
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1198
	/* Add header */
1199
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1200
	sgs[out_num++] = &hdr;
1201

1202 1203
	if (out)
		sgs[out_num++] = out;
1204

1205
	/* Add return status. */
1206
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1207
	sgs[out_num] = &stat;
1208

1209
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1210
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1211

1212
	if (unlikely(!virtqueue_kick(vi->cvq)))
1213
		return vi->ctrl_status == VIRTIO_NET_OK;
1214 1215 1216 1217

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1218 1219
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1220 1221
		cpu_relax();

1222
	return vi->ctrl_status == VIRTIO_NET_OK;
1223 1224
}

1225 1226 1227 1228
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1229
	int ret;
1230
	struct sockaddr *addr;
1231
	struct scatterlist sg;
1232

1233
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1234 1235 1236 1237
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1238
	if (ret)
1239
		goto out;
1240

1241 1242 1243
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1244
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1245 1246
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1247 1248
			ret = -EINVAL;
			goto out;
1249
		}
1250 1251
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1252 1253 1254 1255 1256 1257 1258
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1259 1260 1261
	}

	eth_commit_mac_addr_change(dev, p);
1262
	ret = 0;
1263

1264 1265 1266
out:
	kfree(addr);
	return ret;
1267 1268
}

1269 1270
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1271 1272 1273 1274 1275 1276
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1277
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1278 1279 1280
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1281
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1282 1283
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1284
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1285 1286

		do {
1287
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1288 1289
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1290
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1291 1292 1293 1294 1295 1296 1297 1298

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1299
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1300 1301 1302 1303 1304
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1305 1306 1307 1308
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1309
	int i;
1310

J
Jason Wang 已提交
1311 1312
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1313 1314 1315
}
#endif

1316 1317 1318 1319
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1320
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1321 1322 1323 1324
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1325
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1326 1327 1328 1329 1330 1331 1332
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1333 1334
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1335 1336

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1337
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1338 1339 1340
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1341
	} else {
J
Jason Wang 已提交
1342
		vi->curr_queue_pairs = queue_pairs;
1343 1344 1345
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1346
	}
J
Jason Wang 已提交
1347 1348 1349 1350

	return 0;
}

1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1361 1362 1363
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1364
	int i;
R
Rusty Russell 已提交
1365

1366 1367
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1368 1369 1370

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1371 1372 1373 1374

	return 0;
}

1375 1376 1377
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1378 1379
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1380
	struct netdev_hw_addr *ha;
1381
	int uc_count;
1382
	int mc_count;
1383 1384
	void *buf;
	int i;
1385

S
stephen hemminger 已提交
1386
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1387 1388 1389
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1390 1391
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1392

1393
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1394 1395

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1396
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1397
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1398
			 vi->ctrl_promisc ? "en" : "dis");
1399

1400
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1401 1402

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1403
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1404
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1405
			 vi->ctrl_allmulti ? "en" : "dis");
1406

1407
	uc_count = netdev_uc_count(dev);
1408
	mc_count = netdev_mc_count(dev);
1409
	/* MAC filter - use one buffer for both lists */
1410 1411 1412
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1413
	if (!buf)
1414 1415
		return;

1416 1417
	sg_init_table(sg, 2);

1418
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1419
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1420
	i = 0;
1421
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1422
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1423 1424

	sg_set_buf(&sg[0], mac_data,
1425
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1426 1427

	/* multicast list and count fill the end */
1428
	mac_data = (void *)&mac_data->macs[uc_count][0];
1429

M
Michael S. Tsirkin 已提交
1430
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1431
	i = 0;
1432 1433
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1434 1435

	sg_set_buf(&sg[1], mac_data,
1436
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1437 1438

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1439
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1440
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1441 1442

	kfree(buf);
1443 1444
}

1445 1446
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1447 1448 1449 1450
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1451 1452
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1453 1454

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1455
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1456
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1457
	return 0;
1458 1459
}

1460 1461
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1462 1463 1464 1465
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1466 1467
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1468 1469

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1470
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1471
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1472
	return 0;
1473 1474
}

1475
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1476 1477 1478
{
	int i;

1479 1480
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1481 1482 1483 1484
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1485 1486 1487
		vi->affinity_hint_set = false;
	}
}
1488

1489 1490 1491 1492
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1493 1494 1495 1496 1497

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1498 1499 1500 1501
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1502 1503
	}

1504 1505
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1506 1507
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1508
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1509
		i++;
J
Jason Wang 已提交
1510 1511
	}

1512
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1513 1514
}

1515
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1516
{
1517 1518 1519 1520 1521
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1522

1523 1524 1525 1526 1527 1528 1529
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1530

1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1562 1563
}

R
Rick Jones 已提交
1564 1565 1566 1567 1568
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1569 1570
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1571 1572 1573 1574
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1602
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1603 1604
		return -EINVAL;

J
John Fastabend 已提交
1605 1606 1607 1608 1609 1610 1611
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1612
	get_online_cpus();
1613
	err = _virtnet_set_queues(vi, queue_pairs);
1614 1615 1616 1617
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1618
		virtnet_set_affinity(vi);
1619
	}
1620
	put_online_cpus();
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1638 1639 1640 1641 1642 1643
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1644 1645 1646
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1647
	ethtool_cmd_speed_set(&diff1, 0);
1648
	diff2.port = PORT_OTHER;
1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1692
static const struct ethtool_ops virtnet_ethtool_ops = {
1693
	.get_drvinfo = virtnet_get_drvinfo,
1694
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1695
	.get_ringparam = virtnet_get_ringparam,
1696 1697
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1698
	.get_ts_info = ethtool_op_get_ts_info,
1699 1700
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1701 1702
};

1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++)
			napi_disable(&vi->rq[i].napi);
	}
}

static int init_vqs(struct virtnet_info *vi);
1721
static void _remove_vq_common(struct virtnet_info *vi);
1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
	}

	netif_device_attach(vi->dev);
	return err;
}

1747
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764
{
	struct virtio_device *dev = vi->vdev;
	int ret;

	virtio_config_disable(dev);
	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
	virtnet_freeze_down(dev);
	_remove_vq_common(vi);

	dev->config->reset(dev);
	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

	ret = virtio_finalize_features(dev);
	if (ret)
		goto err;

1765
	vi->xdp_queue_pairs = xdp_qp;
1766 1767 1768
	ret = virtnet_restore_up(dev);
	if (ret)
		goto err;
1769
	ret = _virtnet_set_queues(vi, curr_qp);
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780
	if (ret)
		goto err;

	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
	virtio_config_enable(dev);
	return 0;
err:
	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
	return ret;
}

J
John Fastabend 已提交
1781 1782 1783 1784 1785
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1786
	u16 xdp_qp = 0, curr_qp;
1787
	int i, err;
J
John Fastabend 已提交
1788 1789

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1790 1791 1792
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

	/* Changing the headroom in buffers is a disruptive operation because
	 * existing buffers must be flushed and reallocated. This will happen
	 * when a xdp program is initially added or xdp is disabled by removing
	 * the xdp program resulting in number of XDP queues changing.
	 */
	if (vi->xdp_queue_pairs != xdp_qp) {
1830 1831 1832
		err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
		if (err) {
			dev_warn(&dev->dev, "XDP reset failure.\n");
1833
			goto virtio_reset_err;
1834
		}
J
John Fastabend 已提交
1835 1836
	}

1837 1838
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1839 1840 1841 1842 1843 1844 1845 1846
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
1847 1848 1849 1850 1851 1852 1853 1854 1855

virtio_reset_err:
	/* On reset error do our best to unwind XDP changes inflight and return
	 * error up to user space for resolution. The underlying reset hung on
	 * us so not much we can do here.
	 */
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1883 1884 1885 1886 1887
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1888
	.ndo_set_mac_address = virtnet_set_mac_address,
1889
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1890
	.ndo_get_stats64     = virtnet_stats,
1891 1892
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1893 1894
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1895
#endif
J
John Fastabend 已提交
1896
	.ndo_xdp		= virtnet_xdp,
1897 1898
};

1899
static void virtnet_config_changed_work(struct work_struct *work)
1900
{
1901 1902
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1903 1904
	u16 v;

1905 1906
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1907
		return;
1908 1909

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1910
		netdev_notify_peers(vi->dev);
1911 1912
		virtnet_ack_link_announce(vi);
	}
1913 1914 1915 1916 1917

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1918
		return;
1919 1920 1921 1922 1923

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1924
		netif_tx_wake_all_queues(vi->dev);
1925 1926
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1927
		netif_tx_stop_all_queues(vi->dev);
1928 1929 1930 1931 1932 1933 1934
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1935
	schedule_work(&vi->config_work);
1936 1937
}

J
Jason Wang 已提交
1938 1939
static void virtnet_free_queues(struct virtnet_info *vi)
{
1940 1941
	int i;

1942 1943
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1944
		netif_napi_del(&vi->rq[i].napi);
1945
	}
1946

1947 1948 1949 1950 1951
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1952 1953 1954 1955
	kfree(vi->rq);
	kfree(vi->sq);
}

1956
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
1957
{
J
John Fastabend 已提交
1958
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1959 1960 1961 1962 1963
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1964 1965 1966 1967 1968

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1969
	}
1970 1971 1972 1973 1974 1975
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
1976
	rtnl_unlock();
J
Jason Wang 已提交
1977 1978
}

1979 1980 1981 1982 1983 1984 1985 1986
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

1987
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
1988 1989 1990 1991 1992 1993 1994 1995 1996
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1997 1998 1999 2000 2001 2002 2003
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2004
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2005
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2006 2007 2008 2009
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2010 2011 2012 2013 2014 2015
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2016 2017 2018 2019 2020
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
2021
				give_pages(&vi->rq[i], buf);
2022
			} else {
2023
				put_page(virt_to_head_page(buf));
2024
			}
J
Jason Wang 已提交
2025 2026 2027 2028
		}
	}
}

2029 2030 2031 2032
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2033
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2034

2035
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2036 2037

	virtnet_free_queues(vi);
2038 2039
}

J
Jason Wang 已提交
2040
static int virtnet_find_vqs(struct virtnet_info *vi)
2041
{
J
Jason Wang 已提交
2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2071

J
Jason Wang 已提交
2072 2073 2074 2075 2076 2077 2078 2079 2080
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
2081

J
Jason Wang 已提交
2082
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2083
					 names, NULL);
J
Jason Wang 已提交
2084 2085
	if (ret)
		goto err_find;
2086

J
Jason Wang 已提交
2087 2088
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2089
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2090
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2091
	}
J
Jason Wang 已提交
2092 2093 2094 2095 2096 2097 2098 2099 2100 2101

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2102
	return 0;
J
Jason Wang 已提交
2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2122
	if (!vi->rq)
J
Jason Wang 已提交
2123 2124 2125 2126 2127 2128 2129 2130 2131
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2132
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2157
	get_online_cpus();
2158
	virtnet_set_affinity(vi);
2159 2160
	put_online_cpus();

J
Jason Wang 已提交
2161 2162 2163 2164 2165 2166
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2167 2168
}

2169 2170 2171 2172 2173 2174
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2175
	struct ewma_pkt_len *avg;
2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2230 2231 2232
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

R
Rusty Russell 已提交
2233 2234
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
2235
	int i, err;
R
Rusty Russell 已提交
2236 2237
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
2238
	u16 max_queue_pairs;
2239
	int mtu;
J
Jason Wang 已提交
2240

2241 2242 2243 2244 2245 2246
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2247 2248 2249
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

J
Jason Wang 已提交
2250
	/* Find if host supports multiqueue virtio_net device */
2251 2252 2253
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2254 2255 2256 2257 2258 2259

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2260 2261

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2262
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2263 2264 2265 2266
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2267
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2268
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2269
	dev->features = NETIF_F_HIGHDMA;
2270

2271
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2272 2273 2274
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2275
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2276
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2277
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2278
		if (csum)
J
Jason Wang 已提交
2279
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2280 2281

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2282
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2283 2284
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2285
		/* Individual feature bits: what can host handle? */
2286 2287 2288 2289 2290 2291
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2292 2293
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2294

2295 2296
		dev->features |= NETIF_F_GSO_ROBUST;

2297
		if (gso)
2298
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2299
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2300
	}
2301 2302
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2303

2304 2305
	dev->vlan_features = dev->features;

2306 2307 2308 2309
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2310
	/* Configuration may specify what MAC to use.  Otherwise random. */
2311 2312 2313 2314 2315
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2316
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2317 2318 2319 2320 2321

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2322
	vdev->priv = vi;
2323 2324 2325 2326 2327
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2328 2329 2330 2331 2332 2333 2334
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2335
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2336

2337
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2338 2339
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2340 2341
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2342 2343
		vi->big_packets = true;

2344 2345 2346
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2347 2348
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2349 2350 2351 2352
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2353 2354
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2355 2356
		vi->any_header_sg = true;

J
Jason Wang 已提交
2357 2358 2359
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2360 2361 2362 2363
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2364
		if (mtu < dev->min_mtu) {
2365
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2366
		} else {
2367
			dev->mtu = mtu;
2368 2369
			dev->max_mtu = mtu;
		}
2370 2371 2372 2373

		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2374 2375
	}

2376 2377
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2378

2379 2380 2381 2382 2383
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2384 2385 2386
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2387
	err = init_vqs(vi);
2388
	if (err)
2389
		goto free_stats;
R
Rusty Russell 已提交
2390

2391 2392 2393 2394
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2395 2396
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2397

2398 2399
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2400 2401 2402
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2403
		goto free_vqs;
R
Rusty Russell 已提交
2404
	}
2405

M
Michael S. Tsirkin 已提交
2406 2407
	virtio_device_ready(vdev);

2408
	err = virtnet_cpu_notif_add(vi);
2409 2410
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2411
		goto free_unregister_netdev;
2412 2413
	}

2414
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2415

J
Jason Wang 已提交
2416 2417 2418 2419
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2420
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2421 2422 2423 2424
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2425

J
Jason Wang 已提交
2426 2427 2428
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2429 2430
	return 0;

2431
free_unregister_netdev:
2432 2433
	vi->vdev->config->reset(vdev);

2434
	unregister_netdev(dev);
2435
free_vqs:
J
Jason Wang 已提交
2436
	cancel_delayed_work_sync(&vi->refill);
2437
	free_receive_page_frags(vi);
2438
	virtnet_del_vqs(vi);
2439 2440
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2441 2442 2443 2444 2445
free:
	free_netdev(dev);
	return err;
}

2446 2447 2448 2449 2450 2451 2452 2453 2454
static void _remove_vq_common(struct virtnet_info *vi)
{
	vi->vdev->config->reset(vi->vdev);
	free_unused_bufs(vi);
	_free_receive_bufs(vi);
	free_receive_page_frags(vi);
	virtnet_del_vqs(vi);
}

2455
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2456
{
2457
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2458 2459

	/* Free unused buffers in both send and recv, if any. */
2460
	free_unused_bufs(vi);
2461

J
Jason Wang 已提交
2462
	free_receive_bufs(vi);
2463

2464 2465
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2466
	virtnet_del_vqs(vi);
2467 2468
}

2469
static void virtnet_remove(struct virtio_device *vdev)
2470 2471 2472
{
	struct virtnet_info *vi = vdev->priv;

2473
	virtnet_cpu_notif_remove(vi);
2474

2475 2476
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2477

2478 2479 2480
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2481

2482
	free_percpu(vi->stats);
2483
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2484 2485
}

2486
#ifdef CONFIG_PM_SLEEP
2487 2488 2489 2490
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2491
	virtnet_cpu_notif_remove(vi);
2492
	virtnet_freeze_down(vdev);
2493 2494 2495 2496 2497 2498 2499 2500
	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
2501
	int err;
2502

2503
	err = virtnet_restore_up(vdev);
2504 2505
	if (err)
		return err;
J
Jason Wang 已提交
2506 2507
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2508
	err = virtnet_cpu_notif_add(vi);
2509 2510 2511
	if (err)
		return err;

2512 2513 2514 2515
	return 0;
}
#endif

R
Rusty Russell 已提交
2516 2517 2518 2519 2520
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2533
static unsigned int features[] = {
2534 2535 2536 2537 2538 2539
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2540
	VIRTIO_F_ANY_LAYOUT,
2541 2542
};

2543
static struct virtio_driver virtio_net_driver = {
2544 2545
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2546 2547
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2548 2549 2550 2551
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
2552
	.remove =	virtnet_remove,
2553
	.config_changed = virtnet_config_changed,
2554
#ifdef CONFIG_PM_SLEEP
2555 2556 2557
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2558 2559
};

2560 2561 2562 2563
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2564
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2565 2566 2567 2568 2569
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2570
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2595 2596 2597 2598

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");