virtio_net.c 67.4 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
R
Rusty Russell 已提交
32

33
static int napi_weight = NAPI_POLL_WEIGHT;
34 35
module_param(napi_weight, int, 0444);

36
static bool csum = true, gso = true;
R
Rusty Russell 已提交
37 38 39
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
40
/* FIXME: MTU in config. */
41
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
42
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
43

44 45
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

46 47 48
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56 57 58 59 60 61 62
/* With mergeable buffers we align buffer address and use the low bits to
 * encode its true size. Buffer size is up to 1 page so we need to align to
 * square root of page size to ensure we reserve enough bits to encode the true
 * size.
 */
#define MERGEABLE_BUFFER_MIN_ALIGN_SHIFT ((PAGE_SHIFT + 1) / 2)

63
/* Minimum alignment for mergeable packet buffers. */
64 65
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, \
				   1 << MERGEABLE_BUFFER_MIN_ALIGN_SHIFT)
66

67
#define VIRTNET_DRIVER_VERSION "1.0.0"
68

69
struct virtnet_stats {
70 71
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
72 73 74 75 76 77 78
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

79 80 81 82 83 84 85
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
86 87 88

	/* Name of the send queue: output.$index */
	char name[40];
89 90 91 92 93 94 95
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
96 97
	struct napi_struct napi;

J
John Fastabend 已提交
98 99
	struct bpf_prog __rcu *xdp_prog;

100 101 102
	/* Chain pages by the private ptr. */
	struct page *pages;

103
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
104
	struct ewma_pkt_len mrg_avg_pkt_len;
105

106 107 108
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

109 110
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
111 112 113

	/* Name of this receive queue: input.$index */
	char name[40];
114 115 116 117 118 119
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
120 121
	struct send_queue *sq;
	struct receive_queue *rq;
122 123
	unsigned int status;

J
Jason Wang 已提交
124 125 126 127 128 129
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

130 131 132
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

133 134 135
	/* I like... big packets and I cannot lie! */
	bool big_packets;

136 137 138
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
139 140 141
	/* Has control virtqueue */
	bool has_cvq;

142 143 144
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

145 146 147
	/* Packet virtio header size */
	u8 hdr_len;

148 149 150
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

151 152 153
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

154 155 156
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
157 158
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
159

160 161 162
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
163 164 165 166

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
167
	struct virtio_net_ctrl_mq ctrl_mq;
168 169
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
170
	u16 ctrl_vid;
171 172 173 174

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
175 176
};

177
struct padded_vnet_hdr {
178
	struct virtio_net_hdr_mrg_rxbuf hdr;
179
	/*
180 181 182
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
183
	 */
184
	char padding[4];
185 186
};

J
Jason Wang 已提交
187 188 189 190 191
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
192
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
193 194 195 196 197 198 199 200 201
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
202
	return vq->index / 2;
J
Jason Wang 已提交
203 204 205 206 207 208 209
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

210
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
211
{
212
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
213 214
}

215 216 217 218
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
219
static void give_pages(struct receive_queue *rq, struct page *page)
220
{
221
	struct page *end;
222

223
	/* Find end of list, sew whole thing into vi->rq.pages. */
224
	for (end = page; end->private; end = (struct page *)end->private);
225 226
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
227 228
}

229
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
230
{
231
	struct page *p = rq->pages;
232

233
	if (p) {
234
		rq->pages = (struct page *)p->private;
235 236 237
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
238 239 240 241
		p = alloc_page(gfp_mask);
	return p;
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
	if (napi_complete_done(napi, processed) &&
	    unlikely(virtqueue_poll(vq, opaque)))
		virtqueue_napi_schedule(napi, vq);
}

262
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
263
{
264
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
265

266
	/* Suppress further interrupts. */
267
	virtqueue_disable_cb(vq);
268

269
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
270
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
271 272
}

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

291
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
292 293
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
294 295
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
296 297
{
	struct sk_buff *skb;
298
	struct virtio_net_hdr_mrg_rxbuf *hdr;
299
	unsigned int copy, hdr_len, hdr_padded_len;
300
	char *p;
301

302
	p = page_address(page) + offset;
303

304
	/* copy small packet so we can reuse these pages for small data */
305
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
306 307
	if (unlikely(!skb))
		return NULL;
308

309
	hdr = skb_vnet_hdr(skb);
310

311 312 313 314
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
315
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
316

317
	memcpy(hdr, p, hdr_len);
318

319
	len -= hdr_len;
320 321
	offset += hdr_padded_len;
	p += hdr_padded_len;
322

323 324 325 326
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
327

328 329
	len -= copy;
	offset += copy;
330

331 332 333 334 335 336 337 338
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

339 340 341 342 343 344 345
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
346
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
347 348 349
		dev_kfree_skb(skb);
		return NULL;
	}
350
	BUG_ON(offset >= PAGE_SIZE);
351
	while (len) {
352 353 354 355
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
356 357 358
		page = (struct page *)page->private;
		offset = 0;
	}
359

360
	if (page)
361
		give_pages(rq, page);
362

363 364
	return skb;
}
365

366
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
367
			     struct receive_queue *rq,
368
			     struct xdp_buff *xdp)
J
John Fastabend 已提交
369 370
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
371
	unsigned int len;
372 373
	struct send_queue *sq;
	unsigned int qp;
J
John Fastabend 已提交
374 375 376
	void *xdp_sent;
	int err;

377 378 379
	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	sq = &vi->sq[qp];

J
John Fastabend 已提交
380 381
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
382
		struct page *sent_page = virt_to_head_page(xdp_sent);
383

384
		put_page(sent_page);
385
	}
J
John Fastabend 已提交
386

387 388 389 390
	xdp->data -= vi->hdr_len;
	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);
391

392
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
393

394
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
J
John Fastabend 已提交
395
	if (unlikely(err)) {
396
		struct page *page = virt_to_head_page(xdp->data);
397

398
		put_page(page);
399
		return false;
J
John Fastabend 已提交
400 401 402
	}

	virtqueue_kick(sq->vq);
403
	return true;
J
John Fastabend 已提交
404 405
}

406 407 408 409 410
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

411 412 413 414
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
415
{
416
	struct sk_buff *skb;
417
	struct bpf_prog *xdp_prog;
418 419 420 421 422 423
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	unsigned int delta = 0;
424
	len -= vi->hdr_len;
425

426 427 428
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
429
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
430
		struct xdp_buff xdp;
431
		void *orig_data;
432 433 434 435
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
436

437 438
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
439
		xdp.data_end = xdp.data + len;
440
		orig_data = xdp.data;
441 442
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

443 444
		switch (act) {
		case XDP_PASS:
445
			/* Recalculate length in case bpf program changed it */
446
			delta = orig_data - xdp.data;
447 448
			break;
		case XDP_TX:
449
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
450
				trace_xdp_exception(vi->dev, xdp_prog, act);
451 452 453
			rcu_read_unlock();
			goto xdp_xmit;
		default:
454 455 456 457
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
458 459 460 461 462
			goto err_xdp;
		}
	}
	rcu_read_unlock();

463 464 465 466 467 468 469 470 471 472 473 474 475
	skb = build_skb(buf, buflen);
	if (!skb) {
		put_page(virt_to_head_page(buf));
		goto err;
	}
	skb_reserve(skb, headroom - delta);
	skb_put(skb, len + delta);
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
476
	return skb;
477 478 479 480

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
481
	put_page(virt_to_head_page(buf));
482 483
xdp_xmit:
	return NULL;
484 485 486
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
487
				   struct virtnet_info *vi,
488 489 490 491 492
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
493
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
494

495 496 497 498 499 500 501 502 503 504 505
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

506 507 508 509 510 511 512 513 514 515 516 517
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
518
				       u16 *num_buf,
519 520 521 522 523
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
524
	unsigned int page_off = VIRTIO_XDP_HEADROOM;
525 526 527 528 529 530 531

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

532
	while (--*num_buf) {
533 534 535 536 537 538 539 540 541
		unsigned int buflen;
		unsigned long ctx;
		void *buf;
		int off;

		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!ctx))
			goto err_buf;

542 543 544 545
		buf = mergeable_ctx_to_buf_address(ctx);
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

546 547 548
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
549 550
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
551
			goto err_buf;
552
		}
553 554 555 556

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
557
		put_page(p);
558 559
	}

560 561
	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
562 563 564 565 566 567
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

568
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
569
					 struct virtnet_info *vi,
570
					 struct receive_queue *rq,
571
					 unsigned long ctx,
572
					 unsigned int len)
573
{
574
	void *buf = mergeable_ctx_to_buf_address(ctx);
575 576
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
577 578
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
579 580 581 582
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
583 584
	head_skb = NULL;

J
John Fastabend 已提交
585 586 587
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
588
		struct page *xdp_page;
589 590
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
591 592
		u32 act;

593
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
594
		if (unlikely(num_buf > 1)) {
595
			/* linearize data for XDP */
596
			xdp_page = xdp_linearize_page(rq, &num_buf,
597 598 599
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
600
			offset = VIRTIO_XDP_HEADROOM;
601 602
		} else {
			xdp_page = page;
J
John Fastabend 已提交
603 604 605 606 607 608 609
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
610
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
611 612
			goto err_xdp;

613 614 615
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
616
		data = page_address(xdp_page) + offset;
617
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
618 619 620 621
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

J
John Fastabend 已提交
622 623
		switch (act) {
		case XDP_PASS:
624 625 626 627 628 629 630
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

631 632 633 634 635
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
636
						       offset, len, PAGE_SIZE);
637
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
638 639
				return head_skb;
			}
J
John Fastabend 已提交
640 641
			break;
		case XDP_TX:
642
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
643
				trace_xdp_exception(vi->dev, xdp_prog, act);
644
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
645 646
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
647 648 649
			rcu_read_unlock();
			goto xdp_xmit;
		default:
650 651 652 653
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
654 655
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
656
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
657
			goto err_xdp;
J
John Fastabend 已提交
658
		}
J
John Fastabend 已提交
659 660
	}
	rcu_read_unlock();
661

J
John Fastabend 已提交
662 663 664
	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
665

666 667
	if (unlikely(!curr_skb))
		goto err_skb;
668
	while (--num_buf) {
669 670
		int num_skb_frags;

671 672
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
673
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
674
				 dev->name, num_buf,
675 676
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
677 678
			dev->stats.rx_length_errors++;
			goto err_buf;
679
		}
680

681
		buf = mergeable_ctx_to_buf_address(ctx);
682 683 684
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
685 686
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
687 688 689

			if (unlikely(!nskb))
				goto err_skb;
690 691 692 693 694 695 696 697
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
698
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
699 700 701
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
702
			head_skb->truesize += truesize;
703
		}
704
		offset = buf - page_address(page);
705 706 707
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
708
					     len, truesize);
709 710
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
711
					offset, len, truesize);
712
		}
713 714
	}

J
Johannes Berg 已提交
715
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
716 717
	return head_skb;

J
John Fastabend 已提交
718 719
err_xdp:
	rcu_read_unlock();
720 721 722
err_skb:
	put_page(page);
	while (--num_buf) {
723 724
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
725 726 727 728 729
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
730
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
731
		put_page(page);
732
	}
733 734 735
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
736
xdp_xmit:
737
	return NULL;
738 739
}

J
Jason Wang 已提交
740 741
static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
		       void *buf, unsigned int len)
742
{
743
	struct net_device *dev = vi->dev;
744
	struct sk_buff *skb;
745
	struct virtio_net_hdr_mrg_rxbuf *hdr;
J
Jason Wang 已提交
746
	int ret;
747

748
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
749 750
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
751 752 753 754 755
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
756
			give_pages(rq, buf);
757
		} else {
758
			put_page(virt_to_head_page(buf));
759
		}
J
Jason Wang 已提交
760
		return 0;
761
	}
762

763
	if (vi->mergeable_rx_bufs)
M
Michael S. Tsirkin 已提交
764
		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
765
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
766
		skb = receive_big(dev, vi, rq, buf, len);
767
	else
768
		skb = receive_small(dev, vi, rq, buf, len);
769 770

	if (unlikely(!skb))
J
Jason Wang 已提交
771
		return 0;
772

773
	hdr = skb_vnet_hdr(skb);
774

J
Jason Wang 已提交
775
	ret = skb->len;
R
Rusty Russell 已提交
776

777
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
778
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
779

780 781 782 783 784 785
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
786 787
	}

788 789 790 791
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
792
	napi_gro_receive(&rq->napi, skb);
J
Jason Wang 已提交
793
	return ret;
R
Rusty Russell 已提交
794 795 796 797

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
J
Jason Wang 已提交
798
	return 0;
R
Rusty Russell 已提交
799 800
}

M
Michael S. Tsirkin 已提交
801 802
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
803
{
804 805
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
806
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
807
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
808
	int err;
809

810 811 812
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
813
		return -ENOMEM;
R
Rusty Russell 已提交
814

815 816 817 818 819 820
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
821
	if (err < 0)
822
		put_page(virt_to_head_page(buf));
823

824 825
	return err;
}
826

827 828
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
829 830 831 832 833
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

834 835
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

836
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
837
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
838
		first = get_a_page(rq, gfp);
839 840
		if (!first) {
			if (list)
841
				give_pages(rq, list);
842
			return -ENOMEM;
843
		}
844
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
845

846 847 848 849
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
850

851
	first = get_a_page(rq, gfp);
852
	if (!first) {
853
		give_pages(rq, list);
854 855 856 857
		return -ENOMEM;
	}
	p = page_address(first);

858
	/* rq->sg[0], rq->sg[1] share the same page */
859 860
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
861

862
	/* rq->sg[1] for data packet, from offset */
863
	offset = sizeof(struct padded_vnet_hdr);
864
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
865 866 867

	/* chain first in list head */
	first->private = (unsigned long)list;
868 869
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
870
	if (err < 0)
871
		give_pages(rq, first);
872 873

	return err;
R
Rusty Russell 已提交
874 875
}

J
Johannes Berg 已提交
876
static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
877
{
878
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
879 880
	unsigned int len;

J
Johannes Berg 已提交
881
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
882 883 884 885
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

886 887
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
888
{
889
	struct page_frag *alloc_frag = &rq->alloc_frag;
890
	unsigned int headroom = virtnet_get_headroom(vi);
891
	char *buf;
892
	unsigned long ctx;
893
	int err;
894
	unsigned int len, hole;
895

896
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
897
	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
898
		return -ENOMEM;
899

900
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
901
	buf += headroom; /* advance address leaving hole at front of pkt */
902
	ctx = mergeable_buf_to_ctx(buf, len);
903
	get_page(alloc_frag->page);
904
	alloc_frag->offset += len + headroom;
905
	hole = alloc_frag->size - alloc_frag->offset;
906
	if (hole < len + headroom) {
907 908 909 910 911
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
912 913 914
		len += hole;
		alloc_frag->offset += hole;
	}
915

916
	sg_init_one(rq->sg, buf, len);
917
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
918
	if (err < 0)
919
		put_page(virt_to_head_page(buf));
920

921 922
	return err;
}
923

924 925 926 927 928 929 930
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
931 932
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
933 934
{
	int err;
935
	bool oom;
936

937
	gfp |= __GFP_COLD;
938 939
	do {
		if (vi->mergeable_rx_bufs)
940
			err = add_recvbuf_mergeable(vi, rq, gfp);
941
		else if (vi->big_packets)
942
			err = add_recvbuf_big(vi, rq, gfp);
943
		else
M
Michael S. Tsirkin 已提交
944
			err = add_recvbuf_small(vi, rq, gfp);
945

946
		oom = err == -ENOMEM;
947
		if (err)
948
			break;
949
	} while (rq->vq->num_free);
950
	virtqueue_kick(rq->vq);
951
	return !oom;
952 953
}

954
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
955 956
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
957
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
958

959
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
960 961
}

962
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
963
{
964
	napi_enable(napi);
965 966

	/* If all buffers were filled by other side before we napi_enabled, we
967 968 969 970 971 972
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
973 974
}

975 976
static void refill_work(struct work_struct *work)
{
977 978
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
979
	bool still_empty;
J
Jason Wang 已提交
980 981
	int i;

982
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
983
		struct receive_queue *rq = &vi->rq[i];
984

J
Jason Wang 已提交
985
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
986
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
987
		virtnet_napi_enable(rq->vq, &rq->napi);
988

J
Jason Wang 已提交
989 990 991 992 993 994
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
995 996
}

997
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
998
{
999
	struct virtnet_info *vi = rq->vq->vdev->priv;
J
Jason Wang 已提交
1000
	unsigned int len, received = 0, bytes = 0;
1001
	void *buf;
J
Jason Wang 已提交
1002
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
1003 1004

	while (received < budget &&
1005
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
J
Jason Wang 已提交
1006
		bytes += receive_buf(vi, rq, buf, len);
R
Rusty Russell 已提交
1007 1008 1009
		received++;
	}

1010
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
1011
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1012
			schedule_delayed_work(&vi->refill, 0);
1013
	}
R
Rusty Russell 已提交
1014

J
Jason Wang 已提交
1015 1016 1017 1018 1019
	u64_stats_update_begin(&stats->rx_syncp);
	stats->rx_bytes += bytes;
	stats->rx_packets += received;
	u64_stats_update_end(&stats->rx_syncp);

1020 1021 1022 1023 1024 1025 1026
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1027
	unsigned int received;
1028

1029
	received = virtnet_receive(rq, budget);
1030

1031
	/* Out of packets? */
1032 1033
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1034 1035 1036 1037

	return received;
}

J
Jason Wang 已提交
1038 1039 1040 1041 1042
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1043 1044 1045
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1046
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1047
				schedule_delayed_work(&vi->refill, 0);
1048
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
J
Jason Wang 已提交
1049 1050 1051 1052 1053
	}

	return 0;
}

1054
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1055 1056
{
	struct sk_buff *skb;
1057
	unsigned int len;
1058
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1059
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
J
Jason Wang 已提交
1060 1061
	unsigned int packets = 0;
	unsigned int bytes = 0;
R
Rusty Russell 已提交
1062

1063
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1064
		pr_debug("Sent skb %p\n", skb);
1065

J
Jason Wang 已提交
1066 1067
		bytes += skb->len;
		packets++;
1068

1069
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1070
	}
J
Jason Wang 已提交
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

	u64_stats_update_begin(&stats->tx_syncp);
	stats->tx_bytes += bytes;
	stats->tx_packets += packets;
	u64_stats_update_end(&stats->tx_syncp);
R
Rusty Russell 已提交
1082 1083
}

1084
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1085
{
1086
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1087
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1088
	struct virtnet_info *vi = sq->vq->vdev->priv;
1089
	unsigned num_sg;
1090
	unsigned hdr_len = vi->hdr_len;
1091
	bool can_push;
R
Rusty Russell 已提交
1092

J
Johannes Berg 已提交
1093
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1094 1095 1096 1097 1098 1099 1100

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1101
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1102 1103
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1104

1105
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1106
				    virtio_is_little_endian(vi->vdev), false))
1107
		BUG();
R
Rusty Russell 已提交
1108

1109
	if (vi->mergeable_rx_bufs)
1110
		hdr->num_buffers = 0;
1111

1112
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1113 1114 1115 1116 1117 1118 1119 1120 1121
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1122
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1123 1124
}

1125
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1126 1127
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1128 1129
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1130
	int err;
1131 1132
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1133 1134

	/* Free up any pending old buffers before queueing new ones. */
1135
	free_old_xmit_skbs(sq);
1136

1137 1138 1139
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1140
	/* Try to transmit */
1141
	err = xmit_skb(sq, skb);
1142

1143
	/* This should not happen! */
1144
	if (unlikely(err)) {
1145 1146 1147
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1148
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1149
		dev->stats.tx_dropped++;
1150
		dev_kfree_skb_any(skb);
1151
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1152
	}
1153

1154 1155 1156 1157
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1158 1159 1160 1161 1162 1163 1164 1165 1166
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1167
	 */
1168
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1169
		netif_stop_subqueue(dev, qnum);
1170
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1171
			/* More just got used, free them then recheck. */
1172 1173
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1174
				netif_start_subqueue(dev, qnum);
1175
				virtqueue_disable_cb(sq->vq);
1176 1177
			}
		}
1178
	}
1179

1180
	if (kick || netif_xmit_stopped(txq))
1181
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1182

1183
	return NETDEV_TX_OK;
1184 1185
}

1186 1187 1188
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1189
 * never fail unless improperly formatted.
1190 1191
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1192
				 struct scatterlist *out)
1193
{
1194
	struct scatterlist *sgs[4], hdr, stat;
1195
	unsigned out_num = 0, tmp;
1196 1197

	/* Caller should know better */
1198
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1199

1200 1201 1202
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1203
	/* Add header */
1204
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1205
	sgs[out_num++] = &hdr;
1206

1207 1208
	if (out)
		sgs[out_num++] = out;
1209

1210
	/* Add return status. */
1211
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1212
	sgs[out_num] = &stat;
1213

1214
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1215
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1216

1217
	if (unlikely(!virtqueue_kick(vi->cvq)))
1218
		return vi->ctrl_status == VIRTIO_NET_OK;
1219 1220 1221 1222

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1223 1224
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1225 1226
		cpu_relax();

1227
	return vi->ctrl_status == VIRTIO_NET_OK;
1228 1229
}

1230 1231 1232 1233
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1234
	int ret;
1235
	struct sockaddr *addr;
1236
	struct scatterlist sg;
1237

1238
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1239 1240 1241 1242
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1243
	if (ret)
1244
		goto out;
1245

1246 1247 1248
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1249
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1250 1251
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1252 1253
			ret = -EINVAL;
			goto out;
1254
		}
1255 1256
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1257 1258 1259 1260 1261 1262 1263
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1264 1265 1266
	}

	eth_commit_mac_addr_change(dev, p);
1267
	ret = 0;
1268

1269 1270 1271
out:
	kfree(addr);
	return ret;
1272 1273
}

1274 1275
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1276 1277 1278 1279 1280 1281
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1282
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1283 1284 1285
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1286
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1287 1288
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1289
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1290 1291

		do {
1292
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1293 1294
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1295
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1296 1297 1298 1299 1300 1301 1302 1303

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1304
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1305 1306 1307 1308 1309
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1310 1311 1312 1313
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1314
	int i;
1315

J
Jason Wang 已提交
1316 1317
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1318 1319 1320
}
#endif

1321 1322 1323 1324
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1325
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1326 1327 1328 1329
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1330
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1331 1332 1333 1334 1335 1336 1337
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1338 1339
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1340 1341

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1342
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1343 1344 1345
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1346
	} else {
J
Jason Wang 已提交
1347
		vi->curr_queue_pairs = queue_pairs;
1348 1349 1350
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1351
	}
J
Jason Wang 已提交
1352 1353 1354 1355

	return 0;
}

1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1366 1367 1368
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1369
	int i;
R
Rusty Russell 已提交
1370

1371 1372
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1373 1374 1375

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1376 1377 1378 1379

	return 0;
}

1380 1381 1382
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1383 1384
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1385
	struct netdev_hw_addr *ha;
1386
	int uc_count;
1387
	int mc_count;
1388 1389
	void *buf;
	int i;
1390

S
stephen hemminger 已提交
1391
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1392 1393 1394
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1395 1396
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1397

1398
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1399 1400

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1401
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1402
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1403
			 vi->ctrl_promisc ? "en" : "dis");
1404

1405
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1406 1407

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1408
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1409
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1410
			 vi->ctrl_allmulti ? "en" : "dis");
1411

1412
	uc_count = netdev_uc_count(dev);
1413
	mc_count = netdev_mc_count(dev);
1414
	/* MAC filter - use one buffer for both lists */
1415 1416 1417
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1418
	if (!buf)
1419 1420
		return;

1421 1422
	sg_init_table(sg, 2);

1423
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1424
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1425
	i = 0;
1426
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1427
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1428 1429

	sg_set_buf(&sg[0], mac_data,
1430
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1431 1432

	/* multicast list and count fill the end */
1433
	mac_data = (void *)&mac_data->macs[uc_count][0];
1434

M
Michael S. Tsirkin 已提交
1435
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1436
	i = 0;
1437 1438
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1439 1440

	sg_set_buf(&sg[1], mac_data,
1441
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1442 1443

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1444
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1445
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1446 1447

	kfree(buf);
1448 1449
}

1450 1451
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1452 1453 1454 1455
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1456 1457
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1458 1459

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1460
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1461
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1462
	return 0;
1463 1464
}

1465 1466
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1467 1468 1469 1470
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1471 1472
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1473 1474

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1475
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1476
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1477
	return 0;
1478 1479
}

1480
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1481 1482 1483
{
	int i;

1484 1485
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1486 1487 1488 1489
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1490 1491 1492
		vi->affinity_hint_set = false;
	}
}
1493

1494 1495 1496 1497
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1498 1499 1500 1501 1502

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1503 1504 1505 1506
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1507 1508
	}

1509 1510
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1511 1512
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1513
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1514
		i++;
J
Jason Wang 已提交
1515 1516
	}

1517
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1518 1519
}

1520
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1521
{
1522 1523 1524 1525 1526
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1527

1528 1529 1530 1531 1532 1533 1534
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1535

1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1567 1568
}

R
Rick Jones 已提交
1569 1570 1571 1572 1573
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1574 1575
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1576 1577 1578 1579
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1607
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1608 1609
		return -EINVAL;

J
John Fastabend 已提交
1610 1611 1612 1613 1614 1615 1616
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1617
	get_online_cpus();
1618
	err = _virtnet_set_queues(vi, queue_pairs);
1619 1620 1621 1622
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1623
		virtnet_set_affinity(vi);
1624
	}
1625
	put_online_cpus();
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1643
/* Check if the user is trying to change anything besides speed/duplex */
1644 1645
static bool
virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
1646
{
1647 1648
	struct ethtool_link_ksettings diff1 = *cmd;
	struct ethtool_link_ksettings diff2 = {};
1649

1650 1651 1652
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
	diff1.base.speed = 0;
	diff2.base.port = PORT_OTHER;
	ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
	diff1.base.duplex = 0;
	diff1.base.cmd = 0;
	diff1.base.link_mode_masks_nwords = 0;

	return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
		bitmap_empty(diff1.link_modes.supported,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
		bitmap_empty(diff1.link_modes.lp_advertising,
			     __ETHTOOL_LINK_MODE_MASK_NBITS);
1667 1668
}

1669 1670
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
1671 1672 1673 1674
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

1675
	speed = cmd->base.speed;
1676 1677
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
1678
	    !ethtool_validate_duplex(cmd->base.duplex) ||
1679 1680 1681
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
1682
	vi->duplex = cmd->base.duplex;
1683 1684 1685 1686

	return 0;
}

1687 1688
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
1689 1690 1691
{
	struct virtnet_info *vi = netdev_priv(dev);

1692 1693 1694
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1707
static const struct ethtool_ops virtnet_ethtool_ops = {
1708
	.get_drvinfo = virtnet_get_drvinfo,
1709
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1710
	.get_ringparam = virtnet_get_ringparam,
1711 1712
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1713
	.get_ts_info = ethtool_op_get_ts_info,
1714 1715
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
1716 1717
};

1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++)
			napi_disable(&vi->rq[i].napi);
	}
}

static int init_vqs(struct virtnet_info *vi);
1736
static void _remove_vq_common(struct virtnet_info *vi);
1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

		for (i = 0; i < vi->max_queue_pairs; i++)
1755
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1756 1757 1758 1759 1760 1761
	}

	netif_device_attach(vi->dev);
	return err;
}

1762
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779
{
	struct virtio_device *dev = vi->vdev;
	int ret;

	virtio_config_disable(dev);
	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
	virtnet_freeze_down(dev);
	_remove_vq_common(vi);

	dev->config->reset(dev);
	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

	ret = virtio_finalize_features(dev);
	if (ret)
		goto err;

1780
	vi->xdp_queue_pairs = xdp_qp;
1781 1782 1783
	ret = virtnet_restore_up(dev);
	if (ret)
		goto err;
1784
	ret = _virtnet_set_queues(vi, curr_qp);
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
	if (ret)
		goto err;

	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
	virtio_config_enable(dev);
	return 0;
err:
	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
	return ret;
}

J
John Fastabend 已提交
1796 1797 1798 1799 1800
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1801
	u16 xdp_qp = 0, curr_qp;
1802
	int i, err;
J
John Fastabend 已提交
1803 1804

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1805 1806 1807
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

	/* Changing the headroom in buffers is a disruptive operation because
	 * existing buffers must be flushed and reallocated. This will happen
	 * when a xdp program is initially added or xdp is disabled by removing
	 * the xdp program resulting in number of XDP queues changing.
	 */
	if (vi->xdp_queue_pairs != xdp_qp) {
1845 1846 1847
		err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
		if (err) {
			dev_warn(&dev->dev, "XDP reset failure.\n");
1848
			goto virtio_reset_err;
1849
		}
J
John Fastabend 已提交
1850 1851
	}

1852 1853
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1854 1855 1856 1857 1858 1859 1860 1861
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
1862 1863 1864 1865 1866 1867 1868 1869 1870

virtio_reset_err:
	/* On reset error do our best to unwind XDP changes inflight and return
	 * error up to user space for resolution. The underlying reset hung on
	 * us so not much we can do here.
	 */
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1898 1899 1900 1901 1902
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1903
	.ndo_set_mac_address = virtnet_set_mac_address,
1904
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1905
	.ndo_get_stats64     = virtnet_stats,
1906 1907
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1908 1909
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1910
#endif
J
John Fastabend 已提交
1911
	.ndo_xdp		= virtnet_xdp,
1912 1913
};

1914
static void virtnet_config_changed_work(struct work_struct *work)
1915
{
1916 1917
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1918 1919
	u16 v;

1920 1921
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1922
		return;
1923 1924

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1925
		netdev_notify_peers(vi->dev);
1926 1927
		virtnet_ack_link_announce(vi);
	}
1928 1929 1930 1931 1932

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1933
		return;
1934 1935 1936 1937 1938

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1939
		netif_tx_wake_all_queues(vi->dev);
1940 1941
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1942
		netif_tx_stop_all_queues(vi->dev);
1943 1944 1945 1946 1947 1948 1949
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1950
	schedule_work(&vi->config_work);
1951 1952
}

J
Jason Wang 已提交
1953 1954
static void virtnet_free_queues(struct virtnet_info *vi)
{
1955 1956
	int i;

1957 1958
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1959
		netif_napi_del(&vi->rq[i].napi);
1960
	}
1961

1962 1963 1964 1965 1966
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1967 1968 1969 1970
	kfree(vi->rq);
	kfree(vi->sq);
}

1971
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
1972
{
J
John Fastabend 已提交
1973
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1974 1975 1976 1977 1978
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1979 1980 1981 1982 1983

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1984
	}
1985 1986 1987 1988 1989 1990
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
1991
	rtnl_unlock();
J
Jason Wang 已提交
1992 1993
}

1994 1995 1996 1997 1998 1999 2000 2001
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

2002
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
2003 2004 2005 2006 2007 2008 2009 2010 2011
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
2012 2013 2014 2015 2016 2017 2018
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2019
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2020
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
2021 2022 2023 2024
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2025 2026 2027 2028 2029 2030
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2031 2032 2033 2034 2035
			if (vi->mergeable_rx_bufs) {
				unsigned long ctx = (unsigned long)buf;
				void *base = mergeable_ctx_to_buf_address(ctx);
				put_page(virt_to_head_page(base));
			} else if (vi->big_packets) {
2036
				give_pages(&vi->rq[i], buf);
2037
			} else {
2038
				put_page(virt_to_head_page(buf));
2039
			}
J
Jason Wang 已提交
2040 2041 2042 2043
		}
	}
}

2044 2045 2046 2047
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2048
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2049

2050
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2051 2052

	virtnet_free_queues(vi);
2053 2054
}

J
Jason Wang 已提交
2055
static int virtnet_find_vqs(struct virtnet_info *vi)
2056
{
J
Jason Wang 已提交
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2086

J
Jason Wang 已提交
2087 2088 2089 2090 2091 2092 2093 2094 2095
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
2096

J
Jason Wang 已提交
2097
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2098
					 names, NULL);
J
Jason Wang 已提交
2099 2100
	if (ret)
		goto err_find;
2101

J
Jason Wang 已提交
2102 2103
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2104
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2105
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2106
	}
J
Jason Wang 已提交
2107 2108 2109 2110 2111 2112 2113 2114 2115 2116

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2117
	return 0;
J
Jason Wang 已提交
2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2137
	if (!vi->rq)
J
Jason Wang 已提交
2138 2139 2140 2141 2142 2143 2144 2145 2146
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2147
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2172
	get_online_cpus();
2173
	virtnet_set_affinity(vi);
2174 2175
	put_online_cpus();

J
Jason Wang 已提交
2176 2177 2178 2179 2180 2181
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2182 2183
}

2184 2185 2186 2187 2188 2189
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2190
	struct ewma_pkt_len *avg;
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2245 2246 2247
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2248
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2249
{
2250 2251 2252 2253 2254 2255
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2256 2257 2258
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
	int i, err;
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2278
	/* Find if host supports multiqueue virtio_net device */
2279 2280 2281
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2282 2283 2284 2285 2286 2287

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2288 2289

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2290
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2291 2292 2293 2294
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2295
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2296
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2297
	dev->features = NETIF_F_HIGHDMA;
2298

2299
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2300 2301 2302
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2303
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2304
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2305
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2306
		if (csum)
J
Jason Wang 已提交
2307
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2308 2309

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2310
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2311 2312
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2313
		/* Individual feature bits: what can host handle? */
2314 2315 2316 2317 2318 2319
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2320 2321
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2322

2323 2324
		dev->features |= NETIF_F_GSO_ROBUST;

2325
		if (gso)
2326
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2327
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2328
	}
2329 2330
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2331

2332 2333
	dev->vlan_features = dev->features;

2334 2335 2336 2337
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2338
	/* Configuration may specify what MAC to use.  Otherwise random. */
2339 2340 2341 2342 2343
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2344
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2345 2346 2347 2348 2349

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2350
	vdev->priv = vi;
2351 2352 2353 2354 2355
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2356 2357 2358 2359 2360 2361 2362
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2363
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2364

2365
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2366 2367
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2368 2369
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2370 2371
		vi->big_packets = true;

2372 2373 2374
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2375 2376
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2377 2378 2379 2380
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2381 2382
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2383 2384
		vi->any_header_sg = true;

J
Jason Wang 已提交
2385 2386 2387
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2388 2389 2390 2391
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2392
		if (mtu < dev->min_mtu) {
2393 2394 2395 2396 2397 2398
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
			goto free_stats;
2399
		}
2400

2401 2402 2403
		dev->mtu = mtu;
		dev->max_mtu = mtu;

2404 2405 2406
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2407 2408
	}

2409 2410
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2411

2412 2413 2414 2415 2416
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2417 2418 2419
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2420
	err = init_vqs(vi);
2421
	if (err)
2422
		goto free_stats;
R
Rusty Russell 已提交
2423

2424 2425 2426 2427
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2428 2429
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2430

2431 2432
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2433 2434 2435
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2436
		goto free_vqs;
R
Rusty Russell 已提交
2437
	}
2438

M
Michael S. Tsirkin 已提交
2439 2440
	virtio_device_ready(vdev);

2441
	err = virtnet_cpu_notif_add(vi);
2442 2443
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2444
		goto free_unregister_netdev;
2445 2446
	}

2447
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2448

J
Jason Wang 已提交
2449 2450 2451 2452
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2453
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2454 2455 2456 2457
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2458

J
Jason Wang 已提交
2459 2460 2461
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2462 2463
	return 0;

2464
free_unregister_netdev:
2465 2466
	vi->vdev->config->reset(vdev);

2467
	unregister_netdev(dev);
2468
free_vqs:
J
Jason Wang 已提交
2469
	cancel_delayed_work_sync(&vi->refill);
2470
	free_receive_page_frags(vi);
2471
	virtnet_del_vqs(vi);
2472 2473
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2474 2475 2476 2477 2478
free:
	free_netdev(dev);
	return err;
}

2479 2480 2481 2482 2483 2484 2485 2486 2487
static void _remove_vq_common(struct virtnet_info *vi)
{
	vi->vdev->config->reset(vi->vdev);
	free_unused_bufs(vi);
	_free_receive_bufs(vi);
	free_receive_page_frags(vi);
	virtnet_del_vqs(vi);
}

2488
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2489
{
2490
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2491 2492

	/* Free unused buffers in both send and recv, if any. */
2493
	free_unused_bufs(vi);
2494

J
Jason Wang 已提交
2495
	free_receive_bufs(vi);
2496

2497 2498
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2499
	virtnet_del_vqs(vi);
2500 2501
}

2502
static void virtnet_remove(struct virtio_device *vdev)
2503 2504 2505
{
	struct virtnet_info *vi = vdev->priv;

2506
	virtnet_cpu_notif_remove(vi);
2507

2508 2509
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2510

2511 2512 2513
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2514

2515
	free_percpu(vi->stats);
2516
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2517 2518
}

2519
#ifdef CONFIG_PM_SLEEP
2520 2521 2522 2523
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2524
	virtnet_cpu_notif_remove(vi);
2525
	virtnet_freeze_down(vdev);
2526 2527 2528 2529 2530 2531 2532 2533
	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
2534
	int err;
2535

2536
	err = virtnet_restore_up(vdev);
2537 2538
	if (err)
		return err;
J
Jason Wang 已提交
2539 2540
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2541
	err = virtnet_cpu_notif_add(vi);
2542 2543 2544
	if (err)
		return err;

2545 2546 2547 2548
	return 0;
}
#endif

R
Rusty Russell 已提交
2549 2550 2551 2552 2553
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2566
static unsigned int features[] = {
2567 2568 2569 2570 2571 2572
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2573
	VIRTIO_F_ANY_LAYOUT,
2574 2575
};

2576
static struct virtio_driver virtio_net_driver = {
2577 2578
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2579 2580
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2581 2582 2583
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
2584
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
2585
	.probe =	virtnet_probe,
2586
	.remove =	virtnet_remove,
2587
	.config_changed = virtnet_config_changed,
2588
#ifdef CONFIG_PM_SLEEP
2589 2590 2591
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2592 2593
};

2594 2595 2596 2597
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2598
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2599 2600 2601 2602 2603
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2604
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2629 2630 2631 2632

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");