virtio_net.c 67.1 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
R
Rusty Russell 已提交
17 18 19 20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
R
Rusty Russell 已提交
22 23 24
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
25
#include <linux/bpf.h>
26
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
27
#include <linux/scatterlist.h>
28
#include <linux/if_vlan.h>
29
#include <linux/slab.h>
30
#include <linux/cpu.h>
31
#include <linux/average.h>
32
#include <net/route.h>
R
Rusty Russell 已提交
33

34
static int napi_weight = NAPI_POLL_WEIGHT;
35 36
module_param(napi_weight, int, 0444);

37
static bool csum = true, gso = true;
R
Rusty Russell 已提交
38 39 40
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
41
/* FIXME: MTU in config. */
42
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
43
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
44

45 46
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

47 48 49
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

J
Johannes Berg 已提交
50 51 52 53
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
54
 */
55
DECLARE_EWMA(pkt_len, 0, 64)
56

57
#define VIRTNET_DRIVER_VERSION "1.0.0"
58

59
struct virtnet_stats {
60 61
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
62 63 64 65 66 67 68
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

69 70 71 72 73 74 75
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
76 77 78

	/* Name of the send queue: output.$index */
	char name[40];
79 80 81 82 83 84 85
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
86 87
	struct napi_struct napi;

J
John Fastabend 已提交
88 89
	struct bpf_prog __rcu *xdp_prog;

90 91 92
	/* Chain pages by the private ptr. */
	struct page *pages;

93
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
94
	struct ewma_pkt_len mrg_avg_pkt_len;
95

96 97 98
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

99 100
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
101

102 103 104
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
105 106
	/* Name of this receive queue: input.$index */
	char name[40];
107 108 109 110 111 112
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
113 114
	struct send_queue *sq;
	struct receive_queue *rq;
115 116
	unsigned int status;

J
Jason Wang 已提交
117 118 119 120 121 122
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

123 124 125
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

126 127 128
	/* I like... big packets and I cannot lie! */
	bool big_packets;

129 130 131
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
132 133 134
	/* Has control virtqueue */
	bool has_cvq;

135 136 137
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

138 139 140
	/* Packet virtio header size */
	u8 hdr_len;

141 142 143
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

144 145 146
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

147 148 149
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
150 151
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
152

153 154 155
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
156 157 158 159

	/* Control VQ buffers: protected by the rtnl lock */
	struct virtio_net_ctrl_hdr ctrl_hdr;
	virtio_net_ctrl_ack ctrl_status;
160
	struct virtio_net_ctrl_mq ctrl_mq;
161 162
	u8 ctrl_promisc;
	u8 ctrl_allmulti;
163
	u16 ctrl_vid;
164 165 166 167

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
R
Rusty Russell 已提交
168 169
};

170
struct padded_vnet_hdr {
171
	struct virtio_net_hdr_mrg_rxbuf hdr;
172
	/*
173 174 175
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
176
	 */
177
	char padding[4];
178 179
};

J
Jason Wang 已提交
180 181 182 183 184
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
185
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
186 187 188 189 190 191 192 193 194
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
195
	return vq->index / 2;
J
Jason Wang 已提交
196 197 198 199 200 201 202
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

203
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
204
{
205
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
206 207
}

208 209 210 211
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
212
static void give_pages(struct receive_queue *rq, struct page *page)
213
{
214
	struct page *end;
215

216
	/* Find end of list, sew whole thing into vi->rq.pages. */
217
	for (end = page; end->private; end = (struct page *)end->private);
218 219
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
220 221
}

222
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
223
{
224
	struct page *p = rq->pages;
225

226
	if (p) {
227
		rq->pages = (struct page *)p->private;
228 229 230
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
231 232 233 234
		p = alloc_page(gfp_mask);
	return p;
}

235
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
236
{
237
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
238

239
	/* Suppress further interrupts. */
240
	virtqueue_disable_cb(vq);
241

242
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
243
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
244 245
}

246
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
247 248
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
249 250
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
251 252
{
	struct sk_buff *skb;
253
	struct virtio_net_hdr_mrg_rxbuf *hdr;
254
	unsigned int copy, hdr_len, hdr_padded_len;
255
	char *p;
256

257
	p = page_address(page) + offset;
258

259
	/* copy small packet so we can reuse these pages for small data */
260
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
261 262
	if (unlikely(!skb))
		return NULL;
263

264
	hdr = skb_vnet_hdr(skb);
265

266 267 268 269
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
		hdr_padded_len = sizeof *hdr;
	else
270
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
271

272
	memcpy(hdr, p, hdr_len);
273

274
	len -= hdr_len;
275 276
	offset += hdr_padded_len;
	p += hdr_padded_len;
277

278 279 280 281
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
282

283 284
	len -= copy;
	offset += copy;
285

286 287 288 289 290 291 292 293
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

294 295 296 297 298 299 300
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
301
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
302 303 304
		dev_kfree_skb(skb);
		return NULL;
	}
305
	BUG_ON(offset >= PAGE_SIZE);
306
	while (len) {
307 308 309 310
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
311 312 313
		page = (struct page *)page->private;
		offset = 0;
	}
314

315
	if (page)
316
		give_pages(rq, page);
317

318 319
	return skb;
}
320

321
static bool virtnet_xdp_xmit(struct virtnet_info *vi,
J
John Fastabend 已提交
322
			     struct receive_queue *rq,
323
			     struct xdp_buff *xdp)
J
John Fastabend 已提交
324 325
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
326
	unsigned int len;
327 328
	struct send_queue *sq;
	unsigned int qp;
J
John Fastabend 已提交
329 330 331
	void *xdp_sent;
	int err;

332 333 334
	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
	sq = &vi->sq[qp];

J
John Fastabend 已提交
335 336
	/* Free up any pending old buffers before queueing new ones. */
	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
337
		struct page *sent_page = virt_to_head_page(xdp_sent);
338

339
		put_page(sent_page);
340
	}
J
John Fastabend 已提交
341

342 343 344 345
	xdp->data -= vi->hdr_len;
	/* Zero header and leave csum up to XDP layers */
	hdr = xdp->data;
	memset(hdr, 0, vi->hdr_len);
346

347
	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
348

349
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
J
John Fastabend 已提交
350
	if (unlikely(err)) {
351
		struct page *page = virt_to_head_page(xdp->data);
352

353
		put_page(page);
354
		return false;
J
John Fastabend 已提交
355 356 357
	}

	virtqueue_kick(sq->vq);
358
	return true;
J
John Fastabend 已提交
359 360
}

361 362 363 364 365
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}

366 367 368 369
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
				     void *buf, unsigned int len)
370
{
371
	struct sk_buff *skb;
372
	struct bpf_prog *xdp_prog;
373 374 375 376 377 378
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	unsigned int delta = 0;
379
	len -= vi->hdr_len;
380

381 382 383
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
384
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
385
		struct xdp_buff xdp;
386
		void *orig_data;
387 388 389 390
		u32 act;

		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
			goto err_xdp;
391

392 393
		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
		xdp.data = xdp.data_hard_start + xdp_headroom;
394
		xdp.data_end = xdp.data + len;
395
		orig_data = xdp.data;
396 397
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

398 399
		switch (act) {
		case XDP_PASS:
400
			/* Recalculate length in case bpf program changed it */
401
			delta = orig_data - xdp.data;
402 403
			break;
		case XDP_TX:
404
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
405
				trace_xdp_exception(vi->dev, xdp_prog, act);
406 407 408
			rcu_read_unlock();
			goto xdp_xmit;
		default:
409 410 411 412
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
413 414 415 416 417
			goto err_xdp;
		}
	}
	rcu_read_unlock();

418 419 420 421 422 423 424 425 426 427 428 429 430
	skb = build_skb(buf, buflen);
	if (!skb) {
		put_page(virt_to_head_page(buf));
		goto err;
	}
	skb_reserve(skb, headroom - delta);
	skb_put(skb, len + delta);
	if (!delta) {
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
	} /* keep zeroed vnet hdr since packet was changed by bpf */

err:
431
	return skb;
432 433 434 435

err_xdp:
	rcu_read_unlock();
	dev->stats.rx_dropped++;
436
	put_page(virt_to_head_page(buf));
437 438
xdp_xmit:
	return NULL;
439 440 441
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
442
				   struct virtnet_info *vi,
443 444 445 446 447
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
448
	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
J
John Fastabend 已提交
449

450 451 452 453 454 455 456 457 458 459 460
	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

461 462 463 464 465 466 467 468 469 470 471 472
/* The conditions to enable XDP should preclude the underlying device from
 * sending packets across multiple buffers (num_buf > 1). However per spec
 * it does not appear to be illegal to do so but rather just against convention.
 * So in order to avoid making a system unresponsive the packets are pushed
 * into a page and the XDP program is run. This will be extremely slow and we
 * push a warning to the user to fix this as soon as possible. Fixing this may
 * require resolving the underlying hardware to determine why multiple buffers
 * are being received or simply loading the XDP program in the ingress stack
 * after the skb is built because there is no advantage to running it here
 * anymore.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
473
				       u16 *num_buf,
474 475 476 477 478
				       struct page *p,
				       int offset,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
479
	unsigned int page_off = VIRTIO_XDP_HEADROOM;
480 481 482 483 484 485 486

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

487
	while (--*num_buf) {
488 489 490 491
		unsigned int buflen;
		void *buf;
		int off;

492 493
		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
494 495
			goto err_buf;

496 497 498
		p = virt_to_head_page(buf);
		off = buf - page_address(p);

499 500 501
		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
502 503
		if ((page_off + buflen) > PAGE_SIZE) {
			put_page(p);
504
			goto err_buf;
505
		}
506 507 508 509

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
510
		put_page(p);
511 512
	}

513 514
	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
515 516 517 518 519 520
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

521
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
522
					 struct virtnet_info *vi,
523
					 struct receive_queue *rq,
524 525
					 void *buf,
					 void *ctx,
526
					 unsigned int len)
527
{
528 529
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
530 531
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
532 533 534 535
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
	unsigned int truesize;

J
John Fastabend 已提交
536 537
	head_skb = NULL;

J
John Fastabend 已提交
538 539 540
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
541
		struct page *xdp_page;
542 543
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
544 545
		u32 act;

546
		/* This happens when rx buffer size is underestimated */
J
John Fastabend 已提交
547
		if (unlikely(num_buf > 1)) {
548
			/* linearize data for XDP */
549
			xdp_page = xdp_linearize_page(rq, &num_buf,
550 551 552
						      page, offset, &len);
			if (!xdp_page)
				goto err_xdp;
553
			offset = VIRTIO_XDP_HEADROOM;
554 555
		} else {
			xdp_page = page;
J
John Fastabend 已提交
556 557 558 559 560 561 562
		}

		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded. In practice I
		 * was not able to create this condition.
		 */
563
		if (unlikely(hdr->hdr.gso_type))
J
John Fastabend 已提交
564 565
			goto err_xdp;

566 567 568
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
569
		data = page_address(xdp_page) + offset;
570
		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
571 572 573 574
		xdp.data = data + vi->hdr_len;
		xdp.data_end = xdp.data + (len - vi->hdr_len);
		act = bpf_prog_run_xdp(xdp_prog, &xdp);

J
John Fastabend 已提交
575 576
		switch (act) {
		case XDP_PASS:
577 578 579 580 581 582 583
			/* recalculate offset to account for any header
			 * adjustments. Note other cases do not build an
			 * skb and avoid using offset
			 */
			offset = xdp.data -
					page_address(xdp_page) - vi->hdr_len;

584 585 586 587 588
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
				head_skb = page_to_skb(vi, rq, xdp_page,
589
						       offset, len, PAGE_SIZE);
590
				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
591 592
				return head_skb;
			}
J
John Fastabend 已提交
593 594
			break;
		case XDP_TX:
595
			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
596
				trace_xdp_exception(vi->dev, xdp_prog, act);
597
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
598 599
			if (unlikely(xdp_page != page))
				goto err_xdp;
J
John Fastabend 已提交
600 601 602
			rcu_read_unlock();
			goto xdp_xmit;
		default:
603 604 605 606
			bpf_warn_invalid_xdp_action(act);
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
		case XDP_DROP:
607 608
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
609
			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
J
John Fastabend 已提交
610
			goto err_xdp;
J
John Fastabend 已提交
611
		}
J
John Fastabend 已提交
612 613
	}
	rcu_read_unlock();
614

615
	if (unlikely(len > (unsigned long)ctx)) {
616
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
617 618 619 620 621
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
	truesize = (unsigned long)ctx;
J
John Fastabend 已提交
622 623
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
	curr_skb = head_skb;
624

625 626
	if (unlikely(!curr_skb))
		goto err_skb;
627
	while (--num_buf) {
628 629
		int num_skb_frags;

630
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
631
		if (unlikely(!ctx)) {
632
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
633
				 dev->name, num_buf,
634 635
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
636 637
			dev->stats.rx_length_errors++;
			goto err_buf;
638
		}
639 640

		page = virt_to_head_page(buf);
641
		if (unlikely(len > (unsigned long)ctx)) {
642
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
643 644 645 646 647
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
		truesize = (unsigned long)ctx;
648 649

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
650 651
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
652 653 654

			if (unlikely(!nskb))
				goto err_skb;
655 656 657 658 659 660 661 662 663 664 665
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
666
			head_skb->truesize += truesize;
667
		}
668
		offset = buf - page_address(page);
669 670 671
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
672
					     len, truesize);
673 674
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
675
					offset, len, truesize);
676
		}
677 678
	}

J
Johannes Berg 已提交
679
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
680 681
	return head_skb;

J
John Fastabend 已提交
682 683
err_xdp:
	rcu_read_unlock();
684 685 686
err_skb:
	put_page(page);
	while (--num_buf) {
687 688
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
689 690 691 692 693
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
694
		page = virt_to_head_page(buf);
695
		put_page(page);
696
	}
697 698 699
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
700
xdp_xmit:
701
	return NULL;
702 703
}

J
Jason Wang 已提交
704
static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
705
		       void *buf, unsigned int len, void **ctx)
706
{
707
	struct net_device *dev = vi->dev;
708
	struct sk_buff *skb;
709
	struct virtio_net_hdr_mrg_rxbuf *hdr;
J
Jason Wang 已提交
710
	int ret;
711

712
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
713 714
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
715
		if (vi->mergeable_rx_bufs) {
716
			put_page(virt_to_head_page(buf));
717
		} else if (vi->big_packets) {
718
			give_pages(rq, buf);
719
		} else {
720
			put_page(virt_to_head_page(buf));
721
		}
J
Jason Wang 已提交
722
		return 0;
723
	}
724

725
	if (vi->mergeable_rx_bufs)
726
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
727
	else if (vi->big_packets)
M
Michael S. Tsirkin 已提交
728
		skb = receive_big(dev, vi, rq, buf, len);
729
	else
730
		skb = receive_small(dev, vi, rq, buf, len);
731 732

	if (unlikely(!skb))
J
Jason Wang 已提交
733
		return 0;
734

735
	hdr = skb_vnet_hdr(skb);
736

J
Jason Wang 已提交
737
	ret = skb->len;
R
Rusty Russell 已提交
738

739
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
740
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
741

742 743 744 745 746 747
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
748 749
	}

750 751 752 753
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
754
	napi_gro_receive(&rq->napi, skb);
J
Jason Wang 已提交
755
	return ret;
R
Rusty Russell 已提交
756 757 758 759

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
J
Jason Wang 已提交
760
	return 0;
R
Rusty Russell 已提交
761 762
}

M
Michael S. Tsirkin 已提交
763 764
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
765
{
766 767
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
768
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
769
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
770
	int err;
771

772 773 774
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
775
		return -ENOMEM;
R
Rusty Russell 已提交
776

777 778 779 780 781 782
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
783
	if (err < 0)
784
		put_page(virt_to_head_page(buf));
785

786 787
	return err;
}
788

789 790
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
791 792 793 794 795
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

796 797
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

798
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
799
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
800
		first = get_a_page(rq, gfp);
801 802
		if (!first) {
			if (list)
803
				give_pages(rq, list);
804
			return -ENOMEM;
805
		}
806
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
807

808 809 810 811
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
812

813
	first = get_a_page(rq, gfp);
814
	if (!first) {
815
		give_pages(rq, list);
816 817 818 819
		return -ENOMEM;
	}
	p = page_address(first);

820
	/* rq->sg[0], rq->sg[1] share the same page */
821 822
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
823

824
	/* rq->sg[1] for data packet, from offset */
825
	offset = sizeof(struct padded_vnet_hdr);
826
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
827 828 829

	/* chain first in list head */
	first->private = (unsigned long)list;
830 831
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
832
	if (err < 0)
833
		give_pages(rq, first);
834 835

	return err;
R
Rusty Russell 已提交
836 837
}

838 839
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
					  struct ewma_pkt_len *avg_pkt_len)
840
{
841
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
842 843
	unsigned int len;

J
Johannes Berg 已提交
844
	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
845
				rq->min_buf_len - hdr_len, PAGE_SIZE - hdr_len);
846
	return ALIGN(len, L1_CACHE_BYTES);
847 848
}

849 850
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
851
{
852
	struct page_frag *alloc_frag = &rq->alloc_frag;
853
	unsigned int headroom = virtnet_get_headroom(vi);
854
	char *buf;
855
	void *ctx;
856
	int err;
857
	unsigned int len, hole;
858

859
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
860
	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
861
		return -ENOMEM;
862

863
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
864
	buf += headroom; /* advance address leaving hole at front of pkt */
865
	ctx = (void *)(unsigned long)len;
866
	get_page(alloc_frag->page);
867
	alloc_frag->offset += len + headroom;
868
	hole = alloc_frag->size - alloc_frag->offset;
869
	if (hole < len + headroom) {
870 871 872 873 874
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
875 876 877
		len += hole;
		alloc_frag->offset += hole;
	}
878

879
	sg_init_one(rq->sg, buf, len);
880
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
881
	if (err < 0)
882
		put_page(virt_to_head_page(buf));
883

884 885
	return err;
}
886

887 888 889 890 891 892 893
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
894 895
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
896 897
{
	int err;
898
	bool oom;
899

900
	gfp |= __GFP_COLD;
901 902
	do {
		if (vi->mergeable_rx_bufs)
903
			err = add_recvbuf_mergeable(vi, rq, gfp);
904
		else if (vi->big_packets)
905
			err = add_recvbuf_big(vi, rq, gfp);
906
		else
M
Michael S. Tsirkin 已提交
907
			err = add_recvbuf_small(vi, rq, gfp);
908

909
		oom = err == -ENOMEM;
910
		if (err)
911
			break;
912
	} while (rq->vq->num_free);
913
	virtqueue_kick(rq->vq);
914
	return !oom;
915 916
}

917
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
918 919
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
920
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
921

922
	/* Schedule NAPI, Suppress further interrupts if successful. */
923
	if (napi_schedule_prep(&rq->napi)) {
924
		virtqueue_disable_cb(rvq);
925
		__napi_schedule(&rq->napi);
926
	}
R
Rusty Russell 已提交
927 928
}

929
static void virtnet_napi_enable(struct receive_queue *rq)
930
{
931
	napi_enable(&rq->napi);
932 933 934 935 936

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
937 938
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
939
		local_bh_disable();
940
		__napi_schedule(&rq->napi);
941
		local_bh_enable();
942 943 944
	}
}

945 946
static void refill_work(struct work_struct *work)
{
947 948
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
949
	bool still_empty;
J
Jason Wang 已提交
950 951
	int i;

952
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
953
		struct receive_queue *rq = &vi->rq[i];
954

J
Jason Wang 已提交
955
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
956
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
J
Jason Wang 已提交
957
		virtnet_napi_enable(rq);
958

J
Jason Wang 已提交
959 960 961 962 963 964
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
965 966
}

967
static int virtnet_receive(struct receive_queue *rq, int budget)
R
Rusty Russell 已提交
968
{
969
	struct virtnet_info *vi = rq->vq->vdev->priv;
J
Jason Wang 已提交
970
	unsigned int len, received = 0, bytes = 0;
971
	void *buf;
J
Jason Wang 已提交
972
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
973

974 975 976 977 978 979 980 981 982 983 984 985 986 987
	if (vi->mergeable_rx_bufs) {
		void *ctx;

		while (received < budget &&
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
			bytes += receive_buf(vi, rq, buf, len, ctx);
			received++;
		}
	} else {
		while (received < budget &&
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
			bytes += receive_buf(vi, rq, buf, len, NULL);
			received++;
		}
R
Rusty Russell 已提交
988 989
	}

990
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
M
Michael S. Tsirkin 已提交
991
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
992
			schedule_delayed_work(&vi->refill, 0);
993
	}
R
Rusty Russell 已提交
994

J
Jason Wang 已提交
995 996 997 998 999
	u64_stats_update_begin(&stats->rx_syncp);
	stats->rx_bytes += bytes;
	stats->rx_packets += received;
	u64_stats_update_end(&stats->rx_syncp);

1000 1001 1002 1003 1004 1005 1006
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1007
	unsigned int r, received;
1008

1009
	received = virtnet_receive(rq, budget);
1010

1011 1012
	/* Out of packets? */
	if (received < budget) {
1013
		r = virtqueue_enable_cb_prepare(rq->vq);
1014 1015 1016 1017 1018 1019
		if (napi_complete_done(napi, received)) {
			if (unlikely(virtqueue_poll(rq->vq, r)) &&
			    napi_schedule_prep(napi)) {
				virtqueue_disable_cb(rq->vq);
				__napi_schedule(napi);
			}
1020
		}
R
Rusty Russell 已提交
1021 1022 1023 1024 1025
	}

	return received;
}

J
Jason Wang 已提交
1026 1027 1028 1029 1030
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

1031 1032 1033
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1034
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1035
				schedule_delayed_work(&vi->refill, 0);
J
Jason Wang 已提交
1036 1037 1038 1039 1040 1041
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

1042
static void free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
1043 1044
{
	struct sk_buff *skb;
1045
	unsigned int len;
1046
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
1047
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
J
Jason Wang 已提交
1048 1049
	unsigned int packets = 0;
	unsigned int bytes = 0;
R
Rusty Russell 已提交
1050

1051
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
1052
		pr_debug("Sent skb %p\n", skb);
1053

J
Jason Wang 已提交
1054 1055
		bytes += skb->len;
		packets++;
1056

1057
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
1058
	}
J
Jason Wang 已提交
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

	u64_stats_update_begin(&stats->tx_syncp);
	stats->tx_bytes += bytes;
	stats->tx_packets += packets;
	u64_stats_update_end(&stats->tx_syncp);
R
Rusty Russell 已提交
1070 1071
}

1072
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1073
{
1074
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1075
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1076
	struct virtnet_info *vi = sq->vq->vdev->priv;
1077
	unsigned num_sg;
1078
	unsigned hdr_len = vi->hdr_len;
1079
	bool can_push;
R
Rusty Russell 已提交
1080

J
Johannes Berg 已提交
1081
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1082 1083 1084 1085 1086 1087 1088

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1089
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1090 1091
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1092

1093
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1094
				    virtio_is_little_endian(vi->vdev), false))
1095
		BUG();
R
Rusty Russell 已提交
1096

1097
	if (vi->mergeable_rx_bufs)
1098
		hdr->num_buffers = 0;
1099

1100
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1101 1102 1103 1104 1105 1106 1107 1108 1109
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	}
1110
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1111 1112
}

1113
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1114 1115
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1116 1117
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1118
	int err;
1119 1120
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
	bool kick = !skb->xmit_more;
1121 1122

	/* Free up any pending old buffers before queueing new ones. */
1123
	free_old_xmit_skbs(sq);
1124

1125 1126 1127
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1128
	/* Try to transmit */
1129
	err = xmit_skb(sq, skb);
1130

1131
	/* This should not happen! */
1132
	if (unlikely(err)) {
1133 1134 1135
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1136
				 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1137
		dev->stats.tx_dropped++;
1138
		dev_kfree_skb_any(skb);
1139
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1140
	}
1141

1142 1143 1144 1145
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

1146 1147 1148 1149 1150 1151 1152 1153 1154
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1155
	 */
1156
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1157
		netif_stop_subqueue(dev, qnum);
1158
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1159
			/* More just got used, free them then recheck. */
1160 1161
			free_old_xmit_skbs(sq);
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1162
				netif_start_subqueue(dev, qnum);
1163
				virtqueue_disable_cb(sq->vq);
1164 1165
			}
		}
1166
	}
1167

1168
	if (kick || netif_xmit_stopped(txq))
1169
		virtqueue_kick(sq->vq);
R
Rusty Russell 已提交
1170

1171
	return NETDEV_TX_OK;
1172 1173
}

1174 1175 1176
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1177
 * never fail unless improperly formatted.
1178 1179
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1180
				 struct scatterlist *out)
1181
{
1182
	struct scatterlist *sgs[4], hdr, stat;
1183
	unsigned out_num = 0, tmp;
1184 1185

	/* Caller should know better */
1186
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1187

1188 1189 1190
	vi->ctrl_status = ~0;
	vi->ctrl_hdr.class = class;
	vi->ctrl_hdr.cmd = cmd;
1191
	/* Add header */
1192
	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1193
	sgs[out_num++] = &hdr;
1194

1195 1196
	if (out)
		sgs[out_num++] = out;
1197

1198
	/* Add return status. */
1199
	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1200
	sgs[out_num] = &stat;
1201

1202
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1203
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1204

1205
	if (unlikely(!virtqueue_kick(vi->cvq)))
1206
		return vi->ctrl_status == VIRTIO_NET_OK;
1207 1208 1209 1210

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1211 1212
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1213 1214
		cpu_relax();

1215
	return vi->ctrl_status == VIRTIO_NET_OK;
1216 1217
}

1218 1219 1220 1221
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1222
	int ret;
1223
	struct sockaddr *addr;
1224
	struct scatterlist sg;
1225

1226
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1227 1228 1229 1230
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1231
	if (ret)
1232
		goto out;
1233

1234 1235 1236
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1237
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1238 1239
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1240 1241
			ret = -EINVAL;
			goto out;
1242
		}
1243 1244
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1245 1246 1247 1248 1249 1250 1251
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1252 1253 1254
	}

	eth_commit_mac_addr_change(dev, p);
1255
	ret = 0;
1256

1257 1258 1259
out:
	kfree(addr);
	return ret;
1260 1261
}

1262 1263
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1264 1265 1266 1267 1268 1269
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
1270
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1271 1272 1273
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
1274
			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1275 1276
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
1277
		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1278 1279

		do {
1280
			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1281 1282
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
1283
		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1284 1285 1286 1287 1288 1289 1290 1291

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1292
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1293 1294 1295 1296 1297
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1298 1299 1300 1301
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1302
	int i;
1303

J
Jason Wang 已提交
1304 1305
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
1306 1307 1308
}
#endif

1309 1310 1311 1312
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1313
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1314 1315 1316 1317
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1318
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1319 1320 1321 1322 1323 1324 1325
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1326 1327
	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
J
Jason Wang 已提交
1328 1329

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1330
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1331 1332 1333
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1334
	} else {
J
Jason Wang 已提交
1335
		vi->curr_queue_pairs = queue_pairs;
1336 1337 1338
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1339
	}
J
Jason Wang 已提交
1340 1341 1342 1343

	return 0;
}

1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1354 1355 1356
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1357
	int i;
R
Rusty Russell 已提交
1358

1359 1360
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1361 1362 1363

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
1364 1365 1366 1367

	return 0;
}

1368 1369 1370
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1371 1372
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1373
	struct netdev_hw_addr *ha;
1374
	int uc_count;
1375
	int mc_count;
1376 1377
	void *buf;
	int i;
1378

S
stephen hemminger 已提交
1379
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1380 1381 1382
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1383 1384
	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1385

1386
	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1387 1388

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1389
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1390
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1391
			 vi->ctrl_promisc ? "en" : "dis");
1392

1393
	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1394 1395

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1396
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1397
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1398
			 vi->ctrl_allmulti ? "en" : "dis");
1399

1400
	uc_count = netdev_uc_count(dev);
1401
	mc_count = netdev_mc_count(dev);
1402
	/* MAC filter - use one buffer for both lists */
1403 1404 1405
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1406
	if (!buf)
1407 1408
		return;

1409 1410
	sg_init_table(sg, 2);

1411
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1412
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1413
	i = 0;
1414
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1415
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1416 1417

	sg_set_buf(&sg[0], mac_data,
1418
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1419 1420

	/* multicast list and count fill the end */
1421
	mac_data = (void *)&mac_data->macs[uc_count][0];
1422

M
Michael S. Tsirkin 已提交
1423
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1424
	i = 0;
1425 1426
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1427 1428

	sg_set_buf(&sg[1], mac_data,
1429
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1430 1431

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1432
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1433
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1434 1435

	kfree(buf);
1436 1437
}

1438 1439
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1440 1441 1442 1443
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1444 1445
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1446 1447

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1448
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1449
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1450
	return 0;
1451 1452
}

1453 1454
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1455 1456 1457 1458
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1459 1460
	vi->ctrl_vid = vid;
	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1461 1462

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1463
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1464
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1465
	return 0;
1466 1467
}

1468
static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
J
Jason Wang 已提交
1469 1470 1471
{
	int i;

1472 1473
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
1474 1475 1476 1477
			virtqueue_set_affinity(vi->rq[i].vq, -1);
			virtqueue_set_affinity(vi->sq[i].vq, -1);
		}

1478 1479 1480
		vi->affinity_hint_set = false;
	}
}
1481

1482 1483 1484 1485
static void virtnet_set_affinity(struct virtnet_info *vi)
{
	int i;
	int cpu;
J
Jason Wang 已提交
1486 1487 1488 1489 1490

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
1491 1492 1493 1494
	if (vi->curr_queue_pairs == 1 ||
	    vi->max_queue_pairs != num_online_cpus()) {
		virtnet_clean_affinity(vi, -1);
		return;
J
Jason Wang 已提交
1495 1496
	}

1497 1498
	i = 0;
	for_each_online_cpu(cpu) {
J
Jason Wang 已提交
1499 1500
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
1501
		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1502
		i++;
J
Jason Wang 已提交
1503 1504
	}

1505
	vi->affinity_hint_set = true;
J
Jason Wang 已提交
1506 1507
}

1508
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1509
{
1510 1511 1512 1513 1514
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
1515

1516 1517 1518 1519 1520 1521 1522
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
1523

1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

	virtnet_clean_affinity(vi, cpu);
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
1555 1556
}

R
Rick Jones 已提交
1557 1558 1559 1560 1561
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1562 1563
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1564 1565 1566 1567
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

1595
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1596 1597
		return -EINVAL;

J
John Fastabend 已提交
1598 1599 1600 1601 1602 1603 1604
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

1605
	get_online_cpus();
1606
	err = _virtnet_set_queues(vi, queue_pairs);
1607 1608 1609 1610
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

1611
		virtnet_set_affinity(vi);
1612
	}
1613
	put_online_cpus();
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1631 1632 1633 1634 1635 1636
/* Check if the user is trying to change anything besides speed/duplex */
static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
{
	struct ethtool_cmd diff1 = *cmd;
	struct ethtool_cmd diff2 = {};

1637 1638 1639
	/* cmd is always set so we need to clear it, validate the port type
	 * and also without autonegotiation we can ignore advertising
	 */
1640
	ethtool_cmd_speed_set(&diff1, 0);
1641
	diff2.port = PORT_OTHER;
1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
	diff1.advertising = 0;
	diff1.duplex = 0;
	diff1.cmd = 0;

	return !memcmp(&diff1, &diff2, sizeof(diff1));
}

static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u32 speed;

	speed = ethtool_cmd_speed(cmd);
	/* don't allow custom speed and duplex */
	if (!ethtool_validate_speed(speed) ||
	    !ethtool_validate_duplex(cmd->duplex) ||
	    !virtnet_validate_ethtool_cmd(cmd))
		return -EINVAL;
	vi->speed = speed;
	vi->duplex = cmd->duplex;

	return 0;
}

static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct virtnet_info *vi = netdev_priv(dev);

	ethtool_cmd_speed_set(cmd, vi->speed);
	cmd->duplex = vi->duplex;
	cmd->port = PORT_OTHER;

	return 0;
}

static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

1685
static const struct ethtool_ops virtnet_ethtool_ops = {
1686
	.get_drvinfo = virtnet_get_drvinfo,
1687
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1688
	.get_ringparam = virtnet_get_ringparam,
1689 1690
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1691
	.get_ts_info = ethtool_op_get_ts_info,
1692 1693
	.get_settings = virtnet_get_settings,
	.set_settings = virtnet_set_settings,
1694 1695
};

1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++)
			napi_disable(&vi->rq[i].napi);
	}
}

static int init_vqs(struct virtnet_info *vi);
1714
static void _remove_vq_common(struct virtnet_info *vi);
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
	}

	netif_device_attach(vi->dev);
	return err;
}

1740
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
{
	struct virtio_device *dev = vi->vdev;
	int ret;

	virtio_config_disable(dev);
	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
	virtnet_freeze_down(dev);
	_remove_vq_common(vi);

	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

	ret = virtio_finalize_features(dev);
	if (ret)
		goto err;

1757
	vi->xdp_queue_pairs = xdp_qp;
1758 1759 1760
	ret = virtnet_restore_up(dev);
	if (ret)
		goto err;
1761
	ret = _virtnet_set_queues(vi, curr_qp);
1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
	if (ret)
		goto err;

	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
	virtio_config_enable(dev);
	return 0;
err:
	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
	return ret;
}

J
John Fastabend 已提交
1773 1774 1775 1776 1777
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
1778
	u16 xdp_qp = 0, curr_qp;
1779
	int i, err;
J
John Fastabend 已提交
1780 1781

	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1782 1783 1784
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
J
John Fastabend 已提交
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
		netdev_warn(dev, "request %i queues but max is %i\n",
			    curr_qp + xdp_qp, vi->max_queue_pairs);
		return -ENOMEM;
	}

1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
	if (prog) {
		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
		if (IS_ERR(prog))
			return PTR_ERR(prog);
	}

	/* Changing the headroom in buffers is a disruptive operation because
	 * existing buffers must be flushed and reallocated. This will happen
	 * when a xdp program is initially added or xdp is disabled by removing
	 * the xdp program resulting in number of XDP queues changing.
	 */
	if (vi->xdp_queue_pairs != xdp_qp) {
1822 1823 1824
		err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
		if (err) {
			dev_warn(&dev->dev, "XDP reset failure.\n");
1825
			goto virtio_reset_err;
1826
		}
J
John Fastabend 已提交
1827 1828
	}

1829 1830
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

J
John Fastabend 已提交
1831 1832 1833 1834 1835 1836 1837 1838
	for (i = 0; i < vi->max_queue_pairs; i++) {
		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
		if (old_prog)
			bpf_prog_put(old_prog);
	}

	return 0;
1839 1840 1841 1842 1843 1844 1845 1846 1847

virtio_reset_err:
	/* On reset error do our best to unwind XDP changes inflight and return
	 * error up to user space for resolution. The underlying reset hung on
	 * us so not much we can do here.
	 */
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874
}

static bool virtnet_xdp_query(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (vi->rq[i].xdp_prog)
			return true;
	}
	return false;
}

static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return virtnet_xdp_set(dev, xdp->prog);
	case XDP_QUERY_PROG:
		xdp->prog_attached = virtnet_xdp_query(dev);
		return 0;
	default:
		return -EINVAL;
	}
}

1875 1876 1877 1878 1879
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1880
	.ndo_set_mac_address = virtnet_set_mac_address,
1881
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1882
	.ndo_get_stats64     = virtnet_stats,
1883 1884
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
1885 1886
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
J
Jason Wang 已提交
1887
#endif
J
John Fastabend 已提交
1888
	.ndo_xdp		= virtnet_xdp,
1889 1890
};

1891
static void virtnet_config_changed_work(struct work_struct *work)
1892
{
1893 1894
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1895 1896
	u16 v;

1897 1898
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
1899
		return;
1900 1901

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1902
		netdev_notify_peers(vi->dev);
1903 1904
		virtnet_ack_link_announce(vi);
	}
1905 1906 1907 1908 1909

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
1910
		return;
1911 1912 1913 1914 1915

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1916
		netif_tx_wake_all_queues(vi->dev);
1917 1918
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1919
		netif_tx_stop_all_queues(vi->dev);
1920 1921 1922 1923 1924 1925 1926
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1927
	schedule_work(&vi->config_work);
1928 1929
}

J
Jason Wang 已提交
1930 1931
static void virtnet_free_queues(struct virtnet_info *vi)
{
1932 1933
	int i;

1934 1935
	for (i = 0; i < vi->max_queue_pairs; i++) {
		napi_hash_del(&vi->rq[i].napi);
1936
		netif_napi_del(&vi->rq[i].napi);
1937
	}
1938

1939 1940 1941 1942 1943
	/* We called napi_hash_del() before netif_napi_del(),
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
1944 1945 1946 1947
	kfree(vi->rq);
	kfree(vi->sq);
}

1948
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
1949
{
J
John Fastabend 已提交
1950
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
1951 1952 1953 1954 1955
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
1956 1957 1958 1959 1960

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
1961
	}
1962 1963 1964 1965 1966 1967
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
1968
	rtnl_unlock();
J
Jason Wang 已提交
1969 1970
}

1971 1972 1973 1974 1975 1976 1977 1978
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

1979
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
J
John Fastabend 已提交
1980 1981 1982 1983 1984 1985 1986 1987 1988
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

J
Jason Wang 已提交
1989 1990 1991 1992 1993 1994 1995
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
1996
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1997
			if (!is_xdp_raw_buffer_queue(vi, i))
J
John Fastabend 已提交
1998 1999 2000 2001
				dev_kfree_skb(buf);
			else
				put_page(virt_to_head_page(buf));
		}
J
Jason Wang 已提交
2002 2003 2004 2005 2006 2007
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2008
			if (vi->mergeable_rx_bufs) {
2009
				put_page(virt_to_head_page(buf));
2010
			} else if (vi->big_packets) {
2011
				give_pages(&vi->rq[i], buf);
2012
			} else {
2013
				put_page(virt_to_head_page(buf));
2014
			}
J
Jason Wang 已提交
2015 2016 2017 2018
		}
	}
}

2019 2020 2021 2022
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2023
	virtnet_clean_affinity(vi, -1);
J
Jason Wang 已提交
2024

2025
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2026 2027

	virtnet_free_queues(vi);
2028 2029
}

2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

	return max(min_buf_len, hdr_len);
}

J
Jason Wang 已提交
2045
static int virtnet_find_vqs(struct virtnet_info *vi)
2046
{
J
Jason Wang 已提交
2047 2048 2049 2050 2051
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2052
	bool *ctx;
J
Jason Wang 已提交
2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;
2071 2072 2073 2074 2075 2076 2077
	if (vi->mergeable_rx_bufs) {
		ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2078 2079 2080 2081 2082 2083

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2084

J
Jason Wang 已提交
2085 2086 2087 2088 2089 2090 2091 2092
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2093 2094
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2095
	}
2096

2097 2098
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names, ctx, NULL);
J
Jason Wang 已提交
2099 2100
	if (ret)
		goto err_find;
2101

J
Jason Wang 已提交
2102 2103
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2104
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2105
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2106
	}
J
Jason Wang 已提交
2107 2108 2109

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2110
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2111 2112 2113 2114 2115 2116 2117
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

2118
	return 0;
J
Jason Wang 已提交
2119 2120

err_find:
2121 2122
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2140
	if (!vi->rq)
J
Jason Wang 已提交
2141 2142 2143 2144 2145 2146 2147 2148 2149
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2150
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2175
	get_online_cpus();
2176
	virtnet_set_affinity(vi);
2177 2178
	put_online_cpus();

J
Jason Wang 已提交
2179 2180 2181 2182 2183 2184
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2185 2186
}

2187 2188 2189 2190 2191 2192
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
		struct rx_queue_attribute *attribute, char *buf)
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
J
Johannes Berg 已提交
2193
	struct ewma_pkt_len *avg;
2194 2195 2196

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2197 2198
	return sprintf(buf, "%u\n",
		       get_mergeable_buf_len(&vi->rq[queue_index], avg));
2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2249 2250 2251
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2252
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2253
{
2254 2255 2256 2257 2258 2259
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

2260 2261 2262
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
	int i, err;
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
2282
	/* Find if host supports multiqueue virtio_net device */
2283 2284 2285
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
2286 2287 2288 2289 2290 2291

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
2292 2293

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
2294
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
2295 2296 2297 2298
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
2299
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2300
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
2301
	dev->features = NETIF_F_HIGHDMA;
2302

2303
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
2304 2305 2306
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
2307
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
2308
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
2309
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2310
		if (csum)
J
Jason Wang 已提交
2311
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2312 2313

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2314
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
2315 2316
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
2317
		/* Individual feature bits: what can host handle? */
2318 2319 2320 2321 2322 2323
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
2324 2325
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;
2326

2327 2328
		dev->features |= NETIF_F_GSO_ROBUST;

2329
		if (gso)
2330
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
2331
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
2332
	}
2333 2334
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
R
Rusty Russell 已提交
2335

2336 2337
	dev->vlan_features = dev->features;

2338 2339 2340 2341
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
2342
	/* Configuration may specify what MAC to use.  Otherwise random. */
2343 2344 2345 2346 2347
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
2348
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
2349 2350 2351 2352 2353

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
2354
	vdev->priv = vi;
2355 2356 2357 2358 2359
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

2360 2361 2362 2363 2364 2365 2366
	for_each_possible_cpu(i) {
		struct virtnet_stats *virtnet_stats;
		virtnet_stats = per_cpu_ptr(vi->stats, i);
		u64_stats_init(&virtnet_stats->tx_syncp);
		u64_stats_init(&virtnet_stats->rx_syncp);
	}

2367
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
2368

2369
	/* If we can receive ANY GSO packets, we must allocate large ones. */
2370 2371
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2372 2373
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2374 2375
		vi->big_packets = true;

2376 2377 2378
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

2379 2380
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2381 2382 2383 2384
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

2385 2386
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2387 2388
		vi->any_header_sg = true;

J
Jason Wang 已提交
2389 2390 2391
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

2392 2393 2394 2395
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
2396
		if (mtu < dev->min_mtu) {
2397 2398 2399 2400 2401 2402
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
			dev_err(&vdev->dev, "device MTU appears to have changed "
				"it is now %d < %d", mtu, dev->min_mtu);
			goto free_stats;
2403
		}
2404

2405 2406 2407
		dev->mtu = mtu;
		dev->max_mtu = mtu;

2408 2409 2410
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
2411 2412
	}

2413 2414
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
2415

2416 2417 2418 2419 2420
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
2421 2422 2423
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2424
	err = init_vqs(vi);
2425
	if (err)
2426
		goto free_stats;
R
Rusty Russell 已提交
2427

2428 2429 2430 2431
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
2432 2433
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
2434

2435 2436
	virtnet_init_settings(dev);

R
Rusty Russell 已提交
2437 2438 2439
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
2440
		goto free_vqs;
R
Rusty Russell 已提交
2441
	}
2442

M
Michael S. Tsirkin 已提交
2443 2444
	virtio_device_ready(vdev);

2445
	err = virtnet_cpu_notif_add(vi);
2446 2447
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
2448
		goto free_unregister_netdev;
2449 2450
	}

2451
	virtnet_set_queues(vi, vi->curr_queue_pairs);
2452

J
Jason Wang 已提交
2453 2454 2455 2456
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
2457
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
2458 2459 2460 2461
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
2462

J
Jason Wang 已提交
2463 2464 2465
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
2466 2467
	return 0;

2468
free_unregister_netdev:
2469 2470
	vi->vdev->config->reset(vdev);

2471
	unregister_netdev(dev);
2472
free_vqs:
J
Jason Wang 已提交
2473
	cancel_delayed_work_sync(&vi->refill);
2474
	free_receive_page_frags(vi);
2475
	virtnet_del_vqs(vi);
2476 2477
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
2478 2479 2480 2481 2482
free:
	free_netdev(dev);
	return err;
}

2483 2484 2485 2486 2487 2488 2489 2490 2491
static void _remove_vq_common(struct virtnet_info *vi)
{
	vi->vdev->config->reset(vi->vdev);
	free_unused_bufs(vi);
	_free_receive_bufs(vi);
	free_receive_page_frags(vi);
	virtnet_del_vqs(vi);
}

2492
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
2493
{
2494
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
2495 2496

	/* Free unused buffers in both send and recv, if any. */
2497
	free_unused_bufs(vi);
2498

J
Jason Wang 已提交
2499
	free_receive_bufs(vi);
2500

2501 2502
	free_receive_page_frags(vi);

J
Jason Wang 已提交
2503
	virtnet_del_vqs(vi);
2504 2505
}

2506
static void virtnet_remove(struct virtio_device *vdev)
2507 2508 2509
{
	struct virtnet_info *vi = vdev->priv;

2510
	virtnet_cpu_notif_remove(vi);
2511

2512 2513
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
2514

2515 2516 2517
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
2518

2519
	free_percpu(vi->stats);
2520
	free_netdev(vi->dev);
R
Rusty Russell 已提交
2521 2522
}

2523
#ifdef CONFIG_PM_SLEEP
2524 2525 2526 2527
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2528
	virtnet_cpu_notif_remove(vi);
2529
	virtnet_freeze_down(vdev);
2530 2531 2532 2533 2534 2535 2536 2537
	remove_vq_common(vi);

	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
2538
	int err;
2539

2540
	err = virtnet_restore_up(vdev);
2541 2542
	if (err)
		return err;
J
Jason Wang 已提交
2543 2544
	virtnet_set_queues(vi, vi->curr_queue_pairs);

2545
	err = virtnet_cpu_notif_add(vi);
2546 2547 2548
	if (err)
		return err;

2549 2550 2551 2552
	return 0;
}
#endif

R
Rusty Russell 已提交
2553 2554 2555 2556 2557
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
	VIRTIO_NET_F_MTU

2570
static unsigned int features[] = {
2571 2572 2573 2574 2575 2576
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
2577
	VIRTIO_F_ANY_LAYOUT,
2578 2579
};

2580
static struct virtio_driver virtio_net_driver = {
2581 2582
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
2583 2584
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
2585 2586 2587
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
2588
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
2589
	.probe =	virtnet_probe,
2590
	.remove =	virtnet_remove,
2591
	.config_changed = virtnet_config_changed,
2592
#ifdef CONFIG_PM_SLEEP
2593 2594 2595
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
2596 2597
};

2598 2599 2600 2601
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
2602
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2603 2604 2605 2606 2607
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
2608
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
	unregister_virtio_driver(&virtio_net_driver);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
2633 2634 2635 2636

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");