virtio_net.c 86.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* A network driver using virtio.
R
Rusty Russell 已提交
3 4 5 6 7 8
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
9
#include <linux/ethtool.h>
R
Rusty Russell 已提交
10 11 12
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
13
#include <linux/bpf.h>
14
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
15
#include <linux/scatterlist.h>
16
#include <linux/if_vlan.h>
17
#include <linux/slab.h>
18
#include <linux/cpu.h>
19
#include <linux/average.h>
J
Jason Wang 已提交
20
#include <linux/filter.h>
21
#include <linux/kernel.h>
22
#include <net/route.h>
23
#include <net/xdp.h>
24
#include <net/net_failover.h>
R
Rusty Russell 已提交
25

26
static int napi_weight = NAPI_POLL_WEIGHT;
27 28
module_param(napi_weight, int, 0444);

29
static bool csum = true, gso = true, napi_tx = true;
R
Rusty Russell 已提交
30 31
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
32
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
33

R
Rusty Russell 已提交
34
/* FIXME: MTU in config. */
35
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
36
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
37

38 39
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

40 41 42
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

43 44 45 46
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

47 48
#define VIRTIO_XDP_FLAG	BIT(0)

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56
#define VIRTNET_DRIVER_VERSION "1.0.0"
57

58 59 60 61
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
62 63
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
64
};
65

66 67 68 69 70
#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
				(1ULL << VIRTIO_NET_F_GUEST_UFO))

T
Toshiaki Makita 已提交
71 72 73
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
74 75
};

T
Toshiaki Makita 已提交
76 77 78 79
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
80 81
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
82
	u64 kicks;
T
Toshiaki Makita 已提交
83 84
};

85 86
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
87 88
	u64 packets;
	u64 bytes;
89
	u64 drops;
90 91 92 93
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
94
	u64 kicks;
T
Toshiaki Makita 已提交
95 96 97
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
98
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
99 100

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
101 102 103 104
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
105
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
106 107 108
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
109 110 111 112 113 114 115
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
116
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
117 118 119 120 121
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

122 123 124 125 126 127 128
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
129 130 131

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
132

T
Toshiaki Makita 已提交
133 134
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
135
	struct napi_struct napi;
136 137 138 139 140 141 142
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
143 144
	struct napi_struct napi;

J
John Fastabend 已提交
145 146
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
147 148
	struct virtnet_rq_stats stats;

149 150 151
	/* Chain pages by the private ptr. */
	struct page *pages;

152
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
153
	struct ewma_pkt_len mrg_avg_pkt_len;
154

155 156 157
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

158 159
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
160

161 162 163
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
164 165
	/* Name of this receive queue: input.$index */
	char name[40];
166 167

	struct xdp_rxq_info xdp_rxq;
168 169
};

170 171 172 173 174 175 176
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
177
	__virtio16 vid;
178
	__virtio64 offloads;
179 180
};

181 182 183 184
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
185 186
	struct send_queue *sq;
	struct receive_queue *rq;
187 188
	unsigned int status;

J
Jason Wang 已提交
189 190 191 192 193 194
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

195 196 197
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

198 199 200
	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
	bool xdp_enabled;

201 202 203
	/* I like... big packets and I cannot lie! */
	bool big_packets;

204 205 206
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
207 208 209
	/* Has control virtqueue */
	bool has_cvq;

210 211 212
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

213 214 215
	/* Packet virtio header size */
	u8 hdr_len;

216 217 218
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

219 220 221
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
222 223
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
224

225 226 227
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
228

229
	struct control_buf *ctrl;
230 231 232 233

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
234 235

	unsigned long guest_offloads;
236
	unsigned long guest_offloads_capable;
237 238 239

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
240 241
};

242
struct padded_vnet_hdr {
243
	struct virtio_net_hdr_mrg_rxbuf hdr;
244
	/*
245 246 247
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
248
	 */
249
	char padding[4];
250 251
};

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
static bool is_xdp_frame(void *ptr)
{
	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
}

static void *xdp_to_ptr(struct xdp_frame *ptr)
{
	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
}

static struct xdp_frame *ptr_to_xdp(void *ptr)
{
	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
}

J
Jason Wang 已提交
267 268 269 270 271
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
272
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
273 274 275 276 277 278 279 280 281
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
282
	return vq->index / 2;
J
Jason Wang 已提交
283 284 285 286 287 288 289
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

290
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
291
{
292
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
293 294
}

295 296 297 298
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
299
static void give_pages(struct receive_queue *rq, struct page *page)
300
{
301
	struct page *end;
302

303
	/* Find end of list, sew whole thing into vi->rq.pages. */
304
	for (end = page; end->private; end = (struct page *)end->private);
305 306
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
307 308
}

309
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
310
{
311
	struct page *p = rq->pages;
312

313
	if (p) {
314
		rq->pages = (struct page *)p->private;
315 316 317
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
318 319 320 321
		p = alloc_page(gfp_mask);
	return p;
}

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
337 338 339 340 341 342
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
343 344
}

345
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
346
{
347
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
348
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
349

350
	/* Suppress further interrupts. */
351
	virtqueue_disable_cb(vq);
352

W
Willem de Bruijn 已提交
353 354 355 356 357
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
358 359
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

377
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
378 379
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
380
				   struct page *page, unsigned int offset,
381
				   unsigned int len, unsigned int truesize,
382 383
				   bool hdr_valid, unsigned int metasize,
				   unsigned int headroom)
384 385
{
	struct sk_buff *skb;
386
	struct virtio_net_hdr_mrg_rxbuf *hdr;
387
	unsigned int copy, hdr_len, hdr_padded_len;
388 389
	int tailroom, shinfo_size;
	char *p, *hdr_p;
390

391
	p = page_address(page) + offset;
392
	hdr_p = p;
393

394 395
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
396
		hdr_padded_len = sizeof(*hdr);
397
	else
398
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
399

400 401 402 403 404 405 406 407 408 409 410
	/* If headroom is not 0, there is an offset between the beginning of the
	 * data and the allocated space, otherwise the data and the allocated
	 * space are aligned.
	 */
	if (headroom) {
		/* The actual allocated space size is PAGE_SIZE. */
		truesize = PAGE_SIZE;
		tailroom = truesize - len - offset;
	} else {
		tailroom = truesize - len;
	}
411

412
	len -= hdr_len;
413 414
	offset += hdr_padded_len;
	p += hdr_padded_len;
415

416 417
	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

418
	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
419 420 421 422 423 424 425 426 427 428 429 430 431
		skb = build_skb(p, truesize);
		if (unlikely(!skb))
			return NULL;

		skb_put(skb, len);
		goto ok;
	}

	/* copy small packet so we can reuse these pages for small data */
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;

432 433 434 435 436 437 438
	/* Copy all frame if it fits skb->head, otherwise
	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
	 */
	if (len <= skb_tailroom(skb))
		copy = len;
	else
		copy = ETH_HLEN + metasize;
439
	skb_put_data(skb, p, copy);
440

441 442
	len -= copy;
	offset += copy;
443

444 445 446 447 448
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
449
		goto ok;
450 451
	}

452 453 454 455 456 457 458
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
459
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
460 461 462
		dev_kfree_skb(skb);
		return NULL;
	}
463
	BUG_ON(offset >= PAGE_SIZE);
464
	while (len) {
465 466 467 468
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
469 470 471
		page = (struct page *)page->private;
		offset = 0;
	}
472

473
	if (page)
474
		give_pages(rq, page);
475

476 477 478 479 480 481 482 483 484 485 486 487
ok:
	/* hdr_valid means no XDP, so we can copy the vnet header */
	if (hdr_valid) {
		hdr = skb_vnet_hdr(skb);
		memcpy(hdr, hdr_p, hdr_len);
	}

	if (metasize) {
		__skb_pull(skb, metasize);
		skb_metadata_set(skb, metasize);
	}

488 489
	return skb;
}
490

491 492 493
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
494 495 496 497
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

498 499 500 501 502
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
503
	/* Zero header and leave csum up to XDP layers */
504
	hdr = xdpf->data;
505
	memset(hdr, 0, vi->hdr_len);
506
	xdpf->len   += vi->hdr_len;
507

508
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
509

510 511
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
				   GFP_ATOMIC);
512
	if (unlikely(err))
513
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
514

515
	return 0;
J
John Fastabend 已提交
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 * the current cpu, so it does not need to be locked.
 *
 * Here we use marco instead of inline functions because we have to deal with
 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 * functions to perfectly solve these three problems at the same time.
 */
#define virtnet_xdp_get_sq(vi) ({                                       \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
	unsigned int qp;                                                \
									\
	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
		qp += smp_processor_id();                               \
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_acquire(txq);                                \
	} else {                                                        \
		qp = smp_processor_id() % v->curr_queue_pairs;          \
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_lock(txq, raw_smp_processor_id());           \
	}                                                               \
	v->sq + qp;                                                     \
})

#define virtnet_xdp_put_sq(vi, q) {                                     \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
									\
	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
	if (v->curr_queue_pairs > nr_cpu_ids)                           \
		__netif_tx_release(txq);                                \
	else                                                            \
		__netif_tx_unlock(txq);                                 \
553 554
}

555
static int virtnet_xdp_xmit(struct net_device *dev,
556
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
557 558
{
	struct virtnet_info *vi = netdev_priv(dev);
559 560
	struct receive_queue *rq = vi->rq;
	struct bpf_prog *xdp_prog;
561 562
	struct send_queue *sq;
	unsigned int len;
563 564
	int packets = 0;
	int bytes = 0;
565
	int nxmit = 0;
T
Toshiaki Makita 已提交
566
	int kicks = 0;
567
	void *ptr;
568
	int ret;
569 570
	int i;

571 572 573
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
574
	xdp_prog = rcu_access_pointer(rq->xdp_prog);
575 576 577
	if (!xdp_prog)
		return -ENXIO;

578
	sq = virtnet_xdp_get_sq(vi);
579 580 581

	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
582 583
		goto out;
	}
584

585
	/* Free up any pending old buffers before queueing new ones. */
586
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
587 588 589 590 591 592 593 594 595 596 597 598
		if (likely(is_xdp_frame(ptr))) {
			struct xdp_frame *frame = ptr_to_xdp(ptr);

			bytes += frame->len;
			xdp_return_frame(frame);
		} else {
			struct sk_buff *skb = ptr;

			bytes += skb->len;
			napi_consume_skb(skb, false);
		}
		packets++;
599
	}
600 601 602 603

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

604 605 606
		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
			break;
		nxmit++;
607
	}
608
	ret = nxmit;
609

T
Toshiaki Makita 已提交
610 611 612 613
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
614 615
out:
	u64_stats_update_begin(&sq->stats.syncp);
616 617
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
618
	sq->stats.xdp_tx += n;
619
	sq->stats.xdp_tx_drops += n - nxmit;
T
Toshiaki Makita 已提交
620
	sq->stats.kicks += kicks;
621
	u64_stats_update_end(&sq->stats.syncp);
622

623
	virtnet_xdp_put_sq(vi, sq);
624
	return ret;
J
Jason Wang 已提交
625 626
}

627 628
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
629
	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
630 631
}

632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
662
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
663 664 665 666 667 668 669 670 671 672 673 674 675 676
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
677
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

696 697 698
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
699
				     void *buf, void *ctx,
J
Jason Wang 已提交
700
				     unsigned int len,
701
				     unsigned int *xdp_xmit,
702
				     struct virtnet_rq_stats *stats)
703
{
704
	struct sk_buff *skb;
705
	struct bpf_prog *xdp_prog;
706
	unsigned int xdp_headroom = (unsigned long)ctx;
707 708 709 710
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
711
	struct page *page = virt_to_head_page(buf);
712
	unsigned int delta = 0;
713
	struct page *xdp_page;
714
	int err;
715
	unsigned int metasize = 0;
716

717
	len -= vi->hdr_len;
718
	stats->bytes += len;
719

720 721 722
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
723
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
724
		struct xdp_frame *xdpf;
725
		struct xdp_buff xdp;
726
		void *orig_data;
727 728
		u32 act;

729
		if (unlikely(hdr->hdr.gso_type))
730
			goto err_xdp;
731

732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

753
		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
754 755
		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
				 xdp_headroom, len, true);
756
		orig_data = xdp.data;
757
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
758
		stats->xdp_packets++;
759

760 761
		switch (act) {
		case XDP_PASS:
762
			/* Recalculate length in case bpf program changed it */
763
			delta = orig_data - xdp.data;
764
			len = xdp.data_end - xdp.data;
765
			metasize = xdp.data - xdp.data_meta;
766 767
			break;
		case XDP_TX:
768
			stats->xdp_tx++;
769
			xdpf = xdp_convert_buff_to_frame(&xdp);
770 771
			if (unlikely(!xdpf))
				goto err_xdp;
772
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
773 774 775
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
776
				trace_xdp_exception(vi->dev, xdp_prog, act);
777 778
				goto err_xdp;
			}
779
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
780 781 782
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
783
			stats->xdp_redirects++;
J
Jason Wang 已提交
784
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
785 786
			if (err)
				goto err_xdp;
787
			*xdp_xmit |= VIRTIO_XDP_REDIR;
788 789 790
			rcu_read_unlock();
			goto xdp_xmit;
		default:
791
			bpf_warn_invalid_xdp_action(act);
792
			fallthrough;
793 794
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
795
			goto err_xdp;
796
		case XDP_DROP:
797 798 799 800 801
			goto err_xdp;
		}
	}
	rcu_read_unlock();

802 803
	skb = build_skb(buf, buflen);
	if (!skb) {
804
		put_page(page);
805 806 807
		goto err;
	}
	skb_reserve(skb, headroom - delta);
808
	skb_put(skb, len);
809
	if (!xdp_prog) {
810 811
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
812
	} /* keep zeroed vnet hdr since XDP is loaded */
813

814 815 816
	if (metasize)
		skb_metadata_set(skb, metasize);

817
err:
818
	return skb;
819 820 821

err_xdp:
	rcu_read_unlock();
822 823
	stats->xdp_drops++;
	stats->drops++;
824
	put_page(page);
825 826
xdp_xmit:
	return NULL;
827 828 829
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
830
				   struct virtnet_info *vi,
831 832
				   struct receive_queue *rq,
				   void *buf,
833
				   unsigned int len,
834
				   struct virtnet_rq_stats *stats)
835 836
{
	struct page *page = buf;
837
	struct sk_buff *skb =
838
		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
J
John Fastabend 已提交
839

840
	stats->bytes += len - vi->hdr_len;
841 842 843 844 845 846
	if (unlikely(!skb))
		goto err;

	return skb;

err:
847
	stats->drops++;
848 849 850 851
	give_pages(rq, page);
	return NULL;
}

852
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
853
					 struct virtnet_info *vi,
854
					 struct receive_queue *rq,
855 856
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
857
					 unsigned int len,
858
					 unsigned int *xdp_xmit,
859
					 struct virtnet_rq_stats *stats)
860
{
861 862
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
863 864
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
865 866
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
867
	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
868
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
869
	unsigned int metasize = 0;
870 871
	unsigned int frame_sz;
	int err;
J
John Fastabend 已提交
872

J
John Fastabend 已提交
873
	head_skb = NULL;
874
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
875

J
John Fastabend 已提交
876 877 878
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
879
		struct xdp_frame *xdpf;
880
		struct page *xdp_page;
881 882
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
883 884
		u32 act;

885 886 887 888 889 890 891
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

892 893 894 895 896
		/* Buffers with headroom use PAGE_SIZE as alloc size,
		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
		 */
		frame_sz = headroom ? PAGE_SIZE : truesize;

897 898 899 900 901 902
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
903 904
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
905
			/* linearize data for XDP */
906
			xdp_page = xdp_linearize_page(rq, &num_buf,
907 908 909
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
910 911
			frame_sz = PAGE_SIZE;

912 913
			if (!xdp_page)
				goto err_xdp;
914
			offset = VIRTIO_XDP_HEADROOM;
915 916
		} else {
			xdp_page = page;
J
John Fastabend 已提交
917 918
		}

919 920 921
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
922
		data = page_address(xdp_page) + offset;
923
		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
924 925
		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
926

927
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
928
		stats->xdp_packets++;
929

J
John Fastabend 已提交
930 931
		switch (act) {
		case XDP_PASS:
932 933
			metasize = xdp.data - xdp.data_meta;

934
			/* recalculate offset to account for any header
935 936 937
			 * adjustments and minus the metasize to copy the
			 * metadata in page_to_skb(). Note other cases do not
			 * build an skb and avoid using offset
938
			 */
939 940
			offset = xdp.data - page_address(xdp_page) -
				 vi->hdr_len - metasize;
941

942 943
			/* recalculate len if xdp.data, xdp.data_end or
			 * xdp.data_meta were adjusted
944
			 */
945
			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
946 947 948 949
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
950 951
				head_skb = page_to_skb(vi, rq, xdp_page, offset,
						       len, PAGE_SIZE, false,
952
						       metasize, headroom);
953 954
				return head_skb;
			}
J
John Fastabend 已提交
955 956
			break;
		case XDP_TX:
957
			stats->xdp_tx++;
958
			xdpf = xdp_convert_buff_to_frame(&xdp);
959 960
			if (unlikely(!xdpf))
				goto err_xdp;
961
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
962 963 964
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
965
				trace_xdp_exception(vi->dev, xdp_prog, act);
966 967 968 969
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
970
			*xdp_xmit |= VIRTIO_XDP_TX;
971
			if (unlikely(xdp_page != page))
972
				put_page(page);
J
John Fastabend 已提交
973 974
			rcu_read_unlock();
			goto xdp_xmit;
975
		case XDP_REDIRECT:
976
			stats->xdp_redirects++;
977 978 979 980 981 982
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
983
			*xdp_xmit |= VIRTIO_XDP_REDIR;
984
			if (unlikely(xdp_page != page))
985
				put_page(page);
986 987
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
988
		default:
989
			bpf_warn_invalid_xdp_action(act);
990
			fallthrough;
991 992
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
993
			fallthrough;
994
		case XDP_DROP:
995 996
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
997
			goto err_xdp;
J
John Fastabend 已提交
998
		}
J
John Fastabend 已提交
999 1000
	}
	rcu_read_unlock();
1001

1002
	if (unlikely(len > truesize)) {
1003
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1004 1005 1006 1007
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
1008

1009
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1010
			       metasize, headroom);
J
John Fastabend 已提交
1011
	curr_skb = head_skb;
1012

1013 1014
	if (unlikely(!curr_skb))
		goto err_skb;
1015
	while (--num_buf) {
1016 1017
		int num_skb_frags;

1018
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1019
		if (unlikely(!buf)) {
1020
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
1021
				 dev->name, num_buf,
1022 1023
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
1024 1025
			dev->stats.rx_length_errors++;
			goto err_buf;
1026
		}
1027

1028
		stats->bytes += len;
1029
		page = virt_to_head_page(buf);
1030 1031 1032

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
1033
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1034 1035 1036 1037
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
1038 1039

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1040 1041
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1042 1043 1044

			if (unlikely(!nskb))
				goto err_skb;
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
1056
			head_skb->truesize += truesize;
1057
		}
1058
		offset = buf - page_address(page);
1059 1060 1061
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1062
					     len, truesize);
1063 1064
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1065
					offset, len, truesize);
1066
		}
1067 1068
	}

J
Johannes Berg 已提交
1069
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1070 1071
	return head_skb;

J
John Fastabend 已提交
1072 1073
err_xdp:
	rcu_read_unlock();
1074
	stats->xdp_drops++;
1075 1076
err_skb:
	put_page(page);
1077
	while (num_buf-- > 1) {
1078 1079
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
1080 1081 1082 1083 1084
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
1085
		stats->bytes += len;
1086
		page = virt_to_head_page(buf);
1087
		put_page(page);
1088
	}
1089
err_buf:
1090
	stats->drops++;
1091
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
1092
xdp_xmit:
1093
	return NULL;
1094 1095
}

1096 1097
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
1098
			unsigned int *xdp_xmit,
1099
			struct virtnet_rq_stats *stats)
1100
{
1101
	struct net_device *dev = vi->dev;
1102
	struct sk_buff *skb;
1103
	struct virtio_net_hdr_mrg_rxbuf *hdr;
1104

1105
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1106 1107
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1108
		if (vi->mergeable_rx_bufs) {
1109
			put_page(virt_to_head_page(buf));
1110
		} else if (vi->big_packets) {
1111
			give_pages(rq, buf);
1112
		} else {
1113
			put_page(virt_to_head_page(buf));
1114
		}
1115
		return;
1116
	}
1117

1118
	if (vi->mergeable_rx_bufs)
1119
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1120
					stats);
1121
	else if (vi->big_packets)
1122
		skb = receive_big(dev, vi, rq, buf, len, stats);
1123
	else
1124
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1125 1126

	if (unlikely(!skb))
1127
		return;
1128

1129
	hdr = skb_vnet_hdr(skb);
1130

1131
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1132
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1133

1134 1135 1136 1137 1138 1139
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1140 1141
	}

1142
	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1143 1144 1145 1146
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1147
	napi_gro_receive(&rq->napi, skb);
1148
	return;
R
Rusty Russell 已提交
1149 1150 1151 1152 1153 1154

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1155 1156 1157 1158 1159
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1160 1161
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1162
{
1163 1164
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1165
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1166
	void *ctx = (void *)(unsigned long)xdp_headroom;
1167
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1168
	int err;
1169

1170 1171 1172
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1173
		return -ENOMEM;
R
Rusty Russell 已提交
1174

1175 1176 1177 1178 1179
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1180
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1181
	if (err < 0)
1182
		put_page(virt_to_head_page(buf));
1183 1184
	return err;
}
1185

1186 1187
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1188 1189 1190 1191 1192
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1193 1194
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1195
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1196
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1197
		first = get_a_page(rq, gfp);
1198 1199
		if (!first) {
			if (list)
1200
				give_pages(rq, list);
1201
			return -ENOMEM;
1202
		}
1203
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1204

1205 1206 1207 1208
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1209

1210
	first = get_a_page(rq, gfp);
1211
	if (!first) {
1212
		give_pages(rq, list);
1213 1214 1215 1216
		return -ENOMEM;
	}
	p = page_address(first);

1217
	/* rq->sg[0], rq->sg[1] share the same page */
1218 1219
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1220

1221
	/* rq->sg[1] for data packet, from offset */
1222
	offset = sizeof(struct padded_vnet_hdr);
1223
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1224 1225 1226

	/* chain first in list head */
	first->private = (unsigned long)list;
1227 1228
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1229
	if (err < 0)
1230
		give_pages(rq, first);
1231 1232

	return err;
R
Rusty Russell 已提交
1233 1234
}

1235
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1236 1237
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1238
{
1239
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1240 1241
	unsigned int len;

1242 1243 1244 1245
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1246
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1247

1248
	return ALIGN(len, L1_CACHE_BYTES);
1249 1250
}

1251 1252
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1253
{
1254
	struct page_frag *alloc_frag = &rq->alloc_frag;
1255
	unsigned int headroom = virtnet_get_headroom(vi);
1256 1257
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1258
	char *buf;
1259
	void *ctx;
1260
	int err;
1261
	unsigned int len, hole;
1262

1263 1264 1265 1266 1267 1268
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1269
		return -ENOMEM;
1270

1271
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1272
	buf += headroom; /* advance address leaving hole at front of pkt */
1273
	get_page(alloc_frag->page);
1274
	alloc_frag->offset += len + room;
1275
	hole = alloc_frag->size - alloc_frag->offset;
1276
	if (hole < len + room) {
1277 1278
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1279
		 * the current buffer.
1280
		 */
1281 1282 1283
		len += hole;
		alloc_frag->offset += hole;
	}
1284

1285
	sg_init_one(rq->sg, buf, len);
1286
	ctx = mergeable_len_to_ctx(len, headroom);
1287
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1288
	if (err < 0)
1289
		put_page(virt_to_head_page(buf));
1290

1291 1292
	return err;
}
1293

1294 1295 1296 1297 1298 1299 1300
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1301 1302
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1303 1304
{
	int err;
1305
	bool oom;
1306

1307 1308
	do {
		if (vi->mergeable_rx_bufs)
1309
			err = add_recvbuf_mergeable(vi, rq, gfp);
1310
		else if (vi->big_packets)
1311
			err = add_recvbuf_big(vi, rq, gfp);
1312
		else
M
Michael S. Tsirkin 已提交
1313
			err = add_recvbuf_small(vi, rq, gfp);
1314

1315
		oom = err == -ENOMEM;
1316
		if (err)
1317
			break;
1318
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1319
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1320 1321 1322
		unsigned long flags;

		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1323
		rq->stats.kicks++;
1324
		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
T
Toshiaki Makita 已提交
1325 1326
	}

1327
	return !oom;
1328 1329
}

1330
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1331 1332
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1333
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1334

1335
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1336 1337
}

1338
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1339
{
1340
	napi_enable(napi);
1341 1342

	/* If all buffers were filled by other side before we napi_enabled, we
1343 1344 1345 1346 1347 1348
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1349 1350
}

W
Willem de Bruijn 已提交
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1369 1370 1371 1372 1373 1374
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1375 1376
static void refill_work(struct work_struct *work)
{
1377 1378
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1379
	bool still_empty;
J
Jason Wang 已提交
1380 1381
	int i;

1382
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1383
		struct receive_queue *rq = &vi->rq[i];
1384

J
Jason Wang 已提交
1385
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1386
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1387
		virtnet_napi_enable(rq->vq, &rq->napi);
1388

J
Jason Wang 已提交
1389 1390 1391 1392 1393 1394
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1395 1396
}

1397 1398
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1399
{
1400
	struct virtnet_info *vi = rq->vq->vdev->priv;
1401
	struct virtnet_rq_stats stats = {};
1402
	unsigned int len;
1403
	void *buf;
1404
	int i;
R
Rusty Russell 已提交
1405

1406
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1407 1408
		void *ctx;

1409
		while (stats.packets < budget &&
1410
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1411
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1412
			stats.packets++;
1413 1414
		}
	} else {
1415
		while (stats.packets < budget &&
1416
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1417
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1418
			stats.packets++;
1419
		}
R
Rusty Russell 已提交
1420 1421
	}

1422
	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
M
Michael S. Tsirkin 已提交
1423
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1424
			schedule_delayed_work(&vi->refill, 0);
1425
	}
R
Rusty Russell 已提交
1426

T
Toshiaki Makita 已提交
1427
	u64_stats_update_begin(&rq->stats.syncp);
1428 1429 1430 1431
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1432 1433
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1434
	}
T
Toshiaki Makita 已提交
1435
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1436

1437
	return stats.packets;
1438 1439
}

1440
static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1441 1442 1443 1444
{
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;
1445
	void *ptr;
1446

1447 1448 1449
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		if (likely(!is_xdp_frame(ptr))) {
			struct sk_buff *skb = ptr;
1450

1451
			pr_debug("Sent skb %p\n", skb);
1452

1453 1454 1455 1456
			bytes += skb->len;
			napi_consume_skb(skb, in_napi);
		} else {
			struct xdp_frame *frame = ptr_to_xdp(ptr);
1457

1458 1459 1460 1461
			bytes += frame->len;
			xdp_return_frame(frame);
		}
		packets++;
1462 1463 1464 1465 1466 1467 1468 1469
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1470 1471 1472 1473
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1474 1475
}

1476 1477 1478 1479 1480 1481 1482 1483 1484 1485
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

1486 1487 1488 1489 1490 1491 1492
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

1493
	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1494 1495 1496
		return;

	if (__netif_tx_trylock(txq)) {
1497
		free_old_xmit_skbs(sq, true);
1498 1499 1500 1501 1502 1503 1504
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1505 1506 1507 1508
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1509 1510
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1511
	unsigned int received;
1512
	unsigned int xdp_xmit = 0;
1513

1514 1515
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1516
	received = virtnet_receive(rq, budget, &xdp_xmit);
1517

1518
	/* Out of packets? */
1519 1520
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1521

1522
	if (xdp_xmit & VIRTIO_XDP_REDIR)
1523
		xdp_do_flush();
1524 1525

	if (xdp_xmit & VIRTIO_XDP_TX) {
1526
		sq = virtnet_xdp_get_sq(vi);
T
Toshiaki Makita 已提交
1527 1528 1529 1530 1531
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1532
		virtnet_xdp_put_sq(vi, sq);
1533
	}
J
Jason Wang 已提交
1534

R
Rusty Russell 已提交
1535 1536 1537
	return received;
}

J
Jason Wang 已提交
1538 1539 1540
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1541
	int i, err;
J
Jason Wang 已提交
1542

1543 1544 1545
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1546
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1547
				schedule_delayed_work(&vi->refill, 0);
1548

1549
		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1550 1551 1552
		if (err < 0)
			return err;

1553 1554 1555 1556 1557 1558 1559
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1560
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1561
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1562 1563 1564 1565 1566
	}

	return 0;
}

W
Willem de Bruijn 已提交
1567 1568 1569 1570
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
1571 1572
	unsigned int index = vq2txq(sq->vq);
	struct netdev_queue *txq;
W
Willem de Bruijn 已提交
1573

1574 1575 1576 1577 1578 1579 1580
	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
		/* We don't need to enable cb for XDP */
		napi_complete_done(napi, 0);
		return 0;
	}

	txq = netdev_get_tx_queue(vi->dev, index);
W
Willem de Bruijn 已提交
1581
	__netif_tx_lock(txq, raw_smp_processor_id());
1582
	free_old_xmit_skbs(sq, true);
W
Willem de Bruijn 已提交
1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1593
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1594
{
1595
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1596
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1597
	struct virtnet_info *vi = sq->vq->vdev->priv;
1598
	int num_sg;
1599
	unsigned hdr_len = vi->hdr_len;
1600
	bool can_push;
R
Rusty Russell 已提交
1601

J
Johannes Berg 已提交
1602
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1603 1604 1605 1606 1607 1608 1609

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1610
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1611 1612
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1613

1614
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1615 1616
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1617
		BUG();
R
Rusty Russell 已提交
1618

1619
	if (vi->mergeable_rx_bufs)
1620
		hdr->num_buffers = 0;
1621

1622
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1623 1624 1625
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1626 1627
		if (unlikely(num_sg < 0))
			return num_sg;
1628 1629 1630 1631
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1632 1633 1634 1635
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1636
	}
1637
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1638 1639
}

1640
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1641 1642
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1643 1644
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1645
	int err;
1646
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1647
	bool kick = !netdev_xmit_more();
W
Willem de Bruijn 已提交
1648
	bool use_napi = sq->napi.weight;
1649 1650

	/* Free up any pending old buffers before queueing new ones. */
1651
	free_old_xmit_skbs(sq, false);
1652

1653 1654 1655
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1656 1657 1658
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1659
	/* Try to transmit */
1660
	err = xmit_skb(sq, skb);
1661

1662
	/* This should not happen! */
1663
	if (unlikely(err)) {
1664 1665 1666
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1667 1668
				 "Unexpected TXQ (%d) queue failure: %d\n",
				 qnum, err);
1669
		dev->stats.tx_dropped++;
1670
		dev_kfree_skb_any(skb);
1671
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1672
	}
1673

1674
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1675 1676
	if (!use_napi) {
		skb_orphan(skb);
1677
		nf_reset_ct(skb);
W
Willem de Bruijn 已提交
1678
	}
1679

1680 1681 1682 1683 1684 1685 1686 1687 1688
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1689
	 */
1690
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1691
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1692 1693
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1694
			/* More just got used, free them then recheck. */
1695
			free_old_xmit_skbs(sq, false);
1696
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1697
				netif_start_subqueue(dev, qnum);
1698
				virtqueue_disable_cb(sq->vq);
1699 1700
			}
		}
1701
	}
1702

T
Toshiaki Makita 已提交
1703 1704 1705 1706 1707 1708 1709
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1710

1711
	return NETDEV_TX_OK;
1712 1713
}

1714 1715 1716
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1717
 * never fail unless improperly formatted.
1718 1719
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1720
				 struct scatterlist *out)
1721
{
1722
	struct scatterlist *sgs[4], hdr, stat;
1723
	unsigned out_num = 0, tmp;
1724 1725

	/* Caller should know better */
1726
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1727

1728 1729 1730
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1731
	/* Add header */
1732
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1733
	sgs[out_num++] = &hdr;
1734

1735 1736
	if (out)
		sgs[out_num++] = out;
1737

1738
	/* Add return status. */
1739
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1740
	sgs[out_num] = &stat;
1741

1742
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1743
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1744

1745
	if (unlikely(!virtqueue_kick(vi->cvq)))
1746
		return vi->ctrl->status == VIRTIO_NET_OK;
1747 1748 1749 1750

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1751 1752
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1753 1754
		cpu_relax();

1755
	return vi->ctrl->status == VIRTIO_NET_OK;
1756 1757
}

1758 1759 1760 1761
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1762
	int ret;
1763
	struct sockaddr *addr;
1764
	struct scatterlist sg;
1765

1766 1767 1768
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1769
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1770 1771 1772 1773
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1774
	if (ret)
1775
		goto out;
1776

1777 1778 1779
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1780
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1781 1782
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1783 1784
			ret = -EINVAL;
			goto out;
1785
		}
1786 1787
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1788 1789 1790 1791 1792 1793 1794
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1795 1796 1797
	}

	eth_commit_mac_addr_change(dev, p);
1798
	ret = 0;
1799

1800 1801 1802
out:
	kfree(addr);
	return ret;
1803 1804
}

1805 1806
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1807 1808 1809
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1810
	int i;
1811

T
Toshiaki Makita 已提交
1812
	for (i = 0; i < vi->max_queue_pairs; i++) {
1813
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1814 1815
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1816 1817

		do {
T
Toshiaki Makita 已提交
1818 1819 1820 1821
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1822 1823

		do {
T
Toshiaki Makita 已提交
1824
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1825 1826 1827
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1828
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1829 1830 1831 1832 1833

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1834
		tot->rx_dropped += rdrops;
1835 1836 1837
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1838
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1839 1840 1841 1842
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1843 1844 1845 1846
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1847
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1848 1849 1850 1851
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1852
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1853 1854 1855 1856 1857 1858 1859
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1860 1861
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1862 1863

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1864
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1865 1866 1867
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1868
	} else {
J
Jason Wang 已提交
1869
		vi->curr_queue_pairs = queue_pairs;
1870 1871 1872
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1873
	}
J
Jason Wang 已提交
1874 1875 1876 1877

	return 0;
}

1878 1879 1880 1881 1882 1883 1884 1885 1886 1887
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1888 1889 1890
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1891
	int i;
R
Rusty Russell 已提交
1892

1893 1894
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1895

W
Willem de Bruijn 已提交
1896
	for (i = 0; i < vi->max_queue_pairs; i++) {
1897
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1898
		napi_disable(&vi->rq[i].napi);
1899
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1900
	}
R
Rusty Russell 已提交
1901 1902 1903 1904

	return 0;
}

1905 1906 1907
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1908 1909
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1910
	struct netdev_hw_addr *ha;
1911
	int uc_count;
1912
	int mc_count;
1913 1914
	void *buf;
	int i;
1915

S
stephen hemminger 已提交
1916
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1917 1918 1919
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1920 1921
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1922

1923
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1924 1925

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1926
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1927
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1928
			 vi->ctrl->promisc ? "en" : "dis");
1929

1930
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1931 1932

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1933
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1934
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1935
			 vi->ctrl->allmulti ? "en" : "dis");
1936

1937
	uc_count = netdev_uc_count(dev);
1938
	mc_count = netdev_mc_count(dev);
1939
	/* MAC filter - use one buffer for both lists */
1940 1941 1942
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1943
	if (!buf)
1944 1945
		return;

1946 1947
	sg_init_table(sg, 2);

1948
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1949
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1950
	i = 0;
1951
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1952
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1953 1954

	sg_set_buf(&sg[0], mac_data,
1955
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1956 1957

	/* multicast list and count fill the end */
1958
	mac_data = (void *)&mac_data->macs[uc_count][0];
1959

M
Michael S. Tsirkin 已提交
1960
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1961
	i = 0;
1962 1963
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1964 1965

	sg_set_buf(&sg[1], mac_data,
1966
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1967 1968

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1969
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1970
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1971 1972

	kfree(buf);
1973 1974
}

1975 1976
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1977 1978 1979 1980
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1981
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1982
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1983 1984

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1985
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1986
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1987
	return 0;
1988 1989
}

1990 1991
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1992 1993 1994 1995
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1996
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1997
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1998 1999

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2000
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2001
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2002
	return 0;
2003 2004
}

2005
static void virtnet_clean_affinity(struct virtnet_info *vi)
J
Jason Wang 已提交
2006 2007 2008
{
	int i;

2009 2010
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2011 2012
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
2013 2014
		}

2015 2016 2017
		vi->affinity_hint_set = false;
	}
}
2018

2019 2020
static void virtnet_set_affinity(struct virtnet_info *vi)
{
2021 2022 2023 2024 2025 2026 2027 2028
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2029
		virtnet_clean_affinity(vi);
2030
		return;
J
Jason Wang 已提交
2031 2032
	}

2033 2034 2035 2036 2037 2038
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
2039

2040 2041 2042 2043 2044 2045 2046 2047 2048 2049
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
2050
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2051
		cpumask_clear(mask);
J
Jason Wang 已提交
2052 2053
	}

2054
	vi->affinity_hint_set = true;
2055
	free_cpumask_var(mask);
J
Jason Wang 已提交
2056 2057
}

2058
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2059
{
2060 2061 2062 2063 2064
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
2065

2066 2067 2068 2069 2070 2071 2072
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
2073

2074 2075 2076 2077 2078
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

2079
	virtnet_clean_affinity(vi);
2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
2105 2106
}

R
Rick Jones 已提交
2107 2108 2109 2110 2111
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
2112 2113
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
2114 2115 2116 2117
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2145
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2146 2147
		return -EINVAL;

J
John Fastabend 已提交
2148 2149 2150 2151 2152 2153 2154
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2155
	get_online_cpus();
2156
	err = _virtnet_set_queues(vi, queue_pairs);
2157 2158 2159
	if (err) {
		put_online_cpus();
		goto err;
2160
	}
2161
	virtnet_set_affinity(vi);
2162
	put_online_cpus();
2163

2164 2165 2166
	netif_set_real_num_tx_queues(dev, queue_pairs);
	netif_set_real_num_rx_queues(dev, queue_pairs);
 err:
2167 2168 2169
	return err;
}

T
Toshiaki Makita 已提交
2170 2171 2172 2173
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int i, j;
2174
	u8 *p = data;
T
Toshiaki Makita 已提交
2175 2176 2177 2178

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
2179 2180 2181
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
						virtnet_rq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2182 2183 2184
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
2185 2186 2187
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
						virtnet_sq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2217
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2256 2257
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2258 2259 2260
{
	struct virtnet_info *vi = netdev_priv(dev);

2261 2262
	return ethtool_virtdev_set_link_ksettings(dev, cmd,
						  &vi->speed, &vi->duplex);
2263 2264
}

2265 2266
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2267 2268 2269
{
	struct virtnet_info *vi = netdev_priv(dev);

2270 2271 2272
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2273 2274 2275 2276

	return 0;
}

2277 2278 2279 2280 2281 2282
static int virtnet_set_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

2283 2284
	if (ec->tx_max_coalesced_frames > 1 ||
	    ec->rx_max_coalesced_frames != 1)
2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314
		return -EINVAL;

	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2315 2316 2317 2318 2319 2320 2321 2322
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2323 2324 2325 2326 2327 2328 2329 2330
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

2331 2332
	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);

2333 2334
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
2335 2336 2337

	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);

2338 2339 2340 2341
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2342
static const struct ethtool_ops virtnet_ethtool_ops = {
2343
	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2344
	.get_drvinfo = virtnet_get_drvinfo,
2345
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2346
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2347 2348 2349
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2350 2351
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2352
	.get_ts_info = ethtool_op_get_ts_info,
2353 2354
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2355 2356
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2357 2358
};

2359 2360 2361 2362 2363 2364 2365 2366
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2367
	netif_tx_lock_bh(vi->dev);
2368
	netif_device_detach(vi->dev);
2369
	netif_tx_unlock_bh(vi->dev);
2370 2371 2372
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2373
		for (i = 0; i < vi->max_queue_pairs; i++) {
2374
			napi_disable(&vi->rq[i].napi);
2375
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2376
		}
2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2398
		for (i = 0; i < vi->max_queue_pairs; i++) {
2399
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2400 2401 2402
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2403 2404
	}

2405
	netif_tx_lock_bh(vi->dev);
2406
	netif_device_attach(vi->dev);
2407
	netif_tx_unlock_bh(vi->dev);
2408 2409 2410
	return err;
}

2411 2412 2413
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2414
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2415

2416
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2417 2418 2419

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2420
		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2447 2448
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2449 2450 2451 2452
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2453
	u16 xdp_qp = 0, curr_qp;
2454
	int i, err;
J
John Fastabend 已提交
2455

2456 2457 2458 2459
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2460 2461 2462
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
J
John Fastabend 已提交
2463 2464 2465 2466
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2467
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2468 2469 2470 2471
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2472
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2473 2474 2475 2476
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2477 2478 2479 2480 2481 2482
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2483
		netdev_warn(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
2484
			    curr_qp + xdp_qp, vi->max_queue_pairs);
2485
		xdp_qp = 0;
2486 2487
	}

2488 2489 2490 2491
	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
	if (!prog && !old_prog)
		return 0;

2492 2493
	if (prog)
		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2494

2495
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2496 2497
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2498
			napi_disable(&vi->rq[i].napi);
2499 2500 2501
			virtnet_napi_tx_disable(&vi->sq[i].napi);
		}
	}
J
John Fastabend 已提交
2502

2503 2504 2505 2506 2507 2508 2509 2510
	if (!prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0)
				virtnet_restore_guest_offloads(vi);
		}
		synchronize_net();
	}
J
John Fastabend 已提交
2511

2512 2513 2514
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
2515
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2516
	vi->xdp_queue_pairs = xdp_qp;
2517

2518
	if (prog) {
2519
		vi->xdp_enabled = true;
2520 2521 2522
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0 && !old_prog)
2523 2524
				virtnet_clear_guest_offloads(vi);
		}
2525 2526
	} else {
		vi->xdp_enabled = false;
2527 2528 2529
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
J
John Fastabend 已提交
2530 2531
		if (old_prog)
			bpf_prog_put(old_prog);
2532
		if (netif_running(dev)) {
2533
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2534 2535 2536
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
J
John Fastabend 已提交
2537 2538 2539
	}

	return 0;
2540

2541
err:
2542 2543 2544 2545 2546 2547
	if (!prog) {
		virtnet_clear_guest_offloads(vi);
		for (i = 0; i < vi->max_queue_pairs; i++)
			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
	}

2548
	if (netif_running(dev)) {
2549
		for (i = 0; i < vi->max_queue_pairs; i++) {
2550
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2551 2552 2553
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2554
	}
2555 2556 2557
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2558 2559
}

2560
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2561 2562 2563
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2564
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2565 2566 2567 2568 2569
	default:
		return -EINVAL;
	}
}

2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2586 2587 2588 2589
static int virtnet_set_features(struct net_device *dev,
				netdev_features_t features)
{
	struct virtnet_info *vi = netdev_priv(dev);
2590
	u64 offloads;
2591 2592
	int err;

2593
	if ((dev->features ^ features) & NETIF_F_LRO) {
2594
		if (vi->xdp_enabled)
2595 2596
			return -EBUSY;

2597
		if (features & NETIF_F_LRO)
2598
			offloads = vi->guest_offloads_capable;
2599
		else
2600 2601
			offloads = vi->guest_offloads_capable &
				   ~GUEST_OFFLOAD_LRO_MASK;
2602

2603 2604 2605 2606
		err = virtnet_set_guest_offloads(vi, offloads);
		if (err)
			return err;
		vi->guest_offloads = offloads;
2607 2608 2609 2610 2611
	}

	return 0;
}

2612 2613 2614 2615 2616
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2617
	.ndo_set_mac_address = virtnet_set_mac_address,
2618
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2619
	.ndo_get_stats64     = virtnet_stats,
2620 2621
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2622
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2623
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2624
	.ndo_features_check	= passthru_features_check,
2625
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2626
	.ndo_set_features	= virtnet_set_features,
2627 2628
};

2629
static void virtnet_config_changed_work(struct work_struct *work)
2630
{
2631 2632
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2633 2634
	u16 v;

2635 2636
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2637
		return;
2638 2639

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2640
		netdev_notify_peers(vi->dev);
2641 2642
		virtnet_ack_link_announce(vi);
	}
2643 2644 2645 2646 2647

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2648
		return;
2649 2650 2651 2652

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2653
		virtnet_update_settings(vi);
2654
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2655
		netif_tx_wake_all_queues(vi->dev);
2656 2657
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2658
		netif_tx_stop_all_queues(vi->dev);
2659 2660 2661 2662 2663 2664 2665
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2666
	schedule_work(&vi->config_work);
2667 2668
}

J
Jason Wang 已提交
2669 2670
static void virtnet_free_queues(struct virtnet_info *vi)
{
2671 2672
	int i;

2673
	for (i = 0; i < vi->max_queue_pairs; i++) {
2674 2675
		__netif_napi_del(&vi->rq[i].napi);
		__netif_napi_del(&vi->sq[i].napi);
2676
	}
2677

2678
	/* We called __netif_napi_del(),
2679 2680 2681 2682
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2683 2684
	kfree(vi->rq);
	kfree(vi->sq);
2685
	kfree(vi->ctrl);
J
Jason Wang 已提交
2686 2687
}

2688
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2689
{
J
John Fastabend 已提交
2690
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2691 2692 2693 2694 2695
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2696 2697 2698 2699 2700

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2701
	}
2702 2703 2704 2705 2706 2707
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2708
	rtnl_unlock();
J
Jason Wang 已提交
2709 2710
}

2711 2712 2713 2714 2715 2716 2717 2718
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
Jason Wang 已提交
2719 2720 2721 2722 2723 2724 2725
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2726
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2727
			if (!is_xdp_frame(buf))
J
John Fastabend 已提交
2728 2729
				dev_kfree_skb(buf);
			else
2730
				xdp_return_frame(ptr_to_xdp(buf));
J
John Fastabend 已提交
2731
		}
J
Jason Wang 已提交
2732 2733 2734 2735 2736 2737
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2738
			if (vi->mergeable_rx_bufs) {
2739
				put_page(virt_to_head_page(buf));
2740
			} else if (vi->big_packets) {
2741
				give_pages(&vi->rq[i], buf);
2742
			} else {
2743
				put_page(virt_to_head_page(buf));
2744
			}
J
Jason Wang 已提交
2745 2746 2747 2748
		}
	}
}

2749 2750 2751 2752
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2753
	virtnet_clean_affinity(vi);
J
Jason Wang 已提交
2754

2755
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2756 2757

	virtnet_free_queues(vi);
2758 2759
}

2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2772 2773
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2774 2775
}

J
Jason Wang 已提交
2776
static int virtnet_find_vqs(struct virtnet_info *vi)
2777
{
J
Jason Wang 已提交
2778 2779 2780 2781 2782
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2783
	bool *ctx;
J
Jason Wang 已提交
2784 2785 2786 2787 2788 2789 2790 2791 2792

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2793
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2794 2795
	if (!vqs)
		goto err_vq;
2796
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2797 2798
	if (!callbacks)
		goto err_callback;
2799
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2800 2801
	if (!names)
		goto err_names;
2802
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2803
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2804 2805 2806 2807 2808
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2809 2810 2811 2812 2813 2814

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2815

J
Jason Wang 已提交
2816 2817 2818 2819 2820 2821 2822 2823
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2824 2825
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2826
	}
2827

J
Jason Wang 已提交
2828
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2829
					 names, ctx, NULL);
J
Jason Wang 已提交
2830 2831
	if (ret)
		goto err_find;
2832

J
Jason Wang 已提交
2833 2834
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2835
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2836
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2837
	}
J
Jason Wang 已提交
2838 2839 2840

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2841
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2842 2843 2844
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2845
	/* run here: ret == 0. */
J
Jason Wang 已提交
2846 2847 2848


err_find:
2849 2850
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2864 2865 2866
	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
	if (!vi->ctrl)
		goto err_ctrl;
K
Kees Cook 已提交
2867
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2868 2869
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2870
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2871
	if (!vi->rq)
J
Jason Wang 已提交
2872 2873 2874 2875 2876 2877 2878
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2879 2880
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2881 2882

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2883
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2884
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2885 2886 2887

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2888 2889 2890 2891 2892 2893 2894
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2895 2896
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2913
	get_online_cpus();
2914
	virtnet_set_affinity(vi);
2915 2916
	put_online_cpus();

J
Jason Wang 已提交
2917 2918 2919 2920 2921 2922
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2923 2924
}

2925 2926
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2927
		char *buf)
2928 2929 2930
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2931 2932
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2933
	struct ewma_pkt_len *avg;
2934 2935 2936

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2937
	return sprintf(buf, "%u\n",
2938 2939
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2990 2991 2992
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2993
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2994
{
2995 2996 2997 2998 2999 3000
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

3001 3002 3003
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
3017
	int i, err = -ENOMEM;
3018 3019 3020 3021 3022
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
3023
	/* Find if host supports multiqueue virtio_net device */
3024 3025 3026
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
3027 3028 3029 3030 3031 3032

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
3033 3034

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
3035
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
3036 3037 3038 3039
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
3040 3041
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
			   IFF_TX_SKB_NO_LINEAR;
3042
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
3043
	dev->features = NETIF_F_HIGHDMA;
3044

3045
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
3046 3047 3048
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
3049
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
3050
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
3051
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3052
		if (csum)
J
Jason Wang 已提交
3053
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3054 3055

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3056
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
3057 3058
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
3059
		/* Individual feature bits: what can host handle? */
3060 3061 3062 3063 3064 3065 3066
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

3067 3068
		dev->features |= NETIF_F_GSO_ROBUST;

3069
		if (gso)
3070
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3071
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
3072
	}
3073 3074
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
3075 3076 3077
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
		dev->features |= NETIF_F_LRO;
3078
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3079
		dev->hw_features |= NETIF_F_LRO;
R
Rusty Russell 已提交
3080

3081 3082
	dev->vlan_features = dev->features;

3083 3084 3085 3086
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
3087
	/* Configuration may specify what MAC to use.  Otherwise random. */
3088 3089 3090 3091 3092
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
3093
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
3094 3095 3096 3097 3098

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
3099
	vdev->priv = vi;
3100

3101
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
3102

3103
	/* If we can receive ANY GSO packets, we must allocate large ones. */
3104 3105
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3106 3107
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3108 3109
		vi->big_packets = true;

3110 3111 3112
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

3113 3114
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3115 3116 3117 3118
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

3119 3120
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3121 3122
		vi->any_header_sg = true;

J
Jason Wang 已提交
3123 3124 3125
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

3126 3127 3128 3129
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3130
		if (mtu < dev->min_mtu) {
3131 3132 3133
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
3134 3135 3136
			dev_err(&vdev->dev,
				"device MTU appears to have changed it is now %d < %d",
				mtu, dev->min_mtu);
3137
			err = -EINVAL;
T
Toshiaki Makita 已提交
3138
			goto free;
3139
		}
3140

3141 3142 3143
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3144 3145 3146
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3147 3148
	}

3149 3150
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3151

3152 3153 3154 3155 3156
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3157 3158 3159
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3160
	err = init_vqs(vi);
3161
	if (err)
T
Toshiaki Makita 已提交
3162
		goto free;
R
Rusty Russell 已提交
3163

3164 3165 3166 3167
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3168 3169
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3170

3171 3172
	virtnet_init_settings(dev);

3173 3174
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3175 3176
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3177
			goto free_vqs;
3178
		}
3179 3180
	}

R
Rusty Russell 已提交
3181 3182 3183
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3184
		goto free_failover;
R
Rusty Russell 已提交
3185
	}
3186

M
Michael S. Tsirkin 已提交
3187 3188
	virtio_device_ready(vdev);

3189
	err = virtnet_cpu_notif_add(vi);
3190 3191
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3192
		goto free_unregister_netdev;
3193 3194
	}

3195
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3196

J
Jason Wang 已提交
3197 3198
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3199
	netif_carrier_off(dev);
J
Jason Wang 已提交
3200
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3201
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3202 3203
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3204
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3205 3206
		netif_carrier_on(dev);
	}
3207

3208 3209 3210
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);
3211
	vi->guest_offloads_capable = vi->guest_offloads;
3212

J
Jason Wang 已提交
3213 3214 3215
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3216 3217
	return 0;

3218
free_unregister_netdev:
3219 3220
	vi->vdev->config->reset(vdev);

3221
	unregister_netdev(dev);
3222 3223
free_failover:
	net_failover_destroy(vi->failover);
3224
free_vqs:
J
Jason Wang 已提交
3225
	cancel_delayed_work_sync(&vi->refill);
3226
	free_receive_page_frags(vi);
3227
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3228 3229 3230 3231 3232
free:
	free_netdev(dev);
	return err;
}

3233
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3234
{
3235
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3236 3237

	/* Free unused buffers in both send and recv, if any. */
3238
	free_unused_bufs(vi);
3239

J
Jason Wang 已提交
3240
	free_receive_bufs(vi);
3241

3242 3243
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3244
	virtnet_del_vqs(vi);
3245 3246
}

3247
static void virtnet_remove(struct virtio_device *vdev)
3248 3249 3250
{
	struct virtnet_info *vi = vdev->priv;

3251
	virtnet_cpu_notif_remove(vi);
3252

3253 3254
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3255

3256 3257
	unregister_netdev(vi->dev);

3258 3259
	net_failover_destroy(vi->failover);

3260
	remove_vq_common(vi);
3261

3262
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3263 3264
}

3265
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3266 3267 3268
{
	struct virtnet_info *vi = vdev->priv;

3269
	virtnet_cpu_notif_remove(vi);
3270
	virtnet_freeze_down(vdev);
3271 3272 3273 3274 3275
	remove_vq_common(vi);

	return 0;
}

3276
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3277 3278
{
	struct virtnet_info *vi = vdev->priv;
3279
	int err;
3280

3281
	err = virtnet_restore_up(vdev);
3282 3283
	if (err)
		return err;
J
Jason Wang 已提交
3284 3285
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3286
	err = virtnet_cpu_notif_add(vi);
3287 3288 3289
	if (err)
		return err;

3290 3291 3292
	return 0;
}

R
Rusty Russell 已提交
3293 3294 3295 3296 3297
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3298 3299 3300 3301 3302 3303 3304 3305 3306 3307
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3308
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3309
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3310

3311
static unsigned int features[] = {
3312 3313 3314 3315 3316 3317
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3318
	VIRTIO_F_ANY_LAYOUT,
3319 3320
};

3321
static struct virtio_driver virtio_net_driver = {
3322 3323
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3324 3325
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3326 3327 3328
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3329
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3330
	.probe =	virtnet_probe,
3331
	.remove =	virtnet_remove,
3332
	.config_changed = virtnet_config_changed,
3333
#ifdef CONFIG_PM_SLEEP
3334 3335 3336
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3337 3338
};

3339 3340 3341 3342
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3343
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3344 3345 3346 3347 3348
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3349
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3369
	unregister_virtio_driver(&virtio_net_driver);
3370 3371 3372 3373
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3374 3375 3376 3377

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");