virtio_net.c 86.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* A network driver using virtio.
R
Rusty Russell 已提交
3 4 5 6 7 8
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
9
#include <linux/ethtool.h>
R
Rusty Russell 已提交
10 11 12
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
13
#include <linux/bpf.h>
14
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
15
#include <linux/scatterlist.h>
16
#include <linux/if_vlan.h>
17
#include <linux/slab.h>
18
#include <linux/cpu.h>
19
#include <linux/average.h>
J
Jason Wang 已提交
20
#include <linux/filter.h>
21
#include <linux/kernel.h>
22
#include <net/route.h>
23
#include <net/xdp.h>
24
#include <net/net_failover.h>
R
Rusty Russell 已提交
25

26
static int napi_weight = NAPI_POLL_WEIGHT;
27 28
module_param(napi_weight, int, 0444);

29
static bool csum = true, gso = true, napi_tx = true;
R
Rusty Russell 已提交
30 31
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
32
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
33

R
Rusty Russell 已提交
34
/* FIXME: MTU in config. */
35
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
36
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
37

38 39
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

40 41 42
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

43 44 45 46
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

47 48
#define VIRTIO_XDP_FLAG	BIT(0)

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56
#define VIRTNET_DRIVER_VERSION "1.0.0"
57

58 59 60 61
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
62 63
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
64
};
65

66 67 68 69 70
#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
				(1ULL << VIRTIO_NET_F_GUEST_UFO))

T
Toshiaki Makita 已提交
71 72 73
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
74 75
};

T
Toshiaki Makita 已提交
76 77 78 79
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
80 81
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
82
	u64 kicks;
T
Toshiaki Makita 已提交
83 84
};

85 86
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
87 88
	u64 packets;
	u64 bytes;
89
	u64 drops;
90 91 92 93
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
94
	u64 kicks;
T
Toshiaki Makita 已提交
95 96 97
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
98
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
99 100

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
101 102 103 104
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
105
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
106 107 108
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
109 110 111 112 113 114 115
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
116
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
117 118 119 120 121
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

122 123 124 125 126 127 128
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
129 130 131

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
132

T
Toshiaki Makita 已提交
133 134
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
135
	struct napi_struct napi;
136 137 138 139 140 141 142
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
143 144
	struct napi_struct napi;

J
John Fastabend 已提交
145 146
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
147 148
	struct virtnet_rq_stats stats;

149 150 151
	/* Chain pages by the private ptr. */
	struct page *pages;

152
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
153
	struct ewma_pkt_len mrg_avg_pkt_len;
154

155 156 157
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

158 159
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
160

161 162 163
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
164 165
	/* Name of this receive queue: input.$index */
	char name[40];
166 167

	struct xdp_rxq_info xdp_rxq;
168 169
};

170 171 172 173 174 175 176
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
177
	__virtio16 vid;
178
	__virtio64 offloads;
179 180
};

181 182 183 184
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
185 186
	struct send_queue *sq;
	struct receive_queue *rq;
187 188
	unsigned int status;

J
Jason Wang 已提交
189 190 191 192 193 194
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

195 196 197
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

198 199 200
	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
	bool xdp_enabled;

201 202 203
	/* I like... big packets and I cannot lie! */
	bool big_packets;

204 205 206
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
207 208 209
	/* Has control virtqueue */
	bool has_cvq;

210 211 212
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

213 214 215
	/* Packet virtio header size */
	u8 hdr_len;

216 217 218
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

219 220 221
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
222 223
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
224

225 226 227
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
228

229
	struct control_buf *ctrl;
230 231 232 233

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
234 235

	unsigned long guest_offloads;
236
	unsigned long guest_offloads_capable;
237 238 239

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
240 241
};

242
struct padded_vnet_hdr {
243
	struct virtio_net_hdr_mrg_rxbuf hdr;
244
	/*
245 246 247
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
248
	 */
249
	char padding[4];
250 251
};

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
static bool is_xdp_frame(void *ptr)
{
	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
}

static void *xdp_to_ptr(struct xdp_frame *ptr)
{
	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
}

static struct xdp_frame *ptr_to_xdp(void *ptr)
{
	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
}

J
Jason Wang 已提交
267 268 269 270 271
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
272
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
273 274 275 276 277 278 279 280 281
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
282
	return vq->index / 2;
J
Jason Wang 已提交
283 284 285 286 287 288 289
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

290
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
291
{
292
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
293 294
}

295 296 297 298
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
299
static void give_pages(struct receive_queue *rq, struct page *page)
300
{
301
	struct page *end;
302

303
	/* Find end of list, sew whole thing into vi->rq.pages. */
304
	for (end = page; end->private; end = (struct page *)end->private);
305 306
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
307 308
}

309
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
310
{
311
	struct page *p = rq->pages;
312

313
	if (p) {
314
		rq->pages = (struct page *)p->private;
315 316 317
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
318 319 320 321
		p = alloc_page(gfp_mask);
	return p;
}

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
337 338 339 340 341 342
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
343 344
}

345
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
346
{
347
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
348
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
349

350
	/* Suppress further interrupts. */
351
	virtqueue_disable_cb(vq);
352

W
Willem de Bruijn 已提交
353 354 355 356 357
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
358 359
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

377
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
378 379
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
380
				   struct page *page, unsigned int offset,
381
				   unsigned int len, unsigned int truesize,
382 383
				   bool hdr_valid, unsigned int metasize,
				   unsigned int headroom)
384 385
{
	struct sk_buff *skb;
386
	struct virtio_net_hdr_mrg_rxbuf *hdr;
387
	unsigned int copy, hdr_len, hdr_padded_len;
388
	struct page *page_to_free = NULL;
389 390
	int tailroom, shinfo_size;
	char *p, *hdr_p;
391

392
	p = page_address(page) + offset;
393
	hdr_p = p;
394

395 396
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
397
		hdr_padded_len = sizeof(*hdr);
398
	else
399
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
400

401 402 403 404 405 406 407 408 409 410 411
	/* If headroom is not 0, there is an offset between the beginning of the
	 * data and the allocated space, otherwise the data and the allocated
	 * space are aligned.
	 */
	if (headroom) {
		/* The actual allocated space size is PAGE_SIZE. */
		truesize = PAGE_SIZE;
		tailroom = truesize - len - offset;
	} else {
		tailroom = truesize - len;
	}
412

413
	len -= hdr_len;
414 415
	offset += hdr_padded_len;
	p += hdr_padded_len;
416

417 418
	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

419
	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
420 421 422 423 424 425 426 427 428 429 430 431 432
		skb = build_skb(p, truesize);
		if (unlikely(!skb))
			return NULL;

		skb_put(skb, len);
		goto ok;
	}

	/* copy small packet so we can reuse these pages for small data */
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;

433 434 435 436 437 438 439
	/* Copy all frame if it fits skb->head, otherwise
	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
	 */
	if (len <= skb_tailroom(skb))
		copy = len;
	else
		copy = ETH_HLEN + metasize;
440
	skb_put_data(skb, p, copy);
441

442 443
	len -= copy;
	offset += copy;
444

445 446 447 448
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
449
			page_to_free = page;
450
		goto ok;
451 452
	}

453 454 455 456 457 458 459
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
460
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
461 462 463
		dev_kfree_skb(skb);
		return NULL;
	}
464
	BUG_ON(offset >= PAGE_SIZE);
465
	while (len) {
466 467 468 469
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
470 471 472
		page = (struct page *)page->private;
		offset = 0;
	}
473

474
	if (page)
475
		give_pages(rq, page);
476

477 478 479 480 481 482
ok:
	/* hdr_valid means no XDP, so we can copy the vnet header */
	if (hdr_valid) {
		hdr = skb_vnet_hdr(skb);
		memcpy(hdr, hdr_p, hdr_len);
	}
483 484
	if (page_to_free)
		put_page(page_to_free);
485 486 487 488 489 490

	if (metasize) {
		__skb_pull(skb, metasize);
		skb_metadata_set(skb, metasize);
	}

491 492
	return skb;
}
493

494 495 496
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
497 498 499 500
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

501 502 503 504 505
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
506
	/* Zero header and leave csum up to XDP layers */
507
	hdr = xdpf->data;
508
	memset(hdr, 0, vi->hdr_len);
509
	xdpf->len   += vi->hdr_len;
510

511
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
512

513 514
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
				   GFP_ATOMIC);
515
	if (unlikely(err))
516
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
517

518
	return 0;
J
John Fastabend 已提交
519 520
}

521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 * the current cpu, so it does not need to be locked.
 *
 * Here we use marco instead of inline functions because we have to deal with
 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 * functions to perfectly solve these three problems at the same time.
 */
#define virtnet_xdp_get_sq(vi) ({                                       \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
	unsigned int qp;                                                \
									\
	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
		qp += smp_processor_id();                               \
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_acquire(txq);                                \
	} else {                                                        \
		qp = smp_processor_id() % v->curr_queue_pairs;          \
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_lock(txq, raw_smp_processor_id());           \
	}                                                               \
	v->sq + qp;                                                     \
})

#define virtnet_xdp_put_sq(vi, q) {                                     \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
									\
	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
	if (v->curr_queue_pairs > nr_cpu_ids)                           \
		__netif_tx_release(txq);                                \
	else                                                            \
		__netif_tx_unlock(txq);                                 \
556 557
}

558
static int virtnet_xdp_xmit(struct net_device *dev,
559
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
560 561
{
	struct virtnet_info *vi = netdev_priv(dev);
562 563
	struct receive_queue *rq = vi->rq;
	struct bpf_prog *xdp_prog;
564 565
	struct send_queue *sq;
	unsigned int len;
566 567
	int packets = 0;
	int bytes = 0;
568
	int nxmit = 0;
T
Toshiaki Makita 已提交
569
	int kicks = 0;
570
	void *ptr;
571
	int ret;
572 573
	int i;

574 575 576
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
577
	xdp_prog = rcu_access_pointer(rq->xdp_prog);
578 579 580
	if (!xdp_prog)
		return -ENXIO;

581
	sq = virtnet_xdp_get_sq(vi);
582 583 584

	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
585 586
		goto out;
	}
587

588
	/* Free up any pending old buffers before queueing new ones. */
589
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
590 591 592 593 594 595 596 597 598 599 600 601
		if (likely(is_xdp_frame(ptr))) {
			struct xdp_frame *frame = ptr_to_xdp(ptr);

			bytes += frame->len;
			xdp_return_frame(frame);
		} else {
			struct sk_buff *skb = ptr;

			bytes += skb->len;
			napi_consume_skb(skb, false);
		}
		packets++;
602
	}
603 604 605 606

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

607 608 609
		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
			break;
		nxmit++;
610
	}
611
	ret = nxmit;
612

T
Toshiaki Makita 已提交
613 614 615 616
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
617 618
out:
	u64_stats_update_begin(&sq->stats.syncp);
619 620
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
621
	sq->stats.xdp_tx += n;
622
	sq->stats.xdp_tx_drops += n - nxmit;
T
Toshiaki Makita 已提交
623
	sq->stats.kicks += kicks;
624
	u64_stats_update_end(&sq->stats.syncp);
625

626
	virtnet_xdp_put_sq(vi, sq);
627
	return ret;
J
Jason Wang 已提交
628 629
}

630 631
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
632
	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
633 634
}

635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
665
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
666 667 668 669 670 671 672 673 674 675 676 677 678 679
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
680
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

699 700 701
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
702
				     void *buf, void *ctx,
J
Jason Wang 已提交
703
				     unsigned int len,
704
				     unsigned int *xdp_xmit,
705
				     struct virtnet_rq_stats *stats)
706
{
707
	struct sk_buff *skb;
708
	struct bpf_prog *xdp_prog;
709
	unsigned int xdp_headroom = (unsigned long)ctx;
710 711 712 713
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
714
	struct page *page = virt_to_head_page(buf);
715
	unsigned int delta = 0;
716
	struct page *xdp_page;
717
	int err;
718
	unsigned int metasize = 0;
719

720
	len -= vi->hdr_len;
721
	stats->bytes += len;
722

723 724 725
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
726
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
727
		struct xdp_frame *xdpf;
728
		struct xdp_buff xdp;
729
		void *orig_data;
730 731
		u32 act;

732
		if (unlikely(hdr->hdr.gso_type))
733
			goto err_xdp;
734

735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

756
		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
757 758
		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
				 xdp_headroom, len, true);
759
		orig_data = xdp.data;
760
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
761
		stats->xdp_packets++;
762

763 764
		switch (act) {
		case XDP_PASS:
765
			/* Recalculate length in case bpf program changed it */
766
			delta = orig_data - xdp.data;
767
			len = xdp.data_end - xdp.data;
768
			metasize = xdp.data - xdp.data_meta;
769 770
			break;
		case XDP_TX:
771
			stats->xdp_tx++;
772
			xdpf = xdp_convert_buff_to_frame(&xdp);
773 774
			if (unlikely(!xdpf))
				goto err_xdp;
775
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
776 777 778
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
779
				trace_xdp_exception(vi->dev, xdp_prog, act);
780 781
				goto err_xdp;
			}
782
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
783 784 785
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
786
			stats->xdp_redirects++;
J
Jason Wang 已提交
787
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
788 789
			if (err)
				goto err_xdp;
790
			*xdp_xmit |= VIRTIO_XDP_REDIR;
791 792 793
			rcu_read_unlock();
			goto xdp_xmit;
		default:
794
			bpf_warn_invalid_xdp_action(act);
795
			fallthrough;
796 797
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
798
			goto err_xdp;
799
		case XDP_DROP:
800 801 802 803 804
			goto err_xdp;
		}
	}
	rcu_read_unlock();

805 806
	skb = build_skb(buf, buflen);
	if (!skb) {
807
		put_page(page);
808 809 810
		goto err;
	}
	skb_reserve(skb, headroom - delta);
811
	skb_put(skb, len);
812
	if (!xdp_prog) {
813 814
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
815
	} /* keep zeroed vnet hdr since XDP is loaded */
816

817 818 819
	if (metasize)
		skb_metadata_set(skb, metasize);

820
err:
821
	return skb;
822 823 824

err_xdp:
	rcu_read_unlock();
825 826
	stats->xdp_drops++;
	stats->drops++;
827
	put_page(page);
828 829
xdp_xmit:
	return NULL;
830 831 832
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
833
				   struct virtnet_info *vi,
834 835
				   struct receive_queue *rq,
				   void *buf,
836
				   unsigned int len,
837
				   struct virtnet_rq_stats *stats)
838 839
{
	struct page *page = buf;
840
	struct sk_buff *skb =
841
		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
J
John Fastabend 已提交
842

843
	stats->bytes += len - vi->hdr_len;
844 845 846 847 848 849
	if (unlikely(!skb))
		goto err;

	return skb;

err:
850
	stats->drops++;
851 852 853 854
	give_pages(rq, page);
	return NULL;
}

855
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
856
					 struct virtnet_info *vi,
857
					 struct receive_queue *rq,
858 859
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
860
					 unsigned int len,
861
					 unsigned int *xdp_xmit,
862
					 struct virtnet_rq_stats *stats)
863
{
864 865
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
866 867
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
868 869
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
870
	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
871
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
872
	unsigned int metasize = 0;
873 874
	unsigned int frame_sz;
	int err;
J
John Fastabend 已提交
875

J
John Fastabend 已提交
876
	head_skb = NULL;
877
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
878

J
John Fastabend 已提交
879 880 881
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
882
		struct xdp_frame *xdpf;
883
		struct page *xdp_page;
884 885
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
886 887
		u32 act;

888 889 890 891 892 893 894
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

895 896 897 898 899
		/* Buffers with headroom use PAGE_SIZE as alloc size,
		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
		 */
		frame_sz = headroom ? PAGE_SIZE : truesize;

900 901 902 903 904 905
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
906 907
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
908
			/* linearize data for XDP */
909
			xdp_page = xdp_linearize_page(rq, &num_buf,
910 911 912
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
913 914
			frame_sz = PAGE_SIZE;

915 916
			if (!xdp_page)
				goto err_xdp;
917
			offset = VIRTIO_XDP_HEADROOM;
918 919
		} else {
			xdp_page = page;
J
John Fastabend 已提交
920 921
		}

922 923 924
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
925
		data = page_address(xdp_page) + offset;
926
		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
927 928
		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
929

930
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
931
		stats->xdp_packets++;
932

J
John Fastabend 已提交
933 934
		switch (act) {
		case XDP_PASS:
935 936
			metasize = xdp.data - xdp.data_meta;

937
			/* recalculate offset to account for any header
938 939 940
			 * adjustments and minus the metasize to copy the
			 * metadata in page_to_skb(). Note other cases do not
			 * build an skb and avoid using offset
941
			 */
942 943
			offset = xdp.data - page_address(xdp_page) -
				 vi->hdr_len - metasize;
944

945 946
			/* recalculate len if xdp.data, xdp.data_end or
			 * xdp.data_meta were adjusted
947
			 */
948
			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
949 950 951 952
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
953 954
				head_skb = page_to_skb(vi, rq, xdp_page, offset,
						       len, PAGE_SIZE, false,
955
						       metasize, headroom);
956 957
				return head_skb;
			}
J
John Fastabend 已提交
958 959
			break;
		case XDP_TX:
960
			stats->xdp_tx++;
961
			xdpf = xdp_convert_buff_to_frame(&xdp);
962 963
			if (unlikely(!xdpf))
				goto err_xdp;
964
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
965 966 967
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
968
				trace_xdp_exception(vi->dev, xdp_prog, act);
969 970 971 972
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
973
			*xdp_xmit |= VIRTIO_XDP_TX;
974
			if (unlikely(xdp_page != page))
975
				put_page(page);
J
John Fastabend 已提交
976 977
			rcu_read_unlock();
			goto xdp_xmit;
978
		case XDP_REDIRECT:
979
			stats->xdp_redirects++;
980 981 982 983 984 985
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
986
			*xdp_xmit |= VIRTIO_XDP_REDIR;
987
			if (unlikely(xdp_page != page))
988
				put_page(page);
989 990
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
991
		default:
992
			bpf_warn_invalid_xdp_action(act);
993
			fallthrough;
994 995
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
996
			fallthrough;
997
		case XDP_DROP:
998 999
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
1000
			goto err_xdp;
J
John Fastabend 已提交
1001
		}
J
John Fastabend 已提交
1002 1003
	}
	rcu_read_unlock();
1004

1005
	if (unlikely(len > truesize)) {
1006
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1007 1008 1009 1010
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
1011

1012
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1013
			       metasize, headroom);
J
John Fastabend 已提交
1014
	curr_skb = head_skb;
1015

1016 1017
	if (unlikely(!curr_skb))
		goto err_skb;
1018
	while (--num_buf) {
1019 1020
		int num_skb_frags;

1021
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1022
		if (unlikely(!buf)) {
1023
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
1024
				 dev->name, num_buf,
1025 1026
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
1027 1028
			dev->stats.rx_length_errors++;
			goto err_buf;
1029
		}
1030

1031
		stats->bytes += len;
1032
		page = virt_to_head_page(buf);
1033 1034 1035

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
1036
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1037 1038 1039 1040
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
1041 1042

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1043 1044
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1045 1046 1047

			if (unlikely(!nskb))
				goto err_skb;
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
1059
			head_skb->truesize += truesize;
1060
		}
1061
		offset = buf - page_address(page);
1062 1063 1064
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1065
					     len, truesize);
1066 1067
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1068
					offset, len, truesize);
1069
		}
1070 1071
	}

J
Johannes Berg 已提交
1072
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1073 1074
	return head_skb;

J
John Fastabend 已提交
1075 1076
err_xdp:
	rcu_read_unlock();
1077
	stats->xdp_drops++;
1078 1079
err_skb:
	put_page(page);
1080
	while (num_buf-- > 1) {
1081 1082
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
1083 1084 1085 1086 1087
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
1088
		stats->bytes += len;
1089
		page = virt_to_head_page(buf);
1090
		put_page(page);
1091
	}
1092
err_buf:
1093
	stats->drops++;
1094
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
1095
xdp_xmit:
1096
	return NULL;
1097 1098
}

1099 1100
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
1101
			unsigned int *xdp_xmit,
1102
			struct virtnet_rq_stats *stats)
1103
{
1104
	struct net_device *dev = vi->dev;
1105
	struct sk_buff *skb;
1106
	struct virtio_net_hdr_mrg_rxbuf *hdr;
1107

1108
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1109 1110
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1111
		if (vi->mergeable_rx_bufs) {
1112
			put_page(virt_to_head_page(buf));
1113
		} else if (vi->big_packets) {
1114
			give_pages(rq, buf);
1115
		} else {
1116
			put_page(virt_to_head_page(buf));
1117
		}
1118
		return;
1119
	}
1120

1121
	if (vi->mergeable_rx_bufs)
1122
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1123
					stats);
1124
	else if (vi->big_packets)
1125
		skb = receive_big(dev, vi, rq, buf, len, stats);
1126
	else
1127
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1128 1129

	if (unlikely(!skb))
1130
		return;
1131

1132
	hdr = skb_vnet_hdr(skb);
1133

1134
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1135
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1136

1137 1138 1139 1140 1141 1142
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1143 1144
	}

1145
	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1146 1147 1148 1149
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1150
	napi_gro_receive(&rq->napi, skb);
1151
	return;
R
Rusty Russell 已提交
1152 1153 1154 1155 1156 1157

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1158 1159 1160 1161 1162
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1163 1164
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1165
{
1166 1167
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1168
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1169
	void *ctx = (void *)(unsigned long)xdp_headroom;
1170
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1171
	int err;
1172

1173 1174 1175
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1176
		return -ENOMEM;
R
Rusty Russell 已提交
1177

1178 1179 1180 1181 1182
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1183
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1184
	if (err < 0)
1185
		put_page(virt_to_head_page(buf));
1186 1187
	return err;
}
1188

1189 1190
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1191 1192 1193 1194 1195
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1196 1197
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1198
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1199
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1200
		first = get_a_page(rq, gfp);
1201 1202
		if (!first) {
			if (list)
1203
				give_pages(rq, list);
1204
			return -ENOMEM;
1205
		}
1206
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1207

1208 1209 1210 1211
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1212

1213
	first = get_a_page(rq, gfp);
1214
	if (!first) {
1215
		give_pages(rq, list);
1216 1217 1218 1219
		return -ENOMEM;
	}
	p = page_address(first);

1220
	/* rq->sg[0], rq->sg[1] share the same page */
1221 1222
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1223

1224
	/* rq->sg[1] for data packet, from offset */
1225
	offset = sizeof(struct padded_vnet_hdr);
1226
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1227 1228 1229

	/* chain first in list head */
	first->private = (unsigned long)list;
1230 1231
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1232
	if (err < 0)
1233
		give_pages(rq, first);
1234 1235

	return err;
R
Rusty Russell 已提交
1236 1237
}

1238
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1239 1240
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1241
{
1242
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1243 1244
	unsigned int len;

1245 1246 1247 1248
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1249
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1250

1251
	return ALIGN(len, L1_CACHE_BYTES);
1252 1253
}

1254 1255
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1256
{
1257
	struct page_frag *alloc_frag = &rq->alloc_frag;
1258
	unsigned int headroom = virtnet_get_headroom(vi);
1259 1260
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1261
	char *buf;
1262
	void *ctx;
1263
	int err;
1264
	unsigned int len, hole;
1265

1266 1267 1268 1269 1270 1271
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1272
		return -ENOMEM;
1273

1274
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1275
	buf += headroom; /* advance address leaving hole at front of pkt */
1276
	get_page(alloc_frag->page);
1277
	alloc_frag->offset += len + room;
1278
	hole = alloc_frag->size - alloc_frag->offset;
1279
	if (hole < len + room) {
1280 1281
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1282
		 * the current buffer.
1283
		 */
1284 1285 1286
		len += hole;
		alloc_frag->offset += hole;
	}
1287

1288
	sg_init_one(rq->sg, buf, len);
1289
	ctx = mergeable_len_to_ctx(len, headroom);
1290
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1291
	if (err < 0)
1292
		put_page(virt_to_head_page(buf));
1293

1294 1295
	return err;
}
1296

1297 1298 1299 1300 1301 1302 1303
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1304 1305
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1306 1307
{
	int err;
1308
	bool oom;
1309

1310 1311
	do {
		if (vi->mergeable_rx_bufs)
1312
			err = add_recvbuf_mergeable(vi, rq, gfp);
1313
		else if (vi->big_packets)
1314
			err = add_recvbuf_big(vi, rq, gfp);
1315
		else
M
Michael S. Tsirkin 已提交
1316
			err = add_recvbuf_small(vi, rq, gfp);
1317

1318
		oom = err == -ENOMEM;
1319
		if (err)
1320
			break;
1321
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1322
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1323 1324 1325
		unsigned long flags;

		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1326
		rq->stats.kicks++;
1327
		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
T
Toshiaki Makita 已提交
1328 1329
	}

1330
	return !oom;
1331 1332
}

1333
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1334 1335
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1336
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1337

1338
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1339 1340
}

1341
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1342
{
1343
	napi_enable(napi);
1344 1345

	/* If all buffers were filled by other side before we napi_enabled, we
1346 1347 1348 1349 1350 1351
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1352 1353
}

W
Willem de Bruijn 已提交
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1372 1373 1374 1375 1376 1377
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1378 1379
static void refill_work(struct work_struct *work)
{
1380 1381
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1382
	bool still_empty;
J
Jason Wang 已提交
1383 1384
	int i;

1385
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1386
		struct receive_queue *rq = &vi->rq[i];
1387

J
Jason Wang 已提交
1388
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1389
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1390
		virtnet_napi_enable(rq->vq, &rq->napi);
1391

J
Jason Wang 已提交
1392 1393 1394 1395 1396 1397
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1398 1399
}

1400 1401
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1402
{
1403
	struct virtnet_info *vi = rq->vq->vdev->priv;
1404
	struct virtnet_rq_stats stats = {};
1405
	unsigned int len;
1406
	void *buf;
1407
	int i;
R
Rusty Russell 已提交
1408

1409
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1410 1411
		void *ctx;

1412
		while (stats.packets < budget &&
1413
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1414
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1415
			stats.packets++;
1416 1417
		}
	} else {
1418
		while (stats.packets < budget &&
1419
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1420
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1421
			stats.packets++;
1422
		}
R
Rusty Russell 已提交
1423 1424
	}

1425
	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
M
Michael S. Tsirkin 已提交
1426
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1427
			schedule_delayed_work(&vi->refill, 0);
1428
	}
R
Rusty Russell 已提交
1429

T
Toshiaki Makita 已提交
1430
	u64_stats_update_begin(&rq->stats.syncp);
1431 1432 1433 1434
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1435 1436
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1437
	}
T
Toshiaki Makita 已提交
1438
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1439

1440
	return stats.packets;
1441 1442
}

1443
static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1444 1445 1446 1447
{
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;
1448
	void *ptr;
1449

1450 1451 1452
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		if (likely(!is_xdp_frame(ptr))) {
			struct sk_buff *skb = ptr;
1453

1454
			pr_debug("Sent skb %p\n", skb);
1455

1456 1457 1458 1459
			bytes += skb->len;
			napi_consume_skb(skb, in_napi);
		} else {
			struct xdp_frame *frame = ptr_to_xdp(ptr);
1460

1461 1462 1463 1464
			bytes += frame->len;
			xdp_return_frame(frame);
		}
		packets++;
1465 1466 1467 1468 1469 1470 1471 1472
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1473 1474 1475 1476
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1477 1478
}

1479 1480 1481 1482 1483 1484 1485 1486 1487 1488
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

1489 1490 1491 1492 1493 1494 1495
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

1496
	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1497 1498 1499
		return;

	if (__netif_tx_trylock(txq)) {
1500
		free_old_xmit_skbs(sq, true);
1501 1502 1503 1504 1505 1506 1507
		__netif_tx_unlock(txq);
	}

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);
}

1508 1509 1510 1511
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1512 1513
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1514
	unsigned int received;
1515
	unsigned int xdp_xmit = 0;
1516

1517 1518
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1519
	received = virtnet_receive(rq, budget, &xdp_xmit);
1520

1521
	/* Out of packets? */
1522 1523
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1524

1525
	if (xdp_xmit & VIRTIO_XDP_REDIR)
1526
		xdp_do_flush();
1527 1528

	if (xdp_xmit & VIRTIO_XDP_TX) {
1529
		sq = virtnet_xdp_get_sq(vi);
T
Toshiaki Makita 已提交
1530 1531 1532 1533 1534
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1535
		virtnet_xdp_put_sq(vi, sq);
1536
	}
J
Jason Wang 已提交
1537

R
Rusty Russell 已提交
1538 1539 1540
	return received;
}

J
Jason Wang 已提交
1541 1542 1543
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1544
	int i, err;
J
Jason Wang 已提交
1545

1546 1547 1548
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1549
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1550
				schedule_delayed_work(&vi->refill, 0);
1551

1552
		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1553 1554 1555
		if (err < 0)
			return err;

1556 1557 1558 1559 1560 1561 1562
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1563
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1564
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1565 1566 1567 1568 1569
	}

	return 0;
}

W
Willem de Bruijn 已提交
1570 1571 1572 1573
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
1574 1575
	unsigned int index = vq2txq(sq->vq);
	struct netdev_queue *txq;
W
Willem de Bruijn 已提交
1576

1577 1578 1579 1580 1581 1582 1583
	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
		/* We don't need to enable cb for XDP */
		napi_complete_done(napi, 0);
		return 0;
	}

	txq = netdev_get_tx_queue(vi->dev, index);
W
Willem de Bruijn 已提交
1584
	__netif_tx_lock(txq, raw_smp_processor_id());
1585
	free_old_xmit_skbs(sq, true);
W
Willem de Bruijn 已提交
1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
	__netif_tx_unlock(txq);

	virtqueue_napi_complete(napi, sq->vq, 0);

	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

	return 0;
}

1596
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1597
{
1598
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1599
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1600
	struct virtnet_info *vi = sq->vq->vdev->priv;
1601
	int num_sg;
1602
	unsigned hdr_len = vi->hdr_len;
1603
	bool can_push;
R
Rusty Russell 已提交
1604

J
Johannes Berg 已提交
1605
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1606 1607 1608 1609 1610 1611 1612

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1613
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1614 1615
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1616

1617
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1618 1619
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1620
		BUG();
R
Rusty Russell 已提交
1621

1622
	if (vi->mergeable_rx_bufs)
1623
		hdr->num_buffers = 0;
1624

1625
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1626 1627 1628
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1629 1630
		if (unlikely(num_sg < 0))
			return num_sg;
1631 1632 1633 1634
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1635 1636 1637 1638
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1639
	}
1640
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1641 1642
}

1643
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1644 1645
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1646 1647
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1648
	int err;
1649
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1650
	bool kick = !netdev_xmit_more();
W
Willem de Bruijn 已提交
1651
	bool use_napi = sq->napi.weight;
1652 1653

	/* Free up any pending old buffers before queueing new ones. */
1654
	free_old_xmit_skbs(sq, false);
1655

1656 1657 1658
	if (use_napi && kick)
		virtqueue_enable_cb_delayed(sq->vq);

1659 1660 1661
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1662
	/* Try to transmit */
1663
	err = xmit_skb(sq, skb);
1664

1665
	/* This should not happen! */
1666
	if (unlikely(err)) {
1667 1668 1669
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1670 1671
				 "Unexpected TXQ (%d) queue failure: %d\n",
				 qnum, err);
1672
		dev->stats.tx_dropped++;
1673
		dev_kfree_skb_any(skb);
1674
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1675
	}
1676

1677
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1678 1679
	if (!use_napi) {
		skb_orphan(skb);
1680
		nf_reset_ct(skb);
W
Willem de Bruijn 已提交
1681
	}
1682

1683 1684 1685 1686 1687 1688 1689 1690 1691
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1692
	 */
1693
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1694
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1695 1696
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1697
			/* More just got used, free them then recheck. */
1698
			free_old_xmit_skbs(sq, false);
1699
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1700
				netif_start_subqueue(dev, qnum);
1701
				virtqueue_disable_cb(sq->vq);
1702 1703
			}
		}
1704
	}
1705

T
Toshiaki Makita 已提交
1706 1707 1708 1709 1710 1711 1712
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1713

1714
	return NETDEV_TX_OK;
1715 1716
}

1717 1718 1719
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1720
 * never fail unless improperly formatted.
1721 1722
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1723
				 struct scatterlist *out)
1724
{
1725
	struct scatterlist *sgs[4], hdr, stat;
1726
	unsigned out_num = 0, tmp;
1727 1728

	/* Caller should know better */
1729
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1730

1731 1732 1733
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1734
	/* Add header */
1735
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1736
	sgs[out_num++] = &hdr;
1737

1738 1739
	if (out)
		sgs[out_num++] = out;
1740

1741
	/* Add return status. */
1742
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1743
	sgs[out_num] = &stat;
1744

1745
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1746
	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1747

1748
	if (unlikely(!virtqueue_kick(vi->cvq)))
1749
		return vi->ctrl->status == VIRTIO_NET_OK;
1750 1751 1752 1753

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1754 1755
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1756 1757
		cpu_relax();

1758
	return vi->ctrl->status == VIRTIO_NET_OK;
1759 1760
}

1761 1762 1763 1764
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1765
	int ret;
1766
	struct sockaddr *addr;
1767
	struct scatterlist sg;
1768

1769 1770 1771
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1772
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1773 1774 1775 1776
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1777
	if (ret)
1778
		goto out;
1779

1780 1781 1782
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1783
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1784 1785
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1786 1787
			ret = -EINVAL;
			goto out;
1788
		}
1789 1790
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1791 1792 1793 1794 1795 1796 1797
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1798 1799 1800
	}

	eth_commit_mac_addr_change(dev, p);
1801
	ret = 0;
1802

1803 1804 1805
out:
	kfree(addr);
	return ret;
1806 1807
}

1808 1809
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1810 1811 1812
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1813
	int i;
1814

T
Toshiaki Makita 已提交
1815
	for (i = 0; i < vi->max_queue_pairs; i++) {
1816
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1817 1818
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1819 1820

		do {
T
Toshiaki Makita 已提交
1821 1822 1823 1824
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1825 1826

		do {
T
Toshiaki Makita 已提交
1827
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1828 1829 1830
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1831
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1832 1833 1834 1835 1836

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1837
		tot->rx_dropped += rdrops;
1838 1839 1840
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1841
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1842 1843 1844 1845
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1846 1847 1848 1849
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1850
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1851 1852 1853 1854
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1855
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1856 1857 1858 1859 1860 1861 1862
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1863 1864
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1865 1866

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1867
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1868 1869 1870
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1871
	} else {
J
Jason Wang 已提交
1872
		vi->curr_queue_pairs = queue_pairs;
1873 1874 1875
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1876
	}
J
Jason Wang 已提交
1877 1878 1879 1880

	return 0;
}

1881 1882 1883 1884 1885 1886 1887 1888 1889 1890
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1891 1892 1893
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1894
	int i;
R
Rusty Russell 已提交
1895

1896 1897
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1898

W
Willem de Bruijn 已提交
1899
	for (i = 0; i < vi->max_queue_pairs; i++) {
1900
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1901
		napi_disable(&vi->rq[i].napi);
1902
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1903
	}
R
Rusty Russell 已提交
1904 1905 1906 1907

	return 0;
}

1908 1909 1910
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1911 1912
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1913
	struct netdev_hw_addr *ha;
1914
	int uc_count;
1915
	int mc_count;
1916 1917
	void *buf;
	int i;
1918

S
stephen hemminger 已提交
1919
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1920 1921 1922
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1923 1924
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1925

1926
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1927 1928

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1929
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1930
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1931
			 vi->ctrl->promisc ? "en" : "dis");
1932

1933
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1934 1935

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1936
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1937
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1938
			 vi->ctrl->allmulti ? "en" : "dis");
1939

1940
	uc_count = netdev_uc_count(dev);
1941
	mc_count = netdev_mc_count(dev);
1942
	/* MAC filter - use one buffer for both lists */
1943 1944 1945
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1946
	if (!buf)
1947 1948
		return;

1949 1950
	sg_init_table(sg, 2);

1951
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1952
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1953
	i = 0;
1954
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1955
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1956 1957

	sg_set_buf(&sg[0], mac_data,
1958
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1959 1960

	/* multicast list and count fill the end */
1961
	mac_data = (void *)&mac_data->macs[uc_count][0];
1962

M
Michael S. Tsirkin 已提交
1963
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1964
	i = 0;
1965 1966
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1967 1968

	sg_set_buf(&sg[1], mac_data,
1969
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1970 1971

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1972
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1973
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1974 1975

	kfree(buf);
1976 1977
}

1978 1979
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
1980 1981 1982 1983
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1984
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1985
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1986 1987

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1988
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1989
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1990
	return 0;
1991 1992
}

1993 1994
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
1995 1996 1997 1998
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1999
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2000
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2001 2002

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2003
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2004
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2005
	return 0;
2006 2007
}

2008
static void virtnet_clean_affinity(struct virtnet_info *vi)
J
Jason Wang 已提交
2009 2010 2011
{
	int i;

2012 2013
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2014 2015
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
2016 2017
		}

2018 2019 2020
		vi->affinity_hint_set = false;
	}
}
2021

2022 2023
static void virtnet_set_affinity(struct virtnet_info *vi)
{
2024 2025 2026 2027 2028 2029 2030 2031
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2032
		virtnet_clean_affinity(vi);
2033
		return;
J
Jason Wang 已提交
2034 2035
	}

2036 2037 2038 2039 2040 2041
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
2042

2043 2044 2045 2046 2047 2048 2049 2050 2051 2052
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
2053
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2054
		cpumask_clear(mask);
J
Jason Wang 已提交
2055 2056
	}

2057
	vi->affinity_hint_set = true;
2058
	free_cpumask_var(mask);
J
Jason Wang 已提交
2059 2060
}

2061
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2062
{
2063 2064 2065 2066 2067
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
2068

2069 2070 2071 2072 2073 2074 2075
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
2076

2077 2078 2079 2080 2081
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

2082
	virtnet_clean_affinity(vi);
2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
2108 2109
}

R
Rick Jones 已提交
2110 2111 2112 2113 2114
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
2115 2116
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
2117 2118 2119 2120
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2148
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2149 2150
		return -EINVAL;

J
John Fastabend 已提交
2151 2152 2153 2154 2155 2156 2157
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2158
	get_online_cpus();
2159
	err = _virtnet_set_queues(vi, queue_pairs);
2160 2161 2162
	if (err) {
		put_online_cpus();
		goto err;
2163
	}
2164
	virtnet_set_affinity(vi);
2165
	put_online_cpus();
2166

2167 2168 2169
	netif_set_real_num_tx_queues(dev, queue_pairs);
	netif_set_real_num_rx_queues(dev, queue_pairs);
 err:
2170 2171 2172
	return err;
}

T
Toshiaki Makita 已提交
2173 2174 2175 2176
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int i, j;
2177
	u8 *p = data;
T
Toshiaki Makita 已提交
2178 2179 2180 2181

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
2182 2183 2184
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
						virtnet_rq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2185 2186 2187
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
2188 2189 2190
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
						virtnet_sq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2220
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2259 2260
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2261 2262 2263
{
	struct virtnet_info *vi = netdev_priv(dev);

2264 2265
	return ethtool_virtdev_set_link_ksettings(dev, cmd,
						  &vi->speed, &vi->duplex);
2266 2267
}

2268 2269
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2270 2271 2272
{
	struct virtnet_info *vi = netdev_priv(dev);

2273 2274 2275
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2276 2277 2278 2279

	return 0;
}

2280 2281 2282 2283 2284 2285
static int virtnet_set_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

2286 2287
	if (ec->tx_max_coalesced_frames > 1 ||
	    ec->rx_max_coalesced_frames != 1)
2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317
		return -EINVAL;

	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
				struct ethtool_coalesce *ec)
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2318 2319 2320 2321 2322 2323 2324 2325
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2326 2327 2328 2329 2330 2331 2332 2333
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

2334 2335
	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);

2336 2337
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
2338 2339 2340

	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);

2341 2342 2343 2344
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2345
static const struct ethtool_ops virtnet_ethtool_ops = {
2346
	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2347
	.get_drvinfo = virtnet_get_drvinfo,
2348
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2349
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2350 2351 2352
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2353 2354
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2355
	.get_ts_info = ethtool_op_get_ts_info,
2356 2357
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2358 2359
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2360 2361
};

2362 2363 2364 2365 2366 2367 2368 2369
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2370
	netif_tx_lock_bh(vi->dev);
2371
	netif_device_detach(vi->dev);
2372
	netif_tx_unlock_bh(vi->dev);
2373 2374 2375
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2376
		for (i = 0; i < vi->max_queue_pairs; i++) {
2377
			napi_disable(&vi->rq[i].napi);
2378
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2379
		}
2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2401
		for (i = 0; i < vi->max_queue_pairs; i++) {
2402
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2403 2404 2405
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2406 2407
	}

2408
	netif_tx_lock_bh(vi->dev);
2409
	netif_device_attach(vi->dev);
2410
	netif_tx_unlock_bh(vi->dev);
2411 2412 2413
	return err;
}

2414 2415 2416
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2417
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2418

2419
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2420 2421 2422

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2423
		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2450 2451
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2452 2453 2454 2455
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2456
	u16 xdp_qp = 0, curr_qp;
2457
	int i, err;
J
John Fastabend 已提交
2458

2459 2460 2461 2462
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2463 2464 2465
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
J
John Fastabend 已提交
2466 2467 2468 2469
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2470
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2471 2472 2473 2474
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2475
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2476 2477 2478 2479
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2480 2481 2482 2483 2484 2485
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2486
		netdev_warn(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
2487
			    curr_qp + xdp_qp, vi->max_queue_pairs);
2488
		xdp_qp = 0;
2489 2490
	}

2491 2492 2493 2494
	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
	if (!prog && !old_prog)
		return 0;

2495 2496
	if (prog)
		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2497

2498
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2499 2500
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2501
			napi_disable(&vi->rq[i].napi);
2502 2503 2504
			virtnet_napi_tx_disable(&vi->sq[i].napi);
		}
	}
J
John Fastabend 已提交
2505

2506 2507 2508 2509 2510 2511 2512 2513
	if (!prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0)
				virtnet_restore_guest_offloads(vi);
		}
		synchronize_net();
	}
J
John Fastabend 已提交
2514

2515 2516 2517
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
2518
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2519
	vi->xdp_queue_pairs = xdp_qp;
2520

2521
	if (prog) {
2522
		vi->xdp_enabled = true;
2523 2524 2525
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0 && !old_prog)
2526 2527
				virtnet_clear_guest_offloads(vi);
		}
2528 2529
	} else {
		vi->xdp_enabled = false;
2530 2531 2532
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
J
John Fastabend 已提交
2533 2534
		if (old_prog)
			bpf_prog_put(old_prog);
2535
		if (netif_running(dev)) {
2536
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2537 2538 2539
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
J
John Fastabend 已提交
2540 2541 2542
	}

	return 0;
2543

2544
err:
2545 2546 2547 2548 2549 2550
	if (!prog) {
		virtnet_clear_guest_offloads(vi);
		for (i = 0; i < vi->max_queue_pairs; i++)
			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
	}

2551
	if (netif_running(dev)) {
2552
		for (i = 0; i < vi->max_queue_pairs; i++) {
2553
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2554 2555 2556
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2557
	}
2558 2559 2560
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2561 2562
}

2563
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2564 2565 2566
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2567
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2568 2569 2570 2571 2572
	default:
		return -EINVAL;
	}
}

2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2589 2590 2591 2592
static int virtnet_set_features(struct net_device *dev,
				netdev_features_t features)
{
	struct virtnet_info *vi = netdev_priv(dev);
2593
	u64 offloads;
2594 2595
	int err;

2596
	if ((dev->features ^ features) & NETIF_F_LRO) {
2597
		if (vi->xdp_enabled)
2598 2599
			return -EBUSY;

2600
		if (features & NETIF_F_LRO)
2601
			offloads = vi->guest_offloads_capable;
2602
		else
2603 2604
			offloads = vi->guest_offloads_capable &
				   ~GUEST_OFFLOAD_LRO_MASK;
2605

2606 2607 2608 2609
		err = virtnet_set_guest_offloads(vi, offloads);
		if (err)
			return err;
		vi->guest_offloads = offloads;
2610 2611 2612 2613 2614
	}

	return 0;
}

2615 2616 2617 2618 2619
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2620
	.ndo_set_mac_address = virtnet_set_mac_address,
2621
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2622
	.ndo_get_stats64     = virtnet_stats,
2623 2624
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2625
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2626
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2627
	.ndo_features_check	= passthru_features_check,
2628
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2629
	.ndo_set_features	= virtnet_set_features,
2630 2631
};

2632
static void virtnet_config_changed_work(struct work_struct *work)
2633
{
2634 2635
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2636 2637
	u16 v;

2638 2639
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2640
		return;
2641 2642

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2643
		netdev_notify_peers(vi->dev);
2644 2645
		virtnet_ack_link_announce(vi);
	}
2646 2647 2648 2649 2650

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2651
		return;
2652 2653 2654 2655

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2656
		virtnet_update_settings(vi);
2657
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2658
		netif_tx_wake_all_queues(vi->dev);
2659 2660
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2661
		netif_tx_stop_all_queues(vi->dev);
2662 2663 2664 2665 2666 2667 2668
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2669
	schedule_work(&vi->config_work);
2670 2671
}

J
Jason Wang 已提交
2672 2673
static void virtnet_free_queues(struct virtnet_info *vi)
{
2674 2675
	int i;

2676
	for (i = 0; i < vi->max_queue_pairs; i++) {
2677 2678
		__netif_napi_del(&vi->rq[i].napi);
		__netif_napi_del(&vi->sq[i].napi);
2679
	}
2680

2681
	/* We called __netif_napi_del(),
2682 2683 2684 2685
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2686 2687
	kfree(vi->rq);
	kfree(vi->sq);
2688
	kfree(vi->ctrl);
J
Jason Wang 已提交
2689 2690
}

2691
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2692
{
J
John Fastabend 已提交
2693
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2694 2695 2696 2697 2698
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2699 2700 2701 2702 2703

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2704
	}
2705 2706 2707 2708 2709 2710
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2711
	rtnl_unlock();
J
Jason Wang 已提交
2712 2713
}

2714 2715 2716 2717 2718 2719 2720 2721
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
Jason Wang 已提交
2722 2723 2724 2725 2726 2727 2728
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2729
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2730
			if (!is_xdp_frame(buf))
J
John Fastabend 已提交
2731 2732
				dev_kfree_skb(buf);
			else
2733
				xdp_return_frame(ptr_to_xdp(buf));
J
John Fastabend 已提交
2734
		}
J
Jason Wang 已提交
2735 2736 2737 2738 2739 2740
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2741
			if (vi->mergeable_rx_bufs) {
2742
				put_page(virt_to_head_page(buf));
2743
			} else if (vi->big_packets) {
2744
				give_pages(&vi->rq[i], buf);
2745
			} else {
2746
				put_page(virt_to_head_page(buf));
2747
			}
J
Jason Wang 已提交
2748 2749 2750 2751
		}
	}
}

2752 2753 2754 2755
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2756
	virtnet_clean_affinity(vi);
J
Jason Wang 已提交
2757

2758
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2759 2760

	virtnet_free_queues(vi);
2761 2762
}

2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2775 2776
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2777 2778
}

J
Jason Wang 已提交
2779
static int virtnet_find_vqs(struct virtnet_info *vi)
2780
{
J
Jason Wang 已提交
2781 2782 2783 2784 2785
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2786
	bool *ctx;
J
Jason Wang 已提交
2787 2788 2789 2790 2791 2792 2793 2794 2795

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2796
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2797 2798
	if (!vqs)
		goto err_vq;
2799
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2800 2801
	if (!callbacks)
		goto err_callback;
2802
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2803 2804
	if (!names)
		goto err_names;
2805
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2806
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2807 2808 2809 2810 2811
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2812 2813 2814 2815 2816 2817

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2818

J
Jason Wang 已提交
2819 2820 2821 2822 2823 2824 2825 2826
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2827 2828
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2829
	}
2830

J
Jason Wang 已提交
2831
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2832
					 names, ctx, NULL);
J
Jason Wang 已提交
2833 2834
	if (ret)
		goto err_find;
2835

J
Jason Wang 已提交
2836 2837
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2838
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2839
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2840
	}
J
Jason Wang 已提交
2841 2842 2843

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2844
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2845 2846 2847
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2848
	/* run here: ret == 0. */
J
Jason Wang 已提交
2849 2850 2851


err_find:
2852 2853
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2867 2868 2869
	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
	if (!vi->ctrl)
		goto err_ctrl;
K
Kees Cook 已提交
2870
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2871 2872
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2873
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2874
	if (!vi->rq)
J
Jason Wang 已提交
2875 2876 2877 2878 2879 2880 2881
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2882 2883
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2884 2885

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2886
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2887
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2888 2889 2890

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2891 2892 2893 2894 2895 2896 2897
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2898 2899
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2916
	get_online_cpus();
2917
	virtnet_set_affinity(vi);
2918 2919
	put_online_cpus();

J
Jason Wang 已提交
2920 2921 2922 2923 2924 2925
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2926 2927
}

2928 2929
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2930
		char *buf)
2931 2932 2933
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2934 2935
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2936
	struct ewma_pkt_len *avg;
2937 2938 2939

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2940
	return sprintf(buf, "%u\n",
2941 2942
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

2993 2994 2995
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

2996
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
2997
{
2998 2999 3000 3001 3002 3003
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

3004 3005 3006
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
3020
	int i, err = -ENOMEM;
3021 3022 3023 3024 3025
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
3026
	/* Find if host supports multiqueue virtio_net device */
3027 3028 3029
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
3030 3031 3032 3033 3034 3035

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
3036 3037

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
3038
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
3039 3040 3041 3042
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
3043 3044
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
			   IFF_TX_SKB_NO_LINEAR;
3045
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
3046
	dev->features = NETIF_F_HIGHDMA;
3047

3048
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
3049 3050 3051
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
3052
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
3053
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
3054
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3055
		if (csum)
J
Jason Wang 已提交
3056
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3057 3058

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3059
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
3060 3061
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
3062
		/* Individual feature bits: what can host handle? */
3063 3064 3065 3066 3067 3068 3069
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

3070 3071
		dev->features |= NETIF_F_GSO_ROBUST;

3072
		if (gso)
3073
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3074
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
3075
	}
3076 3077
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
3078 3079 3080
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
		dev->features |= NETIF_F_LRO;
3081
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3082
		dev->hw_features |= NETIF_F_LRO;
R
Rusty Russell 已提交
3083

3084 3085
	dev->vlan_features = dev->features;

3086 3087 3088 3089
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
3090
	/* Configuration may specify what MAC to use.  Otherwise random. */
3091 3092 3093 3094 3095
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
3096
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
3097 3098 3099 3100 3101

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
3102
	vdev->priv = vi;
3103

3104
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
3105

3106
	/* If we can receive ANY GSO packets, we must allocate large ones. */
3107 3108
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3109 3110
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3111 3112
		vi->big_packets = true;

3113 3114 3115
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

3116 3117
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3118 3119 3120 3121
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

3122 3123
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3124 3125
		vi->any_header_sg = true;

J
Jason Wang 已提交
3126 3127 3128
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

3129 3130 3131 3132
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3133
		if (mtu < dev->min_mtu) {
3134 3135 3136
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
3137 3138 3139
			dev_err(&vdev->dev,
				"device MTU appears to have changed it is now %d < %d",
				mtu, dev->min_mtu);
3140
			err = -EINVAL;
T
Toshiaki Makita 已提交
3141
			goto free;
3142
		}
3143

3144 3145 3146
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3147 3148 3149
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3150 3151
	}

3152 3153
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3154

3155 3156 3157 3158 3159
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3160 3161 3162
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3163
	err = init_vqs(vi);
3164
	if (err)
T
Toshiaki Makita 已提交
3165
		goto free;
R
Rusty Russell 已提交
3166

3167 3168 3169 3170
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3171 3172
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3173

3174 3175
	virtnet_init_settings(dev);

3176 3177
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3178 3179
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3180
			goto free_vqs;
3181
		}
3182 3183
	}

R
Rusty Russell 已提交
3184 3185 3186
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3187
		goto free_failover;
R
Rusty Russell 已提交
3188
	}
3189

M
Michael S. Tsirkin 已提交
3190 3191
	virtio_device_ready(vdev);

3192
	err = virtnet_cpu_notif_add(vi);
3193 3194
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3195
		goto free_unregister_netdev;
3196 3197
	}

3198
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3199

J
Jason Wang 已提交
3200 3201
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3202
	netif_carrier_off(dev);
J
Jason Wang 已提交
3203
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3204
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3205 3206
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3207
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3208 3209
		netif_carrier_on(dev);
	}
3210

3211 3212 3213
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);
3214
	vi->guest_offloads_capable = vi->guest_offloads;
3215

J
Jason Wang 已提交
3216 3217 3218
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3219 3220
	return 0;

3221
free_unregister_netdev:
3222 3223
	vi->vdev->config->reset(vdev);

3224
	unregister_netdev(dev);
3225 3226
free_failover:
	net_failover_destroy(vi->failover);
3227
free_vqs:
J
Jason Wang 已提交
3228
	cancel_delayed_work_sync(&vi->refill);
3229
	free_receive_page_frags(vi);
3230
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3231 3232 3233 3234 3235
free:
	free_netdev(dev);
	return err;
}

3236
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3237
{
3238
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3239 3240

	/* Free unused buffers in both send and recv, if any. */
3241
	free_unused_bufs(vi);
3242

J
Jason Wang 已提交
3243
	free_receive_bufs(vi);
3244

3245 3246
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3247
	virtnet_del_vqs(vi);
3248 3249
}

3250
static void virtnet_remove(struct virtio_device *vdev)
3251 3252 3253
{
	struct virtnet_info *vi = vdev->priv;

3254
	virtnet_cpu_notif_remove(vi);
3255

3256 3257
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3258

3259 3260
	unregister_netdev(vi->dev);

3261 3262
	net_failover_destroy(vi->failover);

3263
	remove_vq_common(vi);
3264

3265
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3266 3267
}

3268
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3269 3270 3271
{
	struct virtnet_info *vi = vdev->priv;

3272
	virtnet_cpu_notif_remove(vi);
3273
	virtnet_freeze_down(vdev);
3274 3275 3276 3277 3278
	remove_vq_common(vi);

	return 0;
}

3279
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3280 3281
{
	struct virtnet_info *vi = vdev->priv;
3282
	int err;
3283

3284
	err = virtnet_restore_up(vdev);
3285 3286
	if (err)
		return err;
J
Jason Wang 已提交
3287 3288
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3289
	err = virtnet_cpu_notif_add(vi);
3290 3291 3292
	if (err)
		return err;

3293 3294 3295
	return 0;
}

R
Rusty Russell 已提交
3296 3297 3298 3299 3300
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3301 3302 3303 3304 3305 3306 3307 3308 3309 3310
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3311
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3312
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3313

3314
static unsigned int features[] = {
3315 3316 3317 3318 3319 3320
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3321
	VIRTIO_F_ANY_LAYOUT,
3322 3323
};

3324
static struct virtio_driver virtio_net_driver = {
3325 3326
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3327 3328
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3329 3330 3331
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3332
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3333
	.probe =	virtnet_probe,
3334
	.remove =	virtnet_remove,
3335
	.config_changed = virtnet_config_changed,
3336
#ifdef CONFIG_PM_SLEEP
3337 3338 3339
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3340 3341
};

3342 3343 3344 3345
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3346
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3347 3348 3349 3350 3351
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3352
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3372
	unregister_virtio_driver(&virtio_net_driver);
3373 3374 3375 3376
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3377 3378 3379 3380

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");