virtio_net.c 88.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* A network driver using virtio.
R
Rusty Russell 已提交
3 4 5 6 7 8
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
9
#include <linux/ethtool.h>
R
Rusty Russell 已提交
10 11 12
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
13
#include <linux/bpf.h>
14
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
15
#include <linux/scatterlist.h>
16
#include <linux/if_vlan.h>
17
#include <linux/slab.h>
18
#include <linux/cpu.h>
19
#include <linux/average.h>
J
Jason Wang 已提交
20
#include <linux/filter.h>
21
#include <linux/kernel.h>
22
#include <net/route.h>
23
#include <net/xdp.h>
24
#include <net/net_failover.h>
R
Rusty Russell 已提交
25

26
static int napi_weight = NAPI_POLL_WEIGHT;
27 28
module_param(napi_weight, int, 0444);

29
static bool csum = true, gso = true, napi_tx = true;
R
Rusty Russell 已提交
30 31
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
32
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
33

R
Rusty Russell 已提交
34
/* FIXME: MTU in config. */
35
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
36
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
37

38 39
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

40 41 42
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

43 44 45 46
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

47 48
#define VIRTIO_XDP_FLAG	BIT(0)

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56
#define VIRTNET_DRIVER_VERSION "1.0.0"
57

58 59 60 61
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
62 63
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
64
};
65

66
#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
67 68 69 70
				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
				(1ULL << VIRTIO_NET_F_GUEST_UFO))

T
Toshiaki Makita 已提交
71 72 73
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
74 75
};

T
Toshiaki Makita 已提交
76 77 78 79
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
80 81
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
82
	u64 kicks;
83
	u64 tx_timeouts;
T
Toshiaki Makita 已提交
84 85
};

86 87
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
88 89
	u64 packets;
	u64 bytes;
90
	u64 drops;
91 92 93 94
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
95
	u64 kicks;
T
Toshiaki Makita 已提交
96 97 98
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
99
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
100 101

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
102 103 104 105
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
106
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
107
	{ "tx_timeouts",	VIRTNET_SQ_STAT(tx_timeouts) },
T
Toshiaki Makita 已提交
108 109 110
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
111 112 113 114 115 116 117
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
118
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
119 120 121 122 123
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

124 125 126 127 128 129 130
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
131 132 133

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
134

T
Toshiaki Makita 已提交
135 136
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
137
	struct napi_struct napi;
138 139 140 141 142 143 144
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
145 146
	struct napi_struct napi;

J
John Fastabend 已提交
147 148
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
149 150
	struct virtnet_rq_stats stats;

151 152 153
	/* Chain pages by the private ptr. */
	struct page *pages;

154
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
155
	struct ewma_pkt_len mrg_avg_pkt_len;
156

157 158 159
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

160 161
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
162

163 164 165
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
166 167
	/* Name of this receive queue: input.$index */
	char name[40];
168 169

	struct xdp_rxq_info xdp_rxq;
170 171
};

172 173 174 175 176 177 178
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
179
	__virtio16 vid;
180
	__virtio64 offloads;
181 182
};

183 184 185 186
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
187 188
	struct send_queue *sq;
	struct receive_queue *rq;
189 190
	unsigned int status;

J
Jason Wang 已提交
191 192 193 194 195 196
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

197 198 199
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

200 201 202
	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
	bool xdp_enabled;

203 204 205
	/* I like... big packets and I cannot lie! */
	bool big_packets;

206 207 208
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
209 210 211
	/* Has control virtqueue */
	bool has_cvq;

212 213 214
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

215 216 217
	/* Packet virtio header size */
	u8 hdr_len;

218 219 220
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

221 222 223
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
224 225
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
226

227 228 229
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
230

231
	struct control_buf *ctrl;
232 233 234 235

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
236 237

	unsigned long guest_offloads;
238
	unsigned long guest_offloads_capable;
239 240 241

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
242 243
};

244
struct padded_vnet_hdr {
245
	struct virtio_net_hdr_mrg_rxbuf hdr;
246
	/*
247 248 249
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
250
	 */
251
	char padding[4];
252 253
};

254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
static bool is_xdp_frame(void *ptr)
{
	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
}

static void *xdp_to_ptr(struct xdp_frame *ptr)
{
	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
}

static struct xdp_frame *ptr_to_xdp(void *ptr)
{
	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
}

J
Jason Wang 已提交
269 270 271 272 273
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
274
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
275 276 277 278 279 280 281 282 283
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
284
	return vq->index / 2;
J
Jason Wang 已提交
285 286 287 288 289 290 291
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

292
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
293
{
294
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
295 296
}

297 298 299 300
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
301
static void give_pages(struct receive_queue *rq, struct page *page)
302
{
303
	struct page *end;
304

305
	/* Find end of list, sew whole thing into vi->rq.pages. */
306
	for (end = page; end->private; end = (struct page *)end->private);
307 308
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
309 310
}

311
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
312
{
313
	struct page *p = rq->pages;
314

315
	if (p) {
316
		rq->pages = (struct page *)p->private;
317 318 319
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
320 321 322 323
		p = alloc_page(gfp_mask);
	return p;
}

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
339 340 341 342 343 344
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
345 346
}

347
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
348
{
349
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
350
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
351

352
	/* Suppress further interrupts. */
353
	virtqueue_disable_cb(vq);
354

W
Willem de Bruijn 已提交
355 356 357 358 359
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
360 361
}

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

379
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
380 381
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
382
				   struct page *page, unsigned int offset,
383
				   unsigned int len, unsigned int truesize,
384
				   bool hdr_valid, unsigned int metasize,
385
				   unsigned int headroom)
386 387
{
	struct sk_buff *skb;
388
	struct virtio_net_hdr_mrg_rxbuf *hdr;
389
	unsigned int copy, hdr_len, hdr_padded_len;
390
	struct page *page_to_free = NULL;
391
	int tailroom, shinfo_size;
392
	char *p, *hdr_p, *buf;
393

394
	p = page_address(page) + offset;
395
	hdr_p = p;
396

397 398
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
399
		hdr_padded_len = sizeof(*hdr);
400
	else
401
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
402

403
	/* If headroom is not 0, there is an offset between the beginning of the
404 405
	 * data and the allocated space, otherwise the data and the allocated
	 * space are aligned.
406 407 408
	 *
	 * Buffers with headroom use PAGE_SIZE as alloc size, see
	 * add_recvbuf_mergeable() + get_mergeable_buf_len()
409
	 */
410 411 412
	truesize = headroom ? PAGE_SIZE : truesize;
	tailroom = truesize - len - headroom;
	buf = p - headroom;
413

414
	len -= hdr_len;
415 416
	offset += hdr_padded_len;
	p += hdr_padded_len;
417

418 419
	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

420
	/* copy small packet so we can reuse these pages */
421
	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
422
		skb = build_skb(buf, truesize);
423 424 425
		if (unlikely(!skb))
			return NULL;

426
		skb_reserve(skb, p - buf);
427
		skb_put(skb, len);
428 429 430 431

		page = (struct page *)page->private;
		if (page)
			give_pages(rq, page);
432 433 434 435 436 437 438 439
		goto ok;
	}

	/* copy small packet so we can reuse these pages for small data */
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;

440 441 442 443 444 445 446
	/* Copy all frame if it fits skb->head, otherwise
	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
	 */
	if (len <= skb_tailroom(skb))
		copy = len;
	else
		copy = ETH_HLEN + metasize;
447
	skb_put_data(skb, p, copy);
448

449 450
	len -= copy;
	offset += copy;
451

452 453 454 455
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
456
			page_to_free = page;
457
		goto ok;
458 459
	}

460 461 462 463 464 465 466
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
467
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
468 469 470
		dev_kfree_skb(skb);
		return NULL;
	}
471
	BUG_ON(offset >= PAGE_SIZE);
472
	while (len) {
473 474 475 476
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
477 478 479
		page = (struct page *)page->private;
		offset = 0;
	}
480

481
	if (page)
482
		give_pages(rq, page);
483

484 485 486 487 488 489
ok:
	/* hdr_valid means no XDP, so we can copy the vnet header */
	if (hdr_valid) {
		hdr = skb_vnet_hdr(skb);
		memcpy(hdr, hdr_p, hdr_len);
	}
490 491
	if (page_to_free)
		put_page(page_to_free);
492 493 494 495 496 497

	if (metasize) {
		__skb_pull(skb, metasize);
		skb_metadata_set(skb, metasize);
	}

498 499
	return skb;
}
500

501 502 503
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
504 505 506 507
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

508 509 510 511 512
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
513
	/* Zero header and leave csum up to XDP layers */
514
	hdr = xdpf->data;
515
	memset(hdr, 0, vi->hdr_len);
516
	xdpf->len   += vi->hdr_len;
517

518
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
519

520 521
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
				   GFP_ATOMIC);
522
	if (unlikely(err))
523
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
524

525
	return 0;
J
John Fastabend 已提交
526 527
}

528 529 530 531 532 533 534 535 536
/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 * the current cpu, so it does not need to be locked.
 *
 * Here we use marco instead of inline functions because we have to deal with
 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 * functions to perfectly solve these three problems at the same time.
 */
#define virtnet_xdp_get_sq(vi) ({                                       \
537
	int cpu = smp_processor_id();                                   \
538 539 540 541 542 543
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
	unsigned int qp;                                                \
									\
	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
544
		qp += cpu;                                              \
545 546 547
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_acquire(txq);                                \
	} else {                                                        \
548
		qp = cpu % v->curr_queue_pairs;                         \
549
		txq = netdev_get_tx_queue(v->dev, qp);                  \
550
		__netif_tx_lock(txq, cpu);                              \
551 552 553 554 555 556 557 558 559 560 561 562 563
	}                                                               \
	v->sq + qp;                                                     \
})

#define virtnet_xdp_put_sq(vi, q) {                                     \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
									\
	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
	if (v->curr_queue_pairs > nr_cpu_ids)                           \
		__netif_tx_release(txq);                                \
	else                                                            \
		__netif_tx_unlock(txq);                                 \
564 565
}

566
static int virtnet_xdp_xmit(struct net_device *dev,
567
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
568 569
{
	struct virtnet_info *vi = netdev_priv(dev);
570 571
	struct receive_queue *rq = vi->rq;
	struct bpf_prog *xdp_prog;
572 573
	struct send_queue *sq;
	unsigned int len;
574 575
	int packets = 0;
	int bytes = 0;
576
	int nxmit = 0;
T
Toshiaki Makita 已提交
577
	int kicks = 0;
578
	void *ptr;
579
	int ret;
580 581
	int i;

582 583 584
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
585
	xdp_prog = rcu_access_pointer(rq->xdp_prog);
586 587 588
	if (!xdp_prog)
		return -ENXIO;

589
	sq = virtnet_xdp_get_sq(vi);
590 591 592

	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
593 594
		goto out;
	}
595

596
	/* Free up any pending old buffers before queueing new ones. */
597
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
598 599 600 601 602 603 604 605 606 607 608 609
		if (likely(is_xdp_frame(ptr))) {
			struct xdp_frame *frame = ptr_to_xdp(ptr);

			bytes += frame->len;
			xdp_return_frame(frame);
		} else {
			struct sk_buff *skb = ptr;

			bytes += skb->len;
			napi_consume_skb(skb, false);
		}
		packets++;
610
	}
611 612 613 614

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

615 616 617
		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
			break;
		nxmit++;
618
	}
619
	ret = nxmit;
620

T
Toshiaki Makita 已提交
621 622 623 624
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
625 626
out:
	u64_stats_update_begin(&sq->stats.syncp);
627 628
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
629
	sq->stats.xdp_tx += n;
630
	sq->stats.xdp_tx_drops += n - nxmit;
T
Toshiaki Makita 已提交
631
	sq->stats.kicks += kicks;
632
	u64_stats_update_end(&sq->stats.syncp);
633

634
	virtnet_xdp_put_sq(vi, sq);
635
	return ret;
J
Jason Wang 已提交
636 637
}

638 639
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
640
	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
641 642
}

643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
673
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
674 675 676 677 678 679 680 681 682 683 684 685 686 687
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
688
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

707 708 709
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
710
				     void *buf, void *ctx,
J
Jason Wang 已提交
711
				     unsigned int len,
712
				     unsigned int *xdp_xmit,
713
				     struct virtnet_rq_stats *stats)
714
{
715
	struct sk_buff *skb;
716
	struct bpf_prog *xdp_prog;
717
	unsigned int xdp_headroom = (unsigned long)ctx;
718 719 720 721
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
722
	struct page *page = virt_to_head_page(buf);
723
	unsigned int delta = 0;
724
	struct page *xdp_page;
725
	int err;
726
	unsigned int metasize = 0;
727

728
	len -= vi->hdr_len;
729
	stats->bytes += len;
730

731 732 733 734 735 736
	if (unlikely(len > GOOD_PACKET_LEN)) {
		pr_debug("%s: rx error: len %u exceeds max size %d\n",
			 dev->name, len, GOOD_PACKET_LEN);
		dev->stats.rx_length_errors++;
		goto err_len;
	}
737 738 739
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
740
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
741
		struct xdp_frame *xdpf;
742
		struct xdp_buff xdp;
743
		void *orig_data;
744 745
		u32 act;

746
		if (unlikely(hdr->hdr.gso_type))
747
			goto err_xdp;
748

749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

770
		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
771 772
		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
				 xdp_headroom, len, true);
773
		orig_data = xdp.data;
774
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
775
		stats->xdp_packets++;
776

777 778
		switch (act) {
		case XDP_PASS:
779
			/* Recalculate length in case bpf program changed it */
780
			delta = orig_data - xdp.data;
781
			len = xdp.data_end - xdp.data;
782
			metasize = xdp.data - xdp.data_meta;
783 784
			break;
		case XDP_TX:
785
			stats->xdp_tx++;
786
			xdpf = xdp_convert_buff_to_frame(&xdp);
787 788
			if (unlikely(!xdpf))
				goto err_xdp;
789
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
790 791 792
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
793
				trace_xdp_exception(vi->dev, xdp_prog, act);
794 795
				goto err_xdp;
			}
796
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
797 798 799
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
800
			stats->xdp_redirects++;
J
Jason Wang 已提交
801
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
802 803
			if (err)
				goto err_xdp;
804
			*xdp_xmit |= VIRTIO_XDP_REDIR;
805 806 807
			rcu_read_unlock();
			goto xdp_xmit;
		default:
808
			bpf_warn_invalid_xdp_action(act);
809
			fallthrough;
810 811
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
812
			goto err_xdp;
813
		case XDP_DROP:
814 815 816 817 818
			goto err_xdp;
		}
	}
	rcu_read_unlock();

819 820
	skb = build_skb(buf, buflen);
	if (!skb) {
821
		put_page(page);
822 823 824
		goto err;
	}
	skb_reserve(skb, headroom - delta);
825
	skb_put(skb, len);
826
	if (!xdp_prog) {
827 828
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
829
	} /* keep zeroed vnet hdr since XDP is loaded */
830

831 832 833
	if (metasize)
		skb_metadata_set(skb, metasize);

834
err:
835
	return skb;
836 837 838

err_xdp:
	rcu_read_unlock();
839
	stats->xdp_drops++;
840
err_len:
841
	stats->drops++;
842
	put_page(page);
843 844
xdp_xmit:
	return NULL;
845 846 847
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
848
				   struct virtnet_info *vi,
849 850
				   struct receive_queue *rq,
				   void *buf,
851
				   unsigned int len,
852
				   struct virtnet_rq_stats *stats)
853 854
{
	struct page *page = buf;
855
	struct sk_buff *skb =
856
		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
J
John Fastabend 已提交
857

858
	stats->bytes += len - vi->hdr_len;
859 860 861 862 863 864
	if (unlikely(!skb))
		goto err;

	return skb;

err:
865
	stats->drops++;
866 867 868 869
	give_pages(rq, page);
	return NULL;
}

870
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
871
					 struct virtnet_info *vi,
872
					 struct receive_queue *rq,
873 874
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
875
					 unsigned int len,
876
					 unsigned int *xdp_xmit,
877
					 struct virtnet_rq_stats *stats)
878
{
879 880
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
881 882
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
883 884
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
885
	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
886
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
887
	unsigned int metasize = 0;
888 889
	unsigned int frame_sz;
	int err;
J
John Fastabend 已提交
890

J
John Fastabend 已提交
891
	head_skb = NULL;
892
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
893

894 895 896 897 898 899
	if (unlikely(len > truesize)) {
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
J
John Fastabend 已提交
900 901 902
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
903
		struct xdp_frame *xdpf;
904
		struct page *xdp_page;
905 906
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
907 908
		u32 act;

909 910 911 912 913 914 915
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

916 917 918 919 920
		/* Buffers with headroom use PAGE_SIZE as alloc size,
		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
		 */
		frame_sz = headroom ? PAGE_SIZE : truesize;

921 922 923 924 925 926
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
927 928
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
929
			/* linearize data for XDP */
930
			xdp_page = xdp_linearize_page(rq, &num_buf,
931 932 933
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
934 935
			frame_sz = PAGE_SIZE;

936 937
			if (!xdp_page)
				goto err_xdp;
938
			offset = VIRTIO_XDP_HEADROOM;
939 940
		} else {
			xdp_page = page;
J
John Fastabend 已提交
941 942
		}

943 944 945
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
946
		data = page_address(xdp_page) + offset;
947
		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
948 949
		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
950

951
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
952
		stats->xdp_packets++;
953

J
John Fastabend 已提交
954 955
		switch (act) {
		case XDP_PASS:
956 957
			metasize = xdp.data - xdp.data_meta;

958
			/* recalculate offset to account for any header
959 960 961
			 * adjustments and minus the metasize to copy the
			 * metadata in page_to_skb(). Note other cases do not
			 * build an skb and avoid using offset
962
			 */
963 964
			offset = xdp.data - page_address(xdp_page) -
				 vi->hdr_len - metasize;
965

966 967
			/* recalculate len if xdp.data, xdp.data_end or
			 * xdp.data_meta were adjusted
968
			 */
969
			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
970 971 972 973
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
974 975
				head_skb = page_to_skb(vi, rq, xdp_page, offset,
						       len, PAGE_SIZE, false,
976 977
						       metasize,
						       VIRTIO_XDP_HEADROOM);
978 979
				return head_skb;
			}
J
John Fastabend 已提交
980 981
			break;
		case XDP_TX:
982
			stats->xdp_tx++;
983
			xdpf = xdp_convert_buff_to_frame(&xdp);
984 985
			if (unlikely(!xdpf))
				goto err_xdp;
986
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
987 988 989
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
990
				trace_xdp_exception(vi->dev, xdp_prog, act);
991 992 993 994
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
995
			*xdp_xmit |= VIRTIO_XDP_TX;
996
			if (unlikely(xdp_page != page))
997
				put_page(page);
J
John Fastabend 已提交
998 999
			rcu_read_unlock();
			goto xdp_xmit;
1000
		case XDP_REDIRECT:
1001
			stats->xdp_redirects++;
1002 1003 1004 1005 1006 1007
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
1008
			*xdp_xmit |= VIRTIO_XDP_REDIR;
1009
			if (unlikely(xdp_page != page))
1010
				put_page(page);
1011 1012
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
1013
		default:
1014
			bpf_warn_invalid_xdp_action(act);
1015
			fallthrough;
1016 1017
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
1018
			fallthrough;
1019
		case XDP_DROP:
1020 1021
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
1022
			goto err_xdp;
J
John Fastabend 已提交
1023
		}
J
John Fastabend 已提交
1024 1025
	}
	rcu_read_unlock();
1026

1027
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1028
			       metasize, headroom);
J
John Fastabend 已提交
1029
	curr_skb = head_skb;
1030

1031 1032
	if (unlikely(!curr_skb))
		goto err_skb;
1033
	while (--num_buf) {
1034 1035
		int num_skb_frags;

1036
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1037
		if (unlikely(!buf)) {
1038
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
1039
				 dev->name, num_buf,
1040 1041
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
1042 1043
			dev->stats.rx_length_errors++;
			goto err_buf;
1044
		}
1045

1046
		stats->bytes += len;
1047
		page = virt_to_head_page(buf);
1048 1049 1050

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
1051
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1052 1053 1054 1055
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
1056 1057

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1058 1059
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1060 1061 1062

			if (unlikely(!nskb))
				goto err_skb;
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
1074
			head_skb->truesize += truesize;
1075
		}
1076
		offset = buf - page_address(page);
1077 1078 1079
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1080
					     len, truesize);
1081 1082
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1083
					offset, len, truesize);
1084
		}
1085 1086
	}

J
Johannes Berg 已提交
1087
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1088 1089
	return head_skb;

J
John Fastabend 已提交
1090 1091
err_xdp:
	rcu_read_unlock();
1092
	stats->xdp_drops++;
1093 1094
err_skb:
	put_page(page);
1095
	while (num_buf-- > 1) {
1096 1097
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
1098 1099 1100 1101 1102
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
1103
		stats->bytes += len;
1104
		page = virt_to_head_page(buf);
1105
		put_page(page);
1106
	}
1107
err_buf:
1108
	stats->drops++;
1109
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
1110
xdp_xmit:
1111
	return NULL;
1112 1113
}

1114 1115
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
1116
			unsigned int *xdp_xmit,
1117
			struct virtnet_rq_stats *stats)
1118
{
1119
	struct net_device *dev = vi->dev;
1120
	struct sk_buff *skb;
1121
	struct virtio_net_hdr_mrg_rxbuf *hdr;
1122

1123
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1124 1125
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1126
		if (vi->mergeable_rx_bufs) {
1127
			put_page(virt_to_head_page(buf));
1128
		} else if (vi->big_packets) {
1129
			give_pages(rq, buf);
1130
		} else {
1131
			put_page(virt_to_head_page(buf));
1132
		}
1133
		return;
1134
	}
1135

1136
	if (vi->mergeable_rx_bufs)
1137
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1138
					stats);
1139
	else if (vi->big_packets)
1140
		skb = receive_big(dev, vi, rq, buf, len, stats);
1141
	else
1142
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1143 1144

	if (unlikely(!skb))
1145
		return;
1146

1147
	hdr = skb_vnet_hdr(skb);
1148

1149
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1150
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1151

1152 1153 1154 1155 1156 1157
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1158 1159
	}

1160
	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1161 1162 1163 1164
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1165
	napi_gro_receive(&rq->napi, skb);
1166
	return;
R
Rusty Russell 已提交
1167 1168 1169 1170 1171 1172

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1173 1174 1175 1176 1177
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1178 1179
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1180
{
1181 1182
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1183
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1184
	void *ctx = (void *)(unsigned long)xdp_headroom;
1185
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1186
	int err;
1187

1188 1189 1190
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1191
		return -ENOMEM;
R
Rusty Russell 已提交
1192

1193 1194 1195 1196 1197
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1198
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1199
	if (err < 0)
1200
		put_page(virt_to_head_page(buf));
1201 1202
	return err;
}
1203

1204 1205
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1206 1207 1208 1209 1210
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1211 1212
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1213
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1214
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1215
		first = get_a_page(rq, gfp);
1216 1217
		if (!first) {
			if (list)
1218
				give_pages(rq, list);
1219
			return -ENOMEM;
1220
		}
1221
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1222

1223 1224 1225 1226
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1227

1228
	first = get_a_page(rq, gfp);
1229
	if (!first) {
1230
		give_pages(rq, list);
1231 1232 1233 1234
		return -ENOMEM;
	}
	p = page_address(first);

1235
	/* rq->sg[0], rq->sg[1] share the same page */
1236 1237
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1238

1239
	/* rq->sg[1] for data packet, from offset */
1240
	offset = sizeof(struct padded_vnet_hdr);
1241
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1242 1243 1244

	/* chain first in list head */
	first->private = (unsigned long)list;
1245 1246
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1247
	if (err < 0)
1248
		give_pages(rq, first);
1249 1250

	return err;
R
Rusty Russell 已提交
1251 1252
}

1253
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1254 1255
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1256
{
1257
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1258 1259
	unsigned int len;

1260 1261 1262 1263
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1264
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1265

1266
	return ALIGN(len, L1_CACHE_BYTES);
1267 1268
}

1269 1270
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1271
{
1272
	struct page_frag *alloc_frag = &rq->alloc_frag;
1273
	unsigned int headroom = virtnet_get_headroom(vi);
1274 1275
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1276
	char *buf;
1277
	void *ctx;
1278
	int err;
1279
	unsigned int len, hole;
1280

1281 1282 1283 1284 1285 1286
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1287
		return -ENOMEM;
1288

1289
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1290
	buf += headroom; /* advance address leaving hole at front of pkt */
1291
	get_page(alloc_frag->page);
1292
	alloc_frag->offset += len + room;
1293
	hole = alloc_frag->size - alloc_frag->offset;
1294
	if (hole < len + room) {
1295 1296
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1297
		 * the current buffer.
1298
		 */
1299 1300 1301
		len += hole;
		alloc_frag->offset += hole;
	}
1302

1303
	sg_init_one(rq->sg, buf, len);
1304
	ctx = mergeable_len_to_ctx(len, headroom);
1305
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1306
	if (err < 0)
1307
		put_page(virt_to_head_page(buf));
1308

1309 1310
	return err;
}
1311

1312 1313 1314 1315 1316 1317 1318
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1319 1320
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1321 1322
{
	int err;
1323
	bool oom;
1324

1325 1326
	do {
		if (vi->mergeable_rx_bufs)
1327
			err = add_recvbuf_mergeable(vi, rq, gfp);
1328
		else if (vi->big_packets)
1329
			err = add_recvbuf_big(vi, rq, gfp);
1330
		else
M
Michael S. Tsirkin 已提交
1331
			err = add_recvbuf_small(vi, rq, gfp);
1332

1333
		oom = err == -ENOMEM;
1334
		if (err)
1335
			break;
1336
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1337
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1338 1339 1340
		unsigned long flags;

		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1341
		rq->stats.kicks++;
1342
		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
T
Toshiaki Makita 已提交
1343 1344
	}

1345
	return !oom;
1346 1347
}

1348
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1349 1350
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1351
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1352

1353
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1354 1355
}

1356
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1357
{
1358
	napi_enable(napi);
1359 1360

	/* If all buffers were filled by other side before we napi_enabled, we
1361 1362 1363 1364 1365 1366
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1367 1368
}

W
Willem de Bruijn 已提交
1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1387 1388 1389 1390 1391 1392
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1393 1394
static void refill_work(struct work_struct *work)
{
1395 1396
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1397
	bool still_empty;
J
Jason Wang 已提交
1398 1399
	int i;

1400
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1401
		struct receive_queue *rq = &vi->rq[i];
1402

J
Jason Wang 已提交
1403
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1404
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1405
		virtnet_napi_enable(rq->vq, &rq->napi);
1406

J
Jason Wang 已提交
1407 1408 1409 1410 1411 1412
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1413 1414
}

1415 1416
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1417
{
1418
	struct virtnet_info *vi = rq->vq->vdev->priv;
1419
	struct virtnet_rq_stats stats = {};
1420
	unsigned int len;
1421
	void *buf;
1422
	int i;
R
Rusty Russell 已提交
1423

1424
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1425 1426
		void *ctx;

1427
		while (stats.packets < budget &&
1428
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1429
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1430
			stats.packets++;
1431 1432
		}
	} else {
1433
		while (stats.packets < budget &&
1434
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1435
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1436
			stats.packets++;
1437
		}
R
Rusty Russell 已提交
1438 1439
	}

1440
	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
M
Michael S. Tsirkin 已提交
1441
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1442
			schedule_delayed_work(&vi->refill, 0);
1443
	}
R
Rusty Russell 已提交
1444

T
Toshiaki Makita 已提交
1445
	u64_stats_update_begin(&rq->stats.syncp);
1446 1447 1448 1449
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1450 1451
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1452
	}
T
Toshiaki Makita 已提交
1453
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1454

1455
	return stats.packets;
1456 1457
}

1458
static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1459 1460 1461 1462
{
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;
1463
	void *ptr;
1464

1465 1466 1467
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		if (likely(!is_xdp_frame(ptr))) {
			struct sk_buff *skb = ptr;
1468

1469
			pr_debug("Sent skb %p\n", skb);
1470

1471 1472 1473 1474
			bytes += skb->len;
			napi_consume_skb(skb, in_napi);
		} else {
			struct xdp_frame *frame = ptr_to_xdp(ptr);
1475

1476 1477 1478 1479
			bytes += frame->len;
			xdp_return_frame(frame);
		}
		packets++;
1480 1481 1482 1483 1484 1485 1486 1487
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1488 1489 1490 1491
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1492 1493
}

1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

1504 1505 1506 1507 1508 1509 1510
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

1511
	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1512 1513 1514
		return;

	if (__netif_tx_trylock(txq)) {
1515 1516 1517 1518
		do {
			virtqueue_disable_cb(sq->vq);
			free_old_xmit_skbs(sq, true);
		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1519 1520 1521 1522

		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
			netif_tx_wake_queue(txq);

1523 1524 1525 1526
		__netif_tx_unlock(txq);
	}
}

1527 1528 1529 1530
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1531 1532
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1533
	unsigned int received;
1534
	unsigned int xdp_xmit = 0;
1535

1536 1537
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1538
	received = virtnet_receive(rq, budget, &xdp_xmit);
1539

1540
	/* Out of packets? */
1541 1542
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1543

1544
	if (xdp_xmit & VIRTIO_XDP_REDIR)
1545
		xdp_do_flush();
1546 1547

	if (xdp_xmit & VIRTIO_XDP_TX) {
1548
		sq = virtnet_xdp_get_sq(vi);
T
Toshiaki Makita 已提交
1549 1550 1551 1552 1553
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1554
		virtnet_xdp_put_sq(vi, sq);
1555
	}
J
Jason Wang 已提交
1556

R
Rusty Russell 已提交
1557 1558 1559
	return received;
}

J
Jason Wang 已提交
1560 1561 1562
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1563
	int i, err;
J
Jason Wang 已提交
1564

1565 1566 1567
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1568
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1569
				schedule_delayed_work(&vi->refill, 0);
1570

1571
		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1572 1573 1574
		if (err < 0)
			return err;

1575 1576 1577 1578 1579 1580 1581
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1582
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1583
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1584 1585 1586 1587 1588
	}

	return 0;
}

W
Willem de Bruijn 已提交
1589 1590 1591 1592
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
1593 1594
	unsigned int index = vq2txq(sq->vq);
	struct netdev_queue *txq;
1595 1596
	int opaque;
	bool done;
W
Willem de Bruijn 已提交
1597

1598 1599 1600 1601 1602 1603 1604
	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
		/* We don't need to enable cb for XDP */
		napi_complete_done(napi, 0);
		return 0;
	}

	txq = netdev_get_tx_queue(vi->dev, index);
W
Willem de Bruijn 已提交
1605
	__netif_tx_lock(txq, raw_smp_processor_id());
1606
	virtqueue_disable_cb(sq->vq);
1607
	free_old_xmit_skbs(sq, true);
1608

1609 1610 1611
	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

1612 1613 1614 1615 1616 1617 1618
	opaque = virtqueue_enable_cb_prepare(sq->vq);

	done = napi_complete_done(napi, 0);

	if (!done)
		virtqueue_disable_cb(sq->vq);

W
Willem de Bruijn 已提交
1619 1620
	__netif_tx_unlock(txq);

1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
	if (done) {
		if (unlikely(virtqueue_poll(sq->vq, opaque))) {
			if (napi_schedule_prep(napi)) {
				__netif_tx_lock(txq, raw_smp_processor_id());
				virtqueue_disable_cb(sq->vq);
				__netif_tx_unlock(txq);
				__napi_schedule(napi);
			}
		}
	}
W
Willem de Bruijn 已提交
1631 1632 1633 1634

	return 0;
}

1635
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1636
{
1637
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1638
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1639
	struct virtnet_info *vi = sq->vq->vdev->priv;
1640
	int num_sg;
1641
	unsigned hdr_len = vi->hdr_len;
1642
	bool can_push;
R
Rusty Russell 已提交
1643

J
Johannes Berg 已提交
1644
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1645 1646 1647 1648 1649 1650 1651

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1652
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1653 1654
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1655

1656
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1657 1658
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1659
		return -EPROTO;
R
Rusty Russell 已提交
1660

1661
	if (vi->mergeable_rx_bufs)
1662
		hdr->num_buffers = 0;
1663

1664
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1665 1666 1667
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1668 1669
		if (unlikely(num_sg < 0))
			return num_sg;
1670 1671 1672 1673
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1674 1675 1676 1677
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1678
	}
1679
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1680 1681
}

1682
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1683 1684
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1685 1686
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1687
	int err;
1688
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1689
	bool kick = !netdev_xmit_more();
W
Willem de Bruijn 已提交
1690
	bool use_napi = sq->napi.weight;
1691 1692

	/* Free up any pending old buffers before queueing new ones. */
1693 1694 1695 1696 1697
	do {
		if (use_napi)
			virtqueue_disable_cb(sq->vq);

		free_old_xmit_skbs(sq, false);
1698

1699 1700
	} while (use_napi && kick &&
	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1701

1702 1703 1704
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1705
	/* Try to transmit */
1706
	err = xmit_skb(sq, skb);
1707

1708
	/* This should not happen! */
1709
	if (unlikely(err)) {
1710 1711 1712
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1713 1714
				 "Unexpected TXQ (%d) queue failure: %d\n",
				 qnum, err);
1715
		dev->stats.tx_dropped++;
1716
		dev_kfree_skb_any(skb);
1717
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1718
	}
1719

1720
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1721 1722
	if (!use_napi) {
		skb_orphan(skb);
1723
		nf_reset_ct(skb);
W
Willem de Bruijn 已提交
1724
	}
1725

1726 1727 1728 1729 1730 1731 1732 1733 1734
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1735
	 */
1736
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1737
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1738 1739
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1740
			/* More just got used, free them then recheck. */
1741
			free_old_xmit_skbs(sq, false);
1742
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1743
				netif_start_subqueue(dev, qnum);
1744
				virtqueue_disable_cb(sq->vq);
1745 1746
			}
		}
1747
	}
1748

T
Toshiaki Makita 已提交
1749 1750 1751 1752 1753 1754 1755
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1756

1757
	return NETDEV_TX_OK;
1758 1759
}

1760 1761 1762
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1763
 * never fail unless improperly formatted.
1764 1765
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1766
				 struct scatterlist *out)
1767
{
1768
	struct scatterlist *sgs[4], hdr, stat;
1769
	unsigned out_num = 0, tmp;
1770
	int ret;
1771 1772

	/* Caller should know better */
1773
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1774

1775 1776 1777
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1778
	/* Add header */
1779
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1780
	sgs[out_num++] = &hdr;
1781

1782 1783
	if (out)
		sgs[out_num++] = out;
1784

1785
	/* Add return status. */
1786
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1787
	sgs[out_num] = &stat;
1788

1789
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1790 1791 1792 1793 1794 1795
	ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
	if (ret < 0) {
		dev_warn(&vi->vdev->dev,
			 "Failed to add sgs for command vq: %d\n.", ret);
		return false;
	}
1796

1797
	if (unlikely(!virtqueue_kick(vi->cvq)))
1798
		return vi->ctrl->status == VIRTIO_NET_OK;
1799 1800 1801 1802

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1803 1804
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1805 1806
		cpu_relax();

1807
	return vi->ctrl->status == VIRTIO_NET_OK;
1808 1809
}

1810 1811 1812 1813
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1814
	int ret;
1815
	struct sockaddr *addr;
1816
	struct scatterlist sg;
1817

1818 1819 1820
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1821
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1822 1823 1824 1825
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1826
	if (ret)
1827
		goto out;
1828

1829 1830 1831
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1832
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1833 1834
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1835 1836
			ret = -EINVAL;
			goto out;
1837
		}
1838 1839
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1840 1841 1842 1843 1844 1845 1846
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1847 1848 1849
	}

	eth_commit_mac_addr_change(dev, p);
1850
	ret = 0;
1851

1852 1853 1854
out:
	kfree(addr);
	return ret;
1855 1856
}

1857 1858
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1859 1860 1861
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1862
	int i;
1863

T
Toshiaki Makita 已提交
1864
	for (i = 0; i < vi->max_queue_pairs; i++) {
1865
		u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1866 1867
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1868 1869

		do {
T
Toshiaki Makita 已提交
1870 1871 1872
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
1873
			terrors  = sq->stats.tx_timeouts;
T
Toshiaki Makita 已提交
1874
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1875 1876

		do {
T
Toshiaki Makita 已提交
1877
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1878 1879 1880
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1881
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1882 1883 1884 1885 1886

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1887
		tot->rx_dropped += rdrops;
1888
		tot->tx_errors  += terrors;
1889 1890 1891
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1892
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1893 1894 1895 1896
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1897 1898 1899 1900
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1901
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1902 1903 1904 1905
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1906
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1907 1908 1909 1910 1911 1912 1913
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1914 1915
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1916 1917

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1918
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1919 1920 1921
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1922
	} else {
J
Jason Wang 已提交
1923
		vi->curr_queue_pairs = queue_pairs;
1924 1925 1926
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1927
	}
J
Jason Wang 已提交
1928 1929 1930 1931

	return 0;
}

1932 1933 1934 1935 1936 1937 1938 1939 1940 1941
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1942 1943 1944
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1945
	int i;
R
Rusty Russell 已提交
1946

1947 1948
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1949

W
Willem de Bruijn 已提交
1950
	for (i = 0; i < vi->max_queue_pairs; i++) {
1951
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1952
		napi_disable(&vi->rq[i].napi);
1953
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1954
	}
R
Rusty Russell 已提交
1955 1956 1957 1958

	return 0;
}

1959 1960 1961
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1962 1963
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1964
	struct netdev_hw_addr *ha;
1965
	int uc_count;
1966
	int mc_count;
1967 1968
	void *buf;
	int i;
1969

S
stephen hemminger 已提交
1970
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1971 1972 1973
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1974 1975
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1976

1977
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1978 1979

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1980
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1981
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1982
			 vi->ctrl->promisc ? "en" : "dis");
1983

1984
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1985 1986

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1987
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1988
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1989
			 vi->ctrl->allmulti ? "en" : "dis");
1990

1991
	uc_count = netdev_uc_count(dev);
1992
	mc_count = netdev_mc_count(dev);
1993
	/* MAC filter - use one buffer for both lists */
1994 1995 1996
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1997
	if (!buf)
1998 1999
		return;

2000 2001
	sg_init_table(sg, 2);

2002
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
2003
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
2004
	i = 0;
2005
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
2006
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2007 2008

	sg_set_buf(&sg[0], mac_data,
2009
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2010 2011

	/* multicast list and count fill the end */
2012
	mac_data = (void *)&mac_data->macs[uc_count][0];
2013

M
Michael S. Tsirkin 已提交
2014
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2015
	i = 0;
2016 2017
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2018 2019

	sg_set_buf(&sg[1], mac_data,
2020
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2021 2022

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2023
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2024
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2025 2026

	kfree(buf);
2027 2028
}

2029 2030
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
2031 2032 2033 2034
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

2035
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2036
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2037 2038

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2039
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2040
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2041
	return 0;
2042 2043
}

2044 2045
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
2046 2047 2048 2049
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

2050
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2051
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2052 2053

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2054
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2055
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2056
	return 0;
2057 2058
}

2059
static void virtnet_clean_affinity(struct virtnet_info *vi)
J
Jason Wang 已提交
2060 2061 2062
{
	int i;

2063 2064
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2065 2066
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
2067 2068
		}

2069 2070 2071
		vi->affinity_hint_set = false;
	}
}
2072

2073 2074
static void virtnet_set_affinity(struct virtnet_info *vi)
{
2075 2076 2077 2078 2079 2080 2081 2082
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2083
		virtnet_clean_affinity(vi);
2084
		return;
J
Jason Wang 已提交
2085 2086
	}

2087 2088 2089 2090 2091 2092
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
2093

2094 2095 2096 2097 2098 2099 2100 2101 2102 2103
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
2104
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2105
		cpumask_clear(mask);
J
Jason Wang 已提交
2106 2107
	}

2108
	vi->affinity_hint_set = true;
2109
	free_cpumask_var(mask);
J
Jason Wang 已提交
2110 2111
}

2112
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2113
{
2114 2115 2116 2117 2118
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
2119

2120 2121 2122 2123 2124 2125 2126
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
2127

2128 2129 2130 2131 2132
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

2133
	virtnet_clean_affinity(vi);
2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
2159 2160
}

R
Rick Jones 已提交
2161 2162 2163 2164 2165
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
2166 2167
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
2168 2169 2170 2171
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2199
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2200 2201
		return -EINVAL;

J
John Fastabend 已提交
2202 2203 2204 2205 2206 2207 2208
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2209
	cpus_read_lock();
2210
	err = _virtnet_set_queues(vi, queue_pairs);
2211
	if (err) {
2212
		cpus_read_unlock();
2213
		goto err;
2214
	}
2215
	virtnet_set_affinity(vi);
2216
	cpus_read_unlock();
2217

2218 2219 2220
	netif_set_real_num_tx_queues(dev, queue_pairs);
	netif_set_real_num_rx_queues(dev, queue_pairs);
 err:
2221 2222 2223
	return err;
}

T
Toshiaki Makita 已提交
2224 2225 2226 2227
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int i, j;
2228
	u8 *p = data;
T
Toshiaki Makita 已提交
2229 2230 2231 2232

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
2233 2234 2235
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
						virtnet_rq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2236 2237 2238
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
2239 2240 2241
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
						virtnet_sq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2271
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2310 2311
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2312 2313 2314
{
	struct virtnet_info *vi = netdev_priv(dev);

2315 2316
	return ethtool_virtdev_set_link_ksettings(dev, cmd,
						  &vi->speed, &vi->duplex);
2317 2318
}

2319 2320
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2321 2322 2323
{
	struct virtnet_info *vi = netdev_priv(dev);

2324 2325 2326
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2327 2328 2329 2330

	return 0;
}

2331
static int virtnet_set_coalesce(struct net_device *dev,
2332 2333 2334
				struct ethtool_coalesce *ec,
				struct kernel_ethtool_coalesce *kernel_coal,
				struct netlink_ext_ack *extack)
2335 2336 2337 2338
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

2339 2340
	if (ec->tx_max_coalesced_frames > 1 ||
	    ec->rx_max_coalesced_frames != 1)
2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354
		return -EINVAL;

	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
2355 2356 2357
				struct ethtool_coalesce *ec,
				struct kernel_ethtool_coalesce *kernel_coal,
				struct netlink_ext_ack *extack)
2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2373 2374 2375 2376 2377 2378 2379 2380
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2381 2382 2383 2384 2385 2386 2387 2388
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

2389 2390
	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);

2391 2392
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
2393 2394 2395

	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);

2396 2397 2398 2399
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2400
static const struct ethtool_ops virtnet_ethtool_ops = {
2401
	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2402
	.get_drvinfo = virtnet_get_drvinfo,
2403
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2404
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2405 2406 2407
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2408 2409
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2410
	.get_ts_info = ethtool_op_get_ts_info,
2411 2412
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2413 2414
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2415 2416
};

2417 2418 2419 2420 2421 2422 2423 2424
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2425
	netif_tx_lock_bh(vi->dev);
2426
	netif_device_detach(vi->dev);
2427
	netif_tx_unlock_bh(vi->dev);
2428 2429 2430
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2431
		for (i = 0; i < vi->max_queue_pairs; i++) {
2432
			napi_disable(&vi->rq[i].napi);
2433
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2434
		}
2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2456
		for (i = 0; i < vi->max_queue_pairs; i++) {
2457
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2458 2459 2460
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2461 2462
	}

2463
	netif_tx_lock_bh(vi->dev);
2464
	netif_device_attach(vi->dev);
2465
	netif_tx_unlock_bh(vi->dev);
2466 2467 2468
	return err;
}

2469 2470 2471
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2472
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2473

2474
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2475 2476 2477

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2478
		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2505 2506
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2507 2508 2509 2510
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2511
	u16 xdp_qp = 0, curr_qp;
2512
	int i, err;
J
John Fastabend 已提交
2513

2514 2515 2516 2517
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2518 2519
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
2520
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
J
John Fastabend 已提交
2521 2522 2523 2524
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2525
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2526 2527 2528 2529
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2530
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2531 2532 2533 2534
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2535 2536 2537 2538 2539 2540
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2541 2542
		netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
				 curr_qp + xdp_qp, vi->max_queue_pairs);
2543
		xdp_qp = 0;
2544 2545
	}

2546 2547 2548 2549
	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
	if (!prog && !old_prog)
		return 0;

2550 2551
	if (prog)
		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2552

2553
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2554 2555
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2556
			napi_disable(&vi->rq[i].napi);
2557 2558 2559
			virtnet_napi_tx_disable(&vi->sq[i].napi);
		}
	}
J
John Fastabend 已提交
2560

2561 2562 2563 2564 2565 2566 2567 2568
	if (!prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0)
				virtnet_restore_guest_offloads(vi);
		}
		synchronize_net();
	}
J
John Fastabend 已提交
2569

2570 2571 2572
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
2573
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2574
	vi->xdp_queue_pairs = xdp_qp;
2575

2576
	if (prog) {
2577
		vi->xdp_enabled = true;
2578 2579 2580
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0 && !old_prog)
2581 2582
				virtnet_clear_guest_offloads(vi);
		}
2583 2584
	} else {
		vi->xdp_enabled = false;
2585 2586 2587
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
J
John Fastabend 已提交
2588 2589
		if (old_prog)
			bpf_prog_put(old_prog);
2590
		if (netif_running(dev)) {
2591
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2592 2593 2594
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
J
John Fastabend 已提交
2595 2596 2597
	}

	return 0;
2598

2599
err:
2600 2601 2602 2603 2604 2605
	if (!prog) {
		virtnet_clear_guest_offloads(vi);
		for (i = 0; i < vi->max_queue_pairs; i++)
			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
	}

2606
	if (netif_running(dev)) {
2607
		for (i = 0; i < vi->max_queue_pairs; i++) {
2608
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2609 2610 2611
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2612
	}
2613 2614 2615
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2616 2617
}

2618
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2619 2620 2621
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2622
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2623 2624 2625 2626 2627
	default:
		return -EINVAL;
	}
}

2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2644 2645 2646 2647
static int virtnet_set_features(struct net_device *dev,
				netdev_features_t features)
{
	struct virtnet_info *vi = netdev_priv(dev);
2648
	u64 offloads;
2649 2650
	int err;

2651
	if ((dev->features ^ features) & NETIF_F_GRO_HW) {
2652
		if (vi->xdp_enabled)
2653 2654
			return -EBUSY;

2655
		if (features & NETIF_F_GRO_HW)
2656
			offloads = vi->guest_offloads_capable;
2657
		else
2658
			offloads = vi->guest_offloads_capable &
2659
				   ~GUEST_OFFLOAD_GRO_HW_MASK;
2660

2661 2662 2663 2664
		err = virtnet_set_guest_offloads(vi, offloads);
		if (err)
			return err;
		vi->guest_offloads = offloads;
2665 2666 2667 2668 2669
	}

	return 0;
}

2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684
static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
{
	struct virtnet_info *priv = netdev_priv(dev);
	struct send_queue *sq = &priv->sq[txqueue];
	struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue);

	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.tx_timeouts++;
	u64_stats_update_end(&sq->stats.syncp);

	netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n",
		   txqueue, sq->name, sq->vq->index, sq->vq->name,
		   jiffies_to_usecs(jiffies - txq->trans_start));
}

2685 2686 2687 2688 2689
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2690
	.ndo_set_mac_address = virtnet_set_mac_address,
2691
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2692
	.ndo_get_stats64     = virtnet_stats,
2693 2694
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2695
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2696
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2697
	.ndo_features_check	= passthru_features_check,
2698
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2699
	.ndo_set_features	= virtnet_set_features,
2700
	.ndo_tx_timeout		= virtnet_tx_timeout,
2701 2702
};

2703
static void virtnet_config_changed_work(struct work_struct *work)
2704
{
2705 2706
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2707 2708
	u16 v;

2709 2710
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2711
		return;
2712 2713

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2714
		netdev_notify_peers(vi->dev);
2715 2716
		virtnet_ack_link_announce(vi);
	}
2717 2718 2719 2720 2721

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2722
		return;
2723 2724 2725 2726

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2727
		virtnet_update_settings(vi);
2728
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2729
		netif_tx_wake_all_queues(vi->dev);
2730 2731
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2732
		netif_tx_stop_all_queues(vi->dev);
2733 2734 2735 2736 2737 2738 2739
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2740
	schedule_work(&vi->config_work);
2741 2742
}

J
Jason Wang 已提交
2743 2744
static void virtnet_free_queues(struct virtnet_info *vi)
{
2745 2746
	int i;

2747
	for (i = 0; i < vi->max_queue_pairs; i++) {
2748 2749
		__netif_napi_del(&vi->rq[i].napi);
		__netif_napi_del(&vi->sq[i].napi);
2750
	}
2751

2752
	/* We called __netif_napi_del(),
2753 2754 2755 2756
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2757 2758
	kfree(vi->rq);
	kfree(vi->sq);
2759
	kfree(vi->ctrl);
J
Jason Wang 已提交
2760 2761
}

2762
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2763
{
J
John Fastabend 已提交
2764
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2765 2766 2767 2768 2769
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2770 2771 2772 2773 2774

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2775
	}
2776 2777 2778 2779 2780 2781
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2782
	rtnl_unlock();
J
Jason Wang 已提交
2783 2784
}

2785 2786 2787 2788 2789 2790 2791 2792
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
Jason Wang 已提交
2793 2794 2795 2796 2797 2798 2799
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2800
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2801
			if (!is_xdp_frame(buf))
J
John Fastabend 已提交
2802 2803
				dev_kfree_skb(buf);
			else
2804
				xdp_return_frame(ptr_to_xdp(buf));
J
John Fastabend 已提交
2805
		}
J
Jason Wang 已提交
2806 2807 2808 2809 2810 2811
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2812
			if (vi->mergeable_rx_bufs) {
2813
				put_page(virt_to_head_page(buf));
2814
			} else if (vi->big_packets) {
2815
				give_pages(&vi->rq[i], buf);
2816
			} else {
2817
				put_page(virt_to_head_page(buf));
2818
			}
J
Jason Wang 已提交
2819 2820 2821 2822
		}
	}
}

2823 2824 2825 2826
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2827
	virtnet_clean_affinity(vi);
J
Jason Wang 已提交
2828

2829
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2830 2831

	virtnet_free_queues(vi);
2832 2833
}

2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2846 2847
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2848 2849
}

J
Jason Wang 已提交
2850
static int virtnet_find_vqs(struct virtnet_info *vi)
2851
{
J
Jason Wang 已提交
2852 2853 2854 2855 2856
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2857
	bool *ctx;
J
Jason Wang 已提交
2858 2859 2860 2861 2862 2863 2864 2865 2866

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2867
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2868 2869
	if (!vqs)
		goto err_vq;
2870
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2871 2872
	if (!callbacks)
		goto err_callback;
2873
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2874 2875
	if (!names)
		goto err_names;
2876
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2877
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2878 2879 2880 2881 2882
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2883 2884 2885 2886 2887 2888

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2889

J
Jason Wang 已提交
2890 2891 2892 2893 2894 2895 2896 2897
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2898 2899
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2900
	}
2901

2902 2903
	ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
				  names, ctx, NULL);
J
Jason Wang 已提交
2904 2905
	if (ret)
		goto err_find;
2906

J
Jason Wang 已提交
2907 2908
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2909
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2910
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2911
	}
J
Jason Wang 已提交
2912 2913 2914

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2915
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2916 2917 2918
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2919
	/* run here: ret == 0. */
J
Jason Wang 已提交
2920 2921 2922


err_find:
2923 2924
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2938 2939 2940 2941 2942 2943 2944
	if (vi->has_cvq) {
		vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
		if (!vi->ctrl)
			goto err_ctrl;
	} else {
		vi->ctrl = NULL;
	}
K
Kees Cook 已提交
2945
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2946 2947
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2948
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2949
	if (!vi->rq)
J
Jason Wang 已提交
2950 2951 2952 2953 2954 2955 2956
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2957 2958
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2959 2960

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2961
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2962
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2963 2964 2965

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2966 2967 2968 2969 2970 2971 2972
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2973 2974
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2991
	cpus_read_lock();
2992
	virtnet_set_affinity(vi);
2993
	cpus_read_unlock();
2994

J
Jason Wang 已提交
2995 2996 2997 2998 2999 3000
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
3001 3002
}

3003 3004
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
3005
		char *buf)
3006 3007 3008
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
3009 3010
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
3011
	struct ewma_pkt_len *avg;
3012 3013 3014

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
3015
	return sprintf(buf, "%u\n",
3016 3017
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

3068 3069 3070
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

3071
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
3072
{
3073 3074 3075 3076 3077 3078
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

3079 3080 3081
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
3095
	int i, err = -ENOMEM;
3096 3097 3098 3099 3100
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
3101
	/* Find if host supports multiqueue virtio_net device */
3102 3103 3104
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
3105 3106 3107 3108 3109 3110

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
3111 3112

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
3113
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
3114 3115 3116 3117
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
3118 3119
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
			   IFF_TX_SKB_NO_LINEAR;
3120
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
3121
	dev->features = NETIF_F_HIGHDMA;
3122

3123
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
3124 3125 3126
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
3127
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
3128
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
3129
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3130
		if (csum)
J
Jason Wang 已提交
3131
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3132 3133

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3134
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
3135 3136
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
3137
		/* Individual feature bits: what can host handle? */
3138 3139 3140 3141 3142 3143 3144
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

3145 3146
		dev->features |= NETIF_F_GSO_ROBUST;

3147
		if (gso)
3148
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3149
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
3150
	}
3151 3152
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
3153 3154
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3155
		dev->features |= NETIF_F_GRO_HW;
3156
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3157
		dev->hw_features |= NETIF_F_GRO_HW;
R
Rusty Russell 已提交
3158

3159 3160
	dev->vlan_features = dev->features;

3161 3162 3163 3164
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
3165
	/* Configuration may specify what MAC to use.  Otherwise random. */
3166 3167 3168 3169 3170
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
3171
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
3172 3173 3174 3175 3176

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
3177
	vdev->priv = vi;
3178

3179
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
3180

3181
	/* If we can receive ANY GSO packets, we must allocate large ones. */
3182 3183
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3184 3185
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3186 3187
		vi->big_packets = true;

3188 3189 3190
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

3191 3192
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3193 3194 3195 3196
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

3197 3198
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3199 3200
		vi->any_header_sg = true;

J
Jason Wang 已提交
3201 3202 3203
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

3204 3205 3206 3207
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3208
		if (mtu < dev->min_mtu) {
3209 3210 3211
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
3212 3213 3214
			dev_err(&vdev->dev,
				"device MTU appears to have changed it is now %d < %d",
				mtu, dev->min_mtu);
3215
			err = -EINVAL;
T
Toshiaki Makita 已提交
3216
			goto free;
3217
		}
3218

3219 3220 3221
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3222 3223 3224
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3225 3226
	}

3227 3228
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3229

3230 3231 3232 3233 3234
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3235 3236 3237
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3238
	err = init_vqs(vi);
3239
	if (err)
T
Toshiaki Makita 已提交
3240
		goto free;
R
Rusty Russell 已提交
3241

3242 3243 3244 3245
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3246 3247
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3248

3249 3250
	virtnet_init_settings(dev);

3251 3252
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3253 3254
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3255
			goto free_vqs;
3256
		}
3257 3258
	}

R
Rusty Russell 已提交
3259 3260 3261
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3262
		goto free_failover;
R
Rusty Russell 已提交
3263
	}
3264

M
Michael S. Tsirkin 已提交
3265 3266
	virtio_device_ready(vdev);

3267
	err = virtnet_cpu_notif_add(vi);
3268 3269
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3270
		goto free_unregister_netdev;
3271 3272
	}

3273
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3274

J
Jason Wang 已提交
3275 3276
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3277
	netif_carrier_off(dev);
J
Jason Wang 已提交
3278
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3279
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3280 3281
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3282
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3283 3284
		netif_carrier_on(dev);
	}
3285

3286 3287 3288
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);
3289
	vi->guest_offloads_capable = vi->guest_offloads;
3290

J
Jason Wang 已提交
3291 3292 3293
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3294 3295
	return 0;

3296
free_unregister_netdev:
3297 3298
	vi->vdev->config->reset(vdev);

3299
	unregister_netdev(dev);
3300 3301
free_failover:
	net_failover_destroy(vi->failover);
3302
free_vqs:
J
Jason Wang 已提交
3303
	cancel_delayed_work_sync(&vi->refill);
3304
	free_receive_page_frags(vi);
3305
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3306 3307 3308 3309 3310
free:
	free_netdev(dev);
	return err;
}

3311
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3312
{
3313
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3314 3315

	/* Free unused buffers in both send and recv, if any. */
3316
	free_unused_bufs(vi);
3317

J
Jason Wang 已提交
3318
	free_receive_bufs(vi);
3319

3320 3321
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3322
	virtnet_del_vqs(vi);
3323 3324
}

3325
static void virtnet_remove(struct virtio_device *vdev)
3326 3327 3328
{
	struct virtnet_info *vi = vdev->priv;

3329
	virtnet_cpu_notif_remove(vi);
3330

3331 3332
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3333

3334 3335
	unregister_netdev(vi->dev);

3336 3337
	net_failover_destroy(vi->failover);

3338
	remove_vq_common(vi);
3339

3340
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3341 3342
}

3343
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3344 3345 3346
{
	struct virtnet_info *vi = vdev->priv;

3347
	virtnet_cpu_notif_remove(vi);
3348
	virtnet_freeze_down(vdev);
3349 3350 3351 3352 3353
	remove_vq_common(vi);

	return 0;
}

3354
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3355 3356
{
	struct virtnet_info *vi = vdev->priv;
3357
	int err;
3358

3359
	err = virtnet_restore_up(vdev);
3360 3361
	if (err)
		return err;
J
Jason Wang 已提交
3362 3363
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3364
	err = virtnet_cpu_notif_add(vi);
3365 3366 3367
	if (err) {
		virtnet_freeze_down(vdev);
		remove_vq_common(vi);
3368
		return err;
3369
	}
3370

3371 3372 3373
	return 0;
}

R
Rusty Russell 已提交
3374 3375 3376 3377 3378
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3379 3380 3381 3382 3383 3384 3385 3386 3387 3388
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3389
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3390
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3391

3392
static unsigned int features[] = {
3393 3394 3395 3396 3397 3398
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3399
	VIRTIO_F_ANY_LAYOUT,
3400 3401
};

3402
static struct virtio_driver virtio_net_driver = {
3403 3404
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3405 3406
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3407 3408 3409
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3410
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3411
	.probe =	virtnet_probe,
3412
	.remove =	virtnet_remove,
3413
	.config_changed = virtnet_config_changed,
3414
#ifdef CONFIG_PM_SLEEP
3415 3416 3417
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3418 3419
};

3420 3421 3422 3423
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3424
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3425 3426 3427 3428 3429
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3430
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3450
	unregister_virtio_driver(&virtio_net_driver);
3451 3452 3453 3454
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3455 3456 3457 3458

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");