virtio_net.c 88.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* A network driver using virtio.
R
Rusty Russell 已提交
3 4 5 6 7 8
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
9
#include <linux/ethtool.h>
R
Rusty Russell 已提交
10 11 12
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
J
John Fastabend 已提交
13
#include <linux/bpf.h>
14
#include <linux/bpf_trace.h>
R
Rusty Russell 已提交
15
#include <linux/scatterlist.h>
16
#include <linux/if_vlan.h>
17
#include <linux/slab.h>
18
#include <linux/cpu.h>
19
#include <linux/average.h>
J
Jason Wang 已提交
20
#include <linux/filter.h>
21
#include <linux/kernel.h>
22
#include <net/route.h>
23
#include <net/xdp.h>
24
#include <net/net_failover.h>
R
Rusty Russell 已提交
25

26
static int napi_weight = NAPI_POLL_WEIGHT;
27 28
module_param(napi_weight, int, 0444);

29
static bool csum = true, gso = true, napi_tx = true;
R
Rusty Russell 已提交
30 31
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);
W
Willem de Bruijn 已提交
32
module_param(napi_tx, bool, 0644);
R
Rusty Russell 已提交
33

R
Rusty Russell 已提交
34
/* FIXME: MTU in config. */
35
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
36
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
37

38 39
#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

40 41 42
/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
#define VIRTIO_XDP_HEADROOM 256

43 44 45 46
/* Separating two types of XDP xmit */
#define VIRTIO_XDP_TX		BIT(0)
#define VIRTIO_XDP_REDIR	BIT(1)

47 48
#define VIRTIO_XDP_FLAG	BIT(0)

J
Johannes Berg 已提交
49 50 51 52
/* RX packet size EWMA. The average packet size is used to determine the packet
 * buffer size when refilling RX rings. As the entire RX ring may be refilled
 * at once, the weight is chosen so that the EWMA will be insensitive to short-
 * term, transient changes in packet size.
53
 */
54
DECLARE_EWMA(pkt_len, 0, 64)
55

56
#define VIRTNET_DRIVER_VERSION "1.0.0"
57

58 59 60 61
static const unsigned long guest_offloads[] = {
	VIRTIO_NET_F_GUEST_TSO4,
	VIRTIO_NET_F_GUEST_TSO6,
	VIRTIO_NET_F_GUEST_ECN,
62 63
	VIRTIO_NET_F_GUEST_UFO,
	VIRTIO_NET_F_GUEST_CSUM
64
};
65

66
#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
67 68 69 70
				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
				(1ULL << VIRTIO_NET_F_GUEST_UFO))

T
Toshiaki Makita 已提交
71 72 73
struct virtnet_stat_desc {
	char desc[ETH_GSTRING_LEN];
	size_t offset;
74 75
};

T
Toshiaki Makita 已提交
76 77 78 79
struct virtnet_sq_stats {
	struct u64_stats_sync syncp;
	u64 packets;
	u64 bytes;
80 81
	u64 xdp_tx;
	u64 xdp_tx_drops;
T
Toshiaki Makita 已提交
82
	u64 kicks;
T
Toshiaki Makita 已提交
83 84
};

85 86
struct virtnet_rq_stats {
	struct u64_stats_sync syncp;
T
Toshiaki Makita 已提交
87 88
	u64 packets;
	u64 bytes;
89
	u64 drops;
90 91 92 93
	u64 xdp_packets;
	u64 xdp_tx;
	u64 xdp_redirects;
	u64 xdp_drops;
T
Toshiaki Makita 已提交
94
	u64 kicks;
T
Toshiaki Makita 已提交
95 96 97
};

#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
98
#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
T
Toshiaki Makita 已提交
99 100

static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
101 102 103 104
	{ "packets",		VIRTNET_SQ_STAT(packets) },
	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
T
Toshiaki Makita 已提交
105
	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
T
Toshiaki Makita 已提交
106 107 108
};

static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
109 110 111 112 113 114 115
	{ "packets",		VIRTNET_RQ_STAT(packets) },
	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
	{ "drops",		VIRTNET_RQ_STAT(drops) },
	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
T
Toshiaki Makita 已提交
116
	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
T
Toshiaki Makita 已提交
117 118 119 120 121
};

#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)

122 123 124 125 126 127 128
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
129 130 131

	/* Name of the send queue: output.$index */
	char name[40];
W
Willem de Bruijn 已提交
132

T
Toshiaki Makita 已提交
133 134
	struct virtnet_sq_stats stats;

W
Willem de Bruijn 已提交
135
	struct napi_struct napi;
136 137 138 139 140 141 142
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
143 144
	struct napi_struct napi;

J
John Fastabend 已提交
145 146
	struct bpf_prog __rcu *xdp_prog;

T
Toshiaki Makita 已提交
147 148
	struct virtnet_rq_stats stats;

149 150 151
	/* Chain pages by the private ptr. */
	struct page *pages;

152
	/* Average packet length for mergeable receive buffers. */
J
Johannes Berg 已提交
153
	struct ewma_pkt_len mrg_avg_pkt_len;
154

155 156 157
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

158 159
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
160

161 162 163
	/* Min single buffer size for mergeable buffers case. */
	unsigned int min_buf_len;

J
Jason Wang 已提交
164 165
	/* Name of this receive queue: input.$index */
	char name[40];
166 167

	struct xdp_rxq_info xdp_rxq;
168 169
};

170 171 172 173 174 175 176
/* Control VQ buffers: protected by the rtnl lock */
struct control_buf {
	struct virtio_net_ctrl_hdr hdr;
	virtio_net_ctrl_ack status;
	struct virtio_net_ctrl_mq mq;
	u8 promisc;
	u8 allmulti;
177
	__virtio16 vid;
178
	__virtio64 offloads;
179 180
};

181 182 183 184
struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
185 186
	struct send_queue *sq;
	struct receive_queue *rq;
187 188
	unsigned int status;

J
Jason Wang 已提交
189 190 191 192 193 194
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

195 196 197
	/* # of XDP queue pairs currently used by the driver */
	u16 xdp_queue_pairs;

198 199 200
	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
	bool xdp_enabled;

201 202 203
	/* I like... big packets and I cannot lie! */
	bool big_packets;

204 205 206
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
207 208 209
	/* Has control virtqueue */
	bool has_cvq;

210 211 212
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

213 214 215
	/* Packet virtio header size */
	u8 hdr_len;

216 217 218
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

219 220 221
	/* Work struct for config space updates */
	struct work_struct config_work;

J
Jason Wang 已提交
222 223
	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
224

225 226 227
	/* CPU hotplug instances for online & dead */
	struct hlist_node node;
	struct hlist_node node_dead;
228

229
	struct control_buf *ctrl;
230 231 232 233

	/* Ethtool settings */
	u8 duplex;
	u32 speed;
234 235

	unsigned long guest_offloads;
236
	unsigned long guest_offloads_capable;
237 238 239

	/* failover when STANDBY feature enabled */
	struct failover *failover;
R
Rusty Russell 已提交
240 241
};

242
struct padded_vnet_hdr {
243
	struct virtio_net_hdr_mrg_rxbuf hdr;
244
	/*
245 246 247
	 * hdr is in a separate sg buffer, and data sg buffer shares same page
	 * with this header sg. This padding makes next sg 16 byte aligned
	 * after the header.
248
	 */
249
	char padding[4];
250 251
};

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
static bool is_xdp_frame(void *ptr)
{
	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
}

static void *xdp_to_ptr(struct xdp_frame *ptr)
{
	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
}

static struct xdp_frame *ptr_to_xdp(void *ptr)
{
	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
}

J
Jason Wang 已提交
267 268 269 270 271
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
272
	return (vq->index - 1) / 2;
J
Jason Wang 已提交
273 274 275 276 277 278 279 280 281
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
282
	return vq->index / 2;
J
Jason Wang 已提交
283 284 285 286 287 288 289
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

290
static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
291
{
292
	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
R
Rusty Russell 已提交
293 294
}

295 296 297 298
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
299
static void give_pages(struct receive_queue *rq, struct page *page)
300
{
301
	struct page *end;
302

303
	/* Find end of list, sew whole thing into vi->rq.pages. */
304
	for (end = page; end->private; end = (struct page *)end->private);
305 306
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
307 308
}

309
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
310
{
311
	struct page *p = rq->pages;
312

313
	if (p) {
314
		rq->pages = (struct page *)p->private;
315 316 317
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
318 319 320 321
		p = alloc_page(gfp_mask);
	return p;
}

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
static void virtqueue_napi_schedule(struct napi_struct *napi,
				    struct virtqueue *vq)
{
	if (napi_schedule_prep(napi)) {
		virtqueue_disable_cb(vq);
		__napi_schedule(napi);
	}
}

static void virtqueue_napi_complete(struct napi_struct *napi,
				    struct virtqueue *vq, int processed)
{
	int opaque;

	opaque = virtqueue_enable_cb_prepare(vq);
337 338 339 340 341 342
	if (napi_complete_done(napi, processed)) {
		if (unlikely(virtqueue_poll(vq, opaque)))
			virtqueue_napi_schedule(napi, vq);
	} else {
		virtqueue_disable_cb(vq);
	}
343 344
}

345
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
346
{
347
	struct virtnet_info *vi = vq->vdev->priv;
W
Willem de Bruijn 已提交
348
	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
R
Rusty Russell 已提交
349

350
	/* Suppress further interrupts. */
351
	virtqueue_disable_cb(vq);
352

W
Willem de Bruijn 已提交
353 354 355 356 357
	if (napi->weight)
		virtqueue_napi_schedule(napi, vq);
	else
		/* We were probably waiting for more output buffers. */
		netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
358 359
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
#define MRG_CTX_HEADER_SHIFT 22
static void *mergeable_len_to_ctx(unsigned int truesize,
				  unsigned int headroom)
{
	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
}

static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
}

static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
{
	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
}

377
/* Called from bottom half context */
M
Michael S. Tsirkin 已提交
378 379
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
				   struct receive_queue *rq,
380
				   struct page *page, unsigned int offset,
381
				   unsigned int len, unsigned int truesize,
382
				   bool hdr_valid, unsigned int metasize,
383
				   unsigned int headroom)
384 385
{
	struct sk_buff *skb;
386
	struct virtio_net_hdr_mrg_rxbuf *hdr;
387
	unsigned int copy, hdr_len, hdr_padded_len;
388
	struct page *page_to_free = NULL;
389
	int tailroom, shinfo_size;
390
	char *p, *hdr_p, *buf;
391

392
	p = page_address(page) + offset;
393
	hdr_p = p;
394

395 396
	hdr_len = vi->hdr_len;
	if (vi->mergeable_rx_bufs)
397
		hdr_padded_len = sizeof(*hdr);
398
	else
399
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
400

401
	/* If headroom is not 0, there is an offset between the beginning of the
402 403
	 * data and the allocated space, otherwise the data and the allocated
	 * space are aligned.
404 405 406
	 *
	 * Buffers with headroom use PAGE_SIZE as alloc size, see
	 * add_recvbuf_mergeable() + get_mergeable_buf_len()
407
	 */
408 409 410
	truesize = headroom ? PAGE_SIZE : truesize;
	tailroom = truesize - len - headroom;
	buf = p - headroom;
411

412
	len -= hdr_len;
413 414
	offset += hdr_padded_len;
	p += hdr_padded_len;
415

416 417
	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

418
	/* copy small packet so we can reuse these pages */
419
	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
420
		skb = build_skb(buf, truesize);
421 422 423
		if (unlikely(!skb))
			return NULL;

424
		skb_reserve(skb, p - buf);
425 426 427 428 429 430 431 432 433
		skb_put(skb, len);
		goto ok;
	}

	/* copy small packet so we can reuse these pages for small data */
	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;

434 435 436 437 438 439 440
	/* Copy all frame if it fits skb->head, otherwise
	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
	 */
	if (len <= skb_tailroom(skb))
		copy = len;
	else
		copy = ETH_HLEN + metasize;
441
	skb_put_data(skb, p, copy);
442

443 444
	len -= copy;
	offset += copy;
445

446 447 448 449
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
450
			page_to_free = page;
451
		goto ok;
452 453
	}

454 455 456 457 458 459 460
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
461
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
462 463 464
		dev_kfree_skb(skb);
		return NULL;
	}
465
	BUG_ON(offset >= PAGE_SIZE);
466
	while (len) {
467 468 469 470
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
471 472 473
		page = (struct page *)page->private;
		offset = 0;
	}
474

475
	if (page)
476
		give_pages(rq, page);
477

478 479 480 481 482 483
ok:
	/* hdr_valid means no XDP, so we can copy the vnet header */
	if (hdr_valid) {
		hdr = skb_vnet_hdr(skb);
		memcpy(hdr, hdr_p, hdr_len);
	}
484 485
	if (page_to_free)
		put_page(page_to_free);
486 487 488 489 490 491

	if (metasize) {
		__skb_pull(skb, metasize);
		skb_metadata_set(skb, metasize);
	}

492 493
	return skb;
}
494

495 496 497
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
				   struct send_queue *sq,
				   struct xdp_frame *xdpf)
J
John Fastabend 已提交
498 499 500 501
{
	struct virtio_net_hdr_mrg_rxbuf *hdr;
	int err;

502 503 504 505 506
	if (unlikely(xdpf->headroom < vi->hdr_len))
		return -EOVERFLOW;

	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
	xdpf->data -= vi->hdr_len;
507
	/* Zero header and leave csum up to XDP layers */
508
	hdr = xdpf->data;
509
	memset(hdr, 0, vi->hdr_len);
510
	xdpf->len   += vi->hdr_len;
511

512
	sg_init_one(sq->sg, xdpf->data, xdpf->len);
513

514 515
	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
				   GFP_ATOMIC);
516
	if (unlikely(err))
517
		return -ENOSPC; /* Caller handle free/refcnt */
J
John Fastabend 已提交
518

519
	return 0;
J
John Fastabend 已提交
520 521
}

522 523 524 525 526 527 528 529 530
/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 * the current cpu, so it does not need to be locked.
 *
 * Here we use marco instead of inline functions because we have to deal with
 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 * functions to perfectly solve these three problems at the same time.
 */
#define virtnet_xdp_get_sq(vi) ({                                       \
531
	int cpu = smp_processor_id();                                   \
532 533 534 535 536 537
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
	unsigned int qp;                                                \
									\
	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
538
		qp += cpu;                                              \
539 540 541
		txq = netdev_get_tx_queue(v->dev, qp);                  \
		__netif_tx_acquire(txq);                                \
	} else {                                                        \
542
		qp = cpu % v->curr_queue_pairs;                         \
543
		txq = netdev_get_tx_queue(v->dev, qp);                  \
544
		__netif_tx_lock(txq, cpu);                              \
545 546 547 548 549 550 551 552 553 554 555 556 557
	}                                                               \
	v->sq + qp;                                                     \
})

#define virtnet_xdp_put_sq(vi, q) {                                     \
	struct netdev_queue *txq;                                       \
	typeof(vi) v = (vi);                                            \
									\
	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
	if (v->curr_queue_pairs > nr_cpu_ids)                           \
		__netif_tx_release(txq);                                \
	else                                                            \
		__netif_tx_unlock(txq);                                 \
558 559
}

560
static int virtnet_xdp_xmit(struct net_device *dev,
561
			    int n, struct xdp_frame **frames, u32 flags)
J
Jason Wang 已提交
562 563
{
	struct virtnet_info *vi = netdev_priv(dev);
564 565
	struct receive_queue *rq = vi->rq;
	struct bpf_prog *xdp_prog;
566 567
	struct send_queue *sq;
	unsigned int len;
568 569
	int packets = 0;
	int bytes = 0;
570
	int nxmit = 0;
T
Toshiaki Makita 已提交
571
	int kicks = 0;
572
	void *ptr;
573
	int ret;
574 575
	int i;

576 577 578
	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
	 * indicate XDP resources have been successfully allocated.
	 */
579
	xdp_prog = rcu_access_pointer(rq->xdp_prog);
580 581 582
	if (!xdp_prog)
		return -ENXIO;

583
	sq = virtnet_xdp_get_sq(vi);
584 585 586

	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
		ret = -EINVAL;
587 588
		goto out;
	}
589

590
	/* Free up any pending old buffers before queueing new ones. */
591
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
592 593 594 595 596 597 598 599 600 601 602 603
		if (likely(is_xdp_frame(ptr))) {
			struct xdp_frame *frame = ptr_to_xdp(ptr);

			bytes += frame->len;
			xdp_return_frame(frame);
		} else {
			struct sk_buff *skb = ptr;

			bytes += skb->len;
			napi_consume_skb(skb, false);
		}
		packets++;
604
	}
605 606 607 608

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];

609 610 611
		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
			break;
		nxmit++;
612
	}
613
	ret = nxmit;
614

T
Toshiaki Makita 已提交
615 616 617 618
	if (flags & XDP_XMIT_FLUSH) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
			kicks = 1;
	}
619 620
out:
	u64_stats_update_begin(&sq->stats.syncp);
621 622
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
623
	sq->stats.xdp_tx += n;
624
	sq->stats.xdp_tx_drops += n - nxmit;
T
Toshiaki Makita 已提交
625
	sq->stats.kicks += kicks;
626
	u64_stats_update_end(&sq->stats.syncp);
627

628
	virtnet_xdp_put_sq(vi, sq);
629
	return ret;
J
Jason Wang 已提交
630 631
}

632 633
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
{
634
	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
635 636
}

637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
/* We copy the packet for XDP in the following cases:
 *
 * 1) Packet is scattered across multiple rx buffers.
 * 2) Headroom space is insufficient.
 *
 * This is inefficient but it's a temporary condition that
 * we hit right after XDP is enabled and until queue is refilled
 * with large buffers with sufficient headroom - so it should affect
 * at most queue size packets.
 * Afterwards, the conditions to enable
 * XDP should preclude the underlying device from sending packets
 * across multiple buffers (num_buf > 1), and we make sure buffers
 * have enough headroom.
 */
static struct page *xdp_linearize_page(struct receive_queue *rq,
				       u16 *num_buf,
				       struct page *p,
				       int offset,
				       int page_off,
				       unsigned int *len)
{
	struct page *page = alloc_page(GFP_ATOMIC);

	if (!page)
		return NULL;

	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
	page_off += *len;

	while (--*num_buf) {
667
		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
668 669 670 671 672 673 674 675 676 677 678 679 680 681
		unsigned int buflen;
		void *buf;
		int off;

		buf = virtqueue_get_buf(rq->vq, &buflen);
		if (unlikely(!buf))
			goto err_buf;

		p = virt_to_head_page(buf);
		off = buf - page_address(p);

		/* guard against a misconfigured or uncooperative backend that
		 * is sending packet larger than the MTU.
		 */
682
		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
			put_page(p);
			goto err_buf;
		}

		memcpy(page_address(page) + page_off,
		       page_address(p) + off, buflen);
		page_off += buflen;
		put_page(p);
	}

	/* Headroom does not contribute to packet length */
	*len = page_off - VIRTIO_XDP_HEADROOM;
	return page;
err_buf:
	__free_pages(page, 0);
	return NULL;
}

701 702 703
static struct sk_buff *receive_small(struct net_device *dev,
				     struct virtnet_info *vi,
				     struct receive_queue *rq,
704
				     void *buf, void *ctx,
J
Jason Wang 已提交
705
				     unsigned int len,
706
				     unsigned int *xdp_xmit,
707
				     struct virtnet_rq_stats *stats)
708
{
709
	struct sk_buff *skb;
710
	struct bpf_prog *xdp_prog;
711
	unsigned int xdp_headroom = (unsigned long)ctx;
712 713 714 715
	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
	unsigned int headroom = vi->hdr_len + header_offset;
	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
716
	struct page *page = virt_to_head_page(buf);
717
	unsigned int delta = 0;
718
	struct page *xdp_page;
719
	int err;
720
	unsigned int metasize = 0;
721

722
	len -= vi->hdr_len;
723
	stats->bytes += len;
724

725 726 727 728 729 730
	if (unlikely(len > GOOD_PACKET_LEN)) {
		pr_debug("%s: rx error: len %u exceeds max size %d\n",
			 dev->name, len, GOOD_PACKET_LEN);
		dev->stats.rx_length_errors++;
		goto err_len;
	}
731 732 733
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
734
		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
735
		struct xdp_frame *xdpf;
736
		struct xdp_buff xdp;
737
		void *orig_data;
738 739
		u32 act;

740
		if (unlikely(hdr->hdr.gso_type))
741
			goto err_xdp;
742

743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
			int offset = buf - page_address(page) + header_offset;
			unsigned int tlen = len + vi->hdr_len;
			u16 num_buf = 1;

			xdp_headroom = virtnet_get_headroom(vi);
			header_offset = VIRTNET_RX_PAD + xdp_headroom;
			headroom = vi->hdr_len + header_offset;
			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
			xdp_page = xdp_linearize_page(rq, &num_buf, page,
						      offset, header_offset,
						      &tlen);
			if (!xdp_page)
				goto err_xdp;

			buf = page_address(xdp_page);
			put_page(page);
			page = xdp_page;
		}

764
		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
765 766
		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
				 xdp_headroom, len, true);
767
		orig_data = xdp.data;
768
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
769
		stats->xdp_packets++;
770

771 772
		switch (act) {
		case XDP_PASS:
773
			/* Recalculate length in case bpf program changed it */
774
			delta = orig_data - xdp.data;
775
			len = xdp.data_end - xdp.data;
776
			metasize = xdp.data - xdp.data_meta;
777 778
			break;
		case XDP_TX:
779
			stats->xdp_tx++;
780
			xdpf = xdp_convert_buff_to_frame(&xdp);
781 782
			if (unlikely(!xdpf))
				goto err_xdp;
783
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
784 785 786
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
787
				trace_xdp_exception(vi->dev, xdp_prog, act);
788 789
				goto err_xdp;
			}
790
			*xdp_xmit |= VIRTIO_XDP_TX;
J
Jason Wang 已提交
791 792 793
			rcu_read_unlock();
			goto xdp_xmit;
		case XDP_REDIRECT:
794
			stats->xdp_redirects++;
J
Jason Wang 已提交
795
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
796 797
			if (err)
				goto err_xdp;
798
			*xdp_xmit |= VIRTIO_XDP_REDIR;
799 800 801
			rcu_read_unlock();
			goto xdp_xmit;
		default:
802
			bpf_warn_invalid_xdp_action(act);
803
			fallthrough;
804 805
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
806
			goto err_xdp;
807
		case XDP_DROP:
808 809 810 811 812
			goto err_xdp;
		}
	}
	rcu_read_unlock();

813 814
	skb = build_skb(buf, buflen);
	if (!skb) {
815
		put_page(page);
816 817 818
		goto err;
	}
	skb_reserve(skb, headroom - delta);
819
	skb_put(skb, len);
820
	if (!xdp_prog) {
821 822
		buf += header_offset;
		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
823
	} /* keep zeroed vnet hdr since XDP is loaded */
824

825 826 827
	if (metasize)
		skb_metadata_set(skb, metasize);

828
err:
829
	return skb;
830 831 832

err_xdp:
	rcu_read_unlock();
833
	stats->xdp_drops++;
834
err_len:
835
	stats->drops++;
836
	put_page(page);
837 838
xdp_xmit:
	return NULL;
839 840 841
}

static struct sk_buff *receive_big(struct net_device *dev,
M
Michael S. Tsirkin 已提交
842
				   struct virtnet_info *vi,
843 844
				   struct receive_queue *rq,
				   void *buf,
845
				   unsigned int len,
846
				   struct virtnet_rq_stats *stats)
847 848
{
	struct page *page = buf;
849
	struct sk_buff *skb =
850
		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
J
John Fastabend 已提交
851

852
	stats->bytes += len - vi->hdr_len;
853 854 855 856 857 858
	if (unlikely(!skb))
		goto err;

	return skb;

err:
859
	stats->drops++;
860 861 862 863
	give_pages(rq, page);
	return NULL;
}

864
static struct sk_buff *receive_mergeable(struct net_device *dev,
M
Michael S. Tsirkin 已提交
865
					 struct virtnet_info *vi,
866
					 struct receive_queue *rq,
867 868
					 void *buf,
					 void *ctx,
J
Jason Wang 已提交
869
					 unsigned int len,
870
					 unsigned int *xdp_xmit,
871
					 struct virtnet_rq_stats *stats)
872
{
873 874
	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
875 876
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
J
John Fastabend 已提交
877 878
	struct sk_buff *head_skb, *curr_skb;
	struct bpf_prog *xdp_prog;
879
	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
880
	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
881
	unsigned int metasize = 0;
882 883
	unsigned int frame_sz;
	int err;
J
John Fastabend 已提交
884

J
John Fastabend 已提交
885
	head_skb = NULL;
886
	stats->bytes += len - vi->hdr_len;
J
John Fastabend 已提交
887

888 889 890 891 892 893
	if (unlikely(len > truesize)) {
		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
			 dev->name, len, (unsigned long)ctx);
		dev->stats.rx_length_errors++;
		goto err_skb;
	}
J
John Fastabend 已提交
894 895 896
	rcu_read_lock();
	xdp_prog = rcu_dereference(rq->xdp_prog);
	if (xdp_prog) {
897
		struct xdp_frame *xdpf;
898
		struct page *xdp_page;
899 900
		struct xdp_buff xdp;
		void *data;
J
John Fastabend 已提交
901 902
		u32 act;

903 904 905 906 907 908 909
		/* Transient failure which in theory could occur if
		 * in-flight packets from before XDP was enabled reach
		 * the receive path after XDP is loaded.
		 */
		if (unlikely(hdr->hdr.gso_type))
			goto err_xdp;

910 911 912 913 914
		/* Buffers with headroom use PAGE_SIZE as alloc size,
		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
		 */
		frame_sz = headroom ? PAGE_SIZE : truesize;

915 916 917 918 919 920
		/* This happens when rx buffer size is underestimated
		 * or headroom is not enough because of the buffer
		 * was refilled before XDP is set. This should only
		 * happen for the first several packets, so we don't
		 * care much about its performance.
		 */
921 922
		if (unlikely(num_buf > 1 ||
			     headroom < virtnet_get_headroom(vi))) {
923
			/* linearize data for XDP */
924
			xdp_page = xdp_linearize_page(rq, &num_buf,
925 926 927
						      page, offset,
						      VIRTIO_XDP_HEADROOM,
						      &len);
928 929
			frame_sz = PAGE_SIZE;

930 931
			if (!xdp_page)
				goto err_xdp;
932
			offset = VIRTIO_XDP_HEADROOM;
933 934
		} else {
			xdp_page = page;
J
John Fastabend 已提交
935 936
		}

937 938 939
		/* Allow consuming headroom but reserve enough space to push
		 * the descriptor on if we get an XDP_TX return code.
		 */
940
		data = page_address(xdp_page) + offset;
941
		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
942 943
		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
944

945
		act = bpf_prog_run_xdp(xdp_prog, &xdp);
946
		stats->xdp_packets++;
947

J
John Fastabend 已提交
948 949
		switch (act) {
		case XDP_PASS:
950 951
			metasize = xdp.data - xdp.data_meta;

952
			/* recalculate offset to account for any header
953 954 955
			 * adjustments and minus the metasize to copy the
			 * metadata in page_to_skb(). Note other cases do not
			 * build an skb and avoid using offset
956
			 */
957 958
			offset = xdp.data - page_address(xdp_page) -
				 vi->hdr_len - metasize;
959

960 961
			/* recalculate len if xdp.data, xdp.data_end or
			 * xdp.data_meta were adjusted
962
			 */
963
			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
964 965 966 967
			/* We can only create skb based on xdp_page. */
			if (unlikely(xdp_page != page)) {
				rcu_read_unlock();
				put_page(page);
968 969
				head_skb = page_to_skb(vi, rq, xdp_page, offset,
						       len, PAGE_SIZE, false,
970 971
						       metasize,
						       VIRTIO_XDP_HEADROOM);
972 973
				return head_skb;
			}
J
John Fastabend 已提交
974 975
			break;
		case XDP_TX:
976
			stats->xdp_tx++;
977
			xdpf = xdp_convert_buff_to_frame(&xdp);
978 979
			if (unlikely(!xdpf))
				goto err_xdp;
980
			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
981 982 983
			if (unlikely(!err)) {
				xdp_return_frame_rx_napi(xdpf);
			} else if (unlikely(err < 0)) {
984
				trace_xdp_exception(vi->dev, xdp_prog, act);
985 986 987 988
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
989
			*xdp_xmit |= VIRTIO_XDP_TX;
990
			if (unlikely(xdp_page != page))
991
				put_page(page);
J
John Fastabend 已提交
992 993
			rcu_read_unlock();
			goto xdp_xmit;
994
		case XDP_REDIRECT:
995
			stats->xdp_redirects++;
996 997 998 999 1000 1001
			err = xdp_do_redirect(dev, &xdp, xdp_prog);
			if (err) {
				if (unlikely(xdp_page != page))
					put_page(xdp_page);
				goto err_xdp;
			}
1002
			*xdp_xmit |= VIRTIO_XDP_REDIR;
1003
			if (unlikely(xdp_page != page))
1004
				put_page(page);
1005 1006
			rcu_read_unlock();
			goto xdp_xmit;
J
John Fastabend 已提交
1007
		default:
1008
			bpf_warn_invalid_xdp_action(act);
1009
			fallthrough;
1010 1011
		case XDP_ABORTED:
			trace_xdp_exception(vi->dev, xdp_prog, act);
1012
			fallthrough;
1013
		case XDP_DROP:
1014 1015
			if (unlikely(xdp_page != page))
				__free_pages(xdp_page, 0);
J
John Fastabend 已提交
1016
			goto err_xdp;
J
John Fastabend 已提交
1017
		}
J
John Fastabend 已提交
1018 1019
	}
	rcu_read_unlock();
1020

1021
	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1022
			       metasize, headroom);
J
John Fastabend 已提交
1023
	curr_skb = head_skb;
1024

1025 1026
	if (unlikely(!curr_skb))
		goto err_skb;
1027
	while (--num_buf) {
1028 1029
		int num_skb_frags;

1030
		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1031
		if (unlikely(!buf)) {
1032
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
M
Michael S. Tsirkin 已提交
1033
				 dev->name, num_buf,
1034 1035
				 virtio16_to_cpu(vi->vdev,
						 hdr->num_buffers));
1036 1037
			dev->stats.rx_length_errors++;
			goto err_buf;
1038
		}
1039

1040
		stats->bytes += len;
1041
		page = virt_to_head_page(buf);
1042 1043 1044

		truesize = mergeable_ctx_to_truesize(ctx);
		if (unlikely(len > truesize)) {
1045
			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1046 1047 1048 1049
				 dev->name, len, (unsigned long)ctx);
			dev->stats.rx_length_errors++;
			goto err_skb;
		}
1050 1051

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1052 1053
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1054 1055 1056

			if (unlikely(!nskb))
				goto err_skb;
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
1068
			head_skb->truesize += truesize;
1069
		}
1070
		offset = buf - page_address(page);
1071 1072 1073
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1074
					     len, truesize);
1075 1076
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1077
					offset, len, truesize);
1078
		}
1079 1080
	}

J
Johannes Berg 已提交
1081
	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1082 1083
	return head_skb;

J
John Fastabend 已提交
1084 1085
err_xdp:
	rcu_read_unlock();
1086
	stats->xdp_drops++;
1087 1088
err_skb:
	put_page(page);
1089
	while (num_buf-- > 1) {
1090 1091
		buf = virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!buf)) {
1092 1093 1094 1095 1096
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
1097
		stats->bytes += len;
1098
		page = virt_to_head_page(buf);
1099
		put_page(page);
1100
	}
1101
err_buf:
1102
	stats->drops++;
1103
	dev_kfree_skb(head_skb);
J
John Fastabend 已提交
1104
xdp_xmit:
1105
	return NULL;
1106 1107
}

1108 1109
static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
			void *buf, unsigned int len, void **ctx,
1110
			unsigned int *xdp_xmit,
1111
			struct virtnet_rq_stats *stats)
1112
{
1113
	struct net_device *dev = vi->dev;
1114
	struct sk_buff *skb;
1115
	struct virtio_net_hdr_mrg_rxbuf *hdr;
1116

1117
	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1118 1119
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
1120
		if (vi->mergeable_rx_bufs) {
1121
			put_page(virt_to_head_page(buf));
1122
		} else if (vi->big_packets) {
1123
			give_pages(rq, buf);
1124
		} else {
1125
			put_page(virt_to_head_page(buf));
1126
		}
1127
		return;
1128
	}
1129

1130
	if (vi->mergeable_rx_bufs)
1131
		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1132
					stats);
1133
	else if (vi->big_packets)
1134
		skb = receive_big(dev, vi, rq, buf, len, stats);
1135
	else
1136
		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1137 1138

	if (unlikely(!skb))
1139
		return;
1140

1141
	hdr = skb_vnet_hdr(skb);
1142

1143
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1144
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
1145

1146 1147 1148 1149 1150 1151
	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
				  virtio_is_little_endian(vi->vdev))) {
		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
				     dev->name, hdr->hdr.gso_type,
				     hdr->hdr.gso_size);
		goto frame_err;
R
Rusty Russell 已提交
1152 1153
	}

1154
	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1155 1156 1157 1158
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

E
Eric Dumazet 已提交
1159
	napi_gro_receive(&rq->napi, skb);
1160
	return;
R
Rusty Russell 已提交
1161 1162 1163 1164 1165 1166

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

1167 1168 1169 1170 1171
/* Unlike mergeable buffers, all buffers are allocated to the
 * same size, except for the headroom. For this reason we do
 * not need to use  mergeable_len_to_ctx here - it is enough
 * to store the headroom as the context ignoring the truesize.
 */
M
Michael S. Tsirkin 已提交
1172 1173
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
			     gfp_t gfp)
R
Rusty Russell 已提交
1174
{
1175 1176
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
1177
	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1178
	void *ctx = (void *)(unsigned long)xdp_headroom;
1179
	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1180
	int err;
1181

1182 1183 1184
	len = SKB_DATA_ALIGN(len) +
	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1185
		return -ENOMEM;
R
Rusty Russell 已提交
1186

1187 1188 1189 1190 1191
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
		    vi->hdr_len + GOOD_PACKET_LEN);
1192
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1193
	if (err < 0)
1194
		put_page(virt_to_head_page(buf));
1195 1196
	return err;
}
1197

1198 1199
static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
			   gfp_t gfp)
1200 1201 1202 1203 1204
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

1205 1206
	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);

1207
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1208
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1209
		first = get_a_page(rq, gfp);
1210 1211
		if (!first) {
			if (list)
1212
				give_pages(rq, list);
1213
			return -ENOMEM;
1214
		}
1215
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1216

1217 1218 1219 1220
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
1221

1222
	first = get_a_page(rq, gfp);
1223
	if (!first) {
1224
		give_pages(rq, list);
1225 1226 1227 1228
		return -ENOMEM;
	}
	p = page_address(first);

1229
	/* rq->sg[0], rq->sg[1] share the same page */
1230 1231
	/* a separated rq->sg[0] for header - required in case !any_header_sg */
	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1232

1233
	/* rq->sg[1] for data packet, from offset */
1234
	offset = sizeof(struct padded_vnet_hdr);
1235
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1236 1237 1238

	/* chain first in list head */
	first->private = (unsigned long)list;
1239 1240
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
1241
	if (err < 0)
1242
		give_pages(rq, first);
1243 1244

	return err;
R
Rusty Russell 已提交
1245 1246
}

1247
static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1248 1249
					  struct ewma_pkt_len *avg_pkt_len,
					  unsigned int room)
1250
{
1251
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1252 1253
	unsigned int len;

1254 1255 1256 1257
	if (room)
		return PAGE_SIZE - room;

	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1258
				rq->min_buf_len, PAGE_SIZE - hdr_len);
1259

1260
	return ALIGN(len, L1_CACHE_BYTES);
1261 1262
}

1263 1264
static int add_recvbuf_mergeable(struct virtnet_info *vi,
				 struct receive_queue *rq, gfp_t gfp)
1265
{
1266
	struct page_frag *alloc_frag = &rq->alloc_frag;
1267
	unsigned int headroom = virtnet_get_headroom(vi);
1268 1269
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1270
	char *buf;
1271
	void *ctx;
1272
	int err;
1273
	unsigned int len, hole;
1274

1275 1276 1277 1278 1279 1280
	/* Extra tailroom is needed to satisfy XDP's assumption. This
	 * means rx frags coalescing won't work, but consider we've
	 * disabled GSO for XDP, it won't be a big issue.
	 */
	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1281
		return -ENOMEM;
1282

1283
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1284
	buf += headroom; /* advance address leaving hole at front of pkt */
1285
	get_page(alloc_frag->page);
1286
	alloc_frag->offset += len + room;
1287
	hole = alloc_frag->size - alloc_frag->offset;
1288
	if (hole < len + room) {
1289 1290
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
1291
		 * the current buffer.
1292
		 */
1293 1294 1295
		len += hole;
		alloc_frag->offset += hole;
	}
1296

1297
	sg_init_one(rq->sg, buf, len);
1298
	ctx = mergeable_len_to_ctx(len, headroom);
1299
	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1300
	if (err < 0)
1301
		put_page(virt_to_head_page(buf));
1302

1303 1304
	return err;
}
1305

1306 1307 1308 1309 1310 1311 1312
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
M
Michael S. Tsirkin 已提交
1313 1314
static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
			  gfp_t gfp)
1315 1316
{
	int err;
1317
	bool oom;
1318

1319 1320
	do {
		if (vi->mergeable_rx_bufs)
1321
			err = add_recvbuf_mergeable(vi, rq, gfp);
1322
		else if (vi->big_packets)
1323
			err = add_recvbuf_big(vi, rq, gfp);
1324
		else
M
Michael S. Tsirkin 已提交
1325
			err = add_recvbuf_small(vi, rq, gfp);
1326

1327
		oom = err == -ENOMEM;
1328
		if (err)
1329
			break;
1330
	} while (rq->vq->num_free);
T
Toshiaki Makita 已提交
1331
	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1332 1333 1334
		unsigned long flags;

		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1335
		rq->stats.kicks++;
1336
		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
T
Toshiaki Makita 已提交
1337 1338
	}

1339
	return !oom;
1340 1341
}

1342
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
1343 1344
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
1345
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1346

1347
	virtqueue_napi_schedule(&rq->napi, rvq);
R
Rusty Russell 已提交
1348 1349
}

1350
static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1351
{
1352
	napi_enable(napi);
1353 1354

	/* If all buffers were filled by other side before we napi_enabled, we
1355 1356 1357 1358 1359 1360
	 * won't get another interrupt, so process any outstanding packets now.
	 * Call local_bh_enable after to trigger softIRQ processing.
	 */
	local_bh_disable();
	virtqueue_napi_schedule(napi, vq);
	local_bh_enable();
1361 1362
}

W
Willem de Bruijn 已提交
1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
static void virtnet_napi_tx_enable(struct virtnet_info *vi,
				   struct virtqueue *vq,
				   struct napi_struct *napi)
{
	if (!napi->weight)
		return;

	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
	 * enable the feature if this is likely affine with the transmit path.
	 */
	if (!vi->affinity_hint_set) {
		napi->weight = 0;
		return;
	}

	return virtnet_napi_enable(vq, napi);
}

1381 1382 1383 1384 1385 1386
static void virtnet_napi_tx_disable(struct napi_struct *napi)
{
	if (napi->weight)
		napi_disable(napi);
}

1387 1388
static void refill_work(struct work_struct *work)
{
1389 1390
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
1391
	bool still_empty;
J
Jason Wang 已提交
1392 1393
	int i;

1394
	for (i = 0; i < vi->curr_queue_pairs; i++) {
J
Jason Wang 已提交
1395
		struct receive_queue *rq = &vi->rq[i];
1396

J
Jason Wang 已提交
1397
		napi_disable(&rq->napi);
M
Michael S. Tsirkin 已提交
1398
		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1399
		virtnet_napi_enable(rq->vq, &rq->napi);
1400

J
Jason Wang 已提交
1401 1402 1403 1404 1405 1406
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
1407 1408
}

1409 1410
static int virtnet_receive(struct receive_queue *rq, int budget,
			   unsigned int *xdp_xmit)
R
Rusty Russell 已提交
1411
{
1412
	struct virtnet_info *vi = rq->vq->vdev->priv;
1413
	struct virtnet_rq_stats stats = {};
1414
	unsigned int len;
1415
	void *buf;
1416
	int i;
R
Rusty Russell 已提交
1417

1418
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1419 1420
		void *ctx;

1421
		while (stats.packets < budget &&
1422
		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1423
			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1424
			stats.packets++;
1425 1426
		}
	} else {
1427
		while (stats.packets < budget &&
1428
		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1429
			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1430
			stats.packets++;
1431
		}
R
Rusty Russell 已提交
1432 1433
	}

1434
	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
M
Michael S. Tsirkin 已提交
1435
		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1436
			schedule_delayed_work(&vi->refill, 0);
1437
	}
R
Rusty Russell 已提交
1438

T
Toshiaki Makita 已提交
1439
	u64_stats_update_begin(&rq->stats.syncp);
1440 1441 1442 1443
	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
		size_t offset = virtnet_rq_stats_desc[i].offset;
		u64 *item;

1444 1445
		item = (u64 *)((u8 *)&rq->stats + offset);
		*item += *(u64 *)((u8 *)&stats + offset);
1446
	}
T
Toshiaki Makita 已提交
1447
	u64_stats_update_end(&rq->stats.syncp);
J
Jason Wang 已提交
1448

1449
	return stats.packets;
1450 1451
}

1452
static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1453 1454 1455 1456
{
	unsigned int len;
	unsigned int packets = 0;
	unsigned int bytes = 0;
1457
	void *ptr;
1458

1459 1460 1461
	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
		if (likely(!is_xdp_frame(ptr))) {
			struct sk_buff *skb = ptr;
1462

1463
			pr_debug("Sent skb %p\n", skb);
1464

1465 1466 1467 1468
			bytes += skb->len;
			napi_consume_skb(skb, in_napi);
		} else {
			struct xdp_frame *frame = ptr_to_xdp(ptr);
1469

1470 1471 1472 1473
			bytes += frame->len;
			xdp_return_frame(frame);
		}
		packets++;
1474 1475 1476 1477 1478 1479 1480 1481
	}

	/* Avoid overhead when no packets have been processed
	 * happens when called speculatively from start_xmit.
	 */
	if (!packets)
		return;

T
Toshiaki Makita 已提交
1482 1483 1484 1485
	u64_stats_update_begin(&sq->stats.syncp);
	sq->stats.bytes += bytes;
	sq->stats.packets += packets;
	u64_stats_update_end(&sq->stats.syncp);
1486 1487
}

1488 1489 1490 1491 1492 1493 1494 1495 1496 1497
static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
{
	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
		return false;
	else if (q < vi->curr_queue_pairs)
		return true;
	else
		return false;
}

1498 1499 1500 1501 1502 1503 1504
static void virtnet_poll_cleantx(struct receive_queue *rq)
{
	struct virtnet_info *vi = rq->vq->vdev->priv;
	unsigned int index = vq2rxq(rq->vq);
	struct send_queue *sq = &vi->sq[index];
	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);

1505
	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1506 1507 1508
		return;

	if (__netif_tx_trylock(txq)) {
1509 1510 1511 1512
		do {
			virtqueue_disable_cb(sq->vq);
			free_old_xmit_skbs(sq, true);
		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1513 1514 1515 1516

		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
			netif_tx_wake_queue(txq);

1517 1518 1519 1520
		__netif_tx_unlock(txq);
	}
}

1521 1522 1523 1524
static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
1525 1526
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct send_queue *sq;
1527
	unsigned int received;
1528
	unsigned int xdp_xmit = 0;
1529

1530 1531
	virtnet_poll_cleantx(rq);

J
Jason Wang 已提交
1532
	received = virtnet_receive(rq, budget, &xdp_xmit);
1533

1534
	/* Out of packets? */
1535 1536
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);
R
Rusty Russell 已提交
1537

1538
	if (xdp_xmit & VIRTIO_XDP_REDIR)
1539
		xdp_do_flush();
1540 1541

	if (xdp_xmit & VIRTIO_XDP_TX) {
1542
		sq = virtnet_xdp_get_sq(vi);
T
Toshiaki Makita 已提交
1543 1544 1545 1546 1547
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
1548
		virtnet_xdp_put_sq(vi, sq);
1549
	}
J
Jason Wang 已提交
1550

R
Rusty Russell 已提交
1551 1552 1553
	return received;
}

J
Jason Wang 已提交
1554 1555 1556
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1557
	int i, err;
J
Jason Wang 已提交
1558

1559 1560 1561
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
M
Michael S. Tsirkin 已提交
1562
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1563
				schedule_delayed_work(&vi->refill, 0);
1564

1565
		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1566 1567 1568
		if (err < 0)
			return err;

1569 1570 1571 1572 1573 1574 1575
		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
			return err;
		}

1576
		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
1577
		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
J
Jason Wang 已提交
1578 1579 1580 1581 1582
	}

	return 0;
}

W
Willem de Bruijn 已提交
1583 1584 1585 1586
static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{
	struct send_queue *sq = container_of(napi, struct send_queue, napi);
	struct virtnet_info *vi = sq->vq->vdev->priv;
1587 1588
	unsigned int index = vq2txq(sq->vq);
	struct netdev_queue *txq;
1589 1590
	int opaque;
	bool done;
W
Willem de Bruijn 已提交
1591

1592 1593 1594 1595 1596 1597 1598
	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
		/* We don't need to enable cb for XDP */
		napi_complete_done(napi, 0);
		return 0;
	}

	txq = netdev_get_tx_queue(vi->dev, index);
W
Willem de Bruijn 已提交
1599
	__netif_tx_lock(txq, raw_smp_processor_id());
1600
	virtqueue_disable_cb(sq->vq);
1601
	free_old_xmit_skbs(sq, true);
1602

1603 1604 1605
	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
		netif_tx_wake_queue(txq);

1606 1607 1608 1609 1610 1611 1612
	opaque = virtqueue_enable_cb_prepare(sq->vq);

	done = napi_complete_done(napi, 0);

	if (!done)
		virtqueue_disable_cb(sq->vq);

W
Willem de Bruijn 已提交
1613 1614
	__netif_tx_unlock(txq);

1615 1616 1617 1618 1619 1620 1621 1622 1623 1624
	if (done) {
		if (unlikely(virtqueue_poll(sq->vq, opaque))) {
			if (napi_schedule_prep(napi)) {
				__netif_tx_lock(txq, raw_smp_processor_id());
				virtqueue_disable_cb(sq->vq);
				__netif_tx_unlock(txq);
				__napi_schedule(napi);
			}
		}
	}
W
Willem de Bruijn 已提交
1625 1626 1627 1628

	return 0;
}

1629
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
1630
{
1631
	struct virtio_net_hdr_mrg_rxbuf *hdr;
R
Rusty Russell 已提交
1632
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1633
	struct virtnet_info *vi = sq->vq->vdev->priv;
1634
	int num_sg;
1635
	unsigned hdr_len = vi->hdr_len;
1636
	bool can_push;
R
Rusty Russell 已提交
1637

J
Johannes Berg 已提交
1638
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1639 1640 1641 1642 1643 1644 1645

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
1646
		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1647 1648
	else
		hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
1649

1650
	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1651 1652
				    virtio_is_little_endian(vi->vdev), false,
				    0))
1653
		return -EPROTO;
R
Rusty Russell 已提交
1654

1655
	if (vi->mergeable_rx_bufs)
1656
		hdr->num_buffers = 0;
1657

1658
	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1659 1660 1661
	if (can_push) {
		__skb_push(skb, hdr_len);
		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1662 1663
		if (unlikely(num_sg < 0))
			return num_sg;
1664 1665 1666 1667
		/* Pull header back to avoid skew in tx bytes calculations. */
		__skb_pull(skb, hdr_len);
	} else {
		sg_set_buf(sq->sg, hdr, hdr_len);
1668 1669 1670 1671
		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
		if (unlikely(num_sg < 0))
			return num_sg;
		num_sg++;
1672
	}
1673
	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1674 1675
}

1676
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1677 1678
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1679 1680
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
1681
	int err;
1682
	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1683
	bool kick = !netdev_xmit_more();
W
Willem de Bruijn 已提交
1684
	bool use_napi = sq->napi.weight;
1685 1686

	/* Free up any pending old buffers before queueing new ones. */
1687 1688 1689 1690 1691
	do {
		if (use_napi)
			virtqueue_disable_cb(sq->vq);

		free_old_xmit_skbs(sq, false);
1692

1693 1694
	} while (use_napi && kick &&
	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1695

1696 1697 1698
	/* timestamp packet in software */
	skb_tx_timestamp(skb);

1699
	/* Try to transmit */
1700
	err = xmit_skb(sq, skb);
1701

1702
	/* This should not happen! */
1703
	if (unlikely(err)) {
1704 1705 1706
		dev->stats.tx_fifo_errors++;
		if (net_ratelimit())
			dev_warn(&dev->dev,
1707 1708
				 "Unexpected TXQ (%d) queue failure: %d\n",
				 qnum, err);
1709
		dev->stats.tx_dropped++;
1710
		dev_kfree_skb_any(skb);
1711
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
1712
	}
1713

1714
	/* Don't wait up for transmitted skbs to be freed. */
W
Willem de Bruijn 已提交
1715 1716
	if (!use_napi) {
		skb_orphan(skb);
1717
		nf_reset_ct(skb);
W
Willem de Bruijn 已提交
1718
	}
1719

1720 1721 1722 1723 1724 1725 1726 1727 1728
	/* If running out of space, stop queue to avoid getting packets that we
	 * are then unable to transmit.
	 * An alternative would be to force queuing layer to requeue the skb by
	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
	 * returned in a normal path of operation: it means that driver is not
	 * maintaining the TX queue stop/start state properly, and causes
	 * the stack to do a non-trivial amount of useless work.
	 * Since most packets only take 1 or 2 ring slots, stopping the queue
	 * early means 16 slots are typically wasted.
1729
	 */
1730
	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1731
		netif_stop_subqueue(dev, qnum);
W
Willem de Bruijn 已提交
1732 1733
		if (!use_napi &&
		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1734
			/* More just got used, free them then recheck. */
1735
			free_old_xmit_skbs(sq, false);
1736
			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
1737
				netif_start_subqueue(dev, qnum);
1738
				virtqueue_disable_cb(sq->vq);
1739 1740
			}
		}
1741
	}
1742

T
Toshiaki Makita 已提交
1743 1744 1745 1746 1747 1748 1749
	if (kick || netif_xmit_stopped(txq)) {
		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
			u64_stats_update_begin(&sq->stats.syncp);
			sq->stats.kicks++;
			u64_stats_update_end(&sq->stats.syncp);
		}
	}
R
Rusty Russell 已提交
1750

1751
	return NETDEV_TX_OK;
1752 1753
}

1754 1755 1756
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
S
stephen hemminger 已提交
1757
 * never fail unless improperly formatted.
1758 1759
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1760
				 struct scatterlist *out)
1761
{
1762
	struct scatterlist *sgs[4], hdr, stat;
1763
	unsigned out_num = 0, tmp;
1764
	int ret;
1765 1766

	/* Caller should know better */
1767
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1768

1769 1770 1771
	vi->ctrl->status = ~0;
	vi->ctrl->hdr.class = class;
	vi->ctrl->hdr.cmd = cmd;
1772
	/* Add header */
1773
	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1774
	sgs[out_num++] = &hdr;
1775

1776 1777
	if (out)
		sgs[out_num++] = out;
1778

1779
	/* Add return status. */
1780
	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1781
	sgs[out_num] = &stat;
1782

1783
	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1784 1785 1786 1787 1788 1789
	ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
	if (ret < 0) {
		dev_warn(&vi->vdev->dev,
			 "Failed to add sgs for command vq: %d\n.", ret);
		return false;
	}
1790

1791
	if (unlikely(!virtqueue_kick(vi->cvq)))
1792
		return vi->ctrl->status == VIRTIO_NET_OK;
1793 1794 1795 1796

	/* Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
1797 1798
	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
	       !virtqueue_is_broken(vi->cvq))
1799 1800
		cpu_relax();

1801
	return vi->ctrl->status == VIRTIO_NET_OK;
1802 1803
}

1804 1805 1806 1807
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
1808
	int ret;
1809
	struct sockaddr *addr;
1810
	struct scatterlist sg;
1811

1812 1813 1814
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

1815
	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1816 1817 1818 1819
	if (!addr)
		return -ENOMEM;

	ret = eth_prepare_mac_addr_change(dev, addr);
1820
	if (ret)
1821
		goto out;
1822

1823 1824 1825
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
		sg_init_one(&sg, addr->sa_data, dev->addr_len);
		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1826
					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1827 1828
			dev_warn(&vdev->dev,
				 "Failed to set mac address by vq command.\n");
1829 1830
			ret = -EINVAL;
			goto out;
1831
		}
1832 1833
	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1834 1835 1836 1837 1838 1839 1840
		unsigned int i;

		/* Naturally, this has an atomicity problem. */
		for (i = 0; i < dev->addr_len; i++)
			virtio_cwrite8(vdev,
				       offsetof(struct virtio_net_config, mac) +
				       i, addr->sa_data[i]);
1841 1842 1843
	}

	eth_commit_mac_addr_change(dev, p);
1844
	ret = 0;
1845

1846 1847 1848
out:
	kfree(addr);
	return ret;
1849 1850
}

1851 1852
static void virtnet_stats(struct net_device *dev,
			  struct rtnl_link_stats64 *tot)
1853 1854 1855
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int start;
T
Toshiaki Makita 已提交
1856
	int i;
1857

T
Toshiaki Makita 已提交
1858
	for (i = 0; i < vi->max_queue_pairs; i++) {
1859
		u64 tpackets, tbytes, rpackets, rbytes, rdrops;
T
Toshiaki Makita 已提交
1860 1861
		struct receive_queue *rq = &vi->rq[i];
		struct send_queue *sq = &vi->sq[i];
1862 1863

		do {
T
Toshiaki Makita 已提交
1864 1865 1866 1867
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			tpackets = sq->stats.packets;
			tbytes   = sq->stats.bytes;
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1868 1869

		do {
T
Toshiaki Makita 已提交
1870
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1871 1872 1873
			rpackets = rq->stats.packets;
			rbytes   = rq->stats.bytes;
			rdrops   = rq->stats.drops;
T
Toshiaki Makita 已提交
1874
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1875 1876 1877 1878 1879

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
1880
		tot->rx_dropped += rdrops;
1881 1882 1883
	}

	tot->tx_dropped = dev->stats.tx_dropped;
1884
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1885 1886 1887 1888
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
}

1889 1890 1891 1892
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1893
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1894 1895 1896 1897
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

1898
static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
J
Jason Wang 已提交
1899 1900 1901 1902 1903 1904 1905
{
	struct scatterlist sg;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

1906 1907
	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
J
Jason Wang 已提交
1908 1909

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1910
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
J
Jason Wang 已提交
1911 1912 1913
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
1914
	} else {
J
Jason Wang 已提交
1915
		vi->curr_queue_pairs = queue_pairs;
1916 1917 1918
		/* virtnet_open() will refill when device is going to up. */
		if (dev->flags & IFF_UP)
			schedule_delayed_work(&vi->refill, 0);
1919
	}
J
Jason Wang 已提交
1920 1921 1922 1923

	return 0;
}

1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	int err;

	rtnl_lock();
	err = _virtnet_set_queues(vi, queue_pairs);
	rtnl_unlock();
	return err;
}

R
Rusty Russell 已提交
1934 1935 1936
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
1937
	int i;
R
Rusty Russell 已提交
1938

1939 1940
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
1941

W
Willem de Bruijn 已提交
1942
	for (i = 0; i < vi->max_queue_pairs; i++) {
1943
		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
J
Jason Wang 已提交
1944
		napi_disable(&vi->rq[i].napi);
1945
		virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
1946
	}
R
Rusty Russell 已提交
1947 1948 1949 1950

	return 0;
}

1951 1952 1953
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
1954 1955
	struct scatterlist sg[2];
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
1956
	struct netdev_hw_addr *ha;
1957
	int uc_count;
1958
	int mc_count;
1959 1960
	void *buf;
	int i;
1961

S
stephen hemminger 已提交
1962
	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1963 1964 1965
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

1966 1967
	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1968

1969
	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1970 1971

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1972
				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1973
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1974
			 vi->ctrl->promisc ? "en" : "dis");
1975

1976
	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1977 1978

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1979
				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1980
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1981
			 vi->ctrl->allmulti ? "en" : "dis");
1982

1983
	uc_count = netdev_uc_count(dev);
1984
	mc_count = netdev_mc_count(dev);
1985
	/* MAC filter - use one buffer for both lists */
1986 1987 1988
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
1989
	if (!buf)
1990 1991
		return;

1992 1993
	sg_init_table(sg, 2);

1994
	/* Store the unicast list and count in the front of the buffer */
M
Michael S. Tsirkin 已提交
1995
	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
J
Jiri Pirko 已提交
1996
	i = 0;
1997
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
1998
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1999 2000

	sg_set_buf(&sg[0], mac_data,
2001
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2002 2003

	/* multicast list and count fill the end */
2004
	mac_data = (void *)&mac_data->macs[uc_count][0];
2005

M
Michael S. Tsirkin 已提交
2006
	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2007
	i = 0;
2008 2009
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2010 2011

	sg_set_buf(&sg[1], mac_data,
2012
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2013 2014

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2015
				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2016
		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2017 2018

	kfree(buf);
2019 2020
}

2021 2022
static int virtnet_vlan_rx_add_vid(struct net_device *dev,
				   __be16 proto, u16 vid)
2023 2024 2025 2026
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

2027
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2028
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2029 2030

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2031
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2032
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2033
	return 0;
2034 2035
}

2036 2037
static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
				    __be16 proto, u16 vid)
2038 2039 2040 2041
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

2042
	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2043
	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2044 2045

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2046
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2047
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2048
	return 0;
2049 2050
}

2051
static void virtnet_clean_affinity(struct virtnet_info *vi)
J
Jason Wang 已提交
2052 2053 2054
{
	int i;

2055 2056
	if (vi->affinity_hint_set) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2057 2058
			virtqueue_set_affinity(vi->rq[i].vq, NULL);
			virtqueue_set_affinity(vi->sq[i].vq, NULL);
2059 2060
		}

2061 2062 2063
		vi->affinity_hint_set = false;
	}
}
2064

2065 2066
static void virtnet_set_affinity(struct virtnet_info *vi)
{
2067 2068 2069 2070 2071 2072 2073 2074
	cpumask_var_t mask;
	int stragglers;
	int group_size;
	int i, j, cpu;
	int num_cpu;
	int stride;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2075
		virtnet_clean_affinity(vi);
2076
		return;
J
Jason Wang 已提交
2077 2078
	}

2079 2080 2081 2082 2083 2084
	num_cpu = num_online_cpus();
	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
	stragglers = num_cpu >= vi->curr_queue_pairs ?
			num_cpu % vi->curr_queue_pairs :
			0;
	cpu = cpumask_next(-1, cpu_online_mask);
2085

2086 2087 2088 2089 2090 2091 2092 2093 2094 2095
	for (i = 0; i < vi->curr_queue_pairs; i++) {
		group_size = stride + (i < stragglers ? 1 : 0);

		for (j = 0; j < group_size; j++) {
			cpumask_set_cpu(cpu, mask);
			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
						nr_cpu_ids, false);
		}
		virtqueue_set_affinity(vi->rq[i].vq, mask);
		virtqueue_set_affinity(vi->sq[i].vq, mask);
2096
		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2097
		cpumask_clear(mask);
J
Jason Wang 已提交
2098 2099
	}

2100
	vi->affinity_hint_set = true;
2101
	free_cpumask_var(mask);
J
Jason Wang 已提交
2102 2103
}

2104
static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2105
{
2106 2107 2108 2109 2110
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);
	virtnet_set_affinity(vi);
	return 0;
}
2111

2112 2113 2114 2115 2116 2117 2118
static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node_dead);
	virtnet_set_affinity(vi);
	return 0;
}
2119

2120 2121 2122 2123 2124
static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
{
	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
						   node);

2125
	virtnet_clean_affinity(vi);
2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150
	return 0;
}

static enum cpuhp_state virtionet_online;

static int virtnet_cpu_notif_add(struct virtnet_info *vi)
{
	int ret;

	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
	if (ret)
		return ret;
	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					       &vi->node_dead);
	if (!ret)
		return ret;
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	return ret;
}

static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
{
	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
					    &vi->node_dead);
J
Jason Wang 已提交
2151 2152
}

R
Rick Jones 已提交
2153 2154 2155 2156 2157
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
2158 2159
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
2160 2161 2162 2163
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

2191
	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2192 2193
		return -EINVAL;

J
John Fastabend 已提交
2194 2195 2196 2197 2198 2199 2200
	/* For now we don't support modifying channels while XDP is loaded
	 * also when XDP is loaded all RX queues have XDP programs so we only
	 * need to check a single RX queue.
	 */
	if (vi->rq[0].xdp_prog)
		return -EINVAL;

2201
	cpus_read_lock();
2202
	err = _virtnet_set_queues(vi, queue_pairs);
2203
	if (err) {
2204
		cpus_read_unlock();
2205
		goto err;
2206
	}
2207
	virtnet_set_affinity(vi);
2208
	cpus_read_unlock();
2209

2210 2211 2212
	netif_set_real_num_tx_queues(dev, queue_pairs);
	netif_set_real_num_rx_queues(dev, queue_pairs);
 err:
2213 2214 2215
	return err;
}

T
Toshiaki Makita 已提交
2216 2217 2218 2219
static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int i, j;
2220
	u8 *p = data;
T
Toshiaki Makita 已提交
2221 2222 2223 2224

	switch (stringset) {
	case ETH_SS_STATS:
		for (i = 0; i < vi->curr_queue_pairs; i++) {
2225 2226 2227
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
						virtnet_rq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2228 2229 2230
		}

		for (i = 0; i < vi->curr_queue_pairs; i++) {
2231 2232 2233
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
						virtnet_sq_stats_desc[j].desc);
T
Toshiaki Makita 已提交
2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262
		}
		break;
	}
}

static int virtnet_get_sset_count(struct net_device *dev, int sset)
{
	struct virtnet_info *vi = netdev_priv(dev);

	switch (sset) {
	case ETH_SS_STATS:
		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
					       VIRTNET_SQ_STATS_LEN);
	default:
		return -EOPNOTSUPP;
	}
}

static void virtnet_get_ethtool_stats(struct net_device *dev,
				      struct ethtool_stats *stats, u64 *data)
{
	struct virtnet_info *vi = netdev_priv(dev);
	unsigned int idx = 0, start, i, j;
	const u8 *stats_base;
	size_t offset;

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];

2263
		stats_base = (u8 *)&rq->stats;
T
Toshiaki Makita 已提交
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
		do {
			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
				offset = virtnet_rq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
		idx += VIRTNET_RQ_STATS_LEN;
	}

	for (i = 0; i < vi->curr_queue_pairs; i++) {
		struct send_queue *sq = &vi->sq[i];

		stats_base = (u8 *)&sq->stats;
		do {
			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
				offset = virtnet_sq_stats_desc[j].offset;
				data[idx + j] = *(u64 *)(stats_base + offset);
			}
		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
		idx += VIRTNET_SQ_STATS_LEN;
	}
}

2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301
static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

2302 2303
static int virtnet_set_link_ksettings(struct net_device *dev,
				      const struct ethtool_link_ksettings *cmd)
2304 2305 2306
{
	struct virtnet_info *vi = netdev_priv(dev);

2307 2308
	return ethtool_virtdev_set_link_ksettings(dev, cmd,
						  &vi->speed, &vi->duplex);
2309 2310
}

2311 2312
static int virtnet_get_link_ksettings(struct net_device *dev,
				      struct ethtool_link_ksettings *cmd)
2313 2314 2315
{
	struct virtnet_info *vi = netdev_priv(dev);

2316 2317 2318
	cmd->base.speed = vi->speed;
	cmd->base.duplex = vi->duplex;
	cmd->base.port = PORT_OTHER;
2319 2320 2321 2322

	return 0;
}

2323
static int virtnet_set_coalesce(struct net_device *dev,
2324 2325 2326
				struct ethtool_coalesce *ec,
				struct kernel_ethtool_coalesce *kernel_coal,
				struct netlink_ext_ack *extack)
2327 2328 2329 2330
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i, napi_weight;

2331 2332
	if (ec->tx_max_coalesced_frames > 1 ||
	    ec->rx_max_coalesced_frames != 1)
2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346
		return -EINVAL;

	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
	if (napi_weight ^ vi->sq[0].napi.weight) {
		if (dev->flags & IFF_UP)
			return -EBUSY;
		for (i = 0; i < vi->max_queue_pairs; i++)
			vi->sq[i].napi.weight = napi_weight;
	}

	return 0;
}

static int virtnet_get_coalesce(struct net_device *dev,
2347 2348 2349
				struct ethtool_coalesce *ec,
				struct kernel_ethtool_coalesce *kernel_coal,
				struct netlink_ext_ack *extack)
2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364
{
	struct ethtool_coalesce ec_default = {
		.cmd = ETHTOOL_GCOALESCE,
		.rx_max_coalesced_frames = 1,
	};
	struct virtnet_info *vi = netdev_priv(dev);

	memcpy(ec, &ec_default, sizeof(ec_default));

	if (vi->sq[0].napi.weight)
		ec->tx_max_coalesced_frames = 1;

	return 0;
}

2365 2366 2367 2368 2369 2370 2371 2372
static void virtnet_init_settings(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);

	vi->speed = SPEED_UNKNOWN;
	vi->duplex = DUPLEX_UNKNOWN;
}

2373 2374 2375 2376 2377 2378 2379 2380
static void virtnet_update_settings(struct virtnet_info *vi)
{
	u32 speed;
	u8 duplex;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
		return;

2381 2382
	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);

2383 2384
	if (ethtool_validate_speed(speed))
		vi->speed = speed;
2385 2386 2387

	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);

2388 2389 2390 2391
	if (ethtool_validate_duplex(duplex))
		vi->duplex = duplex;
}

2392
static const struct ethtool_ops virtnet_ethtool_ops = {
2393
	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2394
	.get_drvinfo = virtnet_get_drvinfo,
2395
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
2396
	.get_ringparam = virtnet_get_ringparam,
T
Toshiaki Makita 已提交
2397 2398 2399
	.get_strings = virtnet_get_strings,
	.get_sset_count = virtnet_get_sset_count,
	.get_ethtool_stats = virtnet_get_ethtool_stats,
2400 2401
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
2402
	.get_ts_info = ethtool_op_get_ts_info,
2403 2404
	.get_link_ksettings = virtnet_get_link_ksettings,
	.set_link_ksettings = virtnet_set_link_ksettings,
2405 2406
	.set_coalesce = virtnet_set_coalesce,
	.get_coalesce = virtnet_get_coalesce,
2407 2408
};

2409 2410 2411 2412 2413 2414 2415 2416
static void virtnet_freeze_down(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int i;

	/* Make sure no work handler is accessing the device */
	flush_work(&vi->config_work);

2417
	netif_tx_lock_bh(vi->dev);
2418
	netif_device_detach(vi->dev);
2419
	netif_tx_unlock_bh(vi->dev);
2420 2421 2422
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev)) {
W
Willem de Bruijn 已提交
2423
		for (i = 0; i < vi->max_queue_pairs; i++) {
2424
			napi_disable(&vi->rq[i].napi);
2425
			virtnet_napi_tx_disable(&vi->sq[i].napi);
W
Willem de Bruijn 已提交
2426
		}
2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447
	}
}

static int init_vqs(struct virtnet_info *vi);

static int virtnet_restore_up(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
	int err, i;

	err = init_vqs(vi);
	if (err)
		return err;

	virtio_device_ready(vdev);

	if (netif_running(vi->dev)) {
		for (i = 0; i < vi->curr_queue_pairs; i++)
			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);

W
Willem de Bruijn 已提交
2448
		for (i = 0; i < vi->max_queue_pairs; i++) {
2449
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
W
Willem de Bruijn 已提交
2450 2451 2452
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2453 2454
	}

2455
	netif_tx_lock_bh(vi->dev);
2456
	netif_device_attach(vi->dev);
2457
	netif_tx_unlock_bh(vi->dev);
2458 2459 2460
	return err;
}

2461 2462 2463
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
	struct scatterlist sg;
2464
	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2465

2466
	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2467 2468 2469

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2470
		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496
		return -EINVAL;
	}

	return 0;
}

static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = 0;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
{
	u64 offloads = vi->guest_offloads;

	if (!vi->guest_offloads)
		return 0;

	return virtnet_set_guest_offloads(vi, offloads);
}

2497 2498
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			   struct netlink_ext_ack *extack)
J
John Fastabend 已提交
2499 2500 2501 2502
{
	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
	struct virtnet_info *vi = netdev_priv(dev);
	struct bpf_prog *old_prog;
2503
	u16 xdp_qp = 0, curr_qp;
2504
	int i, err;
J
John Fastabend 已提交
2505

2506 2507 2508 2509
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2510 2511
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
2512
		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
J
John Fastabend 已提交
2513 2514 2515 2516
		return -EOPNOTSUPP;
	}

	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2517
		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
J
John Fastabend 已提交
2518 2519 2520 2521
		return -EINVAL;
	}

	if (dev->mtu > max_sz) {
2522
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
J
John Fastabend 已提交
2523 2524 2525 2526
		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
		return -EINVAL;
	}

2527 2528 2529 2530 2531 2532
	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
	if (prog)
		xdp_qp = nr_cpu_ids;

	/* XDP requires extra queues for XDP_TX */
	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2533 2534
		netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
				 curr_qp + xdp_qp, vi->max_queue_pairs);
2535
		xdp_qp = 0;
2536 2537
	}

2538 2539 2540 2541
	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
	if (!prog && !old_prog)
		return 0;

2542 2543
	if (prog)
		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2544

2545
	/* Make sure NAPI is not using any XDP TX queues for RX. */
2546 2547
	if (netif_running(dev)) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
2548
			napi_disable(&vi->rq[i].napi);
2549 2550 2551
			virtnet_napi_tx_disable(&vi->sq[i].napi);
		}
	}
J
John Fastabend 已提交
2552

2553 2554 2555 2556 2557 2558 2559 2560
	if (!prog) {
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0)
				virtnet_restore_guest_offloads(vi);
		}
		synchronize_net();
	}
J
John Fastabend 已提交
2561

2562 2563 2564
	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
	if (err)
		goto err;
2565
	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2566
	vi->xdp_queue_pairs = xdp_qp;
2567

2568
	if (prog) {
2569
		vi->xdp_enabled = true;
2570 2571 2572
		for (i = 0; i < vi->max_queue_pairs; i++) {
			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
			if (i == 0 && !old_prog)
2573 2574
				virtnet_clear_guest_offloads(vi);
		}
2575 2576
	} else {
		vi->xdp_enabled = false;
2577 2578 2579
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
J
John Fastabend 已提交
2580 2581
		if (old_prog)
			bpf_prog_put(old_prog);
2582
		if (netif_running(dev)) {
2583
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2584 2585 2586
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
J
John Fastabend 已提交
2587 2588 2589
	}

	return 0;
2590

2591
err:
2592 2593 2594 2595 2596 2597
	if (!prog) {
		virtnet_clear_guest_offloads(vi);
		for (i = 0; i < vi->max_queue_pairs; i++)
			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
	}

2598
	if (netif_running(dev)) {
2599
		for (i = 0; i < vi->max_queue_pairs; i++) {
2600
			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2601 2602 2603
			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
					       &vi->sq[i].napi);
		}
2604
	}
2605 2606 2607
	if (prog)
		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
	return err;
J
John Fastabend 已提交
2608 2609
}

2610
static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
J
John Fastabend 已提交
2611 2612 2613
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
2614
		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
J
John Fastabend 已提交
2615 2616 2617 2618 2619
	default:
		return -EINVAL;
	}
}

2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
				      size_t len)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int ret;

	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
		return -EOPNOTSUPP;

	ret = snprintf(buf, len, "sby");
	if (ret >= len)
		return -EOPNOTSUPP;

	return 0;
}

2636 2637 2638 2639
static int virtnet_set_features(struct net_device *dev,
				netdev_features_t features)
{
	struct virtnet_info *vi = netdev_priv(dev);
2640
	u64 offloads;
2641 2642
	int err;

2643
	if ((dev->features ^ features) & NETIF_F_GRO_HW) {
2644
		if (vi->xdp_enabled)
2645 2646
			return -EBUSY;

2647
		if (features & NETIF_F_GRO_HW)
2648
			offloads = vi->guest_offloads_capable;
2649
		else
2650
			offloads = vi->guest_offloads_capable &
2651
				   ~GUEST_OFFLOAD_GRO_HW_MASK;
2652

2653 2654 2655 2656
		err = virtnet_set_guest_offloads(vi, offloads);
		if (err)
			return err;
		vi->guest_offloads = offloads;
2657 2658 2659 2660 2661
	}

	return 0;
}

2662 2663 2664 2665 2666
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
2667
	.ndo_set_mac_address = virtnet_set_mac_address,
2668
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
2669
	.ndo_get_stats64     = virtnet_stats,
2670 2671
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2672
	.ndo_bpf		= virtnet_xdp,
J
Jason Wang 已提交
2673
	.ndo_xdp_xmit		= virtnet_xdp_xmit,
2674
	.ndo_features_check	= passthru_features_check,
2675
	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
2676
	.ndo_set_features	= virtnet_set_features,
2677 2678
};

2679
static void virtnet_config_changed_work(struct work_struct *work)
2680
{
2681 2682
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
2683 2684
	u16 v;

2685 2686
	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
				 struct virtio_net_config, status, &v) < 0)
M
Michael S. Tsirkin 已提交
2687
		return;
2688 2689

	if (v & VIRTIO_NET_S_ANNOUNCE) {
2690
		netdev_notify_peers(vi->dev);
2691 2692
		virtnet_ack_link_announce(vi);
	}
2693 2694 2695 2696 2697

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
M
Michael S. Tsirkin 已提交
2698
		return;
2699 2700 2701 2702

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
2703
		virtnet_update_settings(vi);
2704
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
2705
		netif_tx_wake_all_queues(vi->dev);
2706 2707
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
2708
		netif_tx_stop_all_queues(vi->dev);
2709 2710 2711 2712 2713 2714 2715
	}
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

2716
	schedule_work(&vi->config_work);
2717 2718
}

J
Jason Wang 已提交
2719 2720
static void virtnet_free_queues(struct virtnet_info *vi)
{
2721 2722
	int i;

2723
	for (i = 0; i < vi->max_queue_pairs; i++) {
2724 2725
		__netif_napi_del(&vi->rq[i].napi);
		__netif_napi_del(&vi->sq[i].napi);
2726
	}
2727

2728
	/* We called __netif_napi_del(),
2729 2730 2731 2732
	 * we need to respect an RCU grace period before freeing vi->rq
	 */
	synchronize_net();

J
Jason Wang 已提交
2733 2734
	kfree(vi->rq);
	kfree(vi->sq);
2735
	kfree(vi->ctrl);
J
Jason Wang 已提交
2736 2737
}

2738
static void _free_receive_bufs(struct virtnet_info *vi)
J
Jason Wang 已提交
2739
{
J
John Fastabend 已提交
2740
	struct bpf_prog *old_prog;
J
Jason Wang 已提交
2741 2742 2743 2744 2745
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
J
John Fastabend 已提交
2746 2747 2748 2749 2750

		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
		if (old_prog)
			bpf_prog_put(old_prog);
J
Jason Wang 已提交
2751
	}
2752 2753 2754 2755 2756 2757
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	rtnl_lock();
	_free_receive_bufs(vi);
J
John Fastabend 已提交
2758
	rtnl_unlock();
J
Jason Wang 已提交
2759 2760
}

2761 2762 2763 2764 2765 2766 2767 2768
static void free_receive_page_frags(struct virtnet_info *vi)
{
	int i;
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (vi->rq[i].alloc_frag.page)
			put_page(vi->rq[i].alloc_frag.page);
}

J
Jason Wang 已提交
2769 2770 2771 2772 2773 2774 2775
static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
J
John Fastabend 已提交
2776
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2777
			if (!is_xdp_frame(buf))
J
John Fastabend 已提交
2778 2779
				dev_kfree_skb(buf);
			else
2780
				xdp_return_frame(ptr_to_xdp(buf));
J
John Fastabend 已提交
2781
		}
J
Jason Wang 已提交
2782 2783 2784 2785 2786 2787
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2788
			if (vi->mergeable_rx_bufs) {
2789
				put_page(virt_to_head_page(buf));
2790
			} else if (vi->big_packets) {
2791
				give_pages(&vi->rq[i], buf);
2792
			} else {
2793
				put_page(virt_to_head_page(buf));
2794
			}
J
Jason Wang 已提交
2795 2796 2797 2798
		}
	}
}

2799 2800 2801 2802
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

2803
	virtnet_clean_affinity(vi);
J
Jason Wang 已提交
2804

2805
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
2806 2807

	virtnet_free_queues(vi);
2808 2809
}

2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821
/* How large should a single buffer be so a queue full of these can fit at
 * least one full packet?
 * Logic below assumes the mergeable buffer header is used.
 */
static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{
	const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	unsigned int rq_size = virtqueue_get_vring_size(vq);
	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);

2822 2823
	return max(max(min_buf_len, hdr_len) - hdr_len,
		   (unsigned int)GOOD_PACKET_LEN);
2824 2825
}

J
Jason Wang 已提交
2826
static int virtnet_find_vqs(struct virtnet_info *vi)
2827
{
J
Jason Wang 已提交
2828 2829 2830 2831 2832
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;
2833
	bool *ctx;
J
Jason Wang 已提交
2834 2835 2836 2837 2838 2839 2840 2841 2842

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
K
Kees Cook 已提交
2843
	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
J
Jason Wang 已提交
2844 2845
	if (!vqs)
		goto err_vq;
2846
	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
J
Jason Wang 已提交
2847 2848
	if (!callbacks)
		goto err_callback;
2849
	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
J
Jason Wang 已提交
2850 2851
	if (!names)
		goto err_names;
2852
	if (!vi->big_packets || vi->mergeable_rx_bufs) {
K
Kees Cook 已提交
2853
		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2854 2855 2856 2857 2858
		if (!ctx)
			goto err_ctx;
	} else {
		ctx = NULL;
	}
J
Jason Wang 已提交
2859 2860 2861 2862 2863 2864

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
2865

J
Jason Wang 已提交
2866 2867 2868 2869 2870 2871 2872 2873
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
2874 2875
		if (ctx)
			ctx[rxq2vq(i)] = true;
J
Jason Wang 已提交
2876
	}
2877

2878 2879
	ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
				  names, ctx, NULL);
J
Jason Wang 已提交
2880 2881
	if (ret)
		goto err_find;
2882

J
Jason Wang 已提交
2883 2884
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
2885
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2886
			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2887
	}
J
Jason Wang 已提交
2888 2889 2890

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
2891
		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
J
Jason Wang 已提交
2892 2893 2894
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

2895
	/* run here: ret == 0. */
J
Jason Wang 已提交
2896 2897 2898


err_find:
2899 2900
	kfree(ctx);
err_ctx:
J
Jason Wang 已提交
2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

2914 2915 2916 2917 2918 2919 2920
	if (vi->has_cvq) {
		vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
		if (!vi->ctrl)
			goto err_ctrl;
	} else {
		vi->ctrl = NULL;
	}
K
Kees Cook 已提交
2921
	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
J
Jason Wang 已提交
2922 2923
	if (!vi->sq)
		goto err_sq;
K
Kees Cook 已提交
2924
	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2925
	if (!vi->rq)
J
Jason Wang 已提交
2926 2927 2928 2929 2930 2931 2932
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);
2933 2934
		netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
				  napi_tx ? napi_weight : 0);
J
Jason Wang 已提交
2935 2936

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
J
Johannes Berg 已提交
2937
		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
J
Jason Wang 已提交
2938
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
T
Toshiaki Makita 已提交
2939 2940 2941

		u64_stats_init(&vi->rq[i].stats.syncp);
		u64_stats_init(&vi->sq[i].stats.syncp);
J
Jason Wang 已提交
2942 2943 2944 2945 2946 2947 2948
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
2949 2950
	kfree(vi->ctrl);
err_ctrl:
J
Jason Wang 已提交
2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

2967
	cpus_read_lock();
2968
	virtnet_set_affinity(vi);
2969
	cpus_read_unlock();
2970

J
Jason Wang 已提交
2971 2972 2973 2974 2975 2976
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
2977 2978
}

2979 2980
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2981
		char *buf)
2982 2983 2984
{
	struct virtnet_info *vi = netdev_priv(queue->dev);
	unsigned int queue_index = get_netdev_rx_queue_index(queue);
2985 2986
	unsigned int headroom = virtnet_get_headroom(vi);
	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
J
Johannes Berg 已提交
2987
	struct ewma_pkt_len *avg;
2988 2989 2990

	BUG_ON(queue_index >= vi->max_queue_pairs);
	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2991
	return sprintf(buf, "%u\n",
2992 2993
		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
				       SKB_DATA_ALIGN(headroom + tailroom)));
2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
	__ATTR_RO(mergeable_rx_buffer_size);

static struct attribute *virtio_net_mrg_rx_attrs[] = {
	&mergeable_rx_buffer_size_attribute.attr,
	NULL
};

static const struct attribute_group virtio_net_mrg_rx_group = {
	.name = "virtio_net",
	.attrs = virtio_net_mrg_rx_attrs
};
#endif

3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043
static bool virtnet_fail_on_feature(struct virtio_device *vdev,
				    unsigned int fbit,
				    const char *fname, const char *dname)
{
	if (!virtio_has_feature(vdev, fbit))
		return false;

	dev_err(&vdev->dev, "device advertises feature %s but not %s",
		fname, dname);

	return true;
}

#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)

static bool virtnet_validate_features(struct virtio_device *vdev)
{
	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
			     "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
			     "VIRTIO_NET_F_CTRL_VQ"))) {
		return false;
	}

	return true;
}

3044 3045 3046
#define MIN_MTU ETH_MIN_MTU
#define MAX_MTU ETH_MAX_MTU

3047
static int virtnet_validate(struct virtio_device *vdev)
R
Rusty Russell 已提交
3048
{
3049 3050 3051 3052 3053 3054
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

3055 3056 3057
	if (!virtnet_validate_features(vdev))
		return -EINVAL;

3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		int mtu = virtio_cread16(vdev,
					 offsetof(struct virtio_net_config,
						  mtu));
		if (mtu < MIN_MTU)
			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
	}

	return 0;
}

static int virtnet_probe(struct virtio_device *vdev)
{
T
Toshiaki Makita 已提交
3071
	int i, err = -ENOMEM;
3072 3073 3074 3075 3076
	struct net_device *dev;
	struct virtnet_info *vi;
	u16 max_queue_pairs;
	int mtu;

J
Jason Wang 已提交
3077
	/* Find if host supports multiqueue virtio_net device */
3078 3079 3080
	err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
				   struct virtio_net_config,
				   max_virtqueue_pairs, &max_queue_pairs);
J
Jason Wang 已提交
3081 3082 3083 3084 3085 3086

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
3087 3088

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
3089
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
3090 3091 3092 3093
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
3094 3095
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
			   IFF_TX_SKB_NO_LINEAR;
3096
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
3097
	dev->features = NETIF_F_HIGHDMA;
3098

3099
	dev->ethtool_ops = &virtnet_ethtool_ops;
R
Rusty Russell 已提交
3100 3101 3102
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
3103
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
3104
		/* This opens up the world of extra features. */
J
Jason Wang 已提交
3105
		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3106
		if (csum)
J
Jason Wang 已提交
3107
			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3108 3109

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3110
			dev->hw_features |= NETIF_F_TSO
R
Rusty Russell 已提交
3111 3112
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
3113
		/* Individual feature bits: what can host handle? */
3114 3115 3116 3117 3118 3119 3120
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;

3121 3122
		dev->features |= NETIF_F_GSO_ROBUST;

3123
		if (gso)
3124
			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3125
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
3126
	}
3127 3128
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
		dev->features |= NETIF_F_RXCSUM;
3129 3130
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3131
		dev->features |= NETIF_F_GRO_HW;
3132
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3133
		dev->hw_features |= NETIF_F_GRO_HW;
R
Rusty Russell 已提交
3134

3135 3136
	dev->vlan_features = dev->features;

3137 3138 3139 3140
	/* MTU range: 68 - 65535 */
	dev->min_mtu = MIN_MTU;
	dev->max_mtu = MAX_MTU;

R
Rusty Russell 已提交
3141
	/* Configuration may specify what MAC to use.  Otherwise random. */
3142 3143 3144 3145 3146
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		virtio_cread_bytes(vdev,
				   offsetof(struct virtio_net_config, mac),
				   dev->dev_addr, dev->addr_len);
	else
3147
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
3148 3149 3150 3151 3152

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
3153
	vdev->priv = vi;
3154

3155
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
3156

3157
	/* If we can receive ANY GSO packets, we must allocate large ones. */
3158 3159
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3160 3161
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3162 3163
		vi->big_packets = true;

3164 3165 3166
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

3167 3168
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3169 3170 3171 3172
		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		vi->hdr_len = sizeof(struct virtio_net_hdr);

3173 3174
	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3175 3176
		vi->any_header_sg = true;

J
Jason Wang 已提交
3177 3178 3179
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

3180 3181 3182 3183
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
		mtu = virtio_cread16(vdev,
				     offsetof(struct virtio_net_config,
					      mtu));
3184
		if (mtu < dev->min_mtu) {
3185 3186 3187
			/* Should never trigger: MTU was previously validated
			 * in virtnet_validate.
			 */
3188 3189 3190
			dev_err(&vdev->dev,
				"device MTU appears to have changed it is now %d < %d",
				mtu, dev->min_mtu);
3191
			err = -EINVAL;
T
Toshiaki Makita 已提交
3192
			goto free;
3193
		}
3194

3195 3196 3197
		dev->mtu = mtu;
		dev->max_mtu = mtu;

3198 3199 3200
		/* TODO: size buffers correctly in this case. */
		if (dev->mtu > ETH_DATA_LEN)
			vi->big_packets = true;
3201 3202
	}

3203 3204
	if (vi->any_header_sg)
		dev->needed_headroom = vi->hdr_len;
3205

3206 3207 3208 3209 3210
	/* Enable multiqueue by default */
	if (num_online_cpus() >= max_queue_pairs)
		vi->curr_queue_pairs = max_queue_pairs;
	else
		vi->curr_queue_pairs = num_online_cpus();
J
Jason Wang 已提交
3211 3212 3213
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3214
	err = init_vqs(vi);
3215
	if (err)
T
Toshiaki Makita 已提交
3216
		goto free;
R
Rusty Russell 已提交
3217

3218 3219 3220 3221
#ifdef CONFIG_SYSFS
	if (vi->mergeable_rx_bufs)
		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
#endif
3222 3223
	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
J
Jason Wang 已提交
3224

3225 3226
	virtnet_init_settings(dev);

3227 3228
	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
		vi->failover = net_failover_create(vi->dev);
3229 3230
		if (IS_ERR(vi->failover)) {
			err = PTR_ERR(vi->failover);
3231
			goto free_vqs;
3232
		}
3233 3234
	}

R
Rusty Russell 已提交
3235 3236 3237
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
3238
		goto free_failover;
R
Rusty Russell 已提交
3239
	}
3240

M
Michael S. Tsirkin 已提交
3241 3242
	virtio_device_ready(vdev);

3243
	err = virtnet_cpu_notif_add(vi);
3244 3245
	if (err) {
		pr_debug("virtio_net: registering cpu notifier failed\n");
3246
		goto free_unregister_netdev;
3247 3248
	}

3249
	virtnet_set_queues(vi, vi->curr_queue_pairs);
3250

J
Jason Wang 已提交
3251 3252
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
3253
	netif_carrier_off(dev);
J
Jason Wang 已提交
3254
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3255
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
3256 3257
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
3258
		virtnet_update_settings(vi);
J
Jason Wang 已提交
3259 3260
		netif_carrier_on(dev);
	}
3261

3262 3263 3264
	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
			set_bit(guest_offloads[i], &vi->guest_offloads);
3265
	vi->guest_offloads_capable = vi->guest_offloads;
3266

J
Jason Wang 已提交
3267 3268 3269
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
3270 3271
	return 0;

3272
free_unregister_netdev:
3273 3274
	vi->vdev->config->reset(vdev);

3275
	unregister_netdev(dev);
3276 3277
free_failover:
	net_failover_destroy(vi->failover);
3278
free_vqs:
J
Jason Wang 已提交
3279
	cancel_delayed_work_sync(&vi->refill);
3280
	free_receive_page_frags(vi);
3281
	virtnet_del_vqs(vi);
R
Rusty Russell 已提交
3282 3283 3284 3285 3286
free:
	free_netdev(dev);
	return err;
}

3287
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
3288
{
3289
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
3290 3291

	/* Free unused buffers in both send and recv, if any. */
3292
	free_unused_bufs(vi);
3293

J
Jason Wang 已提交
3294
	free_receive_bufs(vi);
3295

3296 3297
	free_receive_page_frags(vi);

J
Jason Wang 已提交
3298
	virtnet_del_vqs(vi);
3299 3300
}

3301
static void virtnet_remove(struct virtio_device *vdev)
3302 3303 3304
{
	struct virtnet_info *vi = vdev->priv;

3305
	virtnet_cpu_notif_remove(vi);
3306

3307 3308
	/* Make sure no work handler is accessing the device. */
	flush_work(&vi->config_work);
3309

3310 3311
	unregister_netdev(vi->dev);

3312 3313
	net_failover_destroy(vi->failover);

3314
	remove_vq_common(vi);
3315

3316
	free_netdev(vi->dev);
R
Rusty Russell 已提交
3317 3318
}

3319
static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3320 3321 3322
{
	struct virtnet_info *vi = vdev->priv;

3323
	virtnet_cpu_notif_remove(vi);
3324
	virtnet_freeze_down(vdev);
3325 3326 3327 3328 3329
	remove_vq_common(vi);

	return 0;
}

3330
static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3331 3332
{
	struct virtnet_info *vi = vdev->priv;
3333
	int err;
3334

3335
	err = virtnet_restore_up(vdev);
3336 3337
	if (err)
		return err;
J
Jason Wang 已提交
3338 3339
	virtnet_set_queues(vi, vi->curr_queue_pairs);

3340
	err = virtnet_cpu_notif_add(vi);
3341 3342 3343
	if (err) {
		virtnet_freeze_down(vdev);
		remove_vq_common(vi);
3344
		return err;
3345
	}
3346

3347 3348 3349
	return 0;
}

R
Rusty Russell 已提交
3350 3351 3352 3353 3354
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

3355 3356 3357 3358 3359 3360 3361 3362 3363 3364
#define VIRTNET_FEATURES \
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
	VIRTIO_NET_F_MAC, \
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3365
	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3366
	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3367

3368
static unsigned int features[] = {
3369 3370 3371 3372 3373 3374
	VIRTNET_FEATURES,
};

static unsigned int features_legacy[] = {
	VIRTNET_FEATURES,
	VIRTIO_NET_F_GSO,
3375
	VIRTIO_F_ANY_LAYOUT,
3376 3377
};

3378
static struct virtio_driver virtio_net_driver = {
3379 3380
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
3381 3382
	.feature_table_legacy = features_legacy,
	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
R
Rusty Russell 已提交
3383 3384 3385
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
3386
	.validate =	virtnet_validate,
R
Rusty Russell 已提交
3387
	.probe =	virtnet_probe,
3388
	.remove =	virtnet_remove,
3389
	.config_changed = virtnet_config_changed,
3390
#ifdef CONFIG_PM_SLEEP
3391 3392 3393
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
3394 3395
};

3396 3397 3398 3399
static __init int virtio_net_driver_init(void)
{
	int ret;

T
Thomas Gleixner 已提交
3400
	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3401 3402 3403 3404 3405
				      virtnet_cpu_online,
				      virtnet_cpu_down_prep);
	if (ret < 0)
		goto out;
	virtionet_online = ret;
T
Thomas Gleixner 已提交
3406
	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425
				      NULL, virtnet_cpu_dead);
	if (ret)
		goto err_dead;

        ret = register_virtio_driver(&virtio_net_driver);
	if (ret)
		goto err_virtio;
	return 0;
err_virtio:
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
err_dead:
	cpuhp_remove_multi_state(virtionet_online);
out:
	return ret;
}
module_init(virtio_net_driver_init);

static __exit void virtio_net_driver_exit(void)
{
A
Andrew Jones 已提交
3426
	unregister_virtio_driver(&virtio_net_driver);
3427 3428 3429 3430
	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
	cpuhp_remove_multi_state(virtionet_online);
}
module_exit(virtio_net_driver_exit);
R
Rusty Russell 已提交
3431 3432 3433 3434

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");