virtio_net.c 41.3 KB
Newer Older
1
/* A network driver using virtio.
R
Rusty Russell 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
22
#include <linux/ethtool.h>
R
Rusty Russell 已提交
23 24 25 26
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
#include <linux/scatterlist.h>
27
#include <linux/if_vlan.h>
28
#include <linux/slab.h>
R
Rusty Russell 已提交
29

30 31 32
static int napi_weight = 128;
module_param(napi_weight, int, 0444);

33
static bool csum = true, gso = true;
R
Rusty Russell 已提交
34 35 36
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

R
Rusty Russell 已提交
37
/* FIXME: MTU in config. */
38
#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
39
#define GOOD_COPY_LEN	128
R
Rusty Russell 已提交
40

41
#define VIRTNET_SEND_COMMAND_SG_MAX    2
42
#define VIRTNET_DRIVER_VERSION "1.0.0"
43

44
struct virtnet_stats {
45 46
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
47 48 49 50 51 52 53
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

54 55 56 57 58 59 60
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
61 62 63

	/* Name of the send queue: output.$index */
	char name[40];
64 65 66 67 68 69 70
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

R
Rusty Russell 已提交
71 72 73 74 75
	struct napi_struct napi;

	/* Number of input buffers, and max we've ever had. */
	unsigned int num, max;

76 77 78 79 80
	/* Chain pages by the private ptr. */
	struct page *pages;

	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
J
Jason Wang 已提交
81 82 83

	/* Name of this receive queue: input.$index */
	char name[40];
84 85 86 87 88 89
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
J
Jason Wang 已提交
90 91
	struct send_queue *sq;
	struct receive_queue *rq;
92 93
	unsigned int status;

J
Jason Wang 已提交
94 95 96 97 98 99
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

100 101 102
	/* I like... big packets and I cannot lie! */
	bool big_packets;

103 104 105
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

J
Jason Wang 已提交
106 107 108
	/* Has control virtqueue */
	bool has_cvq;

109 110 111
	/* enable config space updates */
	bool config_enable;

112 113 114
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

115 116 117
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

118 119 120 121 122
	/* Work struct for config space updates */
	struct work_struct config_work;

	/* Lock for config space updates */
	struct mutex config_lock;
J
Jason Wang 已提交
123 124 125

	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
R
Rusty Russell 已提交
126 127
};

128 129 130 131 132
struct skb_vnet_hdr {
	union {
		struct virtio_net_hdr hdr;
		struct virtio_net_hdr_mrg_rxbuf mhdr;
	};
133
	unsigned int num_sg;
134 135
};

136 137 138 139 140 141 142 143 144 145
struct padded_vnet_hdr {
	struct virtio_net_hdr hdr;
	/*
	 * virtio_net_hdr should be in a separated sg buffer because of a
	 * QEMU bug, and data sg buffer shares same page with this header sg.
	 * This padding makes next sg 16 byte aligned after virtio_net_hdr.
	 */
	char padding[6];
};

J
Jason Wang 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
	return (virtqueue_get_queue_index(vq) - 1) / 2;
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
	return virtqueue_get_queue_index(vq) / 2;
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

169
static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
R
Rusty Russell 已提交
170
{
171
	return (struct skb_vnet_hdr *)skb->cb;
R
Rusty Russell 已提交
172 173
}

174 175 176 177
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
178
static void give_pages(struct receive_queue *rq, struct page *page)
179
{
180
	struct page *end;
181

182
	/* Find end of list, sew whole thing into vi->rq.pages. */
183
	for (end = page; end->private; end = (struct page *)end->private);
184 185
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
186 187
}

188
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
189
{
190
	struct page *p = rq->pages;
191

192
	if (p) {
193
		rq->pages = (struct page *)p->private;
194 195 196
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
197 198 199 200
		p = alloc_page(gfp_mask);
	return p;
}

201
static void skb_xmit_done(struct virtqueue *vq)
R
Rusty Russell 已提交
202
{
203
	struct virtnet_info *vi = vq->vdev->priv;
R
Rusty Russell 已提交
204

205
	/* Suppress further interrupts. */
206
	virtqueue_disable_cb(vq);
207

208
	/* We were probably waiting for more output buffers. */
J
Jason Wang 已提交
209
	netif_wake_subqueue(vi->dev, vq2txq(vq));
R
Rusty Russell 已提交
210 211
}

212 213
static void set_skb_frag(struct sk_buff *skb, struct page *page,
			 unsigned int offset, unsigned int *len)
R
Rusty Russell 已提交
214
{
215
	int size = min((unsigned)PAGE_SIZE - offset, *len);
216 217
	int i = skb_shinfo(skb)->nr_frags;

218
	__skb_fill_page_desc(skb, i, page, offset, size);
219

220 221
	skb->data_len += size;
	skb->len += size;
222
	skb->truesize += PAGE_SIZE;
223
	skb_shinfo(skb)->nr_frags++;
224
	*len -= size;
225
}
226

227
/* Called from bottom half context */
228
static struct sk_buff *page_to_skb(struct receive_queue *rq,
229 230
				   struct page *page, unsigned int len)
{
231
	struct virtnet_info *vi = rq->vq->vdev->priv;
232 233 234 235
	struct sk_buff *skb;
	struct skb_vnet_hdr *hdr;
	unsigned int copy, hdr_len, offset;
	char *p;
236

237
	p = page_address(page);
238

239 240 241 242
	/* copy small packet so we can reuse these pages for small data */
	skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;
243

244
	hdr = skb_vnet_hdr(skb);
245

246 247 248 249 250 251 252
	if (vi->mergeable_rx_bufs) {
		hdr_len = sizeof hdr->mhdr;
		offset = hdr_len;
	} else {
		hdr_len = sizeof hdr->hdr;
		offset = sizeof(struct padded_vnet_hdr);
	}
253

254
	memcpy(hdr, p, hdr_len);
255

256 257
	len -= hdr_len;
	p += offset;
258

259 260 261 262
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
263

264 265
	len -= copy;
	offset += copy;
266

267 268 269 270 271 272 273
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
274
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
275 276 277 278
		dev_kfree_skb(skb);
		return NULL;
	}

279 280 281 282 283
	while (len) {
		set_skb_frag(skb, page, offset, &len);
		page = (struct page *)page->private;
		offset = 0;
	}
284

285
	if (page)
286
		give_pages(rq, page);
287

288 289
	return skb;
}
290

291
static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
292 293 294 295 296 297 298 299 300 301 302 303 304
{
	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
	struct page *page;
	int num_buf, i, len;

	num_buf = hdr->mhdr.num_buffers;
	while (--num_buf) {
		i = skb_shinfo(skb)->nr_frags;
		if (i >= MAX_SKB_FRAGS) {
			pr_debug("%s: packet too long\n", skb->dev->name);
			skb->dev->stats.rx_length_errors++;
			return -EINVAL;
		}
305
		page = virtqueue_get_buf(rq->vq, &len);
306 307 308 309 310
		if (!page) {
			pr_debug("%s: rx error: %d buffers missing\n",
				 skb->dev->name, hdr->mhdr.num_buffers);
			skb->dev->stats.rx_length_errors++;
			return -EINVAL;
311
		}
312

313 314 315 316 317
		if (len > PAGE_SIZE)
			len = PAGE_SIZE;

		set_skb_frag(skb, page, 0, &len);

318
		--rq->num;
319 320 321 322
	}
	return 0;
}

323
static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
324
{
325 326
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct net_device *dev = vi->dev;
E
Eric Dumazet 已提交
327
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
328 329 330
	struct sk_buff *skb;
	struct page *page;
	struct skb_vnet_hdr *hdr;
331

332 333 334 335
	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
		if (vi->mergeable_rx_bufs || vi->big_packets)
336
			give_pages(rq, buf);
337 338 339 340
		else
			dev_kfree_skb(buf);
		return;
	}
341

342 343 344 345 346 347
	if (!vi->mergeable_rx_bufs && !vi->big_packets) {
		skb = buf;
		len -= sizeof(struct virtio_net_hdr);
		skb_trim(skb, len);
	} else {
		page = buf;
348
		skb = page_to_skb(rq, page, len);
349
		if (unlikely(!skb)) {
350
			dev->stats.rx_dropped++;
351
			give_pages(rq, page);
352
			return;
353
		}
354
		if (vi->mergeable_rx_bufs)
355
			if (receive_mergeable(rq, skb)) {
356 357 358
				dev_kfree_skb(skb);
				return;
			}
359
	}
360

361
	hdr = skb_vnet_hdr(skb);
362

363
	u64_stats_update_begin(&stats->rx_syncp);
364 365
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
366
	u64_stats_update_end(&stats->rx_syncp);
R
Rusty Russell 已提交
367

368
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
R
Rusty Russell 已提交
369
		pr_debug("Needs csum!\n");
370 371 372
		if (!skb_partial_csum_set(skb,
					  hdr->hdr.csum_start,
					  hdr->hdr.csum_offset))
R
Rusty Russell 已提交
373
			goto frame_err;
374 375
	} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
		skb->ip_summed = CHECKSUM_UNNECESSARY;
R
Rusty Russell 已提交
376 377
	}

378 379 380 381
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

382
	if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
R
Rusty Russell 已提交
383
		pr_debug("GSO!\n");
384
		switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
R
Rusty Russell 已提交
385 386 387 388 389 390 391 392 393 394
		case VIRTIO_NET_HDR_GSO_TCPV4:
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
			break;
		case VIRTIO_NET_HDR_GSO_UDP:
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
			break;
		case VIRTIO_NET_HDR_GSO_TCPV6:
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
			break;
		default:
395 396
			net_warn_ratelimited("%s: bad gso type %u.\n",
					     dev->name, hdr->hdr.gso_type);
R
Rusty Russell 已提交
397 398 399
			goto frame_err;
		}

400
		if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
R
Rusty Russell 已提交
401 402
			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;

403
		skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
R
Rusty Russell 已提交
404
		if (skb_shinfo(skb)->gso_size == 0) {
405
			net_warn_ratelimited("%s: zero gso size.\n", dev->name);
R
Rusty Russell 已提交
406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
			goto frame_err;
		}

		/* Header must be checked, and gso_segs computed. */
		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
		skb_shinfo(skb)->gso_segs = 0;
	}

	netif_receive_skb(skb);
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

422
static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
R
Rusty Russell 已提交
423
{
424
	struct virtnet_info *vi = rq->vq->vdev->priv;
R
Rusty Russell 已提交
425
	struct sk_buff *skb;
426 427
	struct skb_vnet_hdr *hdr;
	int err;
428

429
	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
430 431
	if (unlikely(!skb))
		return -ENOMEM;
R
Rusty Russell 已提交
432

433
	skb_put(skb, MAX_PACKET_LEN);
434

435
	hdr = skb_vnet_hdr(skb);
436
	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
437

438
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
439

440
	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
441 442
	if (err < 0)
		dev_kfree_skb(skb);
443

444 445
	return err;
}
446

447
static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
448 449 450 451 452
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

453
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
454
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
455
		first = get_a_page(rq, gfp);
456 457
		if (!first) {
			if (list)
458
				give_pages(rq, list);
459
			return -ENOMEM;
460
		}
461
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
462

463 464 465 466
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
R
Rusty Russell 已提交
467

468
	first = get_a_page(rq, gfp);
469
	if (!first) {
470
		give_pages(rq, list);
471 472 473 474
		return -ENOMEM;
	}
	p = page_address(first);

475 476 477
	/* rq->sg[0], rq->sg[1] share the same page */
	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
478

479
	/* rq->sg[1] for data packet, from offset */
480
	offset = sizeof(struct padded_vnet_hdr);
481
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
482 483 484

	/* chain first in list head */
	first->private = (unsigned long)list;
485
	err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
486
				first, gfp);
487
	if (err < 0)
488
		give_pages(rq, first);
489 490

	return err;
R
Rusty Russell 已提交
491 492
}

493
static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
494
{
495
	struct page *page;
496 497
	int err;

498
	page = get_a_page(rq, gfp);
499 500
	if (!page)
		return -ENOMEM;
501

502
	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
503

504
	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
505
	if (err < 0)
506
		give_pages(rq, page);
507

508 509
	return err;
}
510

511 512 513 514 515 516 517
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
518
static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
519
{
520
	struct virtnet_info *vi = rq->vq->vdev->priv;
521
	int err;
522
	bool oom;
523

524 525
	do {
		if (vi->mergeable_rx_bufs)
526
			err = add_recvbuf_mergeable(rq, gfp);
527
		else if (vi->big_packets)
528
			err = add_recvbuf_big(rq, gfp);
529
		else
530
			err = add_recvbuf_small(rq, gfp);
531

532 533
		oom = err == -ENOMEM;
		if (err < 0)
534
			break;
535
		++rq->num;
536
	} while (err > 0);
537 538 539
	if (unlikely(rq->num > rq->max))
		rq->max = rq->num;
	virtqueue_kick(rq->vq);
540
	return !oom;
541 542
}

543
static void skb_recv_done(struct virtqueue *rvq)
R
Rusty Russell 已提交
544 545
{
	struct virtnet_info *vi = rvq->vdev->priv;
J
Jason Wang 已提交
546
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
547

548
	/* Schedule NAPI, Suppress further interrupts if successful. */
549
	if (napi_schedule_prep(&rq->napi)) {
550
		virtqueue_disable_cb(rvq);
551
		__napi_schedule(&rq->napi);
552
	}
R
Rusty Russell 已提交
553 554
}

555
static void virtnet_napi_enable(struct receive_queue *rq)
556
{
557
	napi_enable(&rq->napi);
558 559 560 561 562

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
563 564
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
565
		local_bh_disable();
566
		__napi_schedule(&rq->napi);
567
		local_bh_enable();
568 569 570
	}
}

571 572
static void refill_work(struct work_struct *work)
{
573 574
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
575
	bool still_empty;
J
Jason Wang 已提交
576 577 578 579
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct receive_queue *rq = &vi->rq[i];
580

J
Jason Wang 已提交
581 582 583
		napi_disable(&rq->napi);
		still_empty = !try_fill_recv(rq, GFP_KERNEL);
		virtnet_napi_enable(rq);
584

J
Jason Wang 已提交
585 586 587 588 589 590
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
591 592
}

R
Rusty Russell 已提交
593 594
static int virtnet_poll(struct napi_struct *napi, int budget)
{
595 596 597
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	struct virtnet_info *vi = rq->vq->vdev->priv;
598
	void *buf;
R
Rusty Russell 已提交
599 600 601 602
	unsigned int len, received = 0;

again:
	while (received < budget &&
603 604 605
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
		receive_buf(rq, buf, len);
		--rq->num;
R
Rusty Russell 已提交
606 607 608
		received++;
	}

609 610
	if (rq->num < rq->max / 2) {
		if (!try_fill_recv(rq, GFP_ATOMIC))
611
			schedule_delayed_work(&vi->refill, 0);
612
	}
R
Rusty Russell 已提交
613

614 615
	/* Out of packets? */
	if (received < budget) {
616
		napi_complete(napi);
617
		if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
618
		    napi_schedule_prep(napi)) {
619
			virtqueue_disable_cb(rq->vq);
620
			__napi_schedule(napi);
R
Rusty Russell 已提交
621
			goto again;
622
		}
R
Rusty Russell 已提交
623 624 625 626 627
	}

	return received;
}

J
Jason Wang 已提交
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		/* Make sure we have some buffers: if oom use wq. */
		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
			schedule_delayed_work(&vi->refill, 0);
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

643
static unsigned int free_old_xmit_skbs(struct send_queue *sq)
R
Rusty Russell 已提交
644 645
{
	struct sk_buff *skb;
646
	unsigned int len, tot_sgs = 0;
647
	struct virtnet_info *vi = sq->vq->vdev->priv;
E
Eric Dumazet 已提交
648
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
R
Rusty Russell 已提交
649

650
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
R
Rusty Russell 已提交
651
		pr_debug("Sent skb %p\n", skb);
652

653
		u64_stats_update_begin(&stats->tx_syncp);
654 655
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
656
		u64_stats_update_end(&stats->tx_syncp);
657

658
		tot_sgs += skb_vnet_hdr(skb)->num_sg;
659
		dev_kfree_skb_any(skb);
R
Rusty Russell 已提交
660
	}
661
	return tot_sgs;
R
Rusty Russell 已提交
662 663
}

664
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
R
Rusty Russell 已提交
665
{
666
	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
R
Rusty Russell 已提交
667
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
668
	struct virtnet_info *vi = sq->vq->vdev->priv;
R
Rusty Russell 已提交
669

J
Johannes Berg 已提交
670
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
R
Rusty Russell 已提交
671 672

	if (skb->ip_summed == CHECKSUM_PARTIAL) {
673
		hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
674
		hdr->hdr.csum_start = skb_checksum_start_offset(skb);
675
		hdr->hdr.csum_offset = skb->csum_offset;
R
Rusty Russell 已提交
676
	} else {
677 678
		hdr->hdr.flags = 0;
		hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
R
Rusty Russell 已提交
679 680 681
	}

	if (skb_is_gso(skb)) {
682 683
		hdr->hdr.hdr_len = skb_headlen(skb);
		hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;
R
Rusty Russell 已提交
684
		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
685
			hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
R
Rusty Russell 已提交
686
		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
687
			hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
R
Rusty Russell 已提交
688
		else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
689
			hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
R
Rusty Russell 已提交
690 691
		else
			BUG();
R
Rusty Russell 已提交
692
		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
693
			hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
R
Rusty Russell 已提交
694
	} else {
695 696
		hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
		hdr->hdr.gso_size = hdr->hdr.hdr_len = 0;
R
Rusty Russell 已提交
697 698
	}

699
	hdr->mhdr.num_buffers = 0;
700 701 702

	/* Encode metadata header at front. */
	if (vi->mergeable_rx_bufs)
703
		sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
704
	else
705
		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
706

707 708
	hdr->num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
	return virtqueue_add_buf(sq->vq, sq->sg, hdr->num_sg,
709
				 0, skb, GFP_ATOMIC);
710 711
}

712
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
713 714
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
715 716
	int qnum = skb_get_queue_mapping(skb);
	struct send_queue *sq = &vi->sq[qnum];
717
	int capacity;
718 719

	/* Free up any pending old buffers before queueing new ones. */
720
	free_old_xmit_skbs(sq);
721

722
	/* Try to transmit */
723
	capacity = xmit_skb(sq, skb);
724 725 726

	/* This can happen with OOM and indirect buffers. */
	if (unlikely(capacity < 0)) {
727
		if (likely(capacity == -ENOMEM)) {
728
			if (net_ratelimit())
729
				dev_warn(&dev->dev,
J
Jason Wang 已提交
730 731
					 "TXQ (%d) failure: out of memory\n",
					 qnum);
732
		} else {
733 734
			dev->stats.tx_fifo_errors++;
			if (net_ratelimit())
735
				dev_warn(&dev->dev,
J
Jason Wang 已提交
736 737
					 "Unexpected TXQ (%d) failure: %d\n",
					 qnum, capacity);
738
		}
739 740 741
		dev->stats.tx_dropped++;
		kfree_skb(skb);
		return NETDEV_TX_OK;
R
Rusty Russell 已提交
742
	}
743
	virtqueue_kick(sq->vq);
744

745 746 747 748 749 750 751
	/* Don't wait up for transmitted skbs to be freed. */
	skb_orphan(skb);
	nf_reset(skb);

	/* Apparently nice girls don't return TX_BUSY; stop the queue
	 * before it gets out of hand.  Naturally, this wastes entries. */
	if (capacity < 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
752
		netif_stop_subqueue(dev, qnum);
753
		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
754
			/* More just got used, free them then recheck. */
755
			capacity += free_old_xmit_skbs(sq);
756
			if (capacity >= 2+MAX_SKB_FRAGS) {
J
Jason Wang 已提交
757
				netif_start_subqueue(dev, qnum);
758
				virtqueue_disable_cb(sq->vq);
759 760
			}
		}
761
	}
762 763

	return NETDEV_TX_OK;
R
Rusty Russell 已提交
764 765
}

766 767 768 769
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;
770
	int ret;
771

772 773 774
	ret = eth_mac_addr(dev, p);
	if (ret)
		return ret;
775

776 777 778
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
		vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
		                  dev->dev_addr, dev->addr_len);
779 780 781 782

	return 0;
}

783 784 785 786 787 788 789 790
static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
					       struct rtnl_link_stats64 *tot)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int cpu;
	unsigned int start;

	for_each_possible_cpu(cpu) {
E
Eric Dumazet 已提交
791
		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
792 793 794
		u64 tpackets, tbytes, rpackets, rbytes;

		do {
795
			start = u64_stats_fetch_begin_bh(&stats->tx_syncp);
796 797
			tpackets = stats->tx_packets;
			tbytes   = stats->tx_bytes;
798
		} while (u64_stats_fetch_retry_bh(&stats->tx_syncp, start));
799 800

		do {
801
			start = u64_stats_fetch_begin_bh(&stats->rx_syncp);
802 803
			rpackets = stats->rx_packets;
			rbytes   = stats->rx_bytes;
804
		} while (u64_stats_fetch_retry_bh(&stats->rx_syncp, start));
805 806 807 808 809 810 811 812

		tot->rx_packets += rpackets;
		tot->tx_packets += tpackets;
		tot->rx_bytes   += rbytes;
		tot->tx_bytes   += tbytes;
	}

	tot->tx_dropped = dev->stats.tx_dropped;
813
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
814 815 816 817 818 819 820
	tot->rx_dropped = dev->stats.rx_dropped;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;

	return tot;
}

821 822 823 824
#ifdef CONFIG_NET_POLL_CONTROLLER
static void virtnet_netpoll(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
825
	int i;
826

J
Jason Wang 已提交
827 828
	for (i = 0; i < vi->curr_queue_pairs; i++)
		napi_schedule(&vi->rq[i].napi);
829 830 831
}
#endif

832 833 834 835 836 837 838 839
/*
 * Send command via the control virtqueue and check status.  Commands
 * supported by the hypervisor, as indicated by feature bits, should
 * never fail unless improperly formated.
 */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
				 struct scatterlist *data, int out, int in)
{
840
	struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
841 842 843
	struct virtio_net_ctrl_hdr ctrl;
	virtio_net_ctrl_ack status = ~0;
	unsigned int tmp;
844
	int i;
845

846 847 848
	/* Caller should know better */
	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
		(out + in > VIRTNET_SEND_COMMAND_SG_MAX));
849 850 851 852 853 854 855 856 857 858

	out++; /* Add header */
	in++; /* Add return status */

	ctrl.class = class;
	ctrl.cmd = cmd;

	sg_init_table(sg, out + in);

	sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
859 860
	for_each_sg(data, s, out + in - 2, i)
		sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
861 862
	sg_set_buf(&sg[out + in - 1], &status, sizeof(status));

863
	BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
864

865
	virtqueue_kick(vi->cvq);
866 867 868 869 870

	/*
	 * Spin for a response, the kick causes an ioport write, trapping
	 * into the hypervisor, so the request should be handled immediately.
	 */
871
	while (!virtqueue_get_buf(vi->cvq, &tmp))
872 873 874 875 876
		cpu_relax();

	return status == VIRTIO_NET_OK;
}

877 878 879 880 881 882 883 884 885 886
static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
	rtnl_lock();
	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL,
				  0, 0))
		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
	rtnl_unlock();
}

J
Jason Wang 已提交
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909
static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
{
	struct scatterlist sg;
	struct virtio_net_ctrl_mq s;
	struct net_device *dev = vi->dev;

	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
		return 0;

	s.virtqueue_pairs = queue_pairs;
	sg_init_one(&sg, &s, sizeof(s));

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
			 queue_pairs);
		return -EINVAL;
	} else
		vi->curr_queue_pairs = queue_pairs;

	return 0;
}

R
Rusty Russell 已提交
910 911 912
static int virtnet_close(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
J
Jason Wang 已提交
913
	int i;
R
Rusty Russell 已提交
914

915 916
	/* Make sure refill_work doesn't re-enable napi! */
	cancel_delayed_work_sync(&vi->refill);
J
Jason Wang 已提交
917 918 919

	for (i = 0; i < vi->max_queue_pairs; i++)
		napi_disable(&vi->rq[i].napi);
R
Rusty Russell 已提交
920 921 922 923

	return 0;
}

924 925 926
static void virtnet_set_rx_mode(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
927
	struct scatterlist sg[2];
928
	u8 promisc, allmulti;
929
	struct virtio_net_ctrl_mac *mac_data;
J
Jiri Pirko 已提交
930
	struct netdev_hw_addr *ha;
931
	int uc_count;
932
	int mc_count;
933 934
	void *buf;
	int i;
935 936 937 938 939

	/* We can't dynamicaly set ndo_set_rx_mode, so return gracefully */
	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
		return;

940 941
	promisc = ((dev->flags & IFF_PROMISC) != 0);
	allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
942

943
	sg_init_one(sg, &promisc, sizeof(promisc));
944 945 946

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
				  VIRTIO_NET_CTRL_RX_PROMISC,
947
				  sg, 1, 0))
948 949 950
		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
			 promisc ? "en" : "dis");

951
	sg_init_one(sg, &allmulti, sizeof(allmulti));
952 953 954

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
				  VIRTIO_NET_CTRL_RX_ALLMULTI,
955
				  sg, 1, 0))
956 957
		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
			 allmulti ? "en" : "dis");
958

959
	uc_count = netdev_uc_count(dev);
960
	mc_count = netdev_mc_count(dev);
961
	/* MAC filter - use one buffer for both lists */
962 963 964
	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
	mac_data = buf;
965 966 967 968 969
	if (!buf) {
		dev_warn(&dev->dev, "No memory for MAC address buffer\n");
		return;
	}

970 971
	sg_init_table(sg, 2);

972
	/* Store the unicast list and count in the front of the buffer */
973
	mac_data->entries = uc_count;
J
Jiri Pirko 已提交
974
	i = 0;
975
	netdev_for_each_uc_addr(ha, dev)
J
Jiri Pirko 已提交
976
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
977 978

	sg_set_buf(&sg[0], mac_data,
979
		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
980 981

	/* multicast list and count fill the end */
982
	mac_data = (void *)&mac_data->macs[uc_count][0];
983

984
	mac_data->entries = mc_count;
985
	i = 0;
986 987
	netdev_for_each_mc_addr(ha, dev)
		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
988 989

	sg_set_buf(&sg[1], mac_data,
990
		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
991 992 993 994 995 996 997

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
				  VIRTIO_NET_CTRL_MAC_TABLE_SET,
				  sg, 2, 0))
		dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");

	kfree(buf);
998 999
}

1000
static int virtnet_vlan_rx_add_vid(struct net_device *dev, u16 vid)
1001 1002 1003 1004
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1005
	sg_init_one(&sg, &vid, sizeof(vid));
1006 1007 1008 1009

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
				  VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1010
	return 0;
1011 1012
}

1013
static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
1014 1015 1016 1017
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct scatterlist sg;

1018
	sg_init_one(&sg, &vid, sizeof(vid));
1019 1020 1021 1022

	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
				  VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1023
	return 0;
1024 1025
}

J
Jason Wang 已提交
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
{
	int i;

	/* In multiqueue mode, when the number of cpu is equal to the number of
	 * queue pairs, we let the queue pairs to be private to one cpu by
	 * setting the affinity hint to eliminate the contention.
	 */
	if ((vi->curr_queue_pairs == 1 ||
	     vi->max_queue_pairs != num_online_cpus()) && set) {
		if (vi->affinity_hint_set)
			set = false;
		else
			return;
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		int cpu = set ? i : -1;
		virtqueue_set_affinity(vi->rq[i].vq, cpu);
		virtqueue_set_affinity(vi->sq[i].vq, cpu);
	}

	if (set)
		vi->affinity_hint_set = true;
	else
		vi->affinity_hint_set = false;
}

R
Rick Jones 已提交
1054 1055 1056 1057 1058
static void virtnet_get_ringparam(struct net_device *dev,
				struct ethtool_ringparam *ring)
{
	struct virtnet_info *vi = netdev_priv(dev);

J
Jason Wang 已提交
1059 1060
	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
R
Rick Jones 已提交
1061 1062 1063 1064
	ring->rx_pending = ring->rx_max_pending;
	ring->tx_pending = ring->tx_max_pending;
}

1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077

static void virtnet_get_drvinfo(struct net_device *dev,
				struct ethtool_drvinfo *info)
{
	struct virtnet_info *vi = netdev_priv(dev);
	struct virtio_device *vdev = vi->vdev;

	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));

}

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
/* TODO: Eliminate OOO packets during switching */
static int virtnet_set_channels(struct net_device *dev,
				struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);
	u16 queue_pairs = channels->combined_count;
	int err;

	/* We don't support separate rx/tx channels.
	 * We don't allow setting 'other' channels.
	 */
	if (channels->rx_count || channels->tx_count || channels->other_count)
		return -EINVAL;

	if (queue_pairs > vi->max_queue_pairs)
		return -EINVAL;

	err = virtnet_set_queues(vi, queue_pairs);
	if (!err) {
		netif_set_real_num_tx_queues(dev, queue_pairs);
		netif_set_real_num_rx_queues(dev, queue_pairs);

		virtnet_set_affinity(vi, true);
	}

	return err;
}

static void virtnet_get_channels(struct net_device *dev,
				 struct ethtool_channels *channels)
{
	struct virtnet_info *vi = netdev_priv(dev);

	channels->combined_count = vi->curr_queue_pairs;
	channels->max_combined = vi->max_queue_pairs;
	channels->max_other = 0;
	channels->rx_count = 0;
	channels->tx_count = 0;
	channels->other_count = 0;
}

1119
static const struct ethtool_ops virtnet_ethtool_ops = {
1120
	.get_drvinfo = virtnet_get_drvinfo,
1121
	.get_link = ethtool_op_get_link,
R
Rick Jones 已提交
1122
	.get_ringparam = virtnet_get_ringparam,
1123 1124
	.set_channels = virtnet_set_channels,
	.get_channels = virtnet_get_channels,
1125 1126
};

M
Mark McLoughlin 已提交
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
#define MIN_MTU 68
#define MAX_MTU 65535

static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
{
	if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
		return -EINVAL;
	dev->mtu = new_mtu;
	return 0;
}

J
Jason Wang 已提交
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
/* To avoid contending a lock hold by a vcpu who would exit to host, select the
 * txq based on the processor id.
 * TODO: handle cpu hotplug.
 */
static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
{
	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
		  smp_processor_id();

	while (unlikely(txq >= dev->real_num_tx_queues))
		txq -= dev->real_num_tx_queues;

	return txq;
}

1153 1154 1155 1156 1157
static const struct net_device_ops virtnet_netdev = {
	.ndo_open            = virtnet_open,
	.ndo_stop   	     = virtnet_close,
	.ndo_start_xmit      = start_xmit,
	.ndo_validate_addr   = eth_validate_addr,
1158
	.ndo_set_mac_address = virtnet_set_mac_address,
1159
	.ndo_set_rx_mode     = virtnet_set_rx_mode,
1160
	.ndo_change_mtu	     = virtnet_change_mtu,
1161
	.ndo_get_stats64     = virtnet_stats,
1162 1163
	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
J
Jason Wang 已提交
1164
	.ndo_select_queue     = virtnet_select_queue,
1165 1166 1167 1168 1169
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller = virtnet_netpoll,
#endif
};

1170
static void virtnet_config_changed_work(struct work_struct *work)
1171
{
1172 1173
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, config_work);
1174 1175
	u16 v;

1176 1177 1178 1179
	mutex_lock(&vi->config_lock);
	if (!vi->config_enable)
		goto done;

1180
	if (virtio_config_val(vi->vdev, VIRTIO_NET_F_STATUS,
1181
			      offsetof(struct virtio_net_config, status),
1182
			      &v) < 0)
1183 1184 1185
		goto done;

	if (v & VIRTIO_NET_S_ANNOUNCE) {
1186
		netdev_notify_peers(vi->dev);
1187 1188
		virtnet_ack_link_announce(vi);
	}
1189 1190 1191 1192 1193

	/* Ignore unknown (future) status bits */
	v &= VIRTIO_NET_S_LINK_UP;

	if (vi->status == v)
1194
		goto done;
1195 1196 1197 1198 1199

	vi->status = v;

	if (vi->status & VIRTIO_NET_S_LINK_UP) {
		netif_carrier_on(vi->dev);
J
Jason Wang 已提交
1200
		netif_tx_wake_all_queues(vi->dev);
1201 1202
	} else {
		netif_carrier_off(vi->dev);
J
Jason Wang 已提交
1203
		netif_tx_stop_all_queues(vi->dev);
1204
	}
1205 1206
done:
	mutex_unlock(&vi->config_lock);
1207 1208 1209 1210 1211 1212
}

static void virtnet_config_changed(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;

1213
	schedule_work(&vi->config_work);
1214 1215
}

J
Jason Wang 已提交
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
static void virtnet_free_queues(struct virtnet_info *vi)
{
	kfree(vi->rq);
	kfree(vi->sq);
}

static void free_receive_bufs(struct virtnet_info *vi)
{
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		while (vi->rq[i].pages)
			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
	}
}

static void free_unused_bufs(struct virtnet_info *vi)
{
	void *buf;
	int i;

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->sq[i].vq;
		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
			dev_kfree_skb(buf);
	}

	for (i = 0; i < vi->max_queue_pairs; i++) {
		struct virtqueue *vq = vi->rq[i].vq;

		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
			if (vi->mergeable_rx_bufs || vi->big_packets)
				give_pages(&vi->rq[i], buf);
			else
				dev_kfree_skb(buf);
			--vi->rq[i].num;
		}
		BUG_ON(vi->rq[i].num != 0);
	}
}

1257 1258 1259 1260
static void virtnet_del_vqs(struct virtnet_info *vi)
{
	struct virtio_device *vdev = vi->vdev;

J
Jason Wang 已提交
1261 1262
	virtnet_set_affinity(vi, false);

1263
	vdev->config->del_vqs(vdev);
J
Jason Wang 已提交
1264 1265

	virtnet_free_queues(vi);
1266 1267
}

J
Jason Wang 已提交
1268
static int virtnet_find_vqs(struct virtnet_info *vi)
1269
{
J
Jason Wang 已提交
1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298
	vq_callback_t **callbacks;
	struct virtqueue **vqs;
	int ret = -ENOMEM;
	int i, total_vqs;
	const char **names;

	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
	 * possible control vq.
	 */
	total_vqs = vi->max_queue_pairs * 2 +
		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);

	/* Allocate space for find_vqs parameters */
	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
	if (!vqs)
		goto err_vq;
	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
	if (!callbacks)
		goto err_callback;
	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
	if (!names)
		goto err_names;

	/* Parameters for control virtqueue, if any */
	if (vi->has_cvq) {
		callbacks[total_vqs - 1] = NULL;
		names[total_vqs - 1] = "control";
	}
1299

J
Jason Wang 已提交
1300 1301 1302 1303 1304 1305 1306 1307 1308
	/* Allocate/initialize parameters for send/receive virtqueues */
	for (i = 0; i < vi->max_queue_pairs; i++) {
		callbacks[rxq2vq(i)] = skb_recv_done;
		callbacks[txq2vq(i)] = skb_xmit_done;
		sprintf(vi->rq[i].name, "input.%d", i);
		sprintf(vi->sq[i].name, "output.%d", i);
		names[rxq2vq(i)] = vi->rq[i].name;
		names[txq2vq(i)] = vi->sq[i].name;
	}
1309

J
Jason Wang 已提交
1310 1311 1312 1313
	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
					 names);
	if (ret)
		goto err_find;
1314

J
Jason Wang 已提交
1315 1316
	if (vi->has_cvq) {
		vi->cvq = vqs[total_vqs - 1];
1317 1318 1319
		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
	}
J
Jason Wang 已提交
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329

	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].vq = vqs[rxq2vq(i)];
		vi->sq[i].vq = vqs[txq2vq(i)];
	}

	kfree(names);
	kfree(callbacks);
	kfree(vqs);

1330
	return 0;
J
Jason Wang 已提交
1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390

err_find:
	kfree(names);
err_names:
	kfree(callbacks);
err_callback:
	kfree(vqs);
err_vq:
	return ret;
}

static int virtnet_alloc_queues(struct virtnet_info *vi)
{
	int i;

	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_sq;
	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
	if (!vi->sq)
		goto err_rq;

	INIT_DELAYED_WORK(&vi->refill, refill_work);
	for (i = 0; i < vi->max_queue_pairs; i++) {
		vi->rq[i].pages = NULL;
		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
			       napi_weight);

		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
	}

	return 0;

err_rq:
	kfree(vi->sq);
err_sq:
	return -ENOMEM;
}

static int init_vqs(struct virtnet_info *vi)
{
	int ret;

	/* Allocate send & receive queues */
	ret = virtnet_alloc_queues(vi);
	if (ret)
		goto err;

	ret = virtnet_find_vqs(vi);
	if (ret)
		goto err_free;

	virtnet_set_affinity(vi, true);
	return 0;

err_free:
	virtnet_free_queues(vi);
err:
	return ret;
1391 1392
}

R
Rusty Russell 已提交
1393 1394
static int virtnet_probe(struct virtio_device *vdev)
{
J
Jason Wang 已提交
1395
	int i, err;
R
Rusty Russell 已提交
1396 1397
	struct net_device *dev;
	struct virtnet_info *vi;
J
Jason Wang 已提交
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
	u16 max_queue_pairs;

	/* Find if host supports multiqueue virtio_net device */
	err = virtio_config_val(vdev, VIRTIO_NET_F_MQ,
				offsetof(struct virtio_net_config,
				max_virtqueue_pairs), &max_queue_pairs);

	/* We need at least 2 queue's */
	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		max_queue_pairs = 1;
R
Rusty Russell 已提交
1410 1411

	/* Allocate ourselves a network device with room for our info */
J
Jason Wang 已提交
1412
	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
R
Rusty Russell 已提交
1413 1414 1415 1416
	if (!dev)
		return -ENOMEM;

	/* Set up network device as normal. */
1417
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
1418
	dev->netdev_ops = &virtnet_netdev;
R
Rusty Russell 已提交
1419
	dev->features = NETIF_F_HIGHDMA;
1420

1421
	SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
R
Rusty Russell 已提交
1422 1423 1424
	SET_NETDEV_DEV(dev, &vdev->dev);

	/* Do we support "hardware" checksums? */
1425
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
R
Rusty Russell 已提交
1426
		/* This opens up the world of extra features. */
1427 1428 1429 1430 1431 1432
		dev->hw_features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
		if (csum)
			dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;

		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
			dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
R
Rusty Russell 已提交
1433 1434
				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
		}
1435
		/* Individual feature bits: what can host handle? */
1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
			dev->hw_features |= NETIF_F_TSO6;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
			dev->hw_features |= NETIF_F_TSO_ECN;
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
			dev->hw_features |= NETIF_F_UFO;

		if (gso)
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
		/* (!csum && gso) case will be fixed by register_netdev() */
R
Rusty Russell 已提交
1448 1449 1450
	}

	/* Configuration may specify what MAC to use.  Otherwise random. */
1451
	if (virtio_config_val_len(vdev, VIRTIO_NET_F_MAC,
1452
				  offsetof(struct virtio_net_config, mac),
1453
				  dev->dev_addr, dev->addr_len) < 0)
1454
		eth_hw_addr_random(dev);
R
Rusty Russell 已提交
1455 1456 1457 1458 1459

	/* Set up our device-specific information */
	vi = netdev_priv(dev);
	vi->dev = dev;
	vi->vdev = vdev;
1460
	vdev->priv = vi;
1461 1462 1463 1464 1465
	vi->stats = alloc_percpu(struct virtnet_stats);
	err = -ENOMEM;
	if (vi->stats == NULL)
		goto free;

1466 1467 1468
	mutex_init(&vi->config_lock);
	vi->config_enable = true;
	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
R
Rusty Russell 已提交
1469

1470
	/* If we can receive ANY GSO packets, we must allocate large ones. */
1471 1472 1473
	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
1474 1475
		vi->big_packets = true;

1476 1477 1478
	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
		vi->mergeable_rx_bufs = true;

J
Jason Wang 已提交
1479 1480 1481 1482 1483 1484 1485 1486
	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
		vi->has_cvq = true;

	/* Use single tx/rx queue pair as default */
	vi->curr_queue_pairs = 1;
	vi->max_queue_pairs = max_queue_pairs;

	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
1487
	err = init_vqs(vi);
1488
	if (err)
1489
		goto free_stats;
R
Rusty Russell 已提交
1490

J
Jason Wang 已提交
1491 1492 1493
	netif_set_real_num_tx_queues(dev, 1);
	netif_set_real_num_rx_queues(dev, 1);

R
Rusty Russell 已提交
1494 1495 1496
	err = register_netdev(dev);
	if (err) {
		pr_debug("virtio_net: registering device failed\n");
1497
		goto free_vqs;
R
Rusty Russell 已提交
1498
	}
1499 1500

	/* Last of all, set up some receive buffers. */
J
Jason Wang 已提交
1501 1502 1503 1504 1505 1506 1507 1508 1509
	for (i = 0; i < vi->max_queue_pairs; i++) {
		try_fill_recv(&vi->rq[i], GFP_KERNEL);

		/* If we didn't even get one input buffer, we're useless. */
		if (vi->rq[i].num == 0) {
			free_unused_bufs(vi);
			err = -ENOMEM;
			goto free_recv_bufs;
		}
1510 1511
	}

J
Jason Wang 已提交
1512 1513 1514 1515
	/* Assume link up if device can't report link status,
	   otherwise get link status from config. */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		netif_carrier_off(dev);
1516
		schedule_work(&vi->config_work);
J
Jason Wang 已提交
1517 1518 1519 1520
	} else {
		vi->status = VIRTIO_NET_S_LINK_UP;
		netif_carrier_on(dev);
	}
1521

J
Jason Wang 已提交
1522 1523 1524
	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
		 dev->name, max_queue_pairs);

R
Rusty Russell 已提交
1525 1526
	return 0;

J
Jason Wang 已提交
1527 1528
free_recv_bufs:
	free_receive_bufs(vi);
1529
	unregister_netdev(dev);
1530
free_vqs:
J
Jason Wang 已提交
1531
	cancel_delayed_work_sync(&vi->refill);
1532
	virtnet_del_vqs(vi);
1533 1534
free_stats:
	free_percpu(vi->stats);
R
Rusty Russell 已提交
1535 1536 1537 1538 1539
free:
	free_netdev(dev);
	return err;
}

1540
static void remove_vq_common(struct virtnet_info *vi)
R
Rusty Russell 已提交
1541
{
1542
	vi->vdev->config->reset(vi->vdev);
S
Shirley Ma 已提交
1543 1544

	/* Free unused buffers in both send and recv, if any. */
1545
	free_unused_bufs(vi);
1546

J
Jason Wang 已提交
1547
	free_receive_bufs(vi);
1548

J
Jason Wang 已提交
1549
	virtnet_del_vqs(vi);
1550 1551
}

1552
static void virtnet_remove(struct virtio_device *vdev)
1553 1554 1555
{
	struct virtnet_info *vi = vdev->priv;

1556 1557 1558 1559 1560
	/* Prevent config work handler from accessing the device. */
	mutex_lock(&vi->config_lock);
	vi->config_enable = false;
	mutex_unlock(&vi->config_lock);

1561 1562 1563
	unregister_netdev(vi->dev);

	remove_vq_common(vi);
1564

1565 1566
	flush_work(&vi->config_work);

1567
	free_percpu(vi->stats);
1568
	free_netdev(vi->dev);
R
Rusty Russell 已提交
1569 1570
}

1571 1572 1573 1574
#ifdef CONFIG_PM
static int virtnet_freeze(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
1575
	int i;
1576

1577 1578 1579 1580 1581
	/* Prevent config work handler from accessing the device */
	mutex_lock(&vi->config_lock);
	vi->config_enable = false;
	mutex_unlock(&vi->config_lock);

1582 1583 1584 1585
	netif_device_detach(vi->dev);
	cancel_delayed_work_sync(&vi->refill);

	if (netif_running(vi->dev))
J
Jason Wang 已提交
1586 1587 1588 1589
		for (i = 0; i < vi->max_queue_pairs; i++) {
			napi_disable(&vi->rq[i].napi);
			netif_napi_del(&vi->rq[i].napi);
		}
1590 1591 1592

	remove_vq_common(vi);

1593 1594
	flush_work(&vi->config_work);

1595 1596 1597 1598 1599 1600
	return 0;
}

static int virtnet_restore(struct virtio_device *vdev)
{
	struct virtnet_info *vi = vdev->priv;
J
Jason Wang 已提交
1601
	int err, i;
1602 1603 1604 1605 1606 1607

	err = init_vqs(vi);
	if (err)
		return err;

	if (netif_running(vi->dev))
J
Jason Wang 已提交
1608 1609
		for (i = 0; i < vi->max_queue_pairs; i++)
			virtnet_napi_enable(&vi->rq[i]);
1610 1611 1612

	netif_device_attach(vi->dev);

J
Jason Wang 已提交
1613 1614 1615
	for (i = 0; i < vi->max_queue_pairs; i++)
		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
			schedule_delayed_work(&vi->refill, 0);
1616

1617 1618 1619 1620
	mutex_lock(&vi->config_lock);
	vi->config_enable = true;
	mutex_unlock(&vi->config_lock);

J
Jason Wang 已提交
1621 1622
	virtnet_set_queues(vi, vi->curr_queue_pairs);

1623 1624 1625 1626
	return 0;
}
#endif

R
Rusty Russell 已提交
1627 1628 1629 1630 1631
static struct virtio_device_id id_table[] = {
	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

1632
static unsigned int features[] = {
1633 1634
	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
	VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
1635
	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
1636
	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
1637
	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
1638
	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
1639
	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
J
Jason Wang 已提交
1640
	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
1641 1642
};

1643
static struct virtio_driver virtio_net_driver = {
1644 1645
	.feature_table = features,
	.feature_table_size = ARRAY_SIZE(features),
R
Rusty Russell 已提交
1646 1647 1648 1649
	.driver.name =	KBUILD_MODNAME,
	.driver.owner =	THIS_MODULE,
	.id_table =	id_table,
	.probe =	virtnet_probe,
1650
	.remove =	virtnet_remove,
1651
	.config_changed = virtnet_config_changed,
1652 1653 1654 1655
#ifdef CONFIG_PM
	.freeze =	virtnet_freeze,
	.restore =	virtnet_restore,
#endif
R
Rusty Russell 已提交
1656 1657 1658 1659
};

static int __init init(void)
{
1660
	return register_virtio_driver(&virtio_net_driver);
R
Rusty Russell 已提交
1661 1662 1663 1664
}

static void __exit fini(void)
{
1665
	unregister_virtio_driver(&virtio_net_driver);
R
Rusty Russell 已提交
1666 1667 1668 1669 1670 1671 1672
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio network driver");
MODULE_LICENSE("GPL");