virtio_ring.c 33.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/* Virtio ring implementation.
 *
 *  Copyright 2007 Rusty Russell IBM Corporation
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include <linux/virtio.h>
#include <linux/virtio_ring.h>
21
#include <linux/virtio_config.h>
22
#include <linux/device.h>
23
#include <linux/slab.h>
24
#include <linux/module.h>
25
#include <linux/hrtimer.h>
26
#include <linux/kmemleak.h>
A
Andy Lutomirski 已提交
27
#include <linux/dma-mapping.h>
A
Andy Lutomirski 已提交
28
#include <xen/xen.h>
29 30 31

#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
32 33 34 35 36 37
#define BAD_RING(_vq, fmt, args...)				\
	do {							\
		dev_err(&(_vq)->vq.vdev->dev,			\
			"%s:"fmt, (_vq)->vq.name, ##args);	\
		BUG();						\
	} while (0)
38 39 40 41
/* Caller is supposed to guarantee no reentry. */
#define START_USE(_vq)						\
	do {							\
		if ((_vq)->in_use)				\
42 43
			panic("%s:in_use = %i\n",		\
			      (_vq)->vq.name, (_vq)->in_use);	\
44
		(_vq)->in_use = __LINE__;			\
45
	} while (0)
46
#define END_USE(_vq) \
47
	do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
48
#else
49 50 51 52 53 54
#define BAD_RING(_vq, fmt, args...)				\
	do {							\
		dev_err(&_vq->vq.vdev->dev,			\
			"%s:"fmt, (_vq)->vq.name, ##args);	\
		(_vq)->broken = true;				\
	} while (0)
55 56 57 58
#define START_USE(vq)
#define END_USE(vq)
#endif

A
Andy Lutomirski 已提交
59 60 61 62 63
struct vring_desc_state {
	void *data;			/* Data for callback. */
	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
};

64
struct vring_virtqueue {
65 66 67 68 69
	struct virtqueue vq;

	/* Actual memory layout for this queue */
	struct vring vring;

70 71 72
	/* Can we use weak barriers? */
	bool weak_barriers;

73 74 75
	/* Other side has made a mess, don't try any more. */
	bool broken;

76 77 78
	/* Host supports indirect buffers */
	bool indirect;

79 80 81
	/* Host publishes avail event idx */
	bool event;

82 83 84 85 86 87
	/* Head of free buffer list. */
	unsigned int free_head;
	/* Number we've added since last sync. */
	unsigned int num_added;

	/* Last used index we've seen. */
A
Anthony Liguori 已提交
88
	u16 last_used_idx;
89

90 91 92 93 94 95
	/* Last written value to avail->flags */
	u16 avail_flags_shadow;

	/* Last written value to avail->idx in guest byte order */
	u16 avail_idx_shadow;

96
	/* How to notify other side. FIXME: commonalize hcalls! */
97
	bool (*notify)(struct virtqueue *vq);
98

99 100 101 102 103
	/* DMA, allocation, and size information */
	bool we_own_ring;
	size_t queue_size_in_bytes;
	dma_addr_t queue_dma_addr;

104 105 106
#ifdef DEBUG
	/* They're supposed to lock for us. */
	unsigned int in_use;
107 108 109 110

	/* Figure out if their kicks are too delayed. */
	bool last_add_time_valid;
	ktime_t last_add_time;
111 112
#endif

A
Andy Lutomirski 已提交
113 114
	/* Per-descriptor state. */
	struct vring_desc_state desc_state[];
115 116 117 118
};

#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)

119
/*
120 121 122 123
 * Modern virtio devices have feature bits to specify whether they need a
 * quirk and bypass the IOMMU. If not there, just use the DMA API.
 *
 * If there, the interaction between virtio and DMA API is messy.
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
 *
 * On most systems with virtio, physical addresses match bus addresses,
 * and it doesn't particularly matter whether we use the DMA API.
 *
 * On some systems, including Xen and any system with a physical device
 * that speaks virtio behind a physical IOMMU, we must use the DMA API
 * for virtio DMA to work at all.
 *
 * On other systems, including SPARC and PPC64, virtio-pci devices are
 * enumerated as though they are behind an IOMMU, but the virtio host
 * ignores the IOMMU, so we must either pretend that the IOMMU isn't
 * there or somehow map everything as the identity.
 *
 * For the time being, we preserve historic behavior and bypass the DMA
 * API.
139 140 141 142
 *
 * TODO: install a per-device DMA ops structure that does the right thing
 * taking into account all the above quirks, and use the DMA API
 * unconditionally on data path.
143 144 145 146
 */

static bool vring_use_dma_api(struct virtio_device *vdev)
{
147 148 149 150
	if (!virtio_has_iommu_quirk(vdev))
		return true;

	/* Otherwise, we are left to guess. */
A
Andy Lutomirski 已提交
151 152 153 154 155 156 157 158 159 160 161
	/*
	 * In theory, it's possible to have a buggy QEMU-supposed
	 * emulated Q35 IOMMU and Xen enabled at the same time.  On
	 * such a configuration, virtio has never worked and will
	 * not work without an even larger kludge.  Instead, enable
	 * the DMA API if we're a Xen guest, which at least allows
	 * all of the sensible Xen configurations to work correctly.
	 */
	if (xen_domain())
		return true;

162 163 164
	return false;
}

A
Andy Lutomirski 已提交
165 166 167 168 169
/*
 * The DMA ops on various arches are rather gnarly right now, and
 * making all of the arch DMA ops work on the vring device itself
 * is a mess.  For now, we use the parent device for DMA ops.
 */
170
static inline struct device *vring_dma_dev(const struct vring_virtqueue *vq)
A
Andy Lutomirski 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
{
	return vq->vq.vdev->dev.parent;
}

/* Map one sg entry. */
static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
				   struct scatterlist *sg,
				   enum dma_data_direction direction)
{
	if (!vring_use_dma_api(vq->vq.vdev))
		return (dma_addr_t)sg_phys(sg);

	/*
	 * We can't use dma_map_sg, because we don't use scatterlists in
	 * the way it expects (we don't guarantee that the scatterlist
	 * will exist for the lifetime of the mapping).
	 */
	return dma_map_page(vring_dma_dev(vq),
			    sg_page(sg), sg->offset, sg->length,
			    direction);
}

static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
				   void *cpu_addr, size_t size,
				   enum dma_data_direction direction)
{
	if (!vring_use_dma_api(vq->vq.vdev))
		return (dma_addr_t)virt_to_phys(cpu_addr);

	return dma_map_single(vring_dma_dev(vq),
			      cpu_addr, size, direction);
}

static void vring_unmap_one(const struct vring_virtqueue *vq,
			    struct vring_desc *desc)
{
	u16 flags;

	if (!vring_use_dma_api(vq->vq.vdev))
		return;

	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);

	if (flags & VRING_DESC_F_INDIRECT) {
		dma_unmap_single(vring_dma_dev(vq),
				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
				 virtio32_to_cpu(vq->vq.vdev, desc->len),
				 (flags & VRING_DESC_F_WRITE) ?
				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
	} else {
		dma_unmap_page(vring_dma_dev(vq),
			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
			       virtio32_to_cpu(vq->vq.vdev, desc->len),
			       (flags & VRING_DESC_F_WRITE) ?
			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
	}
}

static int vring_mapping_error(const struct vring_virtqueue *vq,
			       dma_addr_t addr)
{
	if (!vring_use_dma_api(vq->vq.vdev))
		return 0;

	return dma_mapping_error(vring_dma_dev(vq), addr);
}

238 239
static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
					 unsigned int total_sg, gfp_t gfp)
240 241
{
	struct vring_desc *desc;
242
	unsigned int i;
243

244 245 246 247 248
	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
249
	gfp &= ~__GFP_HIGHMEM;
250

251
	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
252
	if (!desc)
253
		return NULL;
254

255
	for (i = 0; i < total_sg; i++)
256
		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
257
	return desc;
258 259
}

260 261
static inline int virtqueue_add(struct virtqueue *_vq,
				struct scatterlist *sgs[],
262
				unsigned int total_sg,
263 264 265 266
				unsigned int out_sgs,
				unsigned int in_sgs,
				void *data,
				gfp_t gfp)
267 268
{
	struct vring_virtqueue *vq = to_vvq(_vq);
269
	struct scatterlist *sg;
270
	struct vring_desc *desc;
A
Andy Lutomirski 已提交
271
	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
M
Michael S. Tsirkin 已提交
272
	int head;
273
	bool indirect;
274

275 276
	START_USE(vq);

277
	BUG_ON(data == NULL);
278

279 280 281 282 283
	if (unlikely(vq->broken)) {
		END_USE(vq);
		return -EIO;
	}

284 285 286 287 288 289 290 291 292 293 294 295 296
#ifdef DEBUG
	{
		ktime_t now = ktime_get();

		/* No kick or get, with .1 second between?  Warn. */
		if (vq->last_add_time_valid)
			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
					    > 100);
		vq->last_add_time = now;
		vq->last_add_time_valid = true;
	}
#endif

297 298 299 300 301
	BUG_ON(total_sg > vq->vring.num);
	BUG_ON(total_sg == 0);

	head = vq->free_head;

302 303
	/* If the host supports indirect descriptor tables, and we have multiple
	 * buffers, then go indirect. FIXME: tune this threshold */
304
	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
305
		desc = alloc_indirect(_vq, total_sg, gfp);
306 307 308 309 310
	else
		desc = NULL;

	if (desc) {
		/* Use a single buffer which doesn't continue */
A
Andy Lutomirski 已提交
311
		indirect = true;
312 313 314 315
		/* Set up rest to use this indirect table. */
		i = 0;
		descs_used = 1;
	} else {
A
Andy Lutomirski 已提交
316
		indirect = false;
317 318 319
		desc = vq->vring.desc;
		i = head;
		descs_used = total_sg;
320 321
	}

322
	if (vq->vq.num_free < descs_used) {
323
		pr_debug("Can't add buf len %i - avail = %i\n",
324
			 descs_used, vq->vq.num_free);
325 326 327
		/* FIXME: for historical reasons, we force a notify here if
		 * there are outgoing parts to the buffer.  Presumably the
		 * host should service the ring ASAP. */
328
		if (out_sgs)
329
			vq->notify(&vq->vq);
330 331
		if (indirect)
			kfree(desc);
332 333 334 335
		END_USE(vq);
		return -ENOSPC;
	}

336
	for (n = 0; n < out_sgs; n++) {
337
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
A
Andy Lutomirski 已提交
338 339 340 341
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

342
			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
A
Andy Lutomirski 已提交
343
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
344
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
345
			prev = i;
346
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
347
		}
348
	}
349
	for (; n < (out_sgs + in_sgs); n++) {
350
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
A
Andy Lutomirski 已提交
351 352 353 354
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

355
			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
A
Andy Lutomirski 已提交
356
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
357
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
358
			prev = i;
359
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
360
		}
361 362
	}
	/* Last one doesn't continue. */
363
	desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
364

A
Andy Lutomirski 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
	if (indirect) {
		/* Now that the indirect table is filled in, map it. */
		dma_addr_t addr = vring_map_single(
			vq, desc, total_sg * sizeof(struct vring_desc),
			DMA_TO_DEVICE);
		if (vring_mapping_error(vq, addr))
			goto unmap_release;

		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);

		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
	}

	/* We're using some buffers from the free list. */
	vq->vq.num_free -= descs_used;

382
	/* Update free pointer */
383
	if (indirect)
384
		vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
385 386
	else
		vq->free_head = i;
387

A
Andy Lutomirski 已提交
388 389 390 391
	/* Store token and indirect buffer state. */
	vq->desc_state[head].data = data;
	if (indirect)
		vq->desc_state[head].indir_desc = desc;
392 393

	/* Put entry in available array (but don't update avail->idx until they
R
Rusty Russell 已提交
394
	 * do sync). */
395
	avail = vq->avail_idx_shadow & (vq->vring.num - 1);
396
	vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
397

398 399
	/* Descriptors and available array need to be set before we expose the
	 * new available array entries. */
400
	virtio_wmb(vq->weak_barriers);
401 402
	vq->avail_idx_shadow++;
	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
403 404
	vq->num_added++;

405 406 407
	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);

408 409 410 411 412
	/* This is very unlikely, but theoretically possible.  Kick
	 * just in case. */
	if (unlikely(vq->num_added == (1 << 16) - 1))
		virtqueue_kick(_vq);

413
	return 0;
A
Andy Lutomirski 已提交
414 415 416 417 418 419 420 421 422

unmap_release:
	err_idx = i;
	i = head;

	for (n = 0; n < total_sg; n++) {
		if (i == err_idx)
			break;
		vring_unmap_one(vq, &desc[i]);
G
Gonglei 已提交
423
		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
A
Andy Lutomirski 已提交
424 425 426 427 428 429 430
	}

	vq->vq.num_free += total_sg;

	if (indirect)
		kfree(desc);

431
	END_USE(vq);
A
Andy Lutomirski 已提交
432
	return -EIO;
433
}
434 435 436 437 438 439 440 441 442 443 444 445 446

/**
 * virtqueue_add_sgs - expose buffers to other end
 * @vq: the struct virtqueue we're talking about.
 * @sgs: array of terminated scatterlists.
 * @out_num: the number of scatterlists readable by other side
 * @in_num: the number of scatterlists which are writable (after readable ones)
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
447
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
448 449 450 451 452 453 454 455
 */
int virtqueue_add_sgs(struct virtqueue *_vq,
		      struct scatterlist *sgs[],
		      unsigned int out_sgs,
		      unsigned int in_sgs,
		      void *data,
		      gfp_t gfp)
{
456
	unsigned int i, total_sg = 0;
457 458

	/* Count them first. */
459
	for (i = 0; i < out_sgs + in_sgs; i++) {
460 461
		struct scatterlist *sg;
		for (sg = sgs[i]; sg; sg = sg_next(sg))
462
			total_sg++;
463
	}
464
	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
465 466 467
}
EXPORT_SYMBOL_GPL(virtqueue_add_sgs);

468 469 470
/**
 * virtqueue_add_outbuf - expose output buffers to other end
 * @vq: the struct virtqueue we're talking about.
471 472
 * @sg: scatterlist (must be well-formed and terminated!)
 * @num: the number of entries in @sg readable by other side
473 474 475 476 477 478
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
479
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
480 481
 */
int virtqueue_add_outbuf(struct virtqueue *vq,
482
			 struct scatterlist *sg, unsigned int num,
483 484 485
			 void *data,
			 gfp_t gfp)
{
486
	return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
487 488 489 490 491 492
}
EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);

/**
 * virtqueue_add_inbuf - expose input buffers to other end
 * @vq: the struct virtqueue we're talking about.
493 494
 * @sg: scatterlist (must be well-formed and terminated!)
 * @num: the number of entries in @sg writable by other side
495 496 497 498 499 500
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
501
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
502 503
 */
int virtqueue_add_inbuf(struct virtqueue *vq,
504
			struct scatterlist *sg, unsigned int num,
505 506 507
			void *data,
			gfp_t gfp)
{
508
	return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
509 510 511
}
EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);

512
/**
513
 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
514 515
 * @vq: the struct virtqueue
 *
516 517 518
 * Instead of virtqueue_kick(), you can do:
 *	if (virtqueue_kick_prepare(vq))
 *		virtqueue_notify(vq);
519
 *
520 521
 * This is sometimes useful because the virtqueue_kick_prepare() needs
 * to be serialized, but the actual virtqueue_notify() call does not.
522
 */
523
bool virtqueue_kick_prepare(struct virtqueue *_vq)
524 525
{
	struct vring_virtqueue *vq = to_vvq(_vq);
526
	u16 new, old;
527 528
	bool needs_kick;

529
	START_USE(vq);
530 531
	/* We need to expose available array entries before checking avail
	 * event. */
532
	virtio_mb(vq->weak_barriers);
533

534 535
	old = vq->avail_idx_shadow - vq->num_added;
	new = vq->avail_idx_shadow;
536 537
	vq->num_added = 0;

538 539 540 541 542 543 544 545
#ifdef DEBUG
	if (vq->last_add_time_valid) {
		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
					      vq->last_add_time)) > 100);
	}
	vq->last_add_time_valid = false;
#endif

546
	if (vq->event) {
547
		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
548 549
					      new, old);
	} else {
550
		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
551
	}
552
	END_USE(vq);
553 554 555 556 557 558 559 560 561
	return needs_kick;
}
EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);

/**
 * virtqueue_notify - second half of split virtqueue_kick call.
 * @vq: the struct virtqueue
 *
 * This does not need to be serialized.
562 563
 *
 * Returns false if host notify failed or queue is broken, otherwise true.
564
 */
565
bool virtqueue_notify(struct virtqueue *_vq)
566 567 568
{
	struct vring_virtqueue *vq = to_vvq(_vq);

569 570 571
	if (unlikely(vq->broken))
		return false;

572
	/* Prod other side to tell it about changes. */
573
	if (!vq->notify(_vq)) {
574 575 576 577
		vq->broken = true;
		return false;
	}
	return true;
578 579 580 581 582 583 584
}
EXPORT_SYMBOL_GPL(virtqueue_notify);

/**
 * virtqueue_kick - update after add_buf
 * @vq: the struct virtqueue
 *
585
 * After one or more virtqueue_add_* calls, invoke this to kick
586 587 588 589
 * the other side.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
590 591
 *
 * Returns false if kick failed, otherwise true.
592
 */
593
bool virtqueue_kick(struct virtqueue *vq)
594 595
{
	if (virtqueue_kick_prepare(vq))
596 597
		return virtqueue_notify(vq);
	return true;
598
}
599
EXPORT_SYMBOL_GPL(virtqueue_kick);
600 601 602

static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
{
A
Andy Lutomirski 已提交
603
	unsigned int i, j;
G
Gonglei 已提交
604
	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
605 606

	/* Clear data ptr. */
A
Andy Lutomirski 已提交
607
	vq->desc_state[head].data = NULL;
608

A
Andy Lutomirski 已提交
609
	/* Put back on free list: unmap first-level descriptors and find end */
610
	i = head;
611

A
Andy Lutomirski 已提交
612 613
	while (vq->vring.desc[i].flags & nextflag) {
		vring_unmap_one(vq, &vq->vring.desc[i]);
614
		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
615
		vq->vq.num_free++;
616 617
	}

A
Andy Lutomirski 已提交
618
	vring_unmap_one(vq, &vq->vring.desc[i]);
619
	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
620
	vq->free_head = head;
A
Andy Lutomirski 已提交
621

622
	/* Plus final descriptor */
623
	vq->vq.num_free++;
A
Andy Lutomirski 已提交
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639

	/* Free the indirect table, if any, now that it's unmapped. */
	if (vq->desc_state[head].indir_desc) {
		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
		u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);

		BUG_ON(!(vq->vring.desc[head].flags &
			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
		BUG_ON(len == 0 || len % sizeof(struct vring_desc));

		for (j = 0; j < len / sizeof(struct vring_desc); j++)
			vring_unmap_one(vq, &indir_desc[j]);

		kfree(vq->desc_state[head].indir_desc);
		vq->desc_state[head].indir_desc = NULL;
	}
640 641 642 643
}

static inline bool more_used(const struct vring_virtqueue *vq)
{
644
	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
645 646
}

647 648 649 650 651 652 653 654 655 656 657 658 659 660
/**
 * virtqueue_get_buf - get the next used buffer
 * @vq: the struct virtqueue we're talking about.
 * @len: the length written into the buffer
 *
 * If the driver wrote data into the buffer, @len will be set to the
 * amount written.  This means you don't need to clear the buffer
 * beforehand to ensure there's no data leakage in the case of short
 * writes.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 *
 * Returns NULL if there are no used buffers, or the "data" token
661
 * handed to virtqueue_add_*().
662
 */
663
void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
664 665 666 667
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	void *ret;
	unsigned int i;
R
Rusty Russell 已提交
668
	u16 last_used;
669 670 671

	START_USE(vq);

672 673 674 675 676
	if (unlikely(vq->broken)) {
		END_USE(vq);
		return NULL;
	}

677 678 679 680 681 682
	if (!more_used(vq)) {
		pr_debug("No more buffers in queue\n");
		END_USE(vq);
		return NULL;
	}

683
	/* Only get used array entries after they have been exposed by host. */
684
	virtio_rmb(vq->weak_barriers);
685

R
Rusty Russell 已提交
686
	last_used = (vq->last_used_idx & (vq->vring.num - 1));
687 688
	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
689 690 691 692 693

	if (unlikely(i >= vq->vring.num)) {
		BAD_RING(vq, "id %u out of range\n", i);
		return NULL;
	}
A
Andy Lutomirski 已提交
694
	if (unlikely(!vq->desc_state[i].data)) {
695 696 697 698 699
		BAD_RING(vq, "id %u is not a head!\n", i);
		return NULL;
	}

	/* detach_buf clears data, so grab it now. */
A
Andy Lutomirski 已提交
700
	ret = vq->desc_state[i].data;
701 702
	detach_buf(vq, i);
	vq->last_used_idx++;
703 704 705
	/* If we expect an interrupt for the next entry, tell host
	 * by writing event index and flush out the write before
	 * the read in the next get_buf call. */
706 707 708 709
	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
		virtio_store_mb(vq->weak_barriers,
				&vring_used_event(&vq->vring),
				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
710

711 712 713 714
#ifdef DEBUG
	vq->last_add_time_valid = false;
#endif

715 716 717
	END_USE(vq);
	return ret;
}
718
EXPORT_SYMBOL_GPL(virtqueue_get_buf);
719

720 721 722 723 724 725 726 727 728
/**
 * virtqueue_disable_cb - disable callbacks
 * @vq: the struct virtqueue we're talking about.
 *
 * Note that this is not necessarily synchronous, hence unreliable and only
 * useful as an optimization.
 *
 * Unlike other operations, this need not be serialized.
 */
729
void virtqueue_disable_cb(struct virtqueue *_vq)
730 731 732
{
	struct vring_virtqueue *vq = to_vvq(_vq);

733 734
	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
735 736
		if (!vq->event)
			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
737 738
	}

739
}
740
EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
741

742
/**
743
 * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
744 745
 * @vq: the struct virtqueue we're talking about.
 *
746 747 748 749
 * This re-enables callbacks; it returns current queue state
 * in an opaque unsigned value. This value should be later tested by
 * virtqueue_poll, to detect a possible race between the driver checking for
 * more work, and enabling callbacks.
750 751 752 753
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
754
unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
755 756
{
	struct vring_virtqueue *vq = to_vvq(_vq);
757
	u16 last_used_idx;
758 759 760 761 762

	START_USE(vq);

	/* We optimistically turn back on interrupts, then check if there was
	 * more to do. */
763 764 765
	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
	 * either clear the flags bit or point the event index at the next
	 * entry. Always do both to keep code simple. */
766 767
	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
768 769
		if (!vq->event)
			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
770
	}
771
	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
	END_USE(vq);
	return last_used_idx;
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);

/**
 * virtqueue_poll - query pending used buffers
 * @vq: the struct virtqueue we're talking about.
 * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
 *
 * Returns "true" if there are pending used buffers in the queue.
 *
 * This does not need to be serialized.
 */
bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

790
	virtio_mb(vq->weak_barriers);
791
	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
792 793
}
EXPORT_SYMBOL_GPL(virtqueue_poll);
794

795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
/**
 * virtqueue_enable_cb - restart callbacks after disable_cb.
 * @vq: the struct virtqueue we're talking about.
 *
 * This re-enables callbacks; it returns "false" if there are pending
 * buffers in the queue, to detect a possible race between the driver
 * checking for more work, and enabling callbacks.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
bool virtqueue_enable_cb(struct virtqueue *_vq)
{
	unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq);
	return !virtqueue_poll(_vq, last_used_idx);
810
}
811
EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
812

813 814 815 816 817 818 819 820 821 822 823 824 825
/**
 * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
 * @vq: the struct virtqueue we're talking about.
 *
 * This re-enables callbacks but hints to the other side to delay
 * interrupts until most of the available buffers have been processed;
 * it returns "false" if there are many pending buffers in the queue,
 * to detect a possible race between the driver checking for more work,
 * and enabling callbacks.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
826 827 828 829 830 831 832 833 834 835 836
bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	u16 bufs;

	START_USE(vq);

	/* We optimistically turn back on interrupts, then check if there was
	 * more to do. */
	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
	 * either clear the flags bit or point the event index at the next
837
	 * entry. Always update the event index to keep code simple. */
838 839
	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
840 841
		if (!vq->event)
			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
842
	}
843
	/* TODO: tune this threshold */
844
	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
845 846 847 848 849

	virtio_store_mb(vq->weak_barriers,
			&vring_used_event(&vq->vring),
			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));

850
	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
851 852 853 854 855 856 857 858 859
		END_USE(vq);
		return false;
	}

	END_USE(vq);
	return true;
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);

860 861 862 863
/**
 * virtqueue_detach_unused_buf - detach first unused buffer
 * @vq: the struct virtqueue we're talking about.
 *
864
 * Returns NULL or the "data" token handed to virtqueue_add_*().
865 866 867
 * This is not valid on an active queue; it is useful only for device
 * shutdown.
 */
868
void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
869 870 871 872 873 874 875 876
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	unsigned int i;
	void *buf;

	START_USE(vq);

	for (i = 0; i < vq->vring.num; i++) {
A
Andy Lutomirski 已提交
877
		if (!vq->desc_state[i].data)
878 879
			continue;
		/* detach_buf clears data, so grab it now. */
A
Andy Lutomirski 已提交
880
		buf = vq->desc_state[i].data;
881
		detach_buf(vq, i);
882 883
		vq->avail_idx_shadow--;
		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
884 885 886 887
		END_USE(vq);
		return buf;
	}
	/* That should have freed everything. */
888
	BUG_ON(vq->vq.num_free != vq->vring.num);
889 890 891 892

	END_USE(vq);
	return NULL;
}
893
EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
894

895 896 897 898 899 900 901 902 903 904 905 906 907
irqreturn_t vring_interrupt(int irq, void *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (!more_used(vq)) {
		pr_debug("virtqueue interrupt with no work for %p\n", vq);
		return IRQ_NONE;
	}

	if (unlikely(vq->broken))
		return IRQ_HANDLED;

	pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
908 909
	if (vq->vq.callback)
		vq->vq.callback(&vq->vq);
910 911 912

	return IRQ_HANDLED;
}
913
EXPORT_SYMBOL_GPL(vring_interrupt);
914

915 916 917 918 919 920 921
struct virtqueue *__vring_new_virtqueue(unsigned int index,
					struct vring vring,
					struct virtio_device *vdev,
					bool weak_barriers,
					bool (*notify)(struct virtqueue *),
					void (*callback)(struct virtqueue *),
					const char *name)
922 923
{
	unsigned int i;
924
	struct vring_virtqueue *vq;
925

926
	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
A
Andy Lutomirski 已提交
927
		     GFP_KERNEL);
928 929 930
	if (!vq)
		return NULL;

931
	vq->vring = vring;
932 933
	vq->vq.callback = callback;
	vq->vq.vdev = vdev;
934
	vq->vq.name = name;
935
	vq->vq.num_free = vring.num;
936
	vq->vq.index = index;
937 938 939
	vq->we_own_ring = false;
	vq->queue_dma_addr = 0;
	vq->queue_size_in_bytes = 0;
940
	vq->notify = notify;
941
	vq->weak_barriers = weak_barriers;
942 943
	vq->broken = false;
	vq->last_used_idx = 0;
944 945
	vq->avail_flags_shadow = 0;
	vq->avail_idx_shadow = 0;
946
	vq->num_added = 0;
947
	list_add_tail(&vq->vq.list, &vdev->vqs);
948 949
#ifdef DEBUG
	vq->in_use = false;
950
	vq->last_add_time_valid = false;
951 952
#endif

953
	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
954
	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
955

956
	/* No callback?  Tell other side not to bother us. */
957 958
	if (!callback) {
		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
959 960
		if (!vq->event)
			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
961
	}
962 963 964

	/* Put everything in free lists. */
	vq->free_head = 0;
965
	for (i = 0; i < vring.num-1; i++)
966
		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
967
	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
968 969 970

	return &vq->vq;
}
971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
EXPORT_SYMBOL_GPL(__vring_new_virtqueue);

static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
			      dma_addr_t *dma_handle, gfp_t flag)
{
	if (vring_use_dma_api(vdev)) {
		return dma_alloc_coherent(vdev->dev.parent, size,
					  dma_handle, flag);
	} else {
		void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
		if (queue) {
			phys_addr_t phys_addr = virt_to_phys(queue);
			*dma_handle = (dma_addr_t)phys_addr;

			/*
			 * Sanity check: make sure we dind't truncate
			 * the address.  The only arches I can find that
			 * have 64-bit phys_addr_t but 32-bit dma_addr_t
			 * are certain non-highmem MIPS and x86
			 * configurations, but these configurations
			 * should never allocate physical pages above 32
			 * bits, so this is fine.  Just in case, throw a
			 * warning and abort if we end up with an
			 * unrepresentable address.
			 */
			if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
				free_pages_exact(queue, PAGE_ALIGN(size));
				return NULL;
			}
		}
		return queue;
	}
}

static void vring_free_queue(struct virtio_device *vdev, size_t size,
			     void *queue, dma_addr_t dma_handle)
{
	if (vring_use_dma_api(vdev)) {
		dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
	} else {
		free_pages_exact(queue, PAGE_ALIGN(size));
	}
}

struct virtqueue *vring_create_virtqueue(
	unsigned int index,
	unsigned int num,
	unsigned int vring_align,
	struct virtio_device *vdev,
	bool weak_barriers,
	bool may_reduce_num,
	bool (*notify)(struct virtqueue *),
	void (*callback)(struct virtqueue *),
	const char *name)
{
	struct virtqueue *vq;
1027
	void *queue = NULL;
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
	dma_addr_t dma_addr;
	size_t queue_size_in_bytes;
	struct vring vring;

	/* We assume num is a power of 2. */
	if (num & (num - 1)) {
		dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
		return NULL;
	}

	/* TODO: allocate each queue chunk individually */
	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
					  &dma_addr,
					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
		if (queue)
			break;
	}

	if (!num)
		return NULL;

	if (!queue) {
		/* Try to get a single page. You are my only hope! */
		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
	}
	if (!queue)
		return NULL;

	queue_size_in_bytes = vring_size(num, vring_align);
	vring_init(&vring, num, queue, vring_align);

	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
				   notify, callback, name);
	if (!vq) {
		vring_free_queue(vdev, queue_size_in_bytes, queue,
				 dma_addr);
		return NULL;
	}

	to_vvq(vq)->queue_dma_addr = dma_addr;
	to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes;
	to_vvq(vq)->we_own_ring = true;

	return vq;
}
EXPORT_SYMBOL_GPL(vring_create_virtqueue);

struct virtqueue *vring_new_virtqueue(unsigned int index,
				      unsigned int num,
				      unsigned int vring_align,
				      struct virtio_device *vdev,
				      bool weak_barriers,
				      void *pages,
				      bool (*notify)(struct virtqueue *vq),
				      void (*callback)(struct virtqueue *vq),
				      const char *name)
{
	struct vring vring;
	vring_init(&vring, num, pages, vring_align);
	return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
				     notify, callback, name);
}
1092
EXPORT_SYMBOL_GPL(vring_new_virtqueue);
1093

1094
void vring_del_virtqueue(struct virtqueue *_vq)
1095
{
1096 1097 1098 1099 1100 1101 1102 1103
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (vq->we_own_ring) {
		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
				 vq->vring.desc, vq->queue_dma_addr);
	}
	list_del(&_vq->list);
	kfree(vq);
1104
}
1105
EXPORT_SYMBOL_GPL(vring_del_virtqueue);
1106

1107 1108 1109 1110 1111 1112 1113
/* Manipulates transport-specific feature bits. */
void vring_transport_features(struct virtio_device *vdev)
{
	unsigned int i;

	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
		switch (i) {
1114 1115
		case VIRTIO_RING_F_INDIRECT_DESC:
			break;
1116 1117
		case VIRTIO_RING_F_EVENT_IDX:
			break;
1118 1119
		case VIRTIO_F_VERSION_1:
			break;
1120 1121
		case VIRTIO_F_IOMMU_PLATFORM:
			break;
1122 1123
		default:
			/* We don't understand this bit. */
1124
			__virtio_clear_bit(vdev, i);
1125 1126 1127 1128 1129
		}
	}
}
EXPORT_SYMBOL_GPL(vring_transport_features);

1130 1131 1132 1133 1134 1135 1136
/**
 * virtqueue_get_vring_size - return the size of the virtqueue's vring
 * @vq: the struct virtqueue containing the vring of interest.
 *
 * Returns the size of the vring.  This is mainly used for boasting to
 * userspace.  Unlike other operations, this need not be serialized.
 */
R
Rick Jones 已提交
1137 1138 1139 1140 1141 1142 1143 1144 1145
unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
{

	struct vring_virtqueue *vq = to_vvq(_vq);

	return vq->vring.num;
}
EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);

1146 1147 1148 1149 1150 1151 1152 1153
bool virtqueue_is_broken(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	return vq->broken;
}
EXPORT_SYMBOL_GPL(virtqueue_is_broken);

1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
/*
 * This should prevent the device from being used, allowing drivers to
 * recover.  You may need to grab appropriate locks to flush.
 */
void virtio_break_device(struct virtio_device *dev)
{
	struct virtqueue *_vq;

	list_for_each_entry(_vq, &dev->vqs, list) {
		struct vring_virtqueue *vq = to_vvq(_vq);
		vq->broken = true;
	}
}
EXPORT_SYMBOL_GPL(virtio_break_device);

1169
dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
1170 1171 1172
{
	struct vring_virtqueue *vq = to_vvq(_vq);

1173 1174 1175
	BUG_ON(!vq->we_own_ring);

	return vq->queue_dma_addr;
1176
}
1177
EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
1178

1179
dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
1180 1181 1182
{
	struct vring_virtqueue *vq = to_vvq(_vq);

1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
	BUG_ON(!vq->we_own_ring);

	return vq->queue_dma_addr +
		((char *)vq->vring.avail - (char *)vq->vring.desc);
}
EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);

dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	BUG_ON(!vq->we_own_ring);

	return vq->queue_dma_addr +
		((char *)vq->vring.used - (char *)vq->vring.desc);
}
EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);

const struct vring *virtqueue_get_vring(struct virtqueue *vq)
{
	return &to_vvq(vq)->vring;
1204
}
1205
EXPORT_SYMBOL_GPL(virtqueue_get_vring);
1206

1207
MODULE_LICENSE("GPL");