gntdev.c 27.4 KB
Newer Older
1 2 3 4 5 6 7 8
/******************************************************************************
 * gntdev.c
 *
 * Device for accessing (in user-space) pages that have been granted by other
 * domains.
 *
 * Copyright (c) 2006-2007, D G Murray.
 *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
9
 *           (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#undef DEBUG

J
Joe Perches 已提交
23 24
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt

25
#include <linux/dma-mapping.h>
26 27 28 29 30 31 32
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
33
#include <linux/sched/mm.h>
34 35
#include <linux/spinlock.h>
#include <linux/slab.h>
36
#include <linux/highmem.h>
37
#include <linux/refcount.h>
38 39 40

#include <xen/xen.h>
#include <xen/grant_table.h>
41
#include <xen/balloon.h>
42
#include <xen/gntdev.h>
43
#include <xen/events.h>
44
#include <xen/page.h>
45 46 47
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

48
#include "gntdev-common.h"
49 50 51
#ifdef CONFIG_XEN_GNTDEV_DMABUF
#include "gntdev-dmabuf.h"
#endif
52

53 54 55 56 57
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
	      "Gerd Hoffmann <kraxel@redhat.com>");
MODULE_DESCRIPTION("User-space granted page access driver");

58 59 60 61
static unsigned int limit = 64*1024;
module_param(limit, uint, 0644);
MODULE_PARM_DESC(limit,
	"Maximum number of grants that may be mapped by one mapping request");
62

63 64
static int use_ptemod;

65 66
static int unmap_grant_pages(struct gntdev_grant_map *map,
			     int offset, int pages);
67

68 69
static struct miscdevice gntdev_miscdev;

70 71
/* ------------------------------------------------------------------ */

72
bool gntdev_test_page_count(unsigned int count)
73
{
74
	return !count || count > limit;
75 76
}

77 78 79 80
static void gntdev_print_maps(struct gntdev_priv *priv,
			      char *text, int text_index)
{
#ifdef DEBUG
81
	struct gntdev_grant_map *map;
82

83
	pr_debug("%s: maps list (priv %p)\n", __func__, priv);
84 85 86 87 88 89 90
	list_for_each_entry(map, &priv->maps, next)
		pr_debug("  index %2d, count %2d %s\n",
		       map->index, map->count,
		       map->index == text_index && text ? text : "");
#endif
}

91
static void gntdev_free_map(struct gntdev_grant_map *map)
92 93 94 95
{
	if (map == NULL)
		return;

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
	if (map->dma_vaddr) {
		struct gnttab_dma_alloc_args args;

		args.dev = map->dma_dev;
		args.coherent = !!(map->dma_flags & GNTDEV_DMA_FLAG_COHERENT);
		args.nr_pages = map->count;
		args.pages = map->pages;
		args.frames = map->frames;
		args.vaddr = map->dma_vaddr;
		args.dev_bus_addr = map->dma_bus_addr;

		gnttab_dma_free_pages(&args);
	} else
#endif
111
	if (map->pages)
112
		gnttab_free_pages(map->count, map->pages);
113 114

#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
115
	kvfree(map->frames);
116
#endif
117 118 119 120 121 122
	kvfree(map->pages);
	kvfree(map->grants);
	kvfree(map->map_ops);
	kvfree(map->unmap_ops);
	kvfree(map->kmap_ops);
	kvfree(map->kunmap_ops);
123 124 125
	kfree(map);
}

126
struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count,
127
					  int dma_flags)
128
{
129
	struct gntdev_grant_map *add;
130
	int i;
131

132
	add = kzalloc(sizeof(*add), GFP_KERNEL);
133 134 135
	if (NULL == add)
		return NULL;

136 137 138 139 140 141 142
	add->grants    = kvcalloc(count, sizeof(add->grants[0]), GFP_KERNEL);
	add->map_ops   = kvcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL);
	add->unmap_ops = kvcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL);
	add->kmap_ops  = kvcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL);
	add->kunmap_ops = kvcalloc(count,
				   sizeof(add->kunmap_ops[0]), GFP_KERNEL);
	add->pages     = kvcalloc(count, sizeof(add->pages[0]), GFP_KERNEL);
143 144 145
	if (NULL == add->grants    ||
	    NULL == add->map_ops   ||
	    NULL == add->unmap_ops ||
146
	    NULL == add->kmap_ops  ||
147
	    NULL == add->kunmap_ops ||
148
	    NULL == add->pages)
149 150
		goto err;

151 152 153 154 155 156 157 158 159 160
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
	add->dma_flags = dma_flags;

	/*
	 * Check if this mapping is requested to be backed
	 * by a DMA buffer.
	 */
	if (dma_flags & (GNTDEV_DMA_FLAG_WC | GNTDEV_DMA_FLAG_COHERENT)) {
		struct gnttab_dma_alloc_args args;

161 162
		add->frames = kvcalloc(count, sizeof(add->frames[0]),
				       GFP_KERNEL);
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
		if (!add->frames)
			goto err;

		/* Remember the device, so we can free DMA memory. */
		add->dma_dev = priv->dma_dev;

		args.dev = priv->dma_dev;
		args.coherent = !!(dma_flags & GNTDEV_DMA_FLAG_COHERENT);
		args.nr_pages = count;
		args.pages = add->pages;
		args.frames = add->frames;

		if (gnttab_dma_alloc_pages(&args))
			goto err;

		add->dma_vaddr = args.vaddr;
		add->dma_bus_addr = args.dev_bus_addr;
	} else
#endif
182
	if (gnttab_alloc_pages(count, add->pages))
183 184
		goto err;

185
	for (i = 0; i < count; i++) {
186 187
		add->map_ops[i].handle = -1;
		add->unmap_ops[i].handle = -1;
188
		add->kmap_ops[i].handle = -1;
189
		add->kunmap_ops[i].handle = -1;
190 191
	}

192 193
	add->index = 0;
	add->count = count;
194
	refcount_set(&add->users, 1);
195 196 197 198

	return add;

err:
199
	gntdev_free_map(add);
200 201 202
	return NULL;
}

203
void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add)
204
{
205
	struct gntdev_grant_map *map;
206 207 208 209 210 211 212 213 214 215 216 217 218 219

	list_for_each_entry(map, &priv->maps, next) {
		if (add->index + add->count < map->index) {
			list_add_tail(&add->next, &map->next);
			goto done;
		}
		add->index = map->index + map->count;
	}
	list_add_tail(&add->next, &priv->maps);

done:
	gntdev_print_maps(priv, "[new]", add->index);
}

220 221
static struct gntdev_grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
						      int index, int count)
222
{
223
	struct gntdev_grant_map *map;
224 225 226 227

	list_for_each_entry(map, &priv->maps, next) {
		if (map->index != index)
			continue;
228
		if (count && map->count != count)
229 230 231 232 233 234
			continue;
		return map;
	}
	return NULL;
}

235
void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
236 237 238
{
	if (!map)
		return;
239

240
	if (!refcount_dec_and_test(&map->users))
241 242
		return;

243
	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
244
		notify_remote_via_evtchn(map->notify.event);
245 246
		evtchn_put(map->notify.event);
	}
247

248 249 250
	if (map->pages && !use_ptemod)
		unmap_grant_pages(map, 0, map->count);
	gntdev_free_map(map);
251 252 253 254
}

/* ------------------------------------------------------------------ */

255
static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data)
256
{
257
	struct gntdev_grant_map *map = data;
258
	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
259
	int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
260 261 262
	u64 pte_maddr;

	BUG_ON(pgnr >= map->count);
263 264
	pte_maddr = arbitrary_virt_to_machine(pte).maddr;

265 266 267 268 269 270 271 272
	/*
	 * Set the PTE as special to force get_user_pages_fast() fall
	 * back to the slow path.  If this is not supported as part of
	 * the grant map, it will be done afterwards.
	 */
	if (xen_feature(XENFEAT_gnttab_map_avail_bits))
		flags |= (1 << _GNTMAP_guest_avail0);

273
	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
274 275
			  map->grants[pgnr].ref,
			  map->grants[pgnr].domid);
276
	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
277
			    -1 /* handle */);
278 279 280
	return 0;
}

281
#ifdef CONFIG_X86
282
static int set_grant_ptes_as_special(pte_t *pte, unsigned long addr, void *data)
283 284 285 286 287 288
{
	set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte));
	return 0;
}
#endif

289
int gntdev_map_grant_pages(struct gntdev_grant_map *map)
290 291
{
	int i, err = 0;
292 293

	if (!use_ptemod) {
294
		/* Note: it could already be mapped */
295
		if (map->map_ops[0].handle != -1)
296
			return 0;
297
		for (i = 0; i < map->count; i++) {
I
Ian Campbell 已提交
298
			unsigned long addr = (unsigned long)
299 300 301 302 303
				pfn_to_kaddr(page_to_pfn(map->pages[i]));
			gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
				map->grants[i].ref,
				map->grants[i].domid);
			gnttab_set_unmap_op(&map->unmap_ops[i], addr,
304
				map->flags, -1 /* handle */);
305
		}
306 307 308 309 310 311 312 313 314 315 316 317
	} else {
		/*
		 * Setup the map_ops corresponding to the pte entries pointing
		 * to the kernel linear addresses of the struct pages.
		 * These ptes are completely different from the user ptes dealt
		 * with find_grant_ptes.
		 */
		for (i = 0; i < map->count; i++) {
			unsigned long address = (unsigned long)
				pfn_to_kaddr(page_to_pfn(map->pages[i]));
			BUG_ON(PageHighMem(map->pages[i]));

318 319
			gnttab_set_map_op(&map->kmap_ops[i], address,
				map->flags | GNTMAP_host_map,
320 321
				map->grants[i].ref,
				map->grants[i].domid);
322 323
			gnttab_set_unmap_op(&map->kunmap_ops[i], address,
				map->flags | GNTMAP_host_map, -1);
324
		}
325
	}
326 327

	pr_debug("map %d+%d\n", map->index, map->count);
328 329
	err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
			map->pages, map->count);
330 331 332 333
	if (err)
		return err;

	for (i = 0; i < map->count; i++) {
334
		if (map->map_ops[i].status) {
335
			err = -EINVAL;
336
			continue;
337
		}
338 339 340 341

		map->unmap_ops[i].handle = map->map_ops[i].handle;
		if (use_ptemod)
			map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
342 343 344 345 346 347 348 349
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
		else if (map->dma_vaddr) {
			unsigned long bfn;

			bfn = pfn_to_bfn(page_to_pfn(map->pages[i]));
			map->unmap_ops[i].dev_bus_addr = __pfn_to_phys(bfn);
		}
#endif
350 351 352 353
	}
	return err;
}

354 355
static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset,
			       int pages)
356 357
{
	int i, err = 0;
358
	struct gntab_unmap_queue_data unmap_data;
359

360 361
	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
		int pgno = (map->notify.addr >> PAGE_SHIFT);
362 363 364
		if (pgno >= offset && pgno < offset + pages) {
			/* No need for kmap, pages are in lowmem */
			uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno]));
365 366 367 368 369
			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
		}
	}

370 371 372 373 374
	unmap_data.unmap_ops = map->unmap_ops + offset;
	unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL;
	unmap_data.pages = map->pages + offset;
	unmap_data.count = pages;

375 376 377
	err = gnttab_unmap_refs_sync(&unmap_data);
	if (err)
		return err;
378 379 380 381

	for (i = 0; i < pages; i++) {
		if (map->unmap_ops[offset+i].status)
			err = -EINVAL;
382 383 384 385
		pr_debug("unmap handle=%d st=%d\n",
			map->unmap_ops[offset+i].handle,
			map->unmap_ops[offset+i].status);
		map->unmap_ops[offset+i].handle = -1;
386 387 388 389
	}
	return err;
}

390 391
static int unmap_grant_pages(struct gntdev_grant_map *map, int offset,
			     int pages)
392 393 394 395 396 397 398 399 400
{
	int range, err = 0;

	pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);

	/* It is possible the requested range will have a "hole" where we
	 * already unmapped some of the grants. Only unmap valid ranges.
	 */
	while (pages && !err) {
401
		while (pages && map->unmap_ops[offset].handle == -1) {
402 403 404 405 406
			offset++;
			pages--;
		}
		range = 0;
		while (range < pages) {
407
			if (map->unmap_ops[offset+range].handle == -1)
408 409 410 411 412 413 414 415 416 417 418
				break;
			range++;
		}
		err = __unmap_grant_pages(map, offset, range);
		offset += range;
		pages -= range;
	}

	return err;
}

419 420
/* ------------------------------------------------------------------ */

421 422
static void gntdev_vma_open(struct vm_area_struct *vma)
{
423
	struct gntdev_grant_map *map = vma->vm_private_data;
424 425

	pr_debug("gntdev_vma_open %p\n", vma);
426
	refcount_inc(&map->users);
427 428
}

429 430
static void gntdev_vma_close(struct vm_area_struct *vma)
{
431
	struct gntdev_grant_map *map = vma->vm_private_data;
432 433
	struct file *file = vma->vm_file;
	struct gntdev_priv *priv = file->private_data;
434

435
	pr_debug("gntdev_vma_close %p\n", vma);
436
	if (use_ptemod) {
437 438
		WARN_ON(map->vma != vma);
		mmu_interval_notifier_remove(&map->notifier);
439 440
		map->vma = NULL;
	}
441
	vma->vm_private_data = NULL;
442
	gntdev_put_map(priv, map);
443 444
}

445 446 447
static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
						 unsigned long addr)
{
448
	struct gntdev_grant_map *map = vma->vm_private_data;
449 450 451 452

	return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
}

453
static const struct vm_operations_struct gntdev_vmops = {
454
	.open = gntdev_vma_open,
455
	.close = gntdev_vma_close,
456
	.find_special_page = gntdev_vma_find_special_page,
457 458 459 460
};

/* ------------------------------------------------------------------ */

461 462 463
static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
			      const struct mmu_notifier_range *range,
			      unsigned long cur_seq)
464
{
465 466
	struct gntdev_grant_map *map =
		container_of(mn, struct gntdev_grant_map, notifier);
467 468 469
	unsigned long mstart, mend;
	int err;

470 471
	if (!mmu_notifier_range_blockable(range))
		return false;
472

473 474 475 476 477 478 479 480 481
	/*
	 * If the VMA is split or otherwise changed the notifier is not
	 * updated, but we don't want to process VA's outside the modified
	 * VMA. FIXME: It would be much more understandable to just prevent
	 * modifying the VMA in the first place.
	 */
	if (map->vma->vm_start >= range->end ||
	    map->vma->vm_end <= range->start)
		return true;
482

483 484
	mstart = max(range->start, map->vma->vm_start);
	mend = min(range->end, map->vma->vm_end);
485 486 487
	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
			map->index, map->count,
			map->vma->vm_start, map->vma->vm_end,
488
			range->start, range->end, mstart, mend);
489 490 491 492
	err = unmap_grant_pages(map,
				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
				(mend - mstart) >> PAGE_SHIFT);
	WARN_ON(err);
493

494
	return true;
495 496
}

497 498
static const struct mmu_interval_notifier_ops gntdev_mmu_ops = {
	.invalidate = gntdev_invalidate,
499 500 501 502 503 504 505 506 507 508 509 510 511
};

/* ------------------------------------------------------------------ */

static int gntdev_open(struct inode *inode, struct file *flip)
{
	struct gntdev_priv *priv;

	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return -ENOMEM;

	INIT_LIST_HEAD(&priv->maps);
512
	mutex_init(&priv->lock);
513

514
#ifdef CONFIG_XEN_GNTDEV_DMABUF
515
	priv->dmabuf_priv = gntdev_dmabuf_init(flip);
516
	if (IS_ERR(priv->dmabuf_priv)) {
517
		int ret = PTR_ERR(priv->dmabuf_priv);
518

519 520 521
		kfree(priv);
		return ret;
	}
522
#endif
523 524

	flip->private_data = priv;
525 526
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
	priv->dma_dev = gntdev_miscdev.this_device;
527
	dma_coerce_mask_and_coherent(priv->dma_dev, DMA_BIT_MASK(64));
528
#endif
529 530 531 532 533 534 535 536
	pr_debug("priv %p\n", priv);

	return 0;
}

static int gntdev_release(struct inode *inode, struct file *flip)
{
	struct gntdev_priv *priv = flip->private_data;
537
	struct gntdev_grant_map *map;
538 539 540

	pr_debug("priv %p\n", priv);

541
	mutex_lock(&priv->lock);
542
	while (!list_empty(&priv->maps)) {
543 544
		map = list_entry(priv->maps.next,
				 struct gntdev_grant_map, next);
545
		list_del(&map->next);
546
		gntdev_put_map(NULL /* already removed */, map);
547
	}
548
	mutex_unlock(&priv->lock);
549

550 551 552 553
#ifdef CONFIG_XEN_GNTDEV_DMABUF
	gntdev_dmabuf_fini(priv->dmabuf_priv);
#endif

554 555 556 557 558 559 560 561
	kfree(priv);
	return 0;
}

static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
				       struct ioctl_gntdev_map_grant_ref __user *u)
{
	struct ioctl_gntdev_map_grant_ref op;
562
	struct gntdev_grant_map *map;
563 564 565 566 567
	int err;

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, add %d\n", priv, op.count);
568
	if (unlikely(gntdev_test_page_count(op.count)))
569 570 571
		return -EINVAL;

	err = -ENOMEM;
572
	map = gntdev_alloc_map(priv, op.count, 0 /* This is not a dma-buf. */);
573 574
	if (!map)
		return err;
575

576 577
	if (copy_from_user(map->grants, &u->refs,
			   sizeof(map->grants[0]) * op.count) != 0) {
578 579
		gntdev_put_map(NULL, map);
		return -EFAULT;
580 581
	}

582
	mutex_lock(&priv->lock);
583 584
	gntdev_add_map(priv, map);
	op.index = map->index << PAGE_SHIFT;
585
	mutex_unlock(&priv->lock);
586

587 588 589
	if (copy_to_user(u, &op, sizeof(op)) != 0)
		return -EFAULT;

590 591 592 593 594 595 596
	return 0;
}

static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
					 struct ioctl_gntdev_unmap_grant_ref __user *u)
{
	struct ioctl_gntdev_unmap_grant_ref op;
597
	struct gntdev_grant_map *map;
598 599 600 601 602 603
	int err = -ENOENT;

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);

604
	mutex_lock(&priv->lock);
605
	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
606 607 608 609
	if (map) {
		list_del(&map->next);
		err = 0;
	}
610
	mutex_unlock(&priv->lock);
611
	if (map)
612
		gntdev_put_map(priv, map);
613 614 615 616 617 618 619
	return err;
}

static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
{
	struct ioctl_gntdev_get_offset_for_vaddr op;
620
	struct vm_area_struct *vma;
621
	struct gntdev_grant_map *map;
622
	int rv = -EINVAL;
623 624 625 626 627

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);

628
	down_read(&current->mm->mmap_sem);
629 630
	vma = find_vma(current->mm, op.vaddr);
	if (!vma || vma->vm_ops != &gntdev_vmops)
631
		goto out_unlock;
632 633 634

	map = vma->vm_private_data;
	if (!map)
635
		goto out_unlock;
636

637 638
	op.offset = map->index << PAGE_SHIFT;
	op.count = map->count;
639
	rv = 0;
640

641 642 643 644
 out_unlock:
	up_read(&current->mm->mmap_sem);

	if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0)
645
		return -EFAULT;
646
	return rv;
647 648
}

649 650 651
static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
{
	struct ioctl_gntdev_unmap_notify op;
652
	struct gntdev_grant_map *map;
653
	int rc;
654 655
	int out_flags;
	unsigned int out_event;
656 657 658 659 660 661 662

	if (copy_from_user(&op, u, sizeof(op)))
		return -EFAULT;

	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
		return -EINVAL;

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
	/* We need to grab a reference to the event channel we are going to use
	 * to send the notify before releasing the reference we may already have
	 * (if someone has called this ioctl twice). This is required so that
	 * it is possible to change the clear_byte part of the notification
	 * without disturbing the event channel part, which may now be the last
	 * reference to that event channel.
	 */
	if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
		if (evtchn_get(op.event_channel_port))
			return -EINVAL;
	}

	out_flags = op.action;
	out_event = op.event_channel_port;

678
	mutex_lock(&priv->lock);
679 680 681 682 683 684 685 686 687 688 689

	list_for_each_entry(map, &priv->maps, next) {
		uint64_t begin = map->index << PAGE_SHIFT;
		uint64_t end = (map->index + map->count) << PAGE_SHIFT;
		if (op.index >= begin && op.index < end)
			goto found;
	}
	rc = -ENOENT;
	goto unlock_out;

 found:
690 691 692 693 694 695
	if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
			(map->flags & GNTMAP_readonly)) {
		rc = -EINVAL;
		goto unlock_out;
	}

696 697 698
	out_flags = map->notify.flags;
	out_event = map->notify.event;

699 700 701
	map->notify.flags = op.action;
	map->notify.addr = op.index - (map->index << PAGE_SHIFT);
	map->notify.event = op.event_channel_port;
702

703
	rc = 0;
704

705
 unlock_out:
706
	mutex_unlock(&priv->lock);
707 708 709 710 711

	/* Drop the reference to the event channel we did not save in the map */
	if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
		evtchn_put(out_event);

712 713 714
	return rc;
}

715
#define GNTDEV_COPY_BATCH 16
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732

struct gntdev_copy_batch {
	struct gnttab_copy ops[GNTDEV_COPY_BATCH];
	struct page *pages[GNTDEV_COPY_BATCH];
	s16 __user *status[GNTDEV_COPY_BATCH];
	unsigned int nr_ops;
	unsigned int nr_pages;
};

static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
			   bool writeable, unsigned long *gfn)
{
	unsigned long addr = (unsigned long)virt;
	struct page *page;
	unsigned long xen_pfn;
	int ret;

733
	ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
	if (ret < 0)
		return ret;

	batch->pages[batch->nr_pages++] = page;

	xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(addr & ~PAGE_MASK);
	*gfn = pfn_to_gfn(xen_pfn);

	return 0;
}

static void gntdev_put_pages(struct gntdev_copy_batch *batch)
{
	unsigned int i;

	for (i = 0; i < batch->nr_pages; i++)
		put_page(batch->pages[i]);
	batch->nr_pages = 0;
}

static int gntdev_copy(struct gntdev_copy_batch *batch)
{
	unsigned int i;

	gnttab_batch_copy(batch->ops, batch->nr_ops);
	gntdev_put_pages(batch);

	/*
	 * For each completed op, update the status if the op failed
	 * and all previous ops for the segment were successful.
	 */
	for (i = 0; i < batch->nr_ops; i++) {
		s16 status = batch->ops[i].status;
		s16 old_status;

		if (status == GNTST_okay)
			continue;

		if (__get_user(old_status, batch->status[i]))
			return -EFAULT;

		if (old_status != GNTST_okay)
			continue;

		if (__put_user(status, batch->status[i]))
			return -EFAULT;
	}

	batch->nr_ops = 0;
	return 0;
}

static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
				 struct gntdev_grant_copy_segment *seg,
				 s16 __user *status)
{
	uint16_t copied = 0;

	/*
	 * Disallow local -> local copies since there is only space in
	 * batch->pages for one page per-op and this would be a very
	 * expensive memcpy().
	 */
	if (!(seg->flags & (GNTCOPY_source_gref | GNTCOPY_dest_gref)))
		return -EINVAL;

	/* Can't cross page if source/dest is a grant ref. */
	if (seg->flags & GNTCOPY_source_gref) {
		if (seg->source.foreign.offset + seg->len > XEN_PAGE_SIZE)
			return -EINVAL;
	}
	if (seg->flags & GNTCOPY_dest_gref) {
		if (seg->dest.foreign.offset + seg->len > XEN_PAGE_SIZE)
			return -EINVAL;
	}

	if (put_user(GNTST_okay, status))
		return -EFAULT;

	while (copied < seg->len) {
		struct gnttab_copy *op;
		void __user *virt;
		size_t len, off;
		unsigned long gfn;
		int ret;

		if (batch->nr_ops >= GNTDEV_COPY_BATCH) {
			ret = gntdev_copy(batch);
			if (ret < 0)
				return ret;
		}

		len = seg->len - copied;

		op = &batch->ops[batch->nr_ops];
		op->flags = 0;

		if (seg->flags & GNTCOPY_source_gref) {
			op->source.u.ref = seg->source.foreign.ref;
			op->source.domid = seg->source.foreign.domid;
			op->source.offset = seg->source.foreign.offset + copied;
			op->flags |= GNTCOPY_source_gref;
		} else {
			virt = seg->source.virt + copied;
			off = (unsigned long)virt & ~XEN_PAGE_MASK;
			len = min(len, (size_t)XEN_PAGE_SIZE - off);

			ret = gntdev_get_page(batch, virt, false, &gfn);
			if (ret < 0)
				return ret;

			op->source.u.gmfn = gfn;
			op->source.domid = DOMID_SELF;
			op->source.offset = off;
		}

		if (seg->flags & GNTCOPY_dest_gref) {
			op->dest.u.ref = seg->dest.foreign.ref;
			op->dest.domid = seg->dest.foreign.domid;
			op->dest.offset = seg->dest.foreign.offset + copied;
			op->flags |= GNTCOPY_dest_gref;
		} else {
			virt = seg->dest.virt + copied;
			off = (unsigned long)virt & ~XEN_PAGE_MASK;
			len = min(len, (size_t)XEN_PAGE_SIZE - off);

			ret = gntdev_get_page(batch, virt, true, &gfn);
			if (ret < 0)
				return ret;

			op->dest.u.gmfn = gfn;
			op->dest.domid = DOMID_SELF;
			op->dest.offset = off;
		}

		op->len = len;
		copied += len;

		batch->status[batch->nr_ops] = status;
		batch->nr_ops++;
	}

	return 0;
}

static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u)
{
	struct ioctl_gntdev_grant_copy copy;
	struct gntdev_copy_batch batch;
	unsigned int i;
	int ret = 0;

	if (copy_from_user(&copy, u, sizeof(copy)))
		return -EFAULT;

	batch.nr_ops = 0;
	batch.nr_pages = 0;

	for (i = 0; i < copy.count; i++) {
		struct gntdev_grant_copy_segment seg;

		if (copy_from_user(&seg, &copy.segments[i], sizeof(seg))) {
			ret = -EFAULT;
			goto out;
		}

		ret = gntdev_grant_copy_seg(&batch, &seg, &copy.segments[i].status);
		if (ret < 0)
			goto out;

		cond_resched();
	}
	if (batch.nr_ops)
		ret = gntdev_copy(&batch);
	return ret;

  out:
	gntdev_put_pages(&batch);
	return ret;
}

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
static long gntdev_ioctl(struct file *flip,
			 unsigned int cmd, unsigned long arg)
{
	struct gntdev_priv *priv = flip->private_data;
	void __user *ptr = (void __user *)arg;

	switch (cmd) {
	case IOCTL_GNTDEV_MAP_GRANT_REF:
		return gntdev_ioctl_map_grant_ref(priv, ptr);

	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
		return gntdev_ioctl_unmap_grant_ref(priv, ptr);

	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);

931 932 933
	case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
		return gntdev_ioctl_notify(priv, ptr);

934 935 936
	case IOCTL_GNTDEV_GRANT_COPY:
		return gntdev_ioctl_grant_copy(priv, ptr);

937 938 939 940 941 942 943 944 945 946 947 948 949 950
#ifdef CONFIG_XEN_GNTDEV_DMABUF
	case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS:
		return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr);

	case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED:
		return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr);

	case IOCTL_GNTDEV_DMABUF_IMP_TO_REFS:
		return gntdev_ioctl_dmabuf_imp_to_refs(priv, ptr);

	case IOCTL_GNTDEV_DMABUF_IMP_RELEASE:
		return gntdev_ioctl_dmabuf_imp_release(priv, ptr);
#endif

951 952 953 954 955 956 957 958 959 960 961 962
	default:
		pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
		return -ENOIOCTLCMD;
	}

	return 0;
}

static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
{
	struct gntdev_priv *priv = flip->private_data;
	int index = vma->vm_pgoff;
M
Muhammad Falak R Wani 已提交
963
	int count = vma_pages(vma);
964
	struct gntdev_grant_map *map;
965
	int err = -EINVAL;
966 967 968 969 970 971 972

	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
		return -EINVAL;

	pr_debug("map %d+%d at %lx (pgoff %lx)\n",
			index, count, vma->vm_start, vma->vm_pgoff);

973
	mutex_lock(&priv->lock);
974 975 976
	map = gntdev_find_map_index(priv, index, count);
	if (!map)
		goto unlock_out;
977
	if (use_ptemod && map->vma)
978
		goto unlock_out;
979
	refcount_inc(&map->users);
980

981 982
	vma->vm_ops = &gntdev_vmops;

983
	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP;
984 985

	if (use_ptemod)
986
		vma->vm_flags |= VM_DONTCOPY;
987 988

	vma->vm_private_data = map;
989 990 991
	if (map->flags) {
		if ((vma->vm_flags & VM_WRITE) &&
				(map->flags & GNTMAP_readonly))
992
			goto out_unlock_put;
993 994 995 996 997
	} else {
		map->flags = GNTMAP_host_map;
		if (!(vma->vm_flags & VM_WRITE))
			map->flags |= GNTMAP_readonly;
	}
998

999 1000 1001 1002 1003 1004 1005 1006
	if (use_ptemod) {
		map->vma = vma;
		err = mmu_interval_notifier_insert_locked(
			&map->notifier, vma->vm_mm, vma->vm_start,
			vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
		if (err)
			goto out_unlock_put;
	}
1007
	mutex_unlock(&priv->lock);
1008

1009
	if (use_ptemod) {
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
		/*
		 * gntdev takes the address of the PTE in find_grant_ptes() and
		 * passes it to the hypervisor in gntdev_map_grant_pages(). The
		 * purpose of the notifier is to prevent the hypervisor pointer
		 * to the PTE from going stale.
		 *
		 * Since this vma's mappings can't be touched without the
		 * mmap_sem, and we are holding it now, there is no need for
		 * the notifier_range locking pattern.
		 */
		mmu_interval_read_begin(&map->notifier);

1022
		map->pages_vm_start = vma->vm_start;
1023 1024 1025 1026
		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
					  vma->vm_end - vma->vm_start,
					  find_grant_ptes, map);
		if (err) {
J
Joe Perches 已提交
1027
			pr_warn("find_grant_ptes() failure.\n");
1028
			goto out_put_map;
1029
		}
1030 1031
	}

1032
	err = gntdev_map_grant_pages(map);
1033 1034
	if (err)
		goto out_put_map;
1035

1036
	if (!use_ptemod) {
1037
		err = vm_map_pages_zero(vma, map->pages, map->count);
1038 1039
		if (err)
			goto out_put_map;
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
	} else {
#ifdef CONFIG_X86
		/*
		 * If the PTEs were not made special by the grant map
		 * hypercall, do so here.
		 *
		 * This is racy since the mapping is already visible
		 * to userspace but userspace should be well-behaved
		 * enough to not touch it until the mmap() call
		 * returns.
		 */
		if (!xen_feature(XENFEAT_gnttab_map_avail_bits)) {
			apply_to_page_range(vma->vm_mm, vma->vm_start,
					    vma->vm_end - vma->vm_start,
					    set_grant_ptes_as_special, NULL);
		}
#endif
1057 1058
	}

1059 1060
	return 0;

1061
unlock_out:
1062
	mutex_unlock(&priv->lock);
1063
	return err;
1064

1065
out_unlock_put:
1066
	mutex_unlock(&priv->lock);
1067
out_put_map:
1068 1069
	if (use_ptemod) {
		unmap_grant_pages(map, 0, map->count);
1070 1071 1072 1073
		if (map->vma) {
			mmu_interval_notifier_remove(&map->notifier);
			map->vma = NULL;
		}
1074
	}
1075
	gntdev_put_map(priv, map);
1076
	return err;
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
}

static const struct file_operations gntdev_fops = {
	.owner = THIS_MODULE,
	.open = gntdev_open,
	.release = gntdev_release,
	.mmap = gntdev_mmap,
	.unlocked_ioctl = gntdev_ioctl
};

static struct miscdevice gntdev_miscdev = {
	.minor        = MISC_DYNAMIC_MINOR,
	.name         = "xen/gntdev",
	.fops         = &gntdev_fops,
};

/* ------------------------------------------------------------------ */

static int __init gntdev_init(void)
{
	int err;

	if (!xen_domain())
		return -ENODEV;

1102
	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
1103

1104 1105
	err = misc_register(&gntdev_miscdev);
	if (err != 0) {
J
Joe Perches 已提交
1106
		pr_err("Could not register gntdev device\n");
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
		return err;
	}
	return 0;
}

static void __exit gntdev_exit(void)
{
	misc_deregister(&gntdev_miscdev);
}

module_init(gntdev_init);
module_exit(gntdev_exit);

/* ------------------------------------------------------------------ */