gntdev.c 27.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/******************************************************************************
 * gntdev.c
 *
 * Device for accessing (in user-space) pages that have been granted by other
 * domains.
 *
 * Copyright (c) 2006-2007, D G Murray.
 *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#undef DEBUG

J
Joe Perches 已提交
22 23
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt

24 25 26 27 28 29 30 31 32 33 34 35 36
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
37
#include <linux/highmem.h>
38 39 40

#include <xen/xen.h>
#include <xen/grant_table.h>
41
#include <xen/balloon.h>
42
#include <xen/gntdev.h>
43
#include <xen/events.h>
44
#include <xen/page.h>
45 46 47 48 49 50 51 52
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
	      "Gerd Hoffmann <kraxel@redhat.com>");
MODULE_DESCRIPTION("User-space granted page access driver");

53
static int limit = 1024*1024;
54
module_param(limit, int, 0644);
55 56 57 58
MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
		"the gntdev device");

static atomic_t pages_mapped = ATOMIC_INIT(0);
59

60
static int use_ptemod;
61
#define populate_freeable_maps use_ptemod
62

63
struct gntdev_priv {
64
	/* maps with visible offsets in the file descriptor */
65
	struct list_head maps;
66 67 68 69
	/* maps that are not visible; will be freed on munmap.
	 * Only populated if populate_freeable_maps == 1 */
	struct list_head freeable_maps;
	/* lock protects maps and freeable_maps */
70
	struct mutex lock;
71 72 73 74
	struct mm_struct *mm;
	struct mmu_notifier mn;
};

75 76 77 78 79 80 81
struct unmap_notify {
	int flags;
	/* Address relative to the start of the grant_map */
	int addr;
	int event;
};

82 83 84 85 86 87
struct grant_map {
	struct list_head next;
	struct vm_area_struct *vma;
	int index;
	int count;
	int flags;
88
	atomic_t users;
89
	struct unmap_notify notify;
90 91 92
	struct ioctl_gntdev_grant_ref *grants;
	struct gnttab_map_grant_ref   *map_ops;
	struct gnttab_unmap_grant_ref *unmap_ops;
93
	struct gnttab_map_grant_ref   *kmap_ops;
94
	struct gnttab_unmap_grant_ref *kunmap_ops;
95
	struct page **pages;
96
	unsigned long pages_vm_start;
97 98
};

99 100
static int unmap_grant_pages(struct grant_map *map, int offset, int pages);

101 102 103 104 105 106 107 108
/* ------------------------------------------------------------------ */

static void gntdev_print_maps(struct gntdev_priv *priv,
			      char *text, int text_index)
{
#ifdef DEBUG
	struct grant_map *map;

109
	pr_debug("%s: maps list (priv %p)\n", __func__, priv);
110 111 112 113 114 115 116
	list_for_each_entry(map, &priv->maps, next)
		pr_debug("  index %2d, count %2d %s\n",
		       map->index, map->count,
		       map->index == text_index && text ? text : "");
#endif
}

117 118 119 120 121 122
static void gntdev_free_map(struct grant_map *map)
{
	if (map == NULL)
		return;

	if (map->pages)
123
		gnttab_free_pages(map->count, map->pages);
124 125 126 127 128
	kfree(map->pages);
	kfree(map->grants);
	kfree(map->map_ops);
	kfree(map->unmap_ops);
	kfree(map->kmap_ops);
129
	kfree(map->kunmap_ops);
130 131 132
	kfree(map);
}

133 134 135
static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
{
	struct grant_map *add;
136
	int i;
137 138 139 140 141

	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
	if (NULL == add)
		return NULL;

142 143 144 145
	add->grants    = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL);
	add->map_ops   = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL);
	add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL);
	add->kmap_ops  = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL);
146
	add->kunmap_ops = kcalloc(count, sizeof(add->kunmap_ops[0]), GFP_KERNEL);
147
	add->pages     = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL);
148 149 150
	if (NULL == add->grants    ||
	    NULL == add->map_ops   ||
	    NULL == add->unmap_ops ||
151
	    NULL == add->kmap_ops  ||
152
	    NULL == add->kunmap_ops ||
153
	    NULL == add->pages)
154 155
		goto err;

156
	if (gnttab_alloc_pages(count, add->pages))
157 158
		goto err;

159
	for (i = 0; i < count; i++) {
160 161
		add->map_ops[i].handle = -1;
		add->unmap_ops[i].handle = -1;
162
		add->kmap_ops[i].handle = -1;
163
		add->kunmap_ops[i].handle = -1;
164 165
	}

166 167
	add->index = 0;
	add->count = count;
168
	atomic_set(&add->users, 1);
169 170 171 172

	return add;

err:
173
	gntdev_free_map(add);
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
	return NULL;
}

static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
{
	struct grant_map *map;

	list_for_each_entry(map, &priv->maps, next) {
		if (add->index + add->count < map->index) {
			list_add_tail(&add->next, &map->next);
			goto done;
		}
		add->index = map->index + map->count;
	}
	list_add_tail(&add->next, &priv->maps);

done:
	gntdev_print_maps(priv, "[new]", add->index);
}

static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
		int index, int count)
{
	struct grant_map *map;

	list_for_each_entry(map, &priv->maps, next) {
		if (map->index != index)
			continue;
202
		if (count && map->count != count)
203 204 205 206 207 208
			continue;
		return map;
	}
	return NULL;
}

209
static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map)
210 211 212
{
	if (!map)
		return;
213

214 215 216 217 218
	if (!atomic_dec_and_test(&map->users))
		return;

	atomic_sub(map->count, &pages_mapped);

219
	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
220
		notify_remote_via_evtchn(map->notify.event);
221 222
		evtchn_put(map->notify.event);
	}
223

224
	if (populate_freeable_maps && priv) {
225
		mutex_lock(&priv->lock);
226
		list_del(&map->next);
227
		mutex_unlock(&priv->lock);
228 229
	}

230 231 232
	if (map->pages && !use_ptemod)
		unmap_grant_pages(map, 0, map->count);
	gntdev_free_map(map);
233 234 235 236 237 238 239 240 241
}

/* ------------------------------------------------------------------ */

static int find_grant_ptes(pte_t *pte, pgtable_t token,
		unsigned long addr, void *data)
{
	struct grant_map *map = data;
	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
242
	int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
243 244 245
	u64 pte_maddr;

	BUG_ON(pgnr >= map->count);
246 247
	pte_maddr = arbitrary_virt_to_machine(pte).maddr;

248 249 250 251 252 253 254 255
	/*
	 * Set the PTE as special to force get_user_pages_fast() fall
	 * back to the slow path.  If this is not supported as part of
	 * the grant map, it will be done afterwards.
	 */
	if (xen_feature(XENFEAT_gnttab_map_avail_bits))
		flags |= (1 << _GNTMAP_guest_avail0);

256
	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
257 258
			  map->grants[pgnr].ref,
			  map->grants[pgnr].domid);
259
	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
260
			    -1 /* handle */);
261 262 263
	return 0;
}

264 265 266 267 268 269 270 271 272
#ifdef CONFIG_X86
static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token,
				     unsigned long addr, void *data)
{
	set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte));
	return 0;
}
#endif

273 274 275
static int map_grant_pages(struct grant_map *map)
{
	int i, err = 0;
276 277

	if (!use_ptemod) {
278
		/* Note: it could already be mapped */
279
		if (map->map_ops[0].handle != -1)
280
			return 0;
281
		for (i = 0; i < map->count; i++) {
I
Ian Campbell 已提交
282
			unsigned long addr = (unsigned long)
283 284 285 286 287
				pfn_to_kaddr(page_to_pfn(map->pages[i]));
			gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
				map->grants[i].ref,
				map->grants[i].domid);
			gnttab_set_unmap_op(&map->unmap_ops[i], addr,
288
				map->flags, -1 /* handle */);
289
		}
290 291 292 293 294 295 296 297 298 299 300 301
	} else {
		/*
		 * Setup the map_ops corresponding to the pte entries pointing
		 * to the kernel linear addresses of the struct pages.
		 * These ptes are completely different from the user ptes dealt
		 * with find_grant_ptes.
		 */
		for (i = 0; i < map->count; i++) {
			unsigned long address = (unsigned long)
				pfn_to_kaddr(page_to_pfn(map->pages[i]));
			BUG_ON(PageHighMem(map->pages[i]));

302 303
			gnttab_set_map_op(&map->kmap_ops[i], address,
				map->flags | GNTMAP_host_map,
304 305
				map->grants[i].ref,
				map->grants[i].domid);
306 307
			gnttab_set_unmap_op(&map->kunmap_ops[i], address,
				map->flags | GNTMAP_host_map, -1);
308
		}
309
	}
310 311

	pr_debug("map %d+%d\n", map->index, map->count);
312 313
	err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
			map->pages, map->count);
314 315 316 317
	if (err)
		return err;

	for (i = 0; i < map->count; i++) {
318
		if (map->map_ops[i].status) {
319
			err = -EINVAL;
320
			continue;
321
		}
322 323 324 325

		map->unmap_ops[i].handle = map->map_ops[i].handle;
		if (use_ptemod)
			map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
326 327 328 329
	}
	return err;
}

330
static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
331 332
{
	int i, err = 0;
333
	struct gntab_unmap_queue_data unmap_data;
334

335 336
	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
		int pgno = (map->notify.addr >> PAGE_SHIFT);
337 338 339
		if (pgno >= offset && pgno < offset + pages) {
			/* No need for kmap, pages are in lowmem */
			uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno]));
340 341 342 343 344
			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
		}
	}

345 346 347 348 349
	unmap_data.unmap_ops = map->unmap_ops + offset;
	unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL;
	unmap_data.pages = map->pages + offset;
	unmap_data.count = pages;

350 351 352
	err = gnttab_unmap_refs_sync(&unmap_data);
	if (err)
		return err;
353 354 355 356

	for (i = 0; i < pages; i++) {
		if (map->unmap_ops[offset+i].status)
			err = -EINVAL;
357 358 359 360
		pr_debug("unmap handle=%d st=%d\n",
			map->unmap_ops[offset+i].handle,
			map->unmap_ops[offset+i].status);
		map->unmap_ops[offset+i].handle = -1;
361 362 363 364
	}
	return err;
}

365 366 367 368 369 370 371 372 373 374
static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
{
	int range, err = 0;

	pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);

	/* It is possible the requested range will have a "hole" where we
	 * already unmapped some of the grants. Only unmap valid ranges.
	 */
	while (pages && !err) {
375
		while (pages && map->unmap_ops[offset].handle == -1) {
376 377 378 379 380
			offset++;
			pages--;
		}
		range = 0;
		while (range < pages) {
381
			if (map->unmap_ops[offset+range].handle == -1) {
382 383 384 385 386 387 388 389 390 391 392 393 394
				range--;
				break;
			}
			range++;
		}
		err = __unmap_grant_pages(map, offset, range);
		offset += range;
		pages -= range;
	}

	return err;
}

395 396
/* ------------------------------------------------------------------ */

397 398 399 400 401 402 403 404
static void gntdev_vma_open(struct vm_area_struct *vma)
{
	struct grant_map *map = vma->vm_private_data;

	pr_debug("gntdev_vma_open %p\n", vma);
	atomic_inc(&map->users);
}

405 406 407
static void gntdev_vma_close(struct vm_area_struct *vma)
{
	struct grant_map *map = vma->vm_private_data;
408 409
	struct file *file = vma->vm_file;
	struct gntdev_priv *priv = file->private_data;
410

411
	pr_debug("gntdev_vma_close %p\n", vma);
412 413 414 415 416 417 418 419
	if (use_ptemod) {
		/* It is possible that an mmu notifier could be running
		 * concurrently, so take priv->lock to ensure that the vma won't
		 * vanishing during the unmap_grant_pages call, since we will
		 * spin here until that completes. Such a concurrent call will
		 * not do any unmapping, since that has been done prior to
		 * closing the vma, but it may still iterate the unmap_ops list.
		 */
420
		mutex_lock(&priv->lock);
421
		map->vma = NULL;
422
		mutex_unlock(&priv->lock);
423
	}
424
	vma->vm_private_data = NULL;
425
	gntdev_put_map(priv, map);
426 427
}

428 429 430 431 432 433 434 435
static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
						 unsigned long addr)
{
	struct grant_map *map = vma->vm_private_data;

	return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
}

436
static const struct vm_operations_struct gntdev_vmops = {
437
	.open = gntdev_vma_open,
438
	.close = gntdev_vma_close,
439
	.find_special_page = gntdev_vma_find_special_page,
440 441 442 443
};

/* ------------------------------------------------------------------ */

444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
static void unmap_if_in_range(struct grant_map *map,
			      unsigned long start, unsigned long end)
{
	unsigned long mstart, mend;
	int err;

	if (!map->vma)
		return;
	if (map->vma->vm_start >= end)
		return;
	if (map->vma->vm_end <= start)
		return;
	mstart = max(start, map->vma->vm_start);
	mend   = min(end,   map->vma->vm_end);
	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
			map->index, map->count,
			map->vma->vm_start, map->vma->vm_end,
			start, end, mstart, mend);
	err = unmap_grant_pages(map,
				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
				(mend - mstart) >> PAGE_SHIFT);
	WARN_ON(err);
}

468 469 470 471 472 473 474
static void mn_invl_range_start(struct mmu_notifier *mn,
				struct mm_struct *mm,
				unsigned long start, unsigned long end)
{
	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
	struct grant_map *map;

475
	mutex_lock(&priv->lock);
476
	list_for_each_entry(map, &priv->maps, next) {
477 478 479 480
		unmap_if_in_range(map, start, end);
	}
	list_for_each_entry(map, &priv->freeable_maps, next) {
		unmap_if_in_range(map, start, end);
481
	}
482
	mutex_unlock(&priv->lock);
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
}

static void mn_invl_page(struct mmu_notifier *mn,
			 struct mm_struct *mm,
			 unsigned long address)
{
	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
}

static void mn_release(struct mmu_notifier *mn,
		       struct mm_struct *mm)
{
	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
	struct grant_map *map;
	int err;

499
	mutex_lock(&priv->lock);
500 501 502 503 504 505 506 507 508
	list_for_each_entry(map, &priv->maps, next) {
		if (!map->vma)
			continue;
		pr_debug("map %d+%d (%lx %lx)\n",
				map->index, map->count,
				map->vma->vm_start, map->vma->vm_end);
		err = unmap_grant_pages(map, /* offset */ 0, map->count);
		WARN_ON(err);
	}
509 510 511 512 513 514 515 516 517
	list_for_each_entry(map, &priv->freeable_maps, next) {
		if (!map->vma)
			continue;
		pr_debug("map %d+%d (%lx %lx)\n",
				map->index, map->count,
				map->vma->vm_start, map->vma->vm_end);
		err = unmap_grant_pages(map, /* offset */ 0, map->count);
		WARN_ON(err);
	}
518
	mutex_unlock(&priv->lock);
519 520
}

521
static const struct mmu_notifier_ops gntdev_mmu_ops = {
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
	.release                = mn_release,
	.invalidate_page        = mn_invl_page,
	.invalidate_range_start = mn_invl_range_start,
};

/* ------------------------------------------------------------------ */

static int gntdev_open(struct inode *inode, struct file *flip)
{
	struct gntdev_priv *priv;
	int ret = 0;

	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return -ENOMEM;

	INIT_LIST_HEAD(&priv->maps);
539
	INIT_LIST_HEAD(&priv->freeable_maps);
540
	mutex_init(&priv->lock);
541

542 543 544 545 546 547 548 549 550
	if (use_ptemod) {
		priv->mm = get_task_mm(current);
		if (!priv->mm) {
			kfree(priv);
			return -ENOMEM;
		}
		priv->mn.ops = &gntdev_mmu_ops;
		ret = mmu_notifier_register(&priv->mn, priv->mm);
		mmput(priv->mm);
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
	}

	if (ret) {
		kfree(priv);
		return ret;
	}

	flip->private_data = priv;
	pr_debug("priv %p\n", priv);

	return 0;
}

static int gntdev_release(struct inode *inode, struct file *flip)
{
	struct gntdev_priv *priv = flip->private_data;
	struct grant_map *map;

	pr_debug("priv %p\n", priv);

571
	mutex_lock(&priv->lock);
572 573
	while (!list_empty(&priv->maps)) {
		map = list_entry(priv->maps.next, struct grant_map, next);
574
		list_del(&map->next);
575
		gntdev_put_map(NULL /* already removed */, map);
576
	}
577
	WARN_ON(!list_empty(&priv->freeable_maps));
578
	mutex_unlock(&priv->lock);
579

580 581
	if (use_ptemod)
		mmu_notifier_unregister(&priv->mn, priv->mm);
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
	kfree(priv);
	return 0;
}

static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
				       struct ioctl_gntdev_map_grant_ref __user *u)
{
	struct ioctl_gntdev_map_grant_ref op;
	struct grant_map *map;
	int err;

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, add %d\n", priv, op.count);
	if (unlikely(op.count <= 0))
		return -EINVAL;

	err = -ENOMEM;
	map = gntdev_alloc_map(priv, op.count);
	if (!map)
		return err;
603

604 605
	if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
		pr_debug("can't map: over limit\n");
606
		gntdev_put_map(NULL, map);
607 608 609
		return err;
	}

610 611
	if (copy_from_user(map->grants, &u->refs,
			   sizeof(map->grants[0]) * op.count) != 0) {
612 613
		gntdev_put_map(NULL, map);
		return -EFAULT;
614 615
	}

616
	mutex_lock(&priv->lock);
617 618
	gntdev_add_map(priv, map);
	op.index = map->index << PAGE_SHIFT;
619
	mutex_unlock(&priv->lock);
620

621 622 623
	if (copy_to_user(u, &op, sizeof(op)) != 0)
		return -EFAULT;

624 625 626 627 628 629 630 631 632 633 634 635 636 637
	return 0;
}

static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
					 struct ioctl_gntdev_unmap_grant_ref __user *u)
{
	struct ioctl_gntdev_unmap_grant_ref op;
	struct grant_map *map;
	int err = -ENOENT;

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);

638
	mutex_lock(&priv->lock);
639
	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
640 641
	if (map) {
		list_del(&map->next);
642 643
		if (populate_freeable_maps)
			list_add_tail(&map->next, &priv->freeable_maps);
644 645
		err = 0;
	}
646
	mutex_unlock(&priv->lock);
647
	if (map)
648
		gntdev_put_map(priv, map);
649 650 651 652 653 654 655
	return err;
}

static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
{
	struct ioctl_gntdev_get_offset_for_vaddr op;
656
	struct vm_area_struct *vma;
657
	struct grant_map *map;
658
	int rv = -EINVAL;
659 660 661 662 663

	if (copy_from_user(&op, u, sizeof(op)) != 0)
		return -EFAULT;
	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);

664
	down_read(&current->mm->mmap_sem);
665 666
	vma = find_vma(current->mm, op.vaddr);
	if (!vma || vma->vm_ops != &gntdev_vmops)
667
		goto out_unlock;
668 669 670

	map = vma->vm_private_data;
	if (!map)
671
		goto out_unlock;
672

673 674
	op.offset = map->index << PAGE_SHIFT;
	op.count = map->count;
675
	rv = 0;
676

677 678 679 680
 out_unlock:
	up_read(&current->mm->mmap_sem);

	if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0)
681
		return -EFAULT;
682
	return rv;
683 684
}

685 686 687 688 689
static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
{
	struct ioctl_gntdev_unmap_notify op;
	struct grant_map *map;
	int rc;
690 691
	int out_flags;
	unsigned int out_event;
692 693 694 695 696 697 698

	if (copy_from_user(&op, u, sizeof(op)))
		return -EFAULT;

	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
		return -EINVAL;

699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
	/* We need to grab a reference to the event channel we are going to use
	 * to send the notify before releasing the reference we may already have
	 * (if someone has called this ioctl twice). This is required so that
	 * it is possible to change the clear_byte part of the notification
	 * without disturbing the event channel part, which may now be the last
	 * reference to that event channel.
	 */
	if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
		if (evtchn_get(op.event_channel_port))
			return -EINVAL;
	}

	out_flags = op.action;
	out_event = op.event_channel_port;

714
	mutex_lock(&priv->lock);
715 716 717 718 719 720 721 722 723 724 725

	list_for_each_entry(map, &priv->maps, next) {
		uint64_t begin = map->index << PAGE_SHIFT;
		uint64_t end = (map->index + map->count) << PAGE_SHIFT;
		if (op.index >= begin && op.index < end)
			goto found;
	}
	rc = -ENOENT;
	goto unlock_out;

 found:
726 727 728 729 730 731
	if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
			(map->flags & GNTMAP_readonly)) {
		rc = -EINVAL;
		goto unlock_out;
	}

732 733 734
	out_flags = map->notify.flags;
	out_event = map->notify.event;

735 736 737
	map->notify.flags = op.action;
	map->notify.addr = op.index - (map->index << PAGE_SHIFT);
	map->notify.event = op.event_channel_port;
738

739
	rc = 0;
740

741
 unlock_out:
742
	mutex_unlock(&priv->lock);
743 744 745 746 747

	/* Drop the reference to the event channel we did not save in the map */
	if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
		evtchn_put(out_event);

748 749 750
	return rc;
}

751
#define GNTDEV_COPY_BATCH 16
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950

struct gntdev_copy_batch {
	struct gnttab_copy ops[GNTDEV_COPY_BATCH];
	struct page *pages[GNTDEV_COPY_BATCH];
	s16 __user *status[GNTDEV_COPY_BATCH];
	unsigned int nr_ops;
	unsigned int nr_pages;
};

static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
			   bool writeable, unsigned long *gfn)
{
	unsigned long addr = (unsigned long)virt;
	struct page *page;
	unsigned long xen_pfn;
	int ret;

	ret = get_user_pages_fast(addr, 1, writeable, &page);
	if (ret < 0)
		return ret;

	batch->pages[batch->nr_pages++] = page;

	xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(addr & ~PAGE_MASK);
	*gfn = pfn_to_gfn(xen_pfn);

	return 0;
}

static void gntdev_put_pages(struct gntdev_copy_batch *batch)
{
	unsigned int i;

	for (i = 0; i < batch->nr_pages; i++)
		put_page(batch->pages[i]);
	batch->nr_pages = 0;
}

static int gntdev_copy(struct gntdev_copy_batch *batch)
{
	unsigned int i;

	gnttab_batch_copy(batch->ops, batch->nr_ops);
	gntdev_put_pages(batch);

	/*
	 * For each completed op, update the status if the op failed
	 * and all previous ops for the segment were successful.
	 */
	for (i = 0; i < batch->nr_ops; i++) {
		s16 status = batch->ops[i].status;
		s16 old_status;

		if (status == GNTST_okay)
			continue;

		if (__get_user(old_status, batch->status[i]))
			return -EFAULT;

		if (old_status != GNTST_okay)
			continue;

		if (__put_user(status, batch->status[i]))
			return -EFAULT;
	}

	batch->nr_ops = 0;
	return 0;
}

static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
				 struct gntdev_grant_copy_segment *seg,
				 s16 __user *status)
{
	uint16_t copied = 0;

	/*
	 * Disallow local -> local copies since there is only space in
	 * batch->pages for one page per-op and this would be a very
	 * expensive memcpy().
	 */
	if (!(seg->flags & (GNTCOPY_source_gref | GNTCOPY_dest_gref)))
		return -EINVAL;

	/* Can't cross page if source/dest is a grant ref. */
	if (seg->flags & GNTCOPY_source_gref) {
		if (seg->source.foreign.offset + seg->len > XEN_PAGE_SIZE)
			return -EINVAL;
	}
	if (seg->flags & GNTCOPY_dest_gref) {
		if (seg->dest.foreign.offset + seg->len > XEN_PAGE_SIZE)
			return -EINVAL;
	}

	if (put_user(GNTST_okay, status))
		return -EFAULT;

	while (copied < seg->len) {
		struct gnttab_copy *op;
		void __user *virt;
		size_t len, off;
		unsigned long gfn;
		int ret;

		if (batch->nr_ops >= GNTDEV_COPY_BATCH) {
			ret = gntdev_copy(batch);
			if (ret < 0)
				return ret;
		}

		len = seg->len - copied;

		op = &batch->ops[batch->nr_ops];
		op->flags = 0;

		if (seg->flags & GNTCOPY_source_gref) {
			op->source.u.ref = seg->source.foreign.ref;
			op->source.domid = seg->source.foreign.domid;
			op->source.offset = seg->source.foreign.offset + copied;
			op->flags |= GNTCOPY_source_gref;
		} else {
			virt = seg->source.virt + copied;
			off = (unsigned long)virt & ~XEN_PAGE_MASK;
			len = min(len, (size_t)XEN_PAGE_SIZE - off);

			ret = gntdev_get_page(batch, virt, false, &gfn);
			if (ret < 0)
				return ret;

			op->source.u.gmfn = gfn;
			op->source.domid = DOMID_SELF;
			op->source.offset = off;
		}

		if (seg->flags & GNTCOPY_dest_gref) {
			op->dest.u.ref = seg->dest.foreign.ref;
			op->dest.domid = seg->dest.foreign.domid;
			op->dest.offset = seg->dest.foreign.offset + copied;
			op->flags |= GNTCOPY_dest_gref;
		} else {
			virt = seg->dest.virt + copied;
			off = (unsigned long)virt & ~XEN_PAGE_MASK;
			len = min(len, (size_t)XEN_PAGE_SIZE - off);

			ret = gntdev_get_page(batch, virt, true, &gfn);
			if (ret < 0)
				return ret;

			op->dest.u.gmfn = gfn;
			op->dest.domid = DOMID_SELF;
			op->dest.offset = off;
		}

		op->len = len;
		copied += len;

		batch->status[batch->nr_ops] = status;
		batch->nr_ops++;
	}

	return 0;
}

static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u)
{
	struct ioctl_gntdev_grant_copy copy;
	struct gntdev_copy_batch batch;
	unsigned int i;
	int ret = 0;

	if (copy_from_user(&copy, u, sizeof(copy)))
		return -EFAULT;

	batch.nr_ops = 0;
	batch.nr_pages = 0;

	for (i = 0; i < copy.count; i++) {
		struct gntdev_grant_copy_segment seg;

		if (copy_from_user(&seg, &copy.segments[i], sizeof(seg))) {
			ret = -EFAULT;
			goto out;
		}

		ret = gntdev_grant_copy_seg(&batch, &seg, &copy.segments[i].status);
		if (ret < 0)
			goto out;

		cond_resched();
	}
	if (batch.nr_ops)
		ret = gntdev_copy(&batch);
	return ret;

  out:
	gntdev_put_pages(&batch);
	return ret;
}

951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
static long gntdev_ioctl(struct file *flip,
			 unsigned int cmd, unsigned long arg)
{
	struct gntdev_priv *priv = flip->private_data;
	void __user *ptr = (void __user *)arg;

	switch (cmd) {
	case IOCTL_GNTDEV_MAP_GRANT_REF:
		return gntdev_ioctl_map_grant_ref(priv, ptr);

	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
		return gntdev_ioctl_unmap_grant_ref(priv, ptr);

	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);

967 968 969
	case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
		return gntdev_ioctl_notify(priv, ptr);

970 971 972
	case IOCTL_GNTDEV_GRANT_COPY:
		return gntdev_ioctl_grant_copy(priv, ptr);

973 974 975 976 977 978 979 980 981 982 983 984 985 986
	default:
		pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
		return -ENOIOCTLCMD;
	}

	return 0;
}

static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
{
	struct gntdev_priv *priv = flip->private_data;
	int index = vma->vm_pgoff;
	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
	struct grant_map *map;
987
	int i, err = -EINVAL;
988 989 990 991 992 993 994

	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
		return -EINVAL;

	pr_debug("map %d+%d at %lx (pgoff %lx)\n",
			index, count, vma->vm_start, vma->vm_pgoff);

995
	mutex_lock(&priv->lock);
996 997 998
	map = gntdev_find_map_index(priv, index, count);
	if (!map)
		goto unlock_out;
999
	if (use_ptemod && map->vma)
1000
		goto unlock_out;
1001
	if (use_ptemod && priv->mm != vma->vm_mm) {
J
Joe Perches 已提交
1002
		pr_warn("Huh? Other mm?\n");
1003 1004 1005
		goto unlock_out;
	}

1006 1007
	atomic_inc(&map->users);

1008 1009
	vma->vm_ops = &gntdev_vmops;

1010
	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
1011 1012

	if (use_ptemod)
1013
		vma->vm_flags |= VM_DONTCOPY;
1014 1015 1016

	vma->vm_private_data = map;

1017 1018 1019
	if (use_ptemod)
		map->vma = vma;

1020 1021 1022
	if (map->flags) {
		if ((vma->vm_flags & VM_WRITE) &&
				(map->flags & GNTMAP_readonly))
1023
			goto out_unlock_put;
1024 1025 1026 1027 1028
	} else {
		map->flags = GNTMAP_host_map;
		if (!(vma->vm_flags & VM_WRITE))
			map->flags |= GNTMAP_readonly;
	}
1029

1030
	mutex_unlock(&priv->lock);
1031

1032 1033 1034 1035 1036
	if (use_ptemod) {
		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
					  vma->vm_end - vma->vm_start,
					  find_grant_ptes, map);
		if (err) {
J
Joe Perches 已提交
1037
			pr_warn("find_grant_ptes() failure.\n");
1038
			goto out_put_map;
1039
		}
1040 1041 1042
	}

	err = map_grant_pages(map);
1043 1044
	if (err)
		goto out_put_map;
1045

1046 1047 1048 1049 1050
	if (!use_ptemod) {
		for (i = 0; i < count; i++) {
			err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
				map->pages[i]);
			if (err)
1051
				goto out_put_map;
1052
		}
1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
	} else {
#ifdef CONFIG_X86
		/*
		 * If the PTEs were not made special by the grant map
		 * hypercall, do so here.
		 *
		 * This is racy since the mapping is already visible
		 * to userspace but userspace should be well-behaved
		 * enough to not touch it until the mmap() call
		 * returns.
		 */
		if (!xen_feature(XENFEAT_gnttab_map_avail_bits)) {
			apply_to_page_range(vma->vm_mm, vma->vm_start,
					    vma->vm_end - vma->vm_start,
					    set_grant_ptes_as_special, NULL);
		}
#endif
1070
		map->pages_vm_start = vma->vm_start;
1071 1072
	}

1073 1074
	return 0;

1075
unlock_out:
1076
	mutex_unlock(&priv->lock);
1077
	return err;
1078

1079
out_unlock_put:
1080
	mutex_unlock(&priv->lock);
1081
out_put_map:
1082 1083
	if (use_ptemod)
		map->vma = NULL;
1084
	gntdev_put_map(priv, map);
1085
	return err;
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
}

static const struct file_operations gntdev_fops = {
	.owner = THIS_MODULE,
	.open = gntdev_open,
	.release = gntdev_release,
	.mmap = gntdev_mmap,
	.unlocked_ioctl = gntdev_ioctl
};

static struct miscdevice gntdev_miscdev = {
	.minor        = MISC_DYNAMIC_MINOR,
	.name         = "xen/gntdev",
	.fops         = &gntdev_fops,
};

/* ------------------------------------------------------------------ */

static int __init gntdev_init(void)
{
	int err;

	if (!xen_domain())
		return -ENODEV;

1111
	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
1112

1113 1114
	err = misc_register(&gntdev_miscdev);
	if (err != 0) {
J
Joe Perches 已提交
1115
		pr_err("Could not register gntdev device\n");
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
		return err;
	}
	return 0;
}

static void __exit gntdev_exit(void)
{
	misc_deregister(&gntdev_miscdev);
}

module_init(gntdev_init);
module_exit(gntdev_exit);

/* ------------------------------------------------------------------ */