kvm_main.c 33.8 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

18
#include "iodev.h"
A
Avi Kivity 已提交
19

20
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
21 22 23 24 25 26 27 28 29 30 31 32
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
33
#include <linux/sysdev.h>
A
Avi Kivity 已提交
34
#include <linux/cpu.h>
A
Alexey Dobriyan 已提交
35
#include <linux/sched.h>
36 37
#include <linux/cpumask.h>
#include <linux/smp.h>
38
#include <linux/anon_inodes.h>
39
#include <linux/profile.h>
40
#include <linux/kvm_para.h>
41
#include <linux/pagemap.h>
42
#include <linux/mman.h>
43
#include <linux/swap.h>
A
Avi Kivity 已提交
44

A
Avi Kivity 已提交
45 46 47
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/uaccess.h>
48
#include <asm/pgtable.h>
A
Avi Kivity 已提交
49

50 51 52 53
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
#include "coalesced_mmio.h"
#endif

A
Avi Kivity 已提交
54 55 56
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

57 58
DEFINE_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
59

60 61
static cpumask_t cpus_hardware_enabled;

62 63
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
A
Avi Kivity 已提交
64

65 66
static __read_mostly struct preempt_ops kvm_preempt_ops;

67
struct dentry *kvm_debugfs_dir;
A
Avi Kivity 已提交
68

A
Avi Kivity 已提交
69 70 71
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
			   unsigned long arg);

72 73
bool kvm_rebooting;

74 75 76 77 78
static inline int valid_vcpu(int n)
{
	return likely(n >= 0 && n < KVM_MAX_VCPUS);
}

A
Avi Kivity 已提交
79 80 81
/*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
82
void vcpu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
83
{
84 85
	int cpu;

A
Avi Kivity 已提交
86
	mutex_lock(&vcpu->mutex);
87 88
	cpu = get_cpu();
	preempt_notifier_register(&vcpu->preempt_notifier);
89
	kvm_arch_vcpu_load(vcpu, cpu);
90
	put_cpu();
A
Avi Kivity 已提交
91 92
}

93
void vcpu_put(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
94
{
95
	preempt_disable();
96
	kvm_arch_vcpu_put(vcpu);
97 98
	preempt_notifier_unregister(&vcpu->preempt_notifier);
	preempt_enable();
A
Avi Kivity 已提交
99 100 101
	mutex_unlock(&vcpu->mutex);
}

102 103 104 105 106 107
static void ack_flush(void *_completed)
{
}

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
108
	int i, cpu;
109 110 111 112
	cpumask_t cpus;
	struct kvm_vcpu *vcpu;

	cpus_clear(cpus);
R
Rusty Russell 已提交
113 114 115 116
	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
		vcpu = kvm->vcpus[i];
		if (!vcpu)
			continue;
117
		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
118 119 120
			continue;
		cpu = vcpu->cpu;
		if (cpu != -1 && cpu != raw_smp_processor_id())
121
			cpu_set(cpu, cpus);
122
	}
123 124 125
	if (cpus_empty(cpus))
		return;
	++kvm->stat.remote_tlb_flush;
126
	smp_call_function_mask(cpus, ack_flush, NULL, 1);
127 128
}

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
void kvm_reload_remote_mmus(struct kvm *kvm)
{
	int i, cpu;
	cpumask_t cpus;
	struct kvm_vcpu *vcpu;

	cpus_clear(cpus);
	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
		vcpu = kvm->vcpus[i];
		if (!vcpu)
			continue;
		if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
			continue;
		cpu = vcpu->cpu;
		if (cpu != -1 && cpu != raw_smp_processor_id())
			cpu_set(cpu, cpus);
	}
	if (cpus_empty(cpus))
		return;
	smp_call_function_mask(cpus, ack_flush, NULL, 1);
}


R
Rusty Russell 已提交
152 153 154 155 156 157 158 159 160
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
	struct page *page;
	int r;

	mutex_init(&vcpu->mutex);
	vcpu->cpu = -1;
	vcpu->kvm = kvm;
	vcpu->vcpu_id = id;
E
Eddie Dong 已提交
161
	init_waitqueue_head(&vcpu->wq);
R
Rusty Russell 已提交
162 163 164 165 166 167 168 169

	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		r = -ENOMEM;
		goto fail;
	}
	vcpu->run = page_address(page);

170
	r = kvm_arch_vcpu_init(vcpu);
R
Rusty Russell 已提交
171
	if (r < 0)
172
		goto fail_free_run;
R
Rusty Russell 已提交
173 174 175 176 177
	return 0;

fail_free_run:
	free_page((unsigned long)vcpu->run);
fail:
178
	return r;
R
Rusty Russell 已提交
179 180 181 182 183
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);

void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
184
	kvm_arch_vcpu_uninit(vcpu);
R
Rusty Russell 已提交
185 186 187 188
	free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);

189
static struct kvm *kvm_create_vm(void)
A
Avi Kivity 已提交
190
{
191
	struct kvm *kvm = kvm_arch_create_vm();
192 193 194
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	struct page *page;
#endif
A
Avi Kivity 已提交
195

196 197
	if (IS_ERR(kvm))
		goto out;
A
Avi Kivity 已提交
198

199 200 201 202 203 204 205 206 207 208
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		kfree(kvm);
		return ERR_PTR(-ENOMEM);
	}
	kvm->coalesced_mmio_ring =
			(struct kvm_coalesced_mmio_ring *)page_address(page);
#endif

209 210
	kvm->mm = current->mm;
	atomic_inc(&kvm->mm->mm_count);
211
	spin_lock_init(&kvm->mmu_lock);
212
	kvm_io_bus_init(&kvm->pio_bus);
S
Shaohua Li 已提交
213
	mutex_init(&kvm->lock);
214
	kvm_io_bus_init(&kvm->mmio_bus);
215
	init_rwsem(&kvm->slots_lock);
I
Izik Eidus 已提交
216
	atomic_set(&kvm->users_count, 1);
217 218 219
	spin_lock(&kvm_lock);
	list_add(&kvm->vm_list, &vm_list);
	spin_unlock(&kvm_lock);
220 221 222
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	kvm_coalesced_mmio_init(kvm);
#endif
223
out:
224 225 226
	return kvm;
}

A
Avi Kivity 已提交
227 228 229 230 231 232
/*
 * Free any memory in @free but not in @dont.
 */
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
				  struct kvm_memory_slot *dont)
{
233 234
	if (!dont || free->rmap != dont->rmap)
		vfree(free->rmap);
A
Avi Kivity 已提交
235 236 237 238

	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
		vfree(free->dirty_bitmap);

M
Marcelo Tosatti 已提交
239 240 241
	if (!dont || free->lpage_info != dont->lpage_info)
		vfree(free->lpage_info);

A
Avi Kivity 已提交
242
	free->npages = 0;
A
Al Viro 已提交
243
	free->dirty_bitmap = NULL;
244
	free->rmap = NULL;
M
Marcelo Tosatti 已提交
245
	free->lpage_info = NULL;
A
Avi Kivity 已提交
246 247
}

248
void kvm_free_physmem(struct kvm *kvm)
A
Avi Kivity 已提交
249 250 251 252
{
	int i;

	for (i = 0; i < kvm->nmemslots; ++i)
A
Al Viro 已提交
253
		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
A
Avi Kivity 已提交
254 255
}

256 257
static void kvm_destroy_vm(struct kvm *kvm)
{
258 259
	struct mm_struct *mm = kvm->mm;

260 261 262
	spin_lock(&kvm_lock);
	list_del(&kvm->vm_list);
	spin_unlock(&kvm_lock);
263
	kvm_io_bus_destroy(&kvm->pio_bus);
264
	kvm_io_bus_destroy(&kvm->mmio_bus);
265 266 267 268
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	if (kvm->coalesced_mmio_ring != NULL)
		free_page((unsigned long)kvm->coalesced_mmio_ring);
#endif
269
	kvm_arch_destroy_vm(kvm);
270
	mmdrop(mm);
271 272
}

I
Izik Eidus 已提交
273 274 275 276 277 278 279 280 281 282 283 284 285 286
void kvm_get_kvm(struct kvm *kvm)
{
	atomic_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

void kvm_put_kvm(struct kvm *kvm)
{
	if (atomic_dec_and_test(&kvm->users_count))
		kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);


287 288 289 290
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
	struct kvm *kvm = filp->private_data;

I
Izik Eidus 已提交
291
	kvm_put_kvm(kvm);
A
Avi Kivity 已提交
292 293 294 295 296 297 298 299
	return 0;
}

/*
 * Allocate some memory and give it an address in the guest physical address
 * space.
 *
 * Discontiguous memory is allowed, mostly for framebuffers.
300
 *
301
 * Must be called holding mmap_sem for write.
A
Avi Kivity 已提交
302
 */
303 304 305
int __kvm_set_memory_region(struct kvm *kvm,
			    struct kvm_userspace_memory_region *mem,
			    int user_alloc)
A
Avi Kivity 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319
{
	int r;
	gfn_t base_gfn;
	unsigned long npages;
	unsigned long i;
	struct kvm_memory_slot *memslot;
	struct kvm_memory_slot old, new;

	r = -EINVAL;
	/* General sanity checks */
	if (mem->memory_size & (PAGE_SIZE - 1))
		goto out;
	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
		goto out;
320
	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
A
Avi Kivity 已提交
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
		goto out;
	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
		goto out;

	memslot = &kvm->memslots[mem->slot];
	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	npages = mem->memory_size >> PAGE_SHIFT;

	if (!npages)
		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;

	new = old = *memslot;

	new.base_gfn = base_gfn;
	new.npages = npages;
	new.flags = mem->flags;

	/* Disallow changing a memory slot's size. */
	r = -EINVAL;
	if (npages && old.npages && npages != old.npages)
341
		goto out_free;
A
Avi Kivity 已提交
342 343 344 345 346 347 348 349 350 351

	/* Check for overlaps */
	r = -EEXIST;
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *s = &kvm->memslots[i];

		if (s == memslot)
			continue;
		if (!((base_gfn + npages <= s->base_gfn) ||
		      (base_gfn >= s->base_gfn + s->npages)))
352
			goto out_free;
A
Avi Kivity 已提交
353 354 355 356
	}

	/* Free page dirty bitmap if unneeded */
	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
A
Al Viro 已提交
357
		new.dirty_bitmap = NULL;
A
Avi Kivity 已提交
358 359 360 361

	r = -ENOMEM;

	/* Allocate if a slot is being created */
362
	if (npages && !new.rmap) {
M
Mike Day 已提交
363
		new.rmap = vmalloc(npages * sizeof(struct page *));
364 365

		if (!new.rmap)
366
			goto out_free;
367 368

		memset(new.rmap, 0, npages * sizeof(*new.rmap));
369

370
		new.user_alloc = user_alloc;
371
		new.userspace_addr = mem->userspace_addr;
A
Avi Kivity 已提交
372
	}
M
Marcelo Tosatti 已提交
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
	if (npages && !new.lpage_info) {
		int largepages = npages / KVM_PAGES_PER_HPAGE;
		if (npages % KVM_PAGES_PER_HPAGE)
			largepages++;
		if (base_gfn % KVM_PAGES_PER_HPAGE)
			largepages++;

		new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));

		if (!new.lpage_info)
			goto out_free;

		memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));

		if (base_gfn % KVM_PAGES_PER_HPAGE)
			new.lpage_info[0].write_count = 1;
		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
			new.lpage_info[largepages-1].write_count = 1;
	}
A
Avi Kivity 已提交
392 393 394 395 396 397 398

	/* Allocate page dirty bitmap if needed */
	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;

		new.dirty_bitmap = vmalloc(dirty_bytes);
		if (!new.dirty_bitmap)
399
			goto out_free;
A
Avi Kivity 已提交
400 401 402 403 404 405
		memset(new.dirty_bitmap, 0, dirty_bytes);
	}

	if (mem->slot >= kvm->nmemslots)
		kvm->nmemslots = mem->slot + 1;

406 407
	*memslot = new;

408 409 410 411
	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
	if (r) {
		*memslot = old;
		goto out_free;
412 413
	}

A
Avi Kivity 已提交
414 415 416
	kvm_free_physmem_slot(&old, &new);
	return 0;

417
out_free:
A
Avi Kivity 已提交
418 419 420
	kvm_free_physmem_slot(&new, &old);
out:
	return r;
421 422

}
423 424 425 426 427 428 429 430
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
			  struct kvm_userspace_memory_region *mem,
			  int user_alloc)
{
	int r;

431
	down_write(&kvm->slots_lock);
432
	r = __kvm_set_memory_region(kvm, mem, user_alloc);
433
	up_write(&kvm->slots_lock);
434 435
	return r;
}
436 437
EXPORT_SYMBOL_GPL(kvm_set_memory_region);

438 439 440 441
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
				   struct
				   kvm_userspace_memory_region *mem,
				   int user_alloc)
442
{
443 444
	if (mem->slot >= KVM_MEMORY_SLOTS)
		return -EINVAL;
445
	return kvm_set_memory_region(kvm, mem, user_alloc);
A
Avi Kivity 已提交
446 447
}

448 449
int kvm_get_dirty_log(struct kvm *kvm,
			struct kvm_dirty_log *log, int *is_dirty)
A
Avi Kivity 已提交
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
{
	struct kvm_memory_slot *memslot;
	int r, i;
	int n;
	unsigned long any = 0;

	r = -EINVAL;
	if (log->slot >= KVM_MEMORY_SLOTS)
		goto out;

	memslot = &kvm->memslots[log->slot];
	r = -ENOENT;
	if (!memslot->dirty_bitmap)
		goto out;

465
	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
A
Avi Kivity 已提交
466

467
	for (i = 0; !any && i < n/sizeof(long); ++i)
A
Avi Kivity 已提交
468 469 470 471 472 473
		any = memslot->dirty_bitmap[i];

	r = -EFAULT;
	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
		goto out;

474 475
	if (any)
		*is_dirty = 1;
A
Avi Kivity 已提交
476 477 478 479 480 481

	r = 0;
out:
	return r;
}

482 483 484 485 486 487
int is_error_page(struct page *page)
{
	return page == bad_page;
}
EXPORT_SYMBOL_GPL(is_error_page);

488 489 490 491 492 493
int is_error_pfn(pfn_t pfn)
{
	return pfn == bad_pfn;
}
EXPORT_SYMBOL_GPL(is_error_pfn);

I
Izik Eidus 已提交
494 495 496 497 498 499 500 501 502 503 504
static inline unsigned long bad_hva(void)
{
	return PAGE_OFFSET;
}

int kvm_is_error_hva(unsigned long addr)
{
	return addr == bad_hva();
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);

505
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
A
Avi Kivity 已提交
506 507 508 509 510 511 512 513 514 515
{
	int i;

	for (i = 0; i < kvm->nmemslots; ++i) {
		struct kvm_memory_slot *memslot = &kvm->memslots[i];

		if (gfn >= memslot->base_gfn
		    && gfn < memslot->base_gfn + memslot->npages)
			return memslot;
	}
A
Al Viro 已提交
516
	return NULL;
A
Avi Kivity 已提交
517
}
518 519 520 521 522 523

struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
	gfn = unalias_gfn(kvm, gfn);
	return __gfn_to_memslot(kvm, gfn);
}
A
Avi Kivity 已提交
524

525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
	int i;

	gfn = unalias_gfn(kvm, gfn);
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *memslot = &kvm->memslots[i];

		if (gfn >= memslot->base_gfn
		    && gfn < memslot->base_gfn + memslot->npages)
			return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);

M
Marcelo Tosatti 已提交
541
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
I
Izik Eidus 已提交
542 543 544 545 546 547 548 549 550
{
	struct kvm_memory_slot *slot;

	gfn = unalias_gfn(kvm, gfn);
	slot = __gfn_to_memslot(kvm, gfn);
	if (!slot)
		return bad_hva();
	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
551
EXPORT_SYMBOL_GPL(gfn_to_hva);
I
Izik Eidus 已提交
552

553 554 555
/*
 * Requires current->mm->mmap_sem to be held
 */
556
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
A
Avi Kivity 已提交
557
{
558
	struct page *page[1];
I
Izik Eidus 已提交
559
	unsigned long addr;
560
	int npages;
561
	pfn_t pfn;
A
Avi Kivity 已提交
562

563 564
	might_sleep();

I
Izik Eidus 已提交
565 566
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr)) {
567
		get_page(bad_page);
568
		return page_to_pfn(bad_page);
569
	}
570

I
Izik Eidus 已提交
571 572 573
	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
				NULL);

574 575 576 577 578 579 580 581 582 583 584 585 586 587
	if (unlikely(npages != 1)) {
		struct vm_area_struct *vma;

		vma = find_vma(current->mm, addr);
		if (vma == NULL || addr < vma->vm_start ||
		    !(vma->vm_flags & VM_PFNMAP)) {
			get_page(bad_page);
			return page_to_pfn(bad_page);
		}

		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
		BUG_ON(pfn_valid(pfn));
	} else
		pfn = page_to_pfn(page[0]);
588

589
	return pfn;
590 591 592 593 594 595
}

EXPORT_SYMBOL_GPL(gfn_to_pfn);

struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
596 597 598 599 600 601 602 603 604 605
	pfn_t pfn;

	pfn = gfn_to_pfn(kvm, gfn);
	if (pfn_valid(pfn))
		return pfn_to_page(pfn);

	WARN_ON(!pfn_valid(pfn));

	get_page(bad_page);
	return bad_page;
A
Avi Kivity 已提交
606
}
607

A
Avi Kivity 已提交
608 609
EXPORT_SYMBOL_GPL(gfn_to_page);

610 611
void kvm_release_page_clean(struct page *page)
{
612
	kvm_release_pfn_clean(page_to_pfn(page));
613 614 615
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);

616 617
void kvm_release_pfn_clean(pfn_t pfn)
{
618 619
	if (pfn_valid(pfn))
		put_page(pfn_to_page(pfn));
620 621 622
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);

623
void kvm_release_page_dirty(struct page *page)
624
{
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
	kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);

void kvm_release_pfn_dirty(pfn_t pfn)
{
	kvm_set_pfn_dirty(pfn);
	kvm_release_pfn_clean(pfn);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);

void kvm_set_page_dirty(struct page *page)
{
	kvm_set_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_set_page_dirty);

void kvm_set_pfn_dirty(pfn_t pfn)
{
644 645 646 647 648
	if (pfn_valid(pfn)) {
		struct page *page = pfn_to_page(pfn);
		if (!PageReserved(page))
			SetPageDirty(page);
	}
649
}
650 651 652 653
EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);

void kvm_set_pfn_accessed(pfn_t pfn)
{
654 655
	if (pfn_valid(pfn))
		mark_page_accessed(pfn_to_page(pfn));
656 657 658 659 660
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

void kvm_get_pfn(pfn_t pfn)
{
661 662
	if (pfn_valid(pfn))
		get_page(pfn_to_page(pfn));
663 664
}
EXPORT_SYMBOL_GPL(kvm_get_pfn);
665

666 667 668 669 670 671 672 673 674 675 676
static int next_segment(unsigned long len, int offset)
{
	if (len > PAGE_SIZE - offset)
		return PAGE_SIZE - offset;
	else
		return len;
}

int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
			int len)
{
677 678
	int r;
	unsigned long addr;
679

680 681 682 683 684
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
	r = copy_from_user(data, (void __user *)addr + offset, len);
	if (r)
685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
		return -EFAULT;
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page);

int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

	while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		data += seg;
		++gfn;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest);

710 711 712 713 714 715 716 717 718 719 720
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
			  unsigned long len)
{
	int r;
	unsigned long addr;
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int offset = offset_in_page(gpa);

	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
721
	pagefault_disable();
722
	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
723
	pagefault_enable();
724 725 726 727 728 729
	if (r)
		return -EFAULT;
	return 0;
}
EXPORT_SYMBOL(kvm_read_guest_atomic);

730 731 732
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
			 int offset, int len)
{
733 734
	int r;
	unsigned long addr;
735

736 737 738 739 740
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
	r = copy_to_user((void __user *)addr + offset, data, len);
	if (r)
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
		return -EFAULT;
	mark_page_dirty(kvm, gfn);
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);

int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
		    unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

	while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		data += seg;
		++gfn;
	}
	return 0;
}

int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
769
	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);

int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

        while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		++gfn;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);

A
Avi Kivity 已提交
792 793
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
794
	struct kvm_memory_slot *memslot;
A
Avi Kivity 已提交
795

796
	gfn = unalias_gfn(kvm, gfn);
R
Rusty Russell 已提交
797 798 799
	memslot = __gfn_to_memslot(kvm, gfn);
	if (memslot && memslot->dirty_bitmap) {
		unsigned long rel_gfn = gfn - memslot->base_gfn;
A
Avi Kivity 已提交
800

R
Rusty Russell 已提交
801 802 803
		/* avoid RMW */
		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
			set_bit(rel_gfn, memslot->dirty_bitmap);
A
Avi Kivity 已提交
804 805 806
	}
}

E
Eddie Dong 已提交
807 808 809
/*
 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
 */
810
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
811
{
812 813 814 815 816 817 818 819 820 821 822 823 824 825
	DEFINE_WAIT(wait);

	for (;;) {
		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);

		if (kvm_cpu_has_interrupt(vcpu))
			break;
		if (kvm_cpu_has_pending_timer(vcpu))
			break;
		if (kvm_arch_vcpu_runnable(vcpu))
			break;
		if (signal_pending(current))
			break;

E
Eddie Dong 已提交
826 827 828 829
		vcpu_put(vcpu);
		schedule();
		vcpu_load(vcpu);
	}
830

831
	finish_wait(&vcpu->wq, &wait);
E
Eddie Dong 已提交
832 833
}

A
Avi Kivity 已提交
834 835
void kvm_resched(struct kvm_vcpu *vcpu)
{
836 837
	if (!need_resched())
		return;
A
Avi Kivity 已提交
838 839 840 841
	cond_resched();
}
EXPORT_SYMBOL_GPL(kvm_resched);

842
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
843 844 845 846
{
	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
	struct page *page;

847
	if (vmf->pgoff == 0)
848
		page = virt_to_page(vcpu->run);
A
Avi Kivity 已提交
849
#ifdef CONFIG_X86
850
	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
851
		page = virt_to_page(vcpu->arch.pio_data);
852 853 854 855
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
A
Avi Kivity 已提交
856
#endif
857
	else
858
		return VM_FAULT_SIGBUS;
859
	get_page(page);
860 861
	vmf->page = page;
	return 0;
862 863 864
}

static struct vm_operations_struct kvm_vcpu_vm_ops = {
865
	.fault = kvm_vcpu_fault,
866 867 868 869 870 871 872 873
};

static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_ops = &kvm_vcpu_vm_ops;
	return 0;
}

A
Avi Kivity 已提交
874 875 876 877
static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
	struct kvm_vcpu *vcpu = filp->private_data;

A
Al Viro 已提交
878
	kvm_put_kvm(vcpu->kvm);
A
Avi Kivity 已提交
879 880 881
	return 0;
}

882
static const struct file_operations kvm_vcpu_fops = {
A
Avi Kivity 已提交
883 884 885
	.release        = kvm_vcpu_release,
	.unlocked_ioctl = kvm_vcpu_ioctl,
	.compat_ioctl   = kvm_vcpu_ioctl,
886
	.mmap           = kvm_vcpu_mmap,
A
Avi Kivity 已提交
887 888 889 890 891 892 893
};

/*
 * Allocates an inode for the vcpu.
 */
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
A
Al Viro 已提交
894 895
	int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu);
	if (fd < 0)
A
Al Viro 已提交
896
		kvm_put_kvm(vcpu->kvm);
A
Avi Kivity 已提交
897 898 899
	return fd;
}

900 901 902 903 904 905 906 907 908
/*
 * Creates some virtual cpus.  Good luck creating more than one.
 */
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
{
	int r;
	struct kvm_vcpu *vcpu;

	if (!valid_vcpu(n))
R
Rusty Russell 已提交
909
		return -EINVAL;
910

911
	vcpu = kvm_arch_vcpu_create(kvm, n);
R
Rusty Russell 已提交
912 913
	if (IS_ERR(vcpu))
		return PTR_ERR(vcpu);
914

915 916
	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);

917 918 919 920
	r = kvm_arch_vcpu_setup(vcpu);
	if (r)
		goto vcpu_destroy;

S
Shaohua Li 已提交
921
	mutex_lock(&kvm->lock);
R
Rusty Russell 已提交
922 923
	if (kvm->vcpus[n]) {
		r = -EEXIST;
S
Shaohua Li 已提交
924
		mutex_unlock(&kvm->lock);
925
		goto vcpu_destroy;
R
Rusty Russell 已提交
926 927
	}
	kvm->vcpus[n] = vcpu;
S
Shaohua Li 已提交
928
	mutex_unlock(&kvm->lock);
929

R
Rusty Russell 已提交
930
	/* Now it's all set up, let userspace reach it */
A
Al Viro 已提交
931
	kvm_get_kvm(kvm);
A
Avi Kivity 已提交
932 933
	r = create_vcpu_fd(vcpu);
	if (r < 0)
R
Rusty Russell 已提交
934 935
		goto unlink;
	return r;
936

R
Rusty Russell 已提交
937
unlink:
S
Shaohua Li 已提交
938
	mutex_lock(&kvm->lock);
R
Rusty Russell 已提交
939
	kvm->vcpus[n] = NULL;
S
Shaohua Li 已提交
940
	mutex_unlock(&kvm->lock);
941
vcpu_destroy:
942
	kvm_arch_vcpu_destroy(vcpu);
943 944 945
	return r;
}

A
Avi Kivity 已提交
946 947 948 949 950 951 952 953 954 955 956
static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
{
	if (sigset) {
		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
		vcpu->sigset_active = 1;
		vcpu->sigset = *sigset;
	} else
		vcpu->sigset_active = 0;
	return 0;
}

A
Avi Kivity 已提交
957 958
static long kvm_vcpu_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
A
Avi Kivity 已提交
959
{
A
Avi Kivity 已提交
960
	struct kvm_vcpu *vcpu = filp->private_data;
A
Al Viro 已提交
961
	void __user *argp = (void __user *)arg;
962
	int r;
A
Avi Kivity 已提交
963

964 965
	if (vcpu->kvm->mm != current->mm)
		return -EIO;
A
Avi Kivity 已提交
966
	switch (ioctl) {
967
	case KVM_RUN:
968 969 970
		r = -EINVAL;
		if (arg)
			goto out;
971
		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
A
Avi Kivity 已提交
972 973
		break;
	case KVM_GET_REGS: {
974
		struct kvm_regs *kvm_regs;
A
Avi Kivity 已提交
975

976 977 978
		r = -ENOMEM;
		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
		if (!kvm_regs)
A
Avi Kivity 已提交
979
			goto out;
980 981 982
		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
		if (r)
			goto out_free1;
A
Avi Kivity 已提交
983
		r = -EFAULT;
984 985
		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
			goto out_free1;
A
Avi Kivity 已提交
986
		r = 0;
987 988
out_free1:
		kfree(kvm_regs);
A
Avi Kivity 已提交
989 990 991
		break;
	}
	case KVM_SET_REGS: {
992
		struct kvm_regs *kvm_regs;
A
Avi Kivity 已提交
993

994 995 996
		r = -ENOMEM;
		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
		if (!kvm_regs)
A
Avi Kivity 已提交
997
			goto out;
998 999 1000 1001
		r = -EFAULT;
		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
			goto out_free2;
		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
A
Avi Kivity 已提交
1002
		if (r)
1003
			goto out_free2;
A
Avi Kivity 已提交
1004
		r = 0;
1005 1006
out_free2:
		kfree(kvm_regs);
A
Avi Kivity 已提交
1007 1008 1009 1010 1011
		break;
	}
	case KVM_GET_SREGS: {
		struct kvm_sregs kvm_sregs;

A
Avi Kivity 已提交
1012
		memset(&kvm_sregs, 0, sizeof kvm_sregs);
1013
		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
A
Avi Kivity 已提交
1014 1015 1016
		if (r)
			goto out;
		r = -EFAULT;
A
Al Viro 已提交
1017
		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
A
Avi Kivity 已提交
1018 1019 1020 1021 1022 1023 1024 1025
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_SREGS: {
		struct kvm_sregs kvm_sregs;

		r = -EFAULT;
A
Al Viro 已提交
1026
		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
A
Avi Kivity 已提交
1027
			goto out;
1028
		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
A
Avi Kivity 已提交
1029 1030 1031 1032 1033
		if (r)
			goto out;
		r = 0;
		break;
	}
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
	case KVM_GET_MP_STATE: {
		struct kvm_mp_state mp_state;

		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
		if (r)
			goto out;
		r = -EFAULT;
		if (copy_to_user(argp, &mp_state, sizeof mp_state))
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_MP_STATE: {
		struct kvm_mp_state mp_state;

		r = -EFAULT;
		if (copy_from_user(&mp_state, argp, sizeof mp_state))
			goto out;
		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1058 1059 1060 1061
	case KVM_TRANSLATE: {
		struct kvm_translation tr;

		r = -EFAULT;
A
Al Viro 已提交
1062
		if (copy_from_user(&tr, argp, sizeof tr))
A
Avi Kivity 已提交
1063
			goto out;
1064
		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
A
Avi Kivity 已提交
1065 1066 1067
		if (r)
			goto out;
		r = -EFAULT;
A
Al Viro 已提交
1068
		if (copy_to_user(argp, &tr, sizeof tr))
A
Avi Kivity 已提交
1069 1070 1071 1072 1073 1074 1075 1076
			goto out;
		r = 0;
		break;
	}
	case KVM_DEBUG_GUEST: {
		struct kvm_debug_guest dbg;

		r = -EFAULT;
A
Al Viro 已提交
1077
		if (copy_from_user(&dbg, argp, sizeof dbg))
A
Avi Kivity 已提交
1078
			goto out;
1079
		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
A
Avi Kivity 已提交
1080 1081 1082 1083 1084
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
	case KVM_SET_SIGNAL_MASK: {
		struct kvm_signal_mask __user *sigmask_arg = argp;
		struct kvm_signal_mask kvm_sigmask;
		sigset_t sigset, *p;

		p = NULL;
		if (argp) {
			r = -EFAULT;
			if (copy_from_user(&kvm_sigmask, argp,
					   sizeof kvm_sigmask))
				goto out;
			r = -EINVAL;
			if (kvm_sigmask.len != sizeof sigset)
				goto out;
			r = -EFAULT;
			if (copy_from_user(&sigset, sigmask_arg->sigset,
					   sizeof sigset))
				goto out;
			p = &sigset;
		}
		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
		break;
	}
A
Avi Kivity 已提交
1108 1109 1110 1111
	case KVM_GET_FPU: {
		struct kvm_fpu fpu;

		memset(&fpu, 0, sizeof fpu);
1112
		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
A
Avi Kivity 已提交
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
		if (r)
			goto out;
		r = -EFAULT;
		if (copy_to_user(argp, &fpu, sizeof fpu))
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_FPU: {
		struct kvm_fpu fpu;

		r = -EFAULT;
		if (copy_from_user(&fpu, argp, sizeof fpu))
			goto out;
1127
		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
A
Avi Kivity 已提交
1128 1129 1130 1131 1132
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1133
	default:
1134
		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
A
Avi Kivity 已提交
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	}
out:
	return r;
}

static long kvm_vm_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
{
	struct kvm *kvm = filp->private_data;
	void __user *argp = (void __user *)arg;
1145
	int r;
A
Avi Kivity 已提交
1146

1147 1148
	if (kvm->mm != current->mm)
		return -EIO;
A
Avi Kivity 已提交
1149 1150 1151 1152 1153 1154
	switch (ioctl) {
	case KVM_CREATE_VCPU:
		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
		if (r < 0)
			goto out;
		break;
1155 1156 1157 1158 1159 1160 1161 1162 1163
	case KVM_SET_USER_MEMORY_REGION: {
		struct kvm_userspace_memory_region kvm_userspace_mem;

		r = -EFAULT;
		if (copy_from_user(&kvm_userspace_mem, argp,
						sizeof kvm_userspace_mem))
			goto out;

		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
A
Avi Kivity 已提交
1164 1165 1166 1167 1168 1169 1170 1171
		if (r)
			goto out;
		break;
	}
	case KVM_GET_DIRTY_LOG: {
		struct kvm_dirty_log log;

		r = -EFAULT;
A
Al Viro 已提交
1172
		if (copy_from_user(&log, argp, sizeof log))
A
Avi Kivity 已提交
1173
			goto out;
1174
		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
A
Avi Kivity 已提交
1175 1176 1177 1178
		if (r)
			goto out;
		break;
	}
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	case KVM_REGISTER_COALESCED_MMIO: {
		struct kvm_coalesced_mmio_zone zone;
		r = -EFAULT;
		if (copy_from_user(&zone, argp, sizeof zone))
			goto out;
		r = -ENXIO;
		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
		if (r)
			goto out;
		r = 0;
		break;
	}
	case KVM_UNREGISTER_COALESCED_MMIO: {
		struct kvm_coalesced_mmio_zone zone;
		r = -EFAULT;
		if (copy_from_user(&zone, argp, sizeof zone))
			goto out;
		r = -ENXIO;
		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
		if (r)
			goto out;
		r = 0;
		break;
	}
#endif
1205
	default:
1206
		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1207 1208 1209 1210 1211
	}
out:
	return r;
}

1212
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1213 1214 1215 1216
{
	struct kvm *kvm = vma->vm_file->private_data;
	struct page *page;

1217 1218
	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
		return VM_FAULT_SIGBUS;
1219
	page = gfn_to_page(kvm, vmf->pgoff);
1220
	if (is_error_page(page)) {
1221
		kvm_release_page_clean(page);
1222
		return VM_FAULT_SIGBUS;
1223
	}
1224 1225
	vmf->page = page;
	return 0;
1226 1227 1228
}

static struct vm_operations_struct kvm_vm_vm_ops = {
1229
	.fault = kvm_vm_fault,
1230 1231 1232 1233 1234 1235 1236 1237
};

static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_ops = &kvm_vm_vm_ops;
	return 0;
}

1238
static const struct file_operations kvm_vm_fops = {
1239 1240 1241 1242 1243 1244 1245 1246
	.release        = kvm_vm_release,
	.unlocked_ioctl = kvm_vm_ioctl,
	.compat_ioctl   = kvm_vm_ioctl,
	.mmap           = kvm_vm_mmap,
};

static int kvm_dev_ioctl_create_vm(void)
{
A
Al Viro 已提交
1247
	int fd;
1248 1249 1250
	struct kvm *kvm;

	kvm = kvm_create_vm();
1251 1252
	if (IS_ERR(kvm))
		return PTR_ERR(kvm);
A
Al Viro 已提交
1253 1254
	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm);
	if (fd < 0)
A
Al Viro 已提交
1255
		kvm_put_kvm(kvm);
1256 1257 1258 1259 1260 1261 1262

	return fd;
}

static long kvm_dev_ioctl(struct file *filp,
			  unsigned int ioctl, unsigned long arg)
{
1263
	long r = -EINVAL;
1264 1265 1266

	switch (ioctl) {
	case KVM_GET_API_VERSION:
1267 1268 1269
		r = -EINVAL;
		if (arg)
			goto out;
1270 1271 1272
		r = KVM_API_VERSION;
		break;
	case KVM_CREATE_VM:
1273 1274 1275
		r = -EINVAL;
		if (arg)
			goto out;
1276 1277
		r = kvm_dev_ioctl_create_vm();
		break;
1278
	case KVM_CHECK_EXTENSION:
1279
		r = kvm_dev_ioctl_check_extension(arg);
1280
		break;
1281 1282 1283 1284
	case KVM_GET_VCPU_MMAP_SIZE:
		r = -EINVAL;
		if (arg)
			goto out;
1285 1286 1287
		r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
		r += PAGE_SIZE;    /* pio data page */
1288 1289 1290
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
		r += PAGE_SIZE;    /* coalesced mmio ring page */
1291
#endif
1292
		break;
1293 1294 1295 1296 1297
	case KVM_TRACE_ENABLE:
	case KVM_TRACE_PAUSE:
	case KVM_TRACE_DISABLE:
		r = kvm_trace_ioctl(ioctl, arg);
		break;
A
Avi Kivity 已提交
1298
	default:
1299
		return kvm_arch_dev_ioctl(filp, ioctl, arg);
A
Avi Kivity 已提交
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
	}
out:
	return r;
}

static struct file_operations kvm_chardev_ops = {
	.unlocked_ioctl = kvm_dev_ioctl,
	.compat_ioctl   = kvm_dev_ioctl,
};

static struct miscdevice kvm_dev = {
A
Avi Kivity 已提交
1311
	KVM_MINOR,
A
Avi Kivity 已提交
1312 1313 1314 1315
	"kvm",
	&kvm_chardev_ops,
};

1316 1317 1318 1319 1320 1321 1322
static void hardware_enable(void *junk)
{
	int cpu = raw_smp_processor_id();

	if (cpu_isset(cpu, cpus_hardware_enabled))
		return;
	cpu_set(cpu, cpus_hardware_enabled);
1323
	kvm_arch_hardware_enable(NULL);
1324 1325 1326 1327 1328 1329 1330 1331 1332
}

static void hardware_disable(void *junk)
{
	int cpu = raw_smp_processor_id();

	if (!cpu_isset(cpu, cpus_hardware_enabled))
		return;
	cpu_clear(cpu, cpus_hardware_enabled);
1333
	kvm_arch_hardware_disable(NULL);
1334 1335
}

A
Avi Kivity 已提交
1336 1337 1338 1339 1340
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
			   void *v)
{
	int cpu = (long)v;

1341
	val &= ~CPU_TASKS_FROZEN;
A
Avi Kivity 已提交
1342
	switch (val) {
1343
	case CPU_DYING:
1344 1345 1346 1347
		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
		       cpu);
		hardware_disable(NULL);
		break;
A
Avi Kivity 已提交
1348
	case CPU_UP_CANCELED:
1349 1350
		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
		       cpu);
1351
		smp_call_function_single(cpu, hardware_disable, NULL, 1);
A
Avi Kivity 已提交
1352
		break;
1353 1354 1355
	case CPU_ONLINE:
		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
		       cpu);
1356
		smp_call_function_single(cpu, hardware_enable, NULL, 1);
A
Avi Kivity 已提交
1357 1358 1359 1360 1361
		break;
	}
	return NOTIFY_OK;
}

1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373

asmlinkage void kvm_handle_fault_on_reboot(void)
{
	if (kvm_rebooting)
		/* spin while reset goes on */
		while (true)
			;
	/* Fault while not rebooting.  We want the trace. */
	BUG();
}
EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);

1374
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
M
Mike Day 已提交
1375
		      void *v)
1376 1377 1378 1379 1380 1381 1382
{
	if (val == SYS_RESTART) {
		/*
		 * Some (well, at least mine) BIOSes hang on reboot if
		 * in vmx root mode.
		 */
		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1383
		kvm_rebooting = true;
1384
		on_each_cpu(hardware_disable, NULL, 1);
1385 1386 1387 1388 1389 1390 1391 1392 1393
	}
	return NOTIFY_OK;
}

static struct notifier_block kvm_reboot_notifier = {
	.notifier_call = kvm_reboot,
	.priority = 0,
};

1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
void kvm_io_bus_init(struct kvm_io_bus *bus)
{
	memset(bus, 0, sizeof(*bus));
}

void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
	int i;

	for (i = 0; i < bus->dev_count; i++) {
		struct kvm_io_device *pos = bus->devs[i];

		kvm_iodevice_destructor(pos);
	}
}

1410 1411
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
					  gpa_t addr, int len, int is_write)
1412 1413 1414 1415 1416 1417
{
	int i;

	for (i = 0; i < bus->dev_count; i++) {
		struct kvm_io_device *pos = bus->devs[i];

1418
		if (pos->in_range(pos, addr, len, is_write))
1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
			return pos;
	}

	return NULL;
}

void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
{
	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));

	bus->devs[bus->dev_count++] = dev;
}

A
Avi Kivity 已提交
1432 1433 1434 1435 1436
static struct notifier_block kvm_cpu_notifier = {
	.notifier_call = kvm_cpu_hotplug,
	.priority = 20, /* must be > scheduler priority */
};

1437
static int vm_stat_get(void *_offset, u64 *val)
1438 1439 1440 1441
{
	unsigned offset = (long)_offset;
	struct kvm *kvm;

1442
	*val = 0;
1443 1444
	spin_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
1445
		*val += *(u32 *)((void *)kvm + offset);
1446
	spin_unlock(&kvm_lock);
1447
	return 0;
1448 1449 1450 1451
}

DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");

1452
static int vcpu_stat_get(void *_offset, u64 *val)
A
Avi Kivity 已提交
1453 1454 1455 1456 1457 1458
{
	unsigned offset = (long)_offset;
	struct kvm *kvm;
	struct kvm_vcpu *vcpu;
	int i;

1459
	*val = 0;
A
Avi Kivity 已提交
1460 1461 1462
	spin_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
R
Rusty Russell 已提交
1463 1464
			vcpu = kvm->vcpus[i];
			if (vcpu)
1465
				*val += *(u32 *)((void *)vcpu + offset);
A
Avi Kivity 已提交
1466 1467
		}
	spin_unlock(&kvm_lock);
1468
	return 0;
A
Avi Kivity 已提交
1469 1470
}

1471 1472 1473 1474 1475 1476
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");

static struct file_operations *stat_fops[] = {
	[KVM_STAT_VCPU] = &vcpu_stat_fops,
	[KVM_STAT_VM]   = &vm_stat_fops,
};
A
Avi Kivity 已提交
1477

1478
static void kvm_init_debug(void)
A
Avi Kivity 已提交
1479 1480 1481
{
	struct kvm_stats_debugfs_item *p;

1482
	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
A
Avi Kivity 已提交
1483
	for (p = debugfs_entries; p->name; ++p)
1484
		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
A
Avi Kivity 已提交
1485
						(void *)(long)p->offset,
1486
						stat_fops[p->kind]);
A
Avi Kivity 已提交
1487 1488 1489 1490 1491 1492 1493 1494
}

static void kvm_exit_debug(void)
{
	struct kvm_stats_debugfs_item *p;

	for (p = debugfs_entries; p->name; ++p)
		debugfs_remove(p->dentry);
1495
	debugfs_remove(kvm_debugfs_dir);
A
Avi Kivity 已提交
1496 1497
}

1498 1499
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
A
Avi Kivity 已提交
1500
	hardware_disable(NULL);
1501 1502 1503 1504 1505
	return 0;
}

static int kvm_resume(struct sys_device *dev)
{
A
Avi Kivity 已提交
1506
	hardware_enable(NULL);
1507 1508 1509 1510
	return 0;
}

static struct sysdev_class kvm_sysdev_class = {
1511
	.name = "kvm",
1512 1513 1514 1515 1516 1517 1518 1519 1520
	.suspend = kvm_suspend,
	.resume = kvm_resume,
};

static struct sys_device kvm_sysdev = {
	.id = 0,
	.cls = &kvm_sysdev_class,
};

1521
struct page *bad_page;
1522
pfn_t bad_pfn;
A
Avi Kivity 已提交
1523

1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
	return container_of(pn, struct kvm_vcpu, preempt_notifier);
}

static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
{
	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

1534
	kvm_arch_vcpu_load(vcpu, cpu);
1535 1536 1537 1538 1539 1540 1541
}

static void kvm_sched_out(struct preempt_notifier *pn,
			  struct task_struct *next)
{
	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

1542
	kvm_arch_vcpu_put(vcpu);
1543 1544
}

1545
int kvm_init(void *opaque, unsigned int vcpu_size,
1546
		  struct module *module)
A
Avi Kivity 已提交
1547 1548
{
	int r;
Y
Yang, Sheng 已提交
1549
	int cpu;
A
Avi Kivity 已提交
1550

1551 1552
	kvm_init_debug();

1553 1554
	r = kvm_arch_init(opaque);
	if (r)
1555
		goto out_fail;
1556 1557 1558 1559 1560 1561 1562 1563

	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);

	if (bad_page == NULL) {
		r = -ENOMEM;
		goto out;
	}

1564 1565
	bad_pfn = page_to_pfn(bad_page);

1566
	r = kvm_arch_hardware_setup();
A
Avi Kivity 已提交
1567
	if (r < 0)
1568
		goto out_free_0;
A
Avi Kivity 已提交
1569

Y
Yang, Sheng 已提交
1570 1571
	for_each_online_cpu(cpu) {
		smp_call_function_single(cpu,
1572
				kvm_arch_check_processor_compat,
1573
				&r, 1);
Y
Yang, Sheng 已提交
1574
		if (r < 0)
1575
			goto out_free_1;
Y
Yang, Sheng 已提交
1576 1577
	}

1578
	on_each_cpu(hardware_enable, NULL, 1);
A
Avi Kivity 已提交
1579 1580
	r = register_cpu_notifier(&kvm_cpu_notifier);
	if (r)
1581
		goto out_free_2;
A
Avi Kivity 已提交
1582 1583
	register_reboot_notifier(&kvm_reboot_notifier);

1584 1585
	r = sysdev_class_register(&kvm_sysdev_class);
	if (r)
1586
		goto out_free_3;
1587 1588 1589

	r = sysdev_register(&kvm_sysdev);
	if (r)
1590
		goto out_free_4;
1591

1592 1593
	/* A kmem cache lets us meet the alignment requirements of fx_save. */
	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
J
Joe Perches 已提交
1594 1595
					   __alignof__(struct kvm_vcpu),
					   0, NULL);
1596 1597
	if (!kvm_vcpu_cache) {
		r = -ENOMEM;
1598
		goto out_free_5;
1599 1600
	}

A
Avi Kivity 已提交
1601 1602 1603 1604
	kvm_chardev_ops.owner = module;

	r = misc_register(&kvm_dev);
	if (r) {
M
Mike Day 已提交
1605
		printk(KERN_ERR "kvm: misc device register failed\n");
A
Avi Kivity 已提交
1606 1607 1608
		goto out_free;
	}

1609 1610 1611
	kvm_preempt_ops.sched_in = kvm_sched_in;
	kvm_preempt_ops.sched_out = kvm_sched_out;

1612
	return 0;
A
Avi Kivity 已提交
1613 1614

out_free:
1615
	kmem_cache_destroy(kvm_vcpu_cache);
1616
out_free_5:
1617
	sysdev_unregister(&kvm_sysdev);
1618
out_free_4:
1619
	sysdev_class_unregister(&kvm_sysdev_class);
1620
out_free_3:
A
Avi Kivity 已提交
1621
	unregister_reboot_notifier(&kvm_reboot_notifier);
A
Avi Kivity 已提交
1622
	unregister_cpu_notifier(&kvm_cpu_notifier);
1623
out_free_2:
1624
	on_each_cpu(hardware_disable, NULL, 1);
1625
out_free_1:
1626
	kvm_arch_hardware_unsetup();
1627 1628
out_free_0:
	__free_page(bad_page);
1629
out:
1630
	kvm_arch_exit();
1631
	kvm_exit_debug();
1632
out_fail:
A
Avi Kivity 已提交
1633 1634
	return r;
}
1635
EXPORT_SYMBOL_GPL(kvm_init);
A
Avi Kivity 已提交
1636

1637
void kvm_exit(void)
A
Avi Kivity 已提交
1638
{
1639
	kvm_trace_cleanup();
A
Avi Kivity 已提交
1640
	misc_deregister(&kvm_dev);
1641
	kmem_cache_destroy(kvm_vcpu_cache);
1642 1643
	sysdev_unregister(&kvm_sysdev);
	sysdev_class_unregister(&kvm_sysdev_class);
A
Avi Kivity 已提交
1644
	unregister_reboot_notifier(&kvm_reboot_notifier);
1645
	unregister_cpu_notifier(&kvm_cpu_notifier);
1646
	on_each_cpu(hardware_disable, NULL, 1);
1647
	kvm_arch_hardware_unsetup();
1648
	kvm_arch_exit();
A
Avi Kivity 已提交
1649
	kvm_exit_debug();
1650
	__free_page(bad_page);
A
Avi Kivity 已提交
1651
}
1652
EXPORT_SYMBOL_GPL(kvm_exit);