kvm_main.c 33.9 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

18
#include "iodev.h"
A
Avi Kivity 已提交
19

20
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
21 22 23 24 25 26 27 28 29 30 31 32
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
33
#include <linux/sysdev.h>
A
Avi Kivity 已提交
34
#include <linux/cpu.h>
A
Alexey Dobriyan 已提交
35
#include <linux/sched.h>
36 37
#include <linux/cpumask.h>
#include <linux/smp.h>
38
#include <linux/anon_inodes.h>
39
#include <linux/profile.h>
40
#include <linux/kvm_para.h>
41
#include <linux/pagemap.h>
42
#include <linux/mman.h>
43
#include <linux/swap.h>
A
Avi Kivity 已提交
44

A
Avi Kivity 已提交
45 46 47
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/uaccess.h>
48
#include <asm/pgtable.h>
A
Avi Kivity 已提交
49

50 51 52 53
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
#include "coalesced_mmio.h"
#endif

A
Avi Kivity 已提交
54 55 56
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

57 58
DEFINE_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
59

60 61
static cpumask_t cpus_hardware_enabled;

62 63
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
A
Avi Kivity 已提交
64

65 66
static __read_mostly struct preempt_ops kvm_preempt_ops;

67
struct dentry *kvm_debugfs_dir;
A
Avi Kivity 已提交
68

A
Avi Kivity 已提交
69 70 71
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
			   unsigned long arg);

72 73
bool kvm_rebooting;

74 75 76 77 78
static inline int valid_vcpu(int n)
{
	return likely(n >= 0 && n < KVM_MAX_VCPUS);
}

A
Avi Kivity 已提交
79 80 81
/*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
82
void vcpu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
83
{
84 85
	int cpu;

A
Avi Kivity 已提交
86
	mutex_lock(&vcpu->mutex);
87 88
	cpu = get_cpu();
	preempt_notifier_register(&vcpu->preempt_notifier);
89
	kvm_arch_vcpu_load(vcpu, cpu);
90
	put_cpu();
A
Avi Kivity 已提交
91 92
}

93
void vcpu_put(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
94
{
95
	preempt_disable();
96
	kvm_arch_vcpu_put(vcpu);
97 98
	preempt_notifier_unregister(&vcpu->preempt_notifier);
	preempt_enable();
A
Avi Kivity 已提交
99 100 101
	mutex_unlock(&vcpu->mutex);
}

102 103 104 105 106 107
static void ack_flush(void *_completed)
{
}

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
108
	int i, cpu, me;
109 110 111
	cpumask_t cpus;
	struct kvm_vcpu *vcpu;

112
	me = get_cpu();
113
	cpus_clear(cpus);
R
Rusty Russell 已提交
114 115 116 117
	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
		vcpu = kvm->vcpus[i];
		if (!vcpu)
			continue;
118
		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
119 120
			continue;
		cpu = vcpu->cpu;
121
		if (cpu != -1 && cpu != me)
122
			cpu_set(cpu, cpus);
123
	}
124
	if (cpus_empty(cpus))
125
		goto out;
126
	++kvm->stat.remote_tlb_flush;
127
	smp_call_function_mask(cpus, ack_flush, NULL, 1);
128 129
out:
	put_cpu();
130 131
}

132 133
void kvm_reload_remote_mmus(struct kvm *kvm)
{
134
	int i, cpu, me;
135 136 137
	cpumask_t cpus;
	struct kvm_vcpu *vcpu;

138
	me = get_cpu();
139 140 141 142 143 144 145 146
	cpus_clear(cpus);
	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
		vcpu = kvm->vcpus[i];
		if (!vcpu)
			continue;
		if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
			continue;
		cpu = vcpu->cpu;
147
		if (cpu != -1 && cpu != me)
148 149 150
			cpu_set(cpu, cpus);
	}
	if (cpus_empty(cpus))
151
		goto out;
152
	smp_call_function_mask(cpus, ack_flush, NULL, 1);
153 154
out:
	put_cpu();
155 156 157
}


R
Rusty Russell 已提交
158 159 160 161 162 163 164 165 166
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
	struct page *page;
	int r;

	mutex_init(&vcpu->mutex);
	vcpu->cpu = -1;
	vcpu->kvm = kvm;
	vcpu->vcpu_id = id;
E
Eddie Dong 已提交
167
	init_waitqueue_head(&vcpu->wq);
R
Rusty Russell 已提交
168 169 170 171 172 173 174 175

	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		r = -ENOMEM;
		goto fail;
	}
	vcpu->run = page_address(page);

176
	r = kvm_arch_vcpu_init(vcpu);
R
Rusty Russell 已提交
177
	if (r < 0)
178
		goto fail_free_run;
R
Rusty Russell 已提交
179 180 181 182 183
	return 0;

fail_free_run:
	free_page((unsigned long)vcpu->run);
fail:
184
	return r;
R
Rusty Russell 已提交
185 186 187 188 189
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);

void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
190
	kvm_arch_vcpu_uninit(vcpu);
R
Rusty Russell 已提交
191 192 193 194
	free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);

195
static struct kvm *kvm_create_vm(void)
A
Avi Kivity 已提交
196
{
197
	struct kvm *kvm = kvm_arch_create_vm();
198 199 200
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	struct page *page;
#endif
A
Avi Kivity 已提交
201

202 203
	if (IS_ERR(kvm))
		goto out;
A
Avi Kivity 已提交
204

205 206 207 208 209 210 211 212 213 214
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		kfree(kvm);
		return ERR_PTR(-ENOMEM);
	}
	kvm->coalesced_mmio_ring =
			(struct kvm_coalesced_mmio_ring *)page_address(page);
#endif

215 216
	kvm->mm = current->mm;
	atomic_inc(&kvm->mm->mm_count);
217
	spin_lock_init(&kvm->mmu_lock);
218
	kvm_io_bus_init(&kvm->pio_bus);
S
Shaohua Li 已提交
219
	mutex_init(&kvm->lock);
220
	kvm_io_bus_init(&kvm->mmio_bus);
221
	init_rwsem(&kvm->slots_lock);
I
Izik Eidus 已提交
222
	atomic_set(&kvm->users_count, 1);
223 224 225
	spin_lock(&kvm_lock);
	list_add(&kvm->vm_list, &vm_list);
	spin_unlock(&kvm_lock);
226 227 228
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	kvm_coalesced_mmio_init(kvm);
#endif
229
out:
230 231 232
	return kvm;
}

A
Avi Kivity 已提交
233 234 235 236 237 238
/*
 * Free any memory in @free but not in @dont.
 */
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
				  struct kvm_memory_slot *dont)
{
239 240
	if (!dont || free->rmap != dont->rmap)
		vfree(free->rmap);
A
Avi Kivity 已提交
241 242 243 244

	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
		vfree(free->dirty_bitmap);

M
Marcelo Tosatti 已提交
245 246 247
	if (!dont || free->lpage_info != dont->lpage_info)
		vfree(free->lpage_info);

A
Avi Kivity 已提交
248
	free->npages = 0;
A
Al Viro 已提交
249
	free->dirty_bitmap = NULL;
250
	free->rmap = NULL;
M
Marcelo Tosatti 已提交
251
	free->lpage_info = NULL;
A
Avi Kivity 已提交
252 253
}

254
void kvm_free_physmem(struct kvm *kvm)
A
Avi Kivity 已提交
255 256 257 258
{
	int i;

	for (i = 0; i < kvm->nmemslots; ++i)
A
Al Viro 已提交
259
		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
A
Avi Kivity 已提交
260 261
}

262 263
static void kvm_destroy_vm(struct kvm *kvm)
{
264 265
	struct mm_struct *mm = kvm->mm;

266 267 268
	spin_lock(&kvm_lock);
	list_del(&kvm->vm_list);
	spin_unlock(&kvm_lock);
269
	kvm_io_bus_destroy(&kvm->pio_bus);
270
	kvm_io_bus_destroy(&kvm->mmio_bus);
271 272 273 274
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	if (kvm->coalesced_mmio_ring != NULL)
		free_page((unsigned long)kvm->coalesced_mmio_ring);
#endif
275
	kvm_arch_destroy_vm(kvm);
276
	mmdrop(mm);
277 278
}

I
Izik Eidus 已提交
279 280 281 282 283 284 285 286 287 288 289 290 291 292
void kvm_get_kvm(struct kvm *kvm)
{
	atomic_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

void kvm_put_kvm(struct kvm *kvm)
{
	if (atomic_dec_and_test(&kvm->users_count))
		kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);


293 294 295 296
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
	struct kvm *kvm = filp->private_data;

I
Izik Eidus 已提交
297
	kvm_put_kvm(kvm);
A
Avi Kivity 已提交
298 299 300 301 302 303 304 305
	return 0;
}

/*
 * Allocate some memory and give it an address in the guest physical address
 * space.
 *
 * Discontiguous memory is allowed, mostly for framebuffers.
306
 *
307
 * Must be called holding mmap_sem for write.
A
Avi Kivity 已提交
308
 */
309 310 311
int __kvm_set_memory_region(struct kvm *kvm,
			    struct kvm_userspace_memory_region *mem,
			    int user_alloc)
A
Avi Kivity 已提交
312 313 314 315 316 317 318 319 320 321 322 323 324 325
{
	int r;
	gfn_t base_gfn;
	unsigned long npages;
	unsigned long i;
	struct kvm_memory_slot *memslot;
	struct kvm_memory_slot old, new;

	r = -EINVAL;
	/* General sanity checks */
	if (mem->memory_size & (PAGE_SIZE - 1))
		goto out;
	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
		goto out;
326
	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
A
Avi Kivity 已提交
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
		goto out;
	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
		goto out;

	memslot = &kvm->memslots[mem->slot];
	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	npages = mem->memory_size >> PAGE_SHIFT;

	if (!npages)
		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;

	new = old = *memslot;

	new.base_gfn = base_gfn;
	new.npages = npages;
	new.flags = mem->flags;

	/* Disallow changing a memory slot's size. */
	r = -EINVAL;
	if (npages && old.npages && npages != old.npages)
347
		goto out_free;
A
Avi Kivity 已提交
348 349 350 351 352 353 354 355 356 357

	/* Check for overlaps */
	r = -EEXIST;
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *s = &kvm->memslots[i];

		if (s == memslot)
			continue;
		if (!((base_gfn + npages <= s->base_gfn) ||
		      (base_gfn >= s->base_gfn + s->npages)))
358
			goto out_free;
A
Avi Kivity 已提交
359 360 361 362
	}

	/* Free page dirty bitmap if unneeded */
	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
A
Al Viro 已提交
363
		new.dirty_bitmap = NULL;
A
Avi Kivity 已提交
364 365 366 367

	r = -ENOMEM;

	/* Allocate if a slot is being created */
368
#ifndef CONFIG_S390
369
	if (npages && !new.rmap) {
M
Mike Day 已提交
370
		new.rmap = vmalloc(npages * sizeof(struct page *));
371 372

		if (!new.rmap)
373
			goto out_free;
374 375

		memset(new.rmap, 0, npages * sizeof(*new.rmap));
376

377
		new.user_alloc = user_alloc;
378
		new.userspace_addr = mem->userspace_addr;
A
Avi Kivity 已提交
379
	}
M
Marcelo Tosatti 已提交
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
	if (npages && !new.lpage_info) {
		int largepages = npages / KVM_PAGES_PER_HPAGE;
		if (npages % KVM_PAGES_PER_HPAGE)
			largepages++;
		if (base_gfn % KVM_PAGES_PER_HPAGE)
			largepages++;

		new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));

		if (!new.lpage_info)
			goto out_free;

		memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));

		if (base_gfn % KVM_PAGES_PER_HPAGE)
			new.lpage_info[0].write_count = 1;
		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
			new.lpage_info[largepages-1].write_count = 1;
	}
A
Avi Kivity 已提交
399 400 401 402 403 404 405

	/* Allocate page dirty bitmap if needed */
	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;

		new.dirty_bitmap = vmalloc(dirty_bytes);
		if (!new.dirty_bitmap)
406
			goto out_free;
A
Avi Kivity 已提交
407 408
		memset(new.dirty_bitmap, 0, dirty_bytes);
	}
409
#endif /* not defined CONFIG_S390 */
A
Avi Kivity 已提交
410 411 412 413

	if (mem->slot >= kvm->nmemslots)
		kvm->nmemslots = mem->slot + 1;

414 415 416
	if (!npages)
		kvm_arch_flush_shadow(kvm);

417 418
	*memslot = new;

419 420 421 422
	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
	if (r) {
		*memslot = old;
		goto out_free;
423 424
	}

A
Avi Kivity 已提交
425 426 427
	kvm_free_physmem_slot(&old, &new);
	return 0;

428
out_free:
A
Avi Kivity 已提交
429 430 431
	kvm_free_physmem_slot(&new, &old);
out:
	return r;
432 433

}
434 435 436 437 438 439 440 441
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
			  struct kvm_userspace_memory_region *mem,
			  int user_alloc)
{
	int r;

442
	down_write(&kvm->slots_lock);
443
	r = __kvm_set_memory_region(kvm, mem, user_alloc);
444
	up_write(&kvm->slots_lock);
445 446
	return r;
}
447 448
EXPORT_SYMBOL_GPL(kvm_set_memory_region);

449 450 451 452
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
				   struct
				   kvm_userspace_memory_region *mem,
				   int user_alloc)
453
{
454 455
	if (mem->slot >= KVM_MEMORY_SLOTS)
		return -EINVAL;
456
	return kvm_set_memory_region(kvm, mem, user_alloc);
A
Avi Kivity 已提交
457 458
}

459 460
int kvm_get_dirty_log(struct kvm *kvm,
			struct kvm_dirty_log *log, int *is_dirty)
A
Avi Kivity 已提交
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
{
	struct kvm_memory_slot *memslot;
	int r, i;
	int n;
	unsigned long any = 0;

	r = -EINVAL;
	if (log->slot >= KVM_MEMORY_SLOTS)
		goto out;

	memslot = &kvm->memslots[log->slot];
	r = -ENOENT;
	if (!memslot->dirty_bitmap)
		goto out;

476
	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
A
Avi Kivity 已提交
477

478
	for (i = 0; !any && i < n/sizeof(long); ++i)
A
Avi Kivity 已提交
479 480 481 482 483 484
		any = memslot->dirty_bitmap[i];

	r = -EFAULT;
	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
		goto out;

485 486
	if (any)
		*is_dirty = 1;
A
Avi Kivity 已提交
487 488 489 490 491 492

	r = 0;
out:
	return r;
}

493 494 495 496 497 498
int is_error_page(struct page *page)
{
	return page == bad_page;
}
EXPORT_SYMBOL_GPL(is_error_page);

499 500 501 502 503 504
int is_error_pfn(pfn_t pfn)
{
	return pfn == bad_pfn;
}
EXPORT_SYMBOL_GPL(is_error_pfn);

I
Izik Eidus 已提交
505 506 507 508 509 510 511 512 513 514 515
static inline unsigned long bad_hva(void)
{
	return PAGE_OFFSET;
}

int kvm_is_error_hva(unsigned long addr)
{
	return addr == bad_hva();
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);

516
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
A
Avi Kivity 已提交
517 518 519 520 521 522 523 524 525 526
{
	int i;

	for (i = 0; i < kvm->nmemslots; ++i) {
		struct kvm_memory_slot *memslot = &kvm->memslots[i];

		if (gfn >= memslot->base_gfn
		    && gfn < memslot->base_gfn + memslot->npages)
			return memslot;
	}
A
Al Viro 已提交
527
	return NULL;
A
Avi Kivity 已提交
528
}
529 530 531 532 533 534

struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
	gfn = unalias_gfn(kvm, gfn);
	return __gfn_to_memslot(kvm, gfn);
}
A
Avi Kivity 已提交
535

536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
	int i;

	gfn = unalias_gfn(kvm, gfn);
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *memslot = &kvm->memslots[i];

		if (gfn >= memslot->base_gfn
		    && gfn < memslot->base_gfn + memslot->npages)
			return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);

M
Marcelo Tosatti 已提交
552
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
I
Izik Eidus 已提交
553 554 555 556 557 558 559 560 561
{
	struct kvm_memory_slot *slot;

	gfn = unalias_gfn(kvm, gfn);
	slot = __gfn_to_memslot(kvm, gfn);
	if (!slot)
		return bad_hva();
	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
562
EXPORT_SYMBOL_GPL(gfn_to_hva);
I
Izik Eidus 已提交
563

564 565 566
/*
 * Requires current->mm->mmap_sem to be held
 */
567
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
A
Avi Kivity 已提交
568
{
569
	struct page *page[1];
I
Izik Eidus 已提交
570
	unsigned long addr;
571
	int npages;
572
	pfn_t pfn;
A
Avi Kivity 已提交
573

574 575
	might_sleep();

I
Izik Eidus 已提交
576 577
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr)) {
578
		get_page(bad_page);
579
		return page_to_pfn(bad_page);
580
	}
581

I
Izik Eidus 已提交
582 583 584
	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
				NULL);

585 586 587 588 589 590 591 592 593 594 595 596 597 598
	if (unlikely(npages != 1)) {
		struct vm_area_struct *vma;

		vma = find_vma(current->mm, addr);
		if (vma == NULL || addr < vma->vm_start ||
		    !(vma->vm_flags & VM_PFNMAP)) {
			get_page(bad_page);
			return page_to_pfn(bad_page);
		}

		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
		BUG_ON(pfn_valid(pfn));
	} else
		pfn = page_to_pfn(page[0]);
599

600
	return pfn;
601 602 603 604 605 606
}

EXPORT_SYMBOL_GPL(gfn_to_pfn);

struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
607 608 609 610 611 612 613 614 615 616
	pfn_t pfn;

	pfn = gfn_to_pfn(kvm, gfn);
	if (pfn_valid(pfn))
		return pfn_to_page(pfn);

	WARN_ON(!pfn_valid(pfn));

	get_page(bad_page);
	return bad_page;
A
Avi Kivity 已提交
617
}
618

A
Avi Kivity 已提交
619 620
EXPORT_SYMBOL_GPL(gfn_to_page);

621 622
void kvm_release_page_clean(struct page *page)
{
623
	kvm_release_pfn_clean(page_to_pfn(page));
624 625 626
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);

627 628
void kvm_release_pfn_clean(pfn_t pfn)
{
629 630
	if (pfn_valid(pfn))
		put_page(pfn_to_page(pfn));
631 632 633
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);

634
void kvm_release_page_dirty(struct page *page)
635
{
636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
	kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);

void kvm_release_pfn_dirty(pfn_t pfn)
{
	kvm_set_pfn_dirty(pfn);
	kvm_release_pfn_clean(pfn);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);

void kvm_set_page_dirty(struct page *page)
{
	kvm_set_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_set_page_dirty);

void kvm_set_pfn_dirty(pfn_t pfn)
{
655 656 657 658 659
	if (pfn_valid(pfn)) {
		struct page *page = pfn_to_page(pfn);
		if (!PageReserved(page))
			SetPageDirty(page);
	}
660
}
661 662 663 664
EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);

void kvm_set_pfn_accessed(pfn_t pfn)
{
665 666
	if (pfn_valid(pfn))
		mark_page_accessed(pfn_to_page(pfn));
667 668 669 670 671
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

void kvm_get_pfn(pfn_t pfn)
{
672 673
	if (pfn_valid(pfn))
		get_page(pfn_to_page(pfn));
674 675
}
EXPORT_SYMBOL_GPL(kvm_get_pfn);
676

677 678 679 680 681 682 683 684 685 686 687
static int next_segment(unsigned long len, int offset)
{
	if (len > PAGE_SIZE - offset)
		return PAGE_SIZE - offset;
	else
		return len;
}

int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
			int len)
{
688 689
	int r;
	unsigned long addr;
690

691 692 693 694 695
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
	r = copy_from_user(data, (void __user *)addr + offset, len);
	if (r)
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
		return -EFAULT;
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page);

int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

	while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		data += seg;
		++gfn;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest);

721 722 723 724 725 726 727 728 729 730 731
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
			  unsigned long len)
{
	int r;
	unsigned long addr;
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int offset = offset_in_page(gpa);

	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
732
	pagefault_disable();
733
	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
734
	pagefault_enable();
735 736 737 738 739 740
	if (r)
		return -EFAULT;
	return 0;
}
EXPORT_SYMBOL(kvm_read_guest_atomic);

741 742 743
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
			 int offset, int len)
{
744 745
	int r;
	unsigned long addr;
746

747 748 749 750 751
	addr = gfn_to_hva(kvm, gfn);
	if (kvm_is_error_hva(addr))
		return -EFAULT;
	r = copy_to_user((void __user *)addr + offset, data, len);
	if (r)
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
		return -EFAULT;
	mark_page_dirty(kvm, gfn);
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);

int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
		    unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

	while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		data += seg;
		++gfn;
	}
	return 0;
}

int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
780
	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);

int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int seg;
	int offset = offset_in_page(gpa);
	int ret;

        while ((seg = next_segment(len, offset)) != 0) {
		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
		if (ret < 0)
			return ret;
		offset = 0;
		len -= seg;
		++gfn;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);

A
Avi Kivity 已提交
803 804
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
805
	struct kvm_memory_slot *memslot;
A
Avi Kivity 已提交
806

807
	gfn = unalias_gfn(kvm, gfn);
R
Rusty Russell 已提交
808 809 810
	memslot = __gfn_to_memslot(kvm, gfn);
	if (memslot && memslot->dirty_bitmap) {
		unsigned long rel_gfn = gfn - memslot->base_gfn;
A
Avi Kivity 已提交
811

R
Rusty Russell 已提交
812 813 814
		/* avoid RMW */
		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
			set_bit(rel_gfn, memslot->dirty_bitmap);
A
Avi Kivity 已提交
815 816 817
	}
}

E
Eddie Dong 已提交
818 819 820
/*
 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
 */
821
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
822
{
823 824 825 826 827 828 829 830 831 832 833 834 835 836
	DEFINE_WAIT(wait);

	for (;;) {
		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);

		if (kvm_cpu_has_interrupt(vcpu))
			break;
		if (kvm_cpu_has_pending_timer(vcpu))
			break;
		if (kvm_arch_vcpu_runnable(vcpu))
			break;
		if (signal_pending(current))
			break;

E
Eddie Dong 已提交
837 838 839 840
		vcpu_put(vcpu);
		schedule();
		vcpu_load(vcpu);
	}
841

842
	finish_wait(&vcpu->wq, &wait);
E
Eddie Dong 已提交
843 844
}

A
Avi Kivity 已提交
845 846
void kvm_resched(struct kvm_vcpu *vcpu)
{
847 848
	if (!need_resched())
		return;
A
Avi Kivity 已提交
849 850 851 852
	cond_resched();
}
EXPORT_SYMBOL_GPL(kvm_resched);

853
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
854 855 856 857
{
	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
	struct page *page;

858
	if (vmf->pgoff == 0)
859
		page = virt_to_page(vcpu->run);
A
Avi Kivity 已提交
860
#ifdef CONFIG_X86
861
	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
862
		page = virt_to_page(vcpu->arch.pio_data);
863 864 865 866
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
A
Avi Kivity 已提交
867
#endif
868
	else
869
		return VM_FAULT_SIGBUS;
870
	get_page(page);
871 872
	vmf->page = page;
	return 0;
873 874 875
}

static struct vm_operations_struct kvm_vcpu_vm_ops = {
876
	.fault = kvm_vcpu_fault,
877 878 879 880 881 882 883 884
};

static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_ops = &kvm_vcpu_vm_ops;
	return 0;
}

A
Avi Kivity 已提交
885 886 887 888
static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
	struct kvm_vcpu *vcpu = filp->private_data;

A
Al Viro 已提交
889
	kvm_put_kvm(vcpu->kvm);
A
Avi Kivity 已提交
890 891 892
	return 0;
}

893
static const struct file_operations kvm_vcpu_fops = {
A
Avi Kivity 已提交
894 895 896
	.release        = kvm_vcpu_release,
	.unlocked_ioctl = kvm_vcpu_ioctl,
	.compat_ioctl   = kvm_vcpu_ioctl,
897
	.mmap           = kvm_vcpu_mmap,
A
Avi Kivity 已提交
898 899 900 901 902 903 904
};

/*
 * Allocates an inode for the vcpu.
 */
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
905
	int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
A
Al Viro 已提交
906
	if (fd < 0)
A
Al Viro 已提交
907
		kvm_put_kvm(vcpu->kvm);
A
Avi Kivity 已提交
908 909 910
	return fd;
}

911 912 913 914 915 916 917 918 919
/*
 * Creates some virtual cpus.  Good luck creating more than one.
 */
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
{
	int r;
	struct kvm_vcpu *vcpu;

	if (!valid_vcpu(n))
R
Rusty Russell 已提交
920
		return -EINVAL;
921

922
	vcpu = kvm_arch_vcpu_create(kvm, n);
R
Rusty Russell 已提交
923 924
	if (IS_ERR(vcpu))
		return PTR_ERR(vcpu);
925

926 927
	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);

928 929 930 931
	r = kvm_arch_vcpu_setup(vcpu);
	if (r)
		goto vcpu_destroy;

S
Shaohua Li 已提交
932
	mutex_lock(&kvm->lock);
R
Rusty Russell 已提交
933 934
	if (kvm->vcpus[n]) {
		r = -EEXIST;
S
Shaohua Li 已提交
935
		mutex_unlock(&kvm->lock);
936
		goto vcpu_destroy;
R
Rusty Russell 已提交
937 938
	}
	kvm->vcpus[n] = vcpu;
S
Shaohua Li 已提交
939
	mutex_unlock(&kvm->lock);
940

R
Rusty Russell 已提交
941
	/* Now it's all set up, let userspace reach it */
A
Al Viro 已提交
942
	kvm_get_kvm(kvm);
A
Avi Kivity 已提交
943 944
	r = create_vcpu_fd(vcpu);
	if (r < 0)
R
Rusty Russell 已提交
945 946
		goto unlink;
	return r;
947

R
Rusty Russell 已提交
948
unlink:
S
Shaohua Li 已提交
949
	mutex_lock(&kvm->lock);
R
Rusty Russell 已提交
950
	kvm->vcpus[n] = NULL;
S
Shaohua Li 已提交
951
	mutex_unlock(&kvm->lock);
952
vcpu_destroy:
953
	kvm_arch_vcpu_destroy(vcpu);
954 955 956
	return r;
}

A
Avi Kivity 已提交
957 958 959 960 961 962 963 964 965 966 967
static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
{
	if (sigset) {
		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
		vcpu->sigset_active = 1;
		vcpu->sigset = *sigset;
	} else
		vcpu->sigset_active = 0;
	return 0;
}

A
Avi Kivity 已提交
968 969
static long kvm_vcpu_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
A
Avi Kivity 已提交
970
{
A
Avi Kivity 已提交
971
	struct kvm_vcpu *vcpu = filp->private_data;
A
Al Viro 已提交
972
	void __user *argp = (void __user *)arg;
973
	int r;
A
Avi Kivity 已提交
974

975 976
	if (vcpu->kvm->mm != current->mm)
		return -EIO;
A
Avi Kivity 已提交
977
	switch (ioctl) {
978
	case KVM_RUN:
979 980 981
		r = -EINVAL;
		if (arg)
			goto out;
982
		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
A
Avi Kivity 已提交
983 984
		break;
	case KVM_GET_REGS: {
985
		struct kvm_regs *kvm_regs;
A
Avi Kivity 已提交
986

987 988 989
		r = -ENOMEM;
		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
		if (!kvm_regs)
A
Avi Kivity 已提交
990
			goto out;
991 992 993
		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
		if (r)
			goto out_free1;
A
Avi Kivity 已提交
994
		r = -EFAULT;
995 996
		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
			goto out_free1;
A
Avi Kivity 已提交
997
		r = 0;
998 999
out_free1:
		kfree(kvm_regs);
A
Avi Kivity 已提交
1000 1001 1002
		break;
	}
	case KVM_SET_REGS: {
1003
		struct kvm_regs *kvm_regs;
A
Avi Kivity 已提交
1004

1005 1006 1007
		r = -ENOMEM;
		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
		if (!kvm_regs)
A
Avi Kivity 已提交
1008
			goto out;
1009 1010 1011 1012
		r = -EFAULT;
		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
			goto out_free2;
		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
A
Avi Kivity 已提交
1013
		if (r)
1014
			goto out_free2;
A
Avi Kivity 已提交
1015
		r = 0;
1016 1017
out_free2:
		kfree(kvm_regs);
A
Avi Kivity 已提交
1018 1019 1020 1021 1022
		break;
	}
	case KVM_GET_SREGS: {
		struct kvm_sregs kvm_sregs;

A
Avi Kivity 已提交
1023
		memset(&kvm_sregs, 0, sizeof kvm_sregs);
1024
		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
A
Avi Kivity 已提交
1025 1026 1027
		if (r)
			goto out;
		r = -EFAULT;
A
Al Viro 已提交
1028
		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
A
Avi Kivity 已提交
1029 1030 1031 1032 1033 1034 1035 1036
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_SREGS: {
		struct kvm_sregs kvm_sregs;

		r = -EFAULT;
A
Al Viro 已提交
1037
		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
A
Avi Kivity 已提交
1038
			goto out;
1039
		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
A
Avi Kivity 已提交
1040 1041 1042 1043 1044
		if (r)
			goto out;
		r = 0;
		break;
	}
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
	case KVM_GET_MP_STATE: {
		struct kvm_mp_state mp_state;

		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
		if (r)
			goto out;
		r = -EFAULT;
		if (copy_to_user(argp, &mp_state, sizeof mp_state))
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_MP_STATE: {
		struct kvm_mp_state mp_state;

		r = -EFAULT;
		if (copy_from_user(&mp_state, argp, sizeof mp_state))
			goto out;
		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1069 1070 1071 1072
	case KVM_TRANSLATE: {
		struct kvm_translation tr;

		r = -EFAULT;
A
Al Viro 已提交
1073
		if (copy_from_user(&tr, argp, sizeof tr))
A
Avi Kivity 已提交
1074
			goto out;
1075
		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
A
Avi Kivity 已提交
1076 1077 1078
		if (r)
			goto out;
		r = -EFAULT;
A
Al Viro 已提交
1079
		if (copy_to_user(argp, &tr, sizeof tr))
A
Avi Kivity 已提交
1080 1081 1082 1083 1084 1085 1086 1087
			goto out;
		r = 0;
		break;
	}
	case KVM_DEBUG_GUEST: {
		struct kvm_debug_guest dbg;

		r = -EFAULT;
A
Al Viro 已提交
1088
		if (copy_from_user(&dbg, argp, sizeof dbg))
A
Avi Kivity 已提交
1089
			goto out;
1090
		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
A
Avi Kivity 已提交
1091 1092 1093 1094 1095
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
	case KVM_SET_SIGNAL_MASK: {
		struct kvm_signal_mask __user *sigmask_arg = argp;
		struct kvm_signal_mask kvm_sigmask;
		sigset_t sigset, *p;

		p = NULL;
		if (argp) {
			r = -EFAULT;
			if (copy_from_user(&kvm_sigmask, argp,
					   sizeof kvm_sigmask))
				goto out;
			r = -EINVAL;
			if (kvm_sigmask.len != sizeof sigset)
				goto out;
			r = -EFAULT;
			if (copy_from_user(&sigset, sigmask_arg->sigset,
					   sizeof sigset))
				goto out;
			p = &sigset;
		}
		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
		break;
	}
A
Avi Kivity 已提交
1119 1120 1121 1122
	case KVM_GET_FPU: {
		struct kvm_fpu fpu;

		memset(&fpu, 0, sizeof fpu);
1123
		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
A
Avi Kivity 已提交
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
		if (r)
			goto out;
		r = -EFAULT;
		if (copy_to_user(argp, &fpu, sizeof fpu))
			goto out;
		r = 0;
		break;
	}
	case KVM_SET_FPU: {
		struct kvm_fpu fpu;

		r = -EFAULT;
		if (copy_from_user(&fpu, argp, sizeof fpu))
			goto out;
1138
		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
A
Avi Kivity 已提交
1139 1140 1141 1142 1143
		if (r)
			goto out;
		r = 0;
		break;
	}
A
Avi Kivity 已提交
1144
	default:
1145
		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
A
Avi Kivity 已提交
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
	}
out:
	return r;
}

static long kvm_vm_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
{
	struct kvm *kvm = filp->private_data;
	void __user *argp = (void __user *)arg;
1156
	int r;
A
Avi Kivity 已提交
1157

1158 1159
	if (kvm->mm != current->mm)
		return -EIO;
A
Avi Kivity 已提交
1160 1161 1162 1163 1164 1165
	switch (ioctl) {
	case KVM_CREATE_VCPU:
		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
		if (r < 0)
			goto out;
		break;
1166 1167 1168 1169 1170 1171 1172 1173 1174
	case KVM_SET_USER_MEMORY_REGION: {
		struct kvm_userspace_memory_region kvm_userspace_mem;

		r = -EFAULT;
		if (copy_from_user(&kvm_userspace_mem, argp,
						sizeof kvm_userspace_mem))
			goto out;

		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
A
Avi Kivity 已提交
1175 1176 1177 1178 1179 1180 1181 1182
		if (r)
			goto out;
		break;
	}
	case KVM_GET_DIRTY_LOG: {
		struct kvm_dirty_log log;

		r = -EFAULT;
A
Al Viro 已提交
1183
		if (copy_from_user(&log, argp, sizeof log))
A
Avi Kivity 已提交
1184
			goto out;
1185
		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
A
Avi Kivity 已提交
1186 1187 1188 1189
		if (r)
			goto out;
		break;
	}
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	case KVM_REGISTER_COALESCED_MMIO: {
		struct kvm_coalesced_mmio_zone zone;
		r = -EFAULT;
		if (copy_from_user(&zone, argp, sizeof zone))
			goto out;
		r = -ENXIO;
		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
		if (r)
			goto out;
		r = 0;
		break;
	}
	case KVM_UNREGISTER_COALESCED_MMIO: {
		struct kvm_coalesced_mmio_zone zone;
		r = -EFAULT;
		if (copy_from_user(&zone, argp, sizeof zone))
			goto out;
		r = -ENXIO;
		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
		if (r)
			goto out;
		r = 0;
		break;
	}
#endif
1216
	default:
1217
		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1218 1219 1220 1221 1222
	}
out:
	return r;
}

1223
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1224 1225 1226 1227
{
	struct kvm *kvm = vma->vm_file->private_data;
	struct page *page;

1228 1229
	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
		return VM_FAULT_SIGBUS;
1230
	page = gfn_to_page(kvm, vmf->pgoff);
1231
	if (is_error_page(page)) {
1232
		kvm_release_page_clean(page);
1233
		return VM_FAULT_SIGBUS;
1234
	}
1235 1236
	vmf->page = page;
	return 0;
1237 1238 1239
}

static struct vm_operations_struct kvm_vm_vm_ops = {
1240
	.fault = kvm_vm_fault,
1241 1242 1243 1244 1245 1246 1247 1248
};

static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_ops = &kvm_vm_vm_ops;
	return 0;
}

1249
static const struct file_operations kvm_vm_fops = {
1250 1251 1252 1253 1254 1255 1256 1257
	.release        = kvm_vm_release,
	.unlocked_ioctl = kvm_vm_ioctl,
	.compat_ioctl   = kvm_vm_ioctl,
	.mmap           = kvm_vm_mmap,
};

static int kvm_dev_ioctl_create_vm(void)
{
A
Al Viro 已提交
1258
	int fd;
1259 1260 1261
	struct kvm *kvm;

	kvm = kvm_create_vm();
1262 1263
	if (IS_ERR(kvm))
		return PTR_ERR(kvm);
1264
	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
A
Al Viro 已提交
1265
	if (fd < 0)
A
Al Viro 已提交
1266
		kvm_put_kvm(kvm);
1267 1268 1269 1270 1271 1272 1273

	return fd;
}

static long kvm_dev_ioctl(struct file *filp,
			  unsigned int ioctl, unsigned long arg)
{
1274
	long r = -EINVAL;
1275 1276 1277

	switch (ioctl) {
	case KVM_GET_API_VERSION:
1278 1279 1280
		r = -EINVAL;
		if (arg)
			goto out;
1281 1282 1283
		r = KVM_API_VERSION;
		break;
	case KVM_CREATE_VM:
1284 1285 1286
		r = -EINVAL;
		if (arg)
			goto out;
1287 1288
		r = kvm_dev_ioctl_create_vm();
		break;
1289
	case KVM_CHECK_EXTENSION:
1290
		r = kvm_dev_ioctl_check_extension(arg);
1291
		break;
1292 1293 1294 1295
	case KVM_GET_VCPU_MMAP_SIZE:
		r = -EINVAL;
		if (arg)
			goto out;
1296 1297 1298
		r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
		r += PAGE_SIZE;    /* pio data page */
1299 1300 1301
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
		r += PAGE_SIZE;    /* coalesced mmio ring page */
1302
#endif
1303
		break;
1304 1305 1306 1307 1308
	case KVM_TRACE_ENABLE:
	case KVM_TRACE_PAUSE:
	case KVM_TRACE_DISABLE:
		r = kvm_trace_ioctl(ioctl, arg);
		break;
A
Avi Kivity 已提交
1309
	default:
1310
		return kvm_arch_dev_ioctl(filp, ioctl, arg);
A
Avi Kivity 已提交
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
	}
out:
	return r;
}

static struct file_operations kvm_chardev_ops = {
	.unlocked_ioctl = kvm_dev_ioctl,
	.compat_ioctl   = kvm_dev_ioctl,
};

static struct miscdevice kvm_dev = {
A
Avi Kivity 已提交
1322
	KVM_MINOR,
A
Avi Kivity 已提交
1323 1324 1325 1326
	"kvm",
	&kvm_chardev_ops,
};

1327 1328 1329 1330 1331 1332 1333
static void hardware_enable(void *junk)
{
	int cpu = raw_smp_processor_id();

	if (cpu_isset(cpu, cpus_hardware_enabled))
		return;
	cpu_set(cpu, cpus_hardware_enabled);
1334
	kvm_arch_hardware_enable(NULL);
1335 1336 1337 1338 1339 1340 1341 1342 1343
}

static void hardware_disable(void *junk)
{
	int cpu = raw_smp_processor_id();

	if (!cpu_isset(cpu, cpus_hardware_enabled))
		return;
	cpu_clear(cpu, cpus_hardware_enabled);
1344
	kvm_arch_hardware_disable(NULL);
1345 1346
}

A
Avi Kivity 已提交
1347 1348 1349 1350 1351
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
			   void *v)
{
	int cpu = (long)v;

1352
	val &= ~CPU_TASKS_FROZEN;
A
Avi Kivity 已提交
1353
	switch (val) {
1354
	case CPU_DYING:
1355 1356 1357 1358
		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
		       cpu);
		hardware_disable(NULL);
		break;
A
Avi Kivity 已提交
1359
	case CPU_UP_CANCELED:
1360 1361
		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
		       cpu);
1362
		smp_call_function_single(cpu, hardware_disable, NULL, 1);
A
Avi Kivity 已提交
1363
		break;
1364 1365 1366
	case CPU_ONLINE:
		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
		       cpu);
1367
		smp_call_function_single(cpu, hardware_enable, NULL, 1);
A
Avi Kivity 已提交
1368 1369 1370 1371 1372
		break;
	}
	return NOTIFY_OK;
}

1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384

asmlinkage void kvm_handle_fault_on_reboot(void)
{
	if (kvm_rebooting)
		/* spin while reset goes on */
		while (true)
			;
	/* Fault while not rebooting.  We want the trace. */
	BUG();
}
EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);

1385
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
M
Mike Day 已提交
1386
		      void *v)
1387 1388 1389 1390 1391 1392 1393
{
	if (val == SYS_RESTART) {
		/*
		 * Some (well, at least mine) BIOSes hang on reboot if
		 * in vmx root mode.
		 */
		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1394
		kvm_rebooting = true;
1395
		on_each_cpu(hardware_disable, NULL, 1);
1396 1397 1398 1399 1400 1401 1402 1403 1404
	}
	return NOTIFY_OK;
}

static struct notifier_block kvm_reboot_notifier = {
	.notifier_call = kvm_reboot,
	.priority = 0,
};

1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420
void kvm_io_bus_init(struct kvm_io_bus *bus)
{
	memset(bus, 0, sizeof(*bus));
}

void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
	int i;

	for (i = 0; i < bus->dev_count; i++) {
		struct kvm_io_device *pos = bus->devs[i];

		kvm_iodevice_destructor(pos);
	}
}

1421 1422
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
					  gpa_t addr, int len, int is_write)
1423 1424 1425 1426 1427 1428
{
	int i;

	for (i = 0; i < bus->dev_count; i++) {
		struct kvm_io_device *pos = bus->devs[i];

1429
		if (pos->in_range(pos, addr, len, is_write))
1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442
			return pos;
	}

	return NULL;
}

void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
{
	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));

	bus->devs[bus->dev_count++] = dev;
}

A
Avi Kivity 已提交
1443 1444 1445 1446 1447
static struct notifier_block kvm_cpu_notifier = {
	.notifier_call = kvm_cpu_hotplug,
	.priority = 20, /* must be > scheduler priority */
};

1448
static int vm_stat_get(void *_offset, u64 *val)
1449 1450 1451 1452
{
	unsigned offset = (long)_offset;
	struct kvm *kvm;

1453
	*val = 0;
1454 1455
	spin_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
1456
		*val += *(u32 *)((void *)kvm + offset);
1457
	spin_unlock(&kvm_lock);
1458
	return 0;
1459 1460 1461 1462
}

DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");

1463
static int vcpu_stat_get(void *_offset, u64 *val)
A
Avi Kivity 已提交
1464 1465 1466 1467 1468 1469
{
	unsigned offset = (long)_offset;
	struct kvm *kvm;
	struct kvm_vcpu *vcpu;
	int i;

1470
	*val = 0;
A
Avi Kivity 已提交
1471 1472 1473
	spin_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
R
Rusty Russell 已提交
1474 1475
			vcpu = kvm->vcpus[i];
			if (vcpu)
1476
				*val += *(u32 *)((void *)vcpu + offset);
A
Avi Kivity 已提交
1477 1478
		}
	spin_unlock(&kvm_lock);
1479
	return 0;
A
Avi Kivity 已提交
1480 1481
}

1482 1483 1484 1485 1486 1487
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");

static struct file_operations *stat_fops[] = {
	[KVM_STAT_VCPU] = &vcpu_stat_fops,
	[KVM_STAT_VM]   = &vm_stat_fops,
};
A
Avi Kivity 已提交
1488

1489
static void kvm_init_debug(void)
A
Avi Kivity 已提交
1490 1491 1492
{
	struct kvm_stats_debugfs_item *p;

1493
	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
A
Avi Kivity 已提交
1494
	for (p = debugfs_entries; p->name; ++p)
1495
		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
A
Avi Kivity 已提交
1496
						(void *)(long)p->offset,
1497
						stat_fops[p->kind]);
A
Avi Kivity 已提交
1498 1499 1500 1501 1502 1503 1504 1505
}

static void kvm_exit_debug(void)
{
	struct kvm_stats_debugfs_item *p;

	for (p = debugfs_entries; p->name; ++p)
		debugfs_remove(p->dentry);
1506
	debugfs_remove(kvm_debugfs_dir);
A
Avi Kivity 已提交
1507 1508
}

1509 1510
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
A
Avi Kivity 已提交
1511
	hardware_disable(NULL);
1512 1513 1514 1515 1516
	return 0;
}

static int kvm_resume(struct sys_device *dev)
{
A
Avi Kivity 已提交
1517
	hardware_enable(NULL);
1518 1519 1520 1521
	return 0;
}

static struct sysdev_class kvm_sysdev_class = {
1522
	.name = "kvm",
1523 1524 1525 1526 1527 1528 1529 1530 1531
	.suspend = kvm_suspend,
	.resume = kvm_resume,
};

static struct sys_device kvm_sysdev = {
	.id = 0,
	.cls = &kvm_sysdev_class,
};

1532
struct page *bad_page;
1533
pfn_t bad_pfn;
A
Avi Kivity 已提交
1534

1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
	return container_of(pn, struct kvm_vcpu, preempt_notifier);
}

static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
{
	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

1545
	kvm_arch_vcpu_load(vcpu, cpu);
1546 1547 1548 1549 1550 1551 1552
}

static void kvm_sched_out(struct preempt_notifier *pn,
			  struct task_struct *next)
{
	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

1553
	kvm_arch_vcpu_put(vcpu);
1554 1555
}

1556
int kvm_init(void *opaque, unsigned int vcpu_size,
1557
		  struct module *module)
A
Avi Kivity 已提交
1558 1559
{
	int r;
Y
Yang, Sheng 已提交
1560
	int cpu;
A
Avi Kivity 已提交
1561

1562 1563
	kvm_init_debug();

1564 1565
	r = kvm_arch_init(opaque);
	if (r)
1566
		goto out_fail;
1567 1568 1569 1570 1571 1572 1573 1574

	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);

	if (bad_page == NULL) {
		r = -ENOMEM;
		goto out;
	}

1575 1576
	bad_pfn = page_to_pfn(bad_page);

1577
	r = kvm_arch_hardware_setup();
A
Avi Kivity 已提交
1578
	if (r < 0)
1579
		goto out_free_0;
A
Avi Kivity 已提交
1580

Y
Yang, Sheng 已提交
1581 1582
	for_each_online_cpu(cpu) {
		smp_call_function_single(cpu,
1583
				kvm_arch_check_processor_compat,
1584
				&r, 1);
Y
Yang, Sheng 已提交
1585
		if (r < 0)
1586
			goto out_free_1;
Y
Yang, Sheng 已提交
1587 1588
	}

1589
	on_each_cpu(hardware_enable, NULL, 1);
A
Avi Kivity 已提交
1590 1591
	r = register_cpu_notifier(&kvm_cpu_notifier);
	if (r)
1592
		goto out_free_2;
A
Avi Kivity 已提交
1593 1594
	register_reboot_notifier(&kvm_reboot_notifier);

1595 1596
	r = sysdev_class_register(&kvm_sysdev_class);
	if (r)
1597
		goto out_free_3;
1598 1599 1600

	r = sysdev_register(&kvm_sysdev);
	if (r)
1601
		goto out_free_4;
1602

1603 1604
	/* A kmem cache lets us meet the alignment requirements of fx_save. */
	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
J
Joe Perches 已提交
1605 1606
					   __alignof__(struct kvm_vcpu),
					   0, NULL);
1607 1608
	if (!kvm_vcpu_cache) {
		r = -ENOMEM;
1609
		goto out_free_5;
1610 1611
	}

A
Avi Kivity 已提交
1612 1613 1614 1615
	kvm_chardev_ops.owner = module;

	r = misc_register(&kvm_dev);
	if (r) {
M
Mike Day 已提交
1616
		printk(KERN_ERR "kvm: misc device register failed\n");
A
Avi Kivity 已提交
1617 1618 1619
		goto out_free;
	}

1620 1621 1622
	kvm_preempt_ops.sched_in = kvm_sched_in;
	kvm_preempt_ops.sched_out = kvm_sched_out;

1623
	return 0;
A
Avi Kivity 已提交
1624 1625

out_free:
1626
	kmem_cache_destroy(kvm_vcpu_cache);
1627
out_free_5:
1628
	sysdev_unregister(&kvm_sysdev);
1629
out_free_4:
1630
	sysdev_class_unregister(&kvm_sysdev_class);
1631
out_free_3:
A
Avi Kivity 已提交
1632
	unregister_reboot_notifier(&kvm_reboot_notifier);
A
Avi Kivity 已提交
1633
	unregister_cpu_notifier(&kvm_cpu_notifier);
1634
out_free_2:
1635
	on_each_cpu(hardware_disable, NULL, 1);
1636
out_free_1:
1637
	kvm_arch_hardware_unsetup();
1638 1639
out_free_0:
	__free_page(bad_page);
1640
out:
1641
	kvm_arch_exit();
1642
	kvm_exit_debug();
1643
out_fail:
A
Avi Kivity 已提交
1644 1645
	return r;
}
1646
EXPORT_SYMBOL_GPL(kvm_init);
A
Avi Kivity 已提交
1647

1648
void kvm_exit(void)
A
Avi Kivity 已提交
1649
{
1650
	kvm_trace_cleanup();
A
Avi Kivity 已提交
1651
	misc_deregister(&kvm_dev);
1652
	kmem_cache_destroy(kvm_vcpu_cache);
1653 1654
	sysdev_unregister(&kvm_sysdev);
	sysdev_class_unregister(&kvm_sysdev_class);
A
Avi Kivity 已提交
1655
	unregister_reboot_notifier(&kvm_reboot_notifier);
1656
	unregister_cpu_notifier(&kvm_cpu_notifier);
1657
	on_each_cpu(hardware_disable, NULL, 1);
1658
	kvm_arch_hardware_unsetup();
1659
	kvm_arch_exit();
A
Avi Kivity 已提交
1660
	kvm_exit_debug();
1661
	__free_page(bad_page);
A
Avi Kivity 已提交
1662
}
1663
EXPORT_SYMBOL_GPL(kvm_exit);