mmu.c 45.4 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
19 20

#include "vmx.h"
21
#include "mmu.h"
A
Avi Kivity 已提交
22

23
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
24 25 26 27 28
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
29
#include <linux/swap.h>
A
Avi Kivity 已提交
30

A
Avi Kivity 已提交
31 32
#include <asm/page.h>
#include <asm/cmpxchg.h>
33
#include <asm/io.h>
A
Avi Kivity 已提交
34

35 36 37 38 39 40 41 42 43
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
static bool tdp_enabled = false;

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
static int dbg = 1;
#endif
A
Avi Kivity 已提交
69

70 71 72
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
73 74 75 76 77
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
78
#endif
A
Avi Kivity 已提交
79

80 81 82 83
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
#define PT32_PT_BITS 10
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
A
Avi Kivity 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96

#define PT_WRITABLE_SHIFT 1

#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
#define PT_USER_MASK (1ULL << 2)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_MASK (1ULL << 5)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
97 98
#define PT64_NX_SHIFT 63
#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
A
Avi Kivity 已提交
99 100 101 102 103 104 105

#define PT_PAT_SHIFT 7
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)

#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
M
Mike Day 已提交
106 107
#define PT32_DIR_PSE36_MASK \
	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
A
Avi Kivity 已提交
108 109 110 111 112 113 114 115 116 117


#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define VALID_PAGE(x) ((x) != INVALID_PAGE)

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
118
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
119 120 121 122 123 124 125 126 127 128 129

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
130
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
131 132 133 134 135 136 137 138

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


139
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
140 141 142 143 144 145 146
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))

147 148
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
149 150 151 152

#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
153
#define PFERR_FETCH_MASK (1U << 4)
A
Avi Kivity 已提交
154 155 156 157 158 159 160 161

#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3

#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1

162 163
#define RMAP_EXT 4

164 165 166 167 168
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

169 170 171 172 173
struct kvm_rmap_desc {
	u64 *shadow_ptes[RMAP_EXT];
	struct kvm_rmap_desc *more;
};

174 175
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
176
static struct kmem_cache *mmu_page_header_cache;
177

178 179 180 181 182 183 184 185 186 187
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;

void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

A
Avi Kivity 已提交
188 189
static int is_write_protection(struct kvm_vcpu *vcpu)
{
190
	return vcpu->arch.cr0 & X86_CR0_WP;
A
Avi Kivity 已提交
191 192 193 194 195 196 197
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

198 199
static int is_nx(struct kvm_vcpu *vcpu)
{
200
	return vcpu->arch.shadow_efer & EFER_NX;
201 202
}

A
Avi Kivity 已提交
203 204 205 206 207
static int is_present_pte(unsigned long pte)
{
	return pte & PT_PRESENT_MASK;
}

208 209 210 211 212 213
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

A
Avi Kivity 已提交
214 215 216 217 218
static int is_writeble_pte(unsigned long pte)
{
	return pte & PT_WRITABLE_MASK;
}

219 220 221 222 223
static int is_dirty_pte(unsigned long pte)
{
	return pte & PT_DIRTY_MASK;
}

224 225
static int is_rmap_pte(u64 pte)
{
226
	return is_shadow_present_pte(pte);
227 228
}

229 230 231 232 233 234 235
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

236 237 238 239 240 241 242 243 244
static void set_shadow_pte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
	set_64bit((unsigned long *)sptep, spte);
#else
	set_64bit((unsigned long long *)sptep, spte);
#endif
}

245
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246
				  struct kmem_cache *base_cache, int min)
247 248 249 250
{
	void *obj;

	if (cache->nobjs >= min)
251
		return 0;
252
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254
		if (!obj)
255
			return -ENOMEM;
256 257
		cache->objects[cache->nobjs++] = obj;
	}
258
	return 0;
259 260 261 262 263 264 265 266
}

static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
		kfree(mc->objects[--mc->nobjs]);
}

A
Avi Kivity 已提交
267
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268
				       int min)
A
Avi Kivity 已提交
269 270 271 272 273 274
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
276 277 278 279 280 281 282 283 284 285 286
		if (!page)
			return -ENOMEM;
		set_page_private(page, 0);
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
287
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
288 289
}

290
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291
{
292 293
	int r;

294
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295
				   pte_chain_cache, 4);
296 297
	if (r)
		goto out;
298
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299
				   rmap_desc_cache, 1);
300 301
	if (r)
		goto out;
302
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 304
	if (r)
		goto out;
305
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306
				   mmu_page_header_cache, 4);
307 308
out:
	return r;
309 310 311 312
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
313 314 315 316
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	memset(p, 0, size);
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
332
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 334 335
				      sizeof(struct kvm_pte_chain));
}

336
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337
{
338
	kfree(pc);
339 340 341 342
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
343
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 345 346
				      sizeof(struct kvm_rmap_desc));
}

347
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348
{
349
	kfree(rd);
350 351
}

352 353 354 355 356 357 358 359 360 361 362 363 364
/*
 * Take gfn and return the reverse mapping to it.
 * Note: gfn must be unaliased before this function get called
 */

static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
{
	struct kvm_memory_slot *slot;

	slot = gfn_to_memslot(kvm, gfn);
	return &slot->rmap[gfn - slot->base_gfn];
}

365 366 367
/*
 * Reverse mapping data structures:
 *
368 369
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
370
 *
371 372
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
373
 */
374
static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375
{
376
	struct kvm_mmu_page *sp;
377
	struct kvm_rmap_desc *desc;
378
	unsigned long *rmapp;
379 380 381 382
	int i;

	if (!is_rmap_pte(*spte))
		return;
383
	gfn = unalias_gfn(vcpu->kvm, gfn);
384 385
	sp = page_header(__pa(spte));
	sp->gfns[spte - sp->spt] = gfn;
386 387
	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
	if (!*rmapp) {
388
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 390
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
391
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392
		desc = mmu_alloc_rmap_desc(vcpu);
393
		desc->shadow_ptes[0] = (u64 *)*rmapp;
394
		desc->shadow_ptes[1] = spte;
395
		*rmapp = (unsigned long)desc | 1;
396 397
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 400 401
		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
			desc = desc->more;
		if (desc->shadow_ptes[RMAP_EXT-1]) {
402
			desc->more = mmu_alloc_rmap_desc(vcpu);
403 404 405 406 407 408 409 410
			desc = desc->more;
		}
		for (i = 0; desc->shadow_ptes[i]; ++i)
			;
		desc->shadow_ptes[i] = spte;
	}
}

411
static void rmap_desc_remove_entry(unsigned long *rmapp,
412 413 414 415 416 417 418 419 420
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
		;
	desc->shadow_ptes[i] = desc->shadow_ptes[j];
A
Al Viro 已提交
421
	desc->shadow_ptes[j] = NULL;
422 423 424
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
425
		*rmapp = (unsigned long)desc->shadow_ptes[0];
426 427 428 429
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
430
			*rmapp = (unsigned long)desc->more | 1;
431
	mmu_free_rmap_desc(desc);
432 433
}

434
static void rmap_remove(struct kvm *kvm, u64 *spte)
435 436 437
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
438
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
439
	struct page *page;
440
	unsigned long *rmapp;
441 442 443 444
	int i;

	if (!is_rmap_pte(*spte))
		return;
445
	sp = page_header(__pa(spte));
A
Avi Kivity 已提交
446
	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447
	mark_page_accessed(page);
448
	if (is_writeble_pte(*spte))
A
Avi Kivity 已提交
449
		kvm_release_page_dirty(page);
450
	else
A
Avi Kivity 已提交
451
		kvm_release_page_clean(page);
452
	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453
	if (!*rmapp) {
454 455
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
456
	} else if (!(*rmapp & 1)) {
457
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
458
		if ((u64 *)*rmapp != spte) {
459 460 461 462
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
463
		*rmapp = 0;
464 465
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
466
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 468 469 470
		prev_desc = NULL;
		while (desc) {
			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
				if (desc->shadow_ptes[i] == spte) {
471
					rmap_desc_remove_entry(rmapp,
472
							       desc, i,
473 474 475 476 477 478 479 480 481 482
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
		BUG();
	}
}

483
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484 485
{
	struct kvm_rmap_desc *desc;
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
	struct kvm_rmap_desc *prev_desc;
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_desc = NULL;
	prev_spte = NULL;
	while (desc) {
		for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
			if (prev_spte == spte)
				return desc->shadow_ptes[i];
			prev_spte = desc->shadow_ptes[i];
		}
		desc = desc->more;
	}
	return NULL;
}

static void rmap_write_protect(struct kvm *kvm, u64 gfn)
{
513
	unsigned long *rmapp;
514
	u64 *spte;
515
	int write_protected = 0;
516

517 518
	gfn = unalias_gfn(kvm, gfn);
	rmapp = gfn_to_rmap(kvm, gfn);
519

520 521
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
522 523 524
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525
		if (is_writeble_pte(*spte)) {
526
			set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 528
			write_protected = 1;
		}
529
		spte = rmap_next(kvm, rmapp, spte);
530
	}
531 532
	if (write_protected)
		kvm_flush_remote_tlbs(kvm);
533 534
}

535
#ifdef MMU_DEBUG
536
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
537
{
538 539 540
	u64 *pos;
	u64 *end;

541
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542
		if (*pos != shadow_trap_nonpresent_pte) {
543 544
			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
			       pos, *pos);
A
Avi Kivity 已提交
545
			return 0;
546
		}
A
Avi Kivity 已提交
547 548
	return 1;
}
549
#endif
A
Avi Kivity 已提交
550

551
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552
{
553 554 555 556 557
	ASSERT(is_empty_shadow_page(sp->spt));
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
	__free_page(virt_to_page(sp->gfns));
	kfree(sp);
558
	++kvm->arch.n_free_mmu_pages;
559 560
}

561 562
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
563
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
564 565
}

566 567
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
					       u64 *parent_pte)
A
Avi Kivity 已提交
568
{
569
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
570

571 572 573
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 577 578 579
	ASSERT(is_empty_shadow_page(sp->spt));
	sp->slot_bitmap = 0;
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
580
	--vcpu->kvm->arch.n_free_mmu_pages;
581
	return sp;
A
Avi Kivity 已提交
582 583
}

584
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585
				    struct kvm_mmu_page *sp, u64 *parent_pte)
586 587 588 589 590 591 592
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
593 594
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
595 596

		if (!old) {
597
			sp->parent_pte = parent_pte;
598 599
			return;
		}
600
		sp->multimapped = 1;
601
		pte_chain = mmu_alloc_pte_chain(vcpu);
602 603
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 605
		pte_chain->parent_ptes[0] = old;
	}
606
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 608 609 610 611 612 613 614
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
615
	pte_chain = mmu_alloc_pte_chain(vcpu);
616
	BUG_ON(!pte_chain);
617
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 619 620
	pte_chain->parent_ptes[0] = parent_pte;
}

621
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 623 624 625 626 627
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

628 629 630
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
631 632
		return;
	}
633
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 635 636 637 638
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
639 640
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
641 642 643 644 645
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
646 647
			if (i == 0) {
				hlist_del(&pte_chain->link);
648
				mmu_free_pte_chain(pte_chain);
649 650 651
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
652 653
				}
			}
654 655 656 657 658
			return;
		}
	BUG();
}

659
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660 661 662
{
	unsigned index;
	struct hlist_head *bucket;
663
	struct kvm_mmu_page *sp;
664 665 666
	struct hlist_node *node;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667
	index = kvm_page_table_hashfn(gfn);
668
	bucket = &kvm->arch.mmu_page_hash[index];
669 670
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
671
			pgprintk("%s: found role %x\n",
672 673
				 __FUNCTION__, sp->role.word);
			return sp;
674 675 676 677 678 679 680 681 682
		}
	return NULL;
}

static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
					     int metaphysical,
683
					     unsigned access,
684
					     u64 *parent_pte)
685 686 687 688 689
{
	union kvm_mmu_page_role role;
	unsigned index;
	unsigned quadrant;
	struct hlist_head *bucket;
690
	struct kvm_mmu_page *sp;
691 692 693
	struct hlist_node *node;

	role.word = 0;
694
	role.glevels = vcpu->arch.mmu.root_level;
695 696
	role.level = level;
	role.metaphysical = metaphysical;
697
	role.access = access;
698
	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
699 700 701 702 703 704
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
		 gfn, role.word);
705
	index = kvm_page_table_hashfn(gfn);
706
	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
707 708 709
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && sp->role.word == role.word) {
			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
710
			pgprintk("%s: found\n", __FUNCTION__);
711
			return sp;
712
		}
A
Avi Kivity 已提交
713
	++vcpu->kvm->stat.mmu_cache_miss;
714 715 716
	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
	if (!sp)
		return sp;
717
	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
718 719 720
	sp->gfn = gfn;
	sp->role = role;
	hlist_add_head(&sp->hash_link, bucket);
721
	vcpu->arch.mmu.prefetch_page(vcpu, sp);
722
	if (!metaphysical)
723
		rmap_write_protect(vcpu->kvm, gfn);
724
	return sp;
725 726
}

727
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
728
					 struct kvm_mmu_page *sp)
729
{
730 731 732 733
	unsigned i;
	u64 *pt;
	u64 ent;

734
	pt = sp->spt;
735

736
	if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
737
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
738
			if (is_shadow_present_pte(pt[i]))
739
				rmap_remove(kvm, &pt[i]);
740
			pt[i] = shadow_trap_nonpresent_pte;
741
		}
742
		kvm_flush_remote_tlbs(kvm);
743 744 745 746 747 748
		return;
	}

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

749 750
		pt[i] = shadow_trap_nonpresent_pte;
		if (!is_shadow_present_pte(ent))
751 752
			continue;
		ent &= PT64_BASE_ADDR_MASK;
753
		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
754
	}
755
	kvm_flush_remote_tlbs(kvm);
756 757
}

758
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
759
{
760
	mmu_page_remove_parent_pte(sp, parent_pte);
761 762
}

763 764 765 766 767 768
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;

	for (i = 0; i < KVM_MAX_VCPUS; ++i)
		if (kvm->vcpus[i])
769
			kvm->vcpus[i]->arch.last_pte_updated = NULL;
770 771
}

772
static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
773 774 775
{
	u64 *parent_pte;

A
Avi Kivity 已提交
776
	++kvm->stat.mmu_shadow_zapped;
777 778 779
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
780 781 782
		else {
			struct kvm_pte_chain *chain;

783
			chain = container_of(sp->parent_ptes.first,
784 785 786
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
787
		BUG_ON(!parent_pte);
788
		kvm_mmu_put_page(sp, parent_pte);
789
		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
790
	}
791 792 793 794
	kvm_mmu_page_unlink_children(kvm, sp);
	if (!sp->root_count) {
		hlist_del(&sp->hash_link);
		kvm_mmu_free_page(kvm, sp);
A
Avi Kivity 已提交
795
	} else
796
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
797
	kvm_mmu_reset_last_pte_updated(kvm);
798 799
}

800 801 802 803 804 805 806 807 808 809 810 811
/*
 * Changing the number of mmu pages allocated to the vm
 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

812
	if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
813
	    kvm_nr_mmu_pages) {
814 815
		int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
				       - kvm->arch.n_free_mmu_pages;
816 817 818 819

		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
			struct kvm_mmu_page *page;

820
			page = container_of(kvm->arch.active_mmu_pages.prev,
821 822 823 824
					    struct kvm_mmu_page, link);
			kvm_mmu_zap_page(kvm, page);
			n_used_mmu_pages--;
		}
825
		kvm->arch.n_free_mmu_pages = 0;
826 827
	}
	else
828 829
		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
					 - kvm->arch.n_alloc_mmu_pages;
830

831
	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
832 833
}

834
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
835 836 837
{
	unsigned index;
	struct hlist_head *bucket;
838
	struct kvm_mmu_page *sp;
839 840 841 842 843
	struct hlist_node *node, *n;
	int r;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
	r = 0;
844
	index = kvm_page_table_hashfn(gfn);
845
	bucket = &kvm->arch.mmu_page_hash[index];
846 847
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
848
			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
849 850
				 sp->role.word);
			kvm_mmu_zap_page(kvm, sp);
851 852 853
			r = 1;
		}
	return r;
854 855
}

856
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
857
{
858
	struct kvm_mmu_page *sp;
859

860 861 862
	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
		kvm_mmu_zap_page(kvm, sp);
863 864 865
	}
}

866
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
867
{
868
	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
869
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
870

871
	__set_bit(slot, &sp->slot_bitmap);
A
Avi Kivity 已提交
872 873
}

874 875
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
876 877
	struct page *page;

878
	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
879 880 881

	if (gpa == UNMAPPED_GVA)
		return NULL;
882 883 884 885 886 887

	down_read(&current->mm->mmap_sem);
	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
	up_read(&current->mm->mmap_sem);

	return page;
888 889
}

890 891 892
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
893
			 int *ptwrite, gfn_t gfn, struct page *page)
894 895
{
	u64 spte;
896
	int was_rmapped = 0;
897
	int was_writeble = is_writeble_pte(*shadow_pte);
898
	hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
899

900
	pgprintk("%s: spte %llx access %x write_fault %d"
901
		 " user_fault %d gfn %lx\n",
902
		 __FUNCTION__, *shadow_pte, pt_access,
903 904
		 write_fault, user_fault, gfn);

905 906 907 908 909 910 911 912 913 914
	if (is_rmap_pte(*shadow_pte)) {
		if (host_pfn != page_to_pfn(page)) {
			pgprintk("hfn old %lx new %lx\n",
				 host_pfn, page_to_pfn(page));
			rmap_remove(vcpu->kvm, shadow_pte);
		}
		else
			was_rmapped = 1;
	}

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
	if (!(pte_access & ACC_EXEC_MASK))
		spte |= PT64_NX_MASK;

	spte |= PT_PRESENT_MASK;
	if (pte_access & ACC_USER_MASK)
		spte |= PT_USER_MASK;

	spte |= page_to_phys(page);

	if ((pte_access & ACC_WRITE_MASK)
	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
		struct kvm_mmu_page *shadow;

		spte |= PT_WRITABLE_MASK;
		if (user_fault) {
			mmu_unshadow(vcpu->kvm, gfn);
			goto unshadowed;
		}

		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
		if (shadow) {
			pgprintk("%s: found shadow page for %lx, marking ro\n",
				 __FUNCTION__, gfn);
			pte_access &= ~ACC_WRITE_MASK;
			if (is_writeble_pte(spte)) {
				spte &= ~PT_WRITABLE_MASK;
				kvm_x86_ops->tlb_flush(vcpu);
			}
			if (write_fault)
				*ptwrite = 1;
		}
	}

unshadowed:

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
	set_shadow_pte(shadow_pte, spte);
	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
	if (!was_rmapped) {
		rmap_add(vcpu, shadow_pte, gfn);
		if (!is_rmap_pte(*shadow_pte))
			kvm_release_page_clean(page);
968 969 970 971 972
	} else {
		if (was_writeble)
			kvm_release_page_dirty(page);
		else
			kvm_release_page_clean(page);
973 974
	}
	if (!ptwrite || !*ptwrite)
975
		vcpu->arch.last_pte_updated = shadow_pte;
976 977
}

A
Avi Kivity 已提交
978 979 980 981
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

982 983
static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
			   gfn_t gfn, struct page *page)
A
Avi Kivity 已提交
984 985
{
	int level = PT32E_ROOT_LEVEL;
986
	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
987
	int pt_write = 0;
A
Avi Kivity 已提交
988 989 990 991 992 993 994 995 996

	for (; ; level--) {
		u32 index = PT64_INDEX(v, level);
		u64 *table;

		ASSERT(VALID_PAGE(table_addr));
		table = __va(table_addr);

		if (level == 1) {
997
			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
998
				     0, write, 1, &pt_write, gfn, page);
999
			return pt_write;
A
Avi Kivity 已提交
1000 1001
		}

1002
		if (table[index] == shadow_trap_nonpresent_pte) {
1003
			struct kvm_mmu_page *new_table;
1004
			gfn_t pseudo_gfn;
A
Avi Kivity 已提交
1005

1006 1007 1008 1009
			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
				>> PAGE_SHIFT;
			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
						     v, level - 1,
1010
						     1, ACC_ALL, &table[index]);
1011
			if (!new_table) {
A
Avi Kivity 已提交
1012
				pgprintk("nonpaging_map: ENOMEM\n");
1013
				kvm_release_page_clean(page);
A
Avi Kivity 已提交
1014 1015 1016
				return -ENOMEM;
			}

1017
			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1018
				| PT_WRITABLE_MASK | PT_USER_MASK;
A
Avi Kivity 已提交
1019 1020 1021 1022 1023
		}
		table_addr = table[index] & PT64_BASE_ADDR_MASK;
	}
}

1024 1025 1026 1027
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
	int r;

1028 1029
	struct page *page;

1030 1031
	down_read(&vcpu->kvm->slots_lock);

1032 1033
	down_read(&current->mm->mmap_sem);
	page = gfn_to_page(vcpu->kvm, gfn);
1034
	up_read(&current->mm->mmap_sem);
1035

1036 1037 1038 1039 1040 1041 1042
	/* mmio */
	if (is_error_page(page)) {
		kvm_release_page_clean(page);
		up_read(&vcpu->kvm->slots_lock);
		return 1;
	}

1043
	spin_lock(&vcpu->kvm->mmu_lock);
1044
	kvm_mmu_free_some_pages(vcpu);
1045 1046 1047
	r = __nonpaging_map(vcpu, v, write, gfn, page);
	spin_unlock(&vcpu->kvm->mmu_lock);

1048
	up_read(&vcpu->kvm->slots_lock);
1049

1050 1051 1052 1053
	return r;
}


1054 1055 1056 1057 1058 1059 1060 1061 1062
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1063 1064 1065
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
1066
	struct kvm_mmu_page *sp;
1067

1068
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
1069
		return;
1070
	spin_lock(&vcpu->kvm->mmu_lock);
1071
#ifdef CONFIG_X86_64
1072 1073
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1074

1075 1076
		sp = page_header(root);
		--sp->root_count;
1077
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1078
		spin_unlock(&vcpu->kvm->mmu_lock);
1079 1080 1081 1082
		return;
	}
#endif
	for (i = 0; i < 4; ++i) {
1083
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1084

A
Avi Kivity 已提交
1085 1086
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
1087 1088
			sp = page_header(root);
			--sp->root_count;
A
Avi Kivity 已提交
1089
		}
1090
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1091
	}
1092
	spin_unlock(&vcpu->kvm->mmu_lock);
1093
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1094 1095 1096 1097 1098
}

static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
	int i;
1099
	gfn_t root_gfn;
1100
	struct kvm_mmu_page *sp;
1101

1102
	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1103 1104

#ifdef CONFIG_X86_64
1105 1106
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1107 1108

		ASSERT(!VALID_PAGE(root));
1109
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1110
				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
1111 1112
		root = __pa(sp->spt);
		++sp->root_count;
1113
		vcpu->arch.mmu.root_hpa = root;
1114 1115 1116 1117
		return;
	}
#endif
	for (i = 0; i < 4; ++i) {
1118
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1119 1120

		ASSERT(!VALID_PAGE(root));
1121 1122 1123
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
			if (!is_present_pte(vcpu->arch.pdptrs[i])) {
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
1124 1125
				continue;
			}
1126 1127
			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
		} else if (vcpu->arch.mmu.root_level == 0)
1128
			root_gfn = 0;
1129 1130
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
				      PT32_ROOT_LEVEL, !is_paging(vcpu),
1131
				      ACC_ALL, NULL);
1132 1133
		root = __pa(sp->spt);
		++sp->root_count;
1134
		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1135
	}
1136
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1137 1138
}

A
Avi Kivity 已提交
1139 1140 1141 1142 1143 1144
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
	return vaddr;
}

static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
A
Avi Kivity 已提交
1145
				u32 error_code)
A
Avi Kivity 已提交
1146
{
1147
	gfn_t gfn;
1148
	int r;
A
Avi Kivity 已提交
1149

1150
	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1151 1152 1153
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
1154

A
Avi Kivity 已提交
1155
	ASSERT(vcpu);
1156
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1157

1158
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
1159

1160 1161
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
1162 1163 1164 1165
}

static void nonpaging_free(struct kvm_vcpu *vcpu)
{
1166
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1167 1168 1169 1170
}

static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
1171
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1172 1173 1174 1175 1176

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
1177
	context->prefetch_page = nonpaging_prefetch_page;
1178
	context->root_level = 0;
A
Avi Kivity 已提交
1179
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1180
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1181 1182 1183
	return 0;
}

1184
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1185
{
A
Avi Kivity 已提交
1186
	++vcpu->stat.tlb_flush;
1187
	kvm_x86_ops->tlb_flush(vcpu);
A
Avi Kivity 已提交
1188 1189 1190 1191
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
1192
	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
1193
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1194 1195 1196 1197 1198 1199
}

static void inject_page_fault(struct kvm_vcpu *vcpu,
			      u64 addr,
			      u32 err_code)
{
1200
	kvm_inject_page_fault(vcpu, addr, err_code);
A
Avi Kivity 已提交
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

1216
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
A
Avi Kivity 已提交
1217
{
1218
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1219 1220 1221 1222 1223

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
1224
	context->prefetch_page = paging64_prefetch_page;
A
Avi Kivity 已提交
1225
	context->free = paging_free;
1226 1227
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
1228
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1229 1230 1231
	return 0;
}

1232 1233 1234 1235 1236
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}

A
Avi Kivity 已提交
1237 1238
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
1239
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1240 1241 1242 1243 1244

	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
1245
	context->prefetch_page = paging32_prefetch_page;
A
Avi Kivity 已提交
1246 1247
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1248
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1249 1250 1251 1252 1253
	return 0;
}

static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
1254
	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
1255 1256 1257 1258 1259
}

static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1260
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1261 1262 1263

	if (!is_paging(vcpu))
		return nonpaging_init_context(vcpu);
A
Avi Kivity 已提交
1264
	else if (is_long_mode(vcpu))
A
Avi Kivity 已提交
1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
		return paging64_init_context(vcpu);
	else if (is_pae(vcpu))
		return paging32E_init_context(vcpu);
	else
		return paging32_init_context(vcpu);
}

static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1275 1276 1277
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
		vcpu->arch.mmu.free(vcpu);
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1278 1279 1280 1281
	}
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1282 1283 1284 1285
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
1286
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
1287 1288

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1289
{
1290 1291
	int r;

1292
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
1293 1294
	if (r)
		goto out;
1295
	spin_lock(&vcpu->kvm->mmu_lock);
1296
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
1297
	mmu_alloc_roots(vcpu);
1298
	spin_unlock(&vcpu->kvm->mmu_lock);
1299
	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
A
Avi Kivity 已提交
1300
	kvm_mmu_flush_tlb(vcpu);
1301 1302
out:
	return r;
A
Avi Kivity 已提交
1303
}
A
Avi Kivity 已提交
1304 1305 1306 1307 1308 1309
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
A
Avi Kivity 已提交
1310

1311
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1312
				  struct kvm_mmu_page *sp,
1313 1314 1315 1316 1317 1318
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
1319
	if (is_shadow_present_pte(pte)) {
1320
		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1321
			rmap_remove(vcpu->kvm, spte);
1322 1323
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
1324
			mmu_page_remove_parent_pte(child, spte);
1325 1326
		}
	}
1327
	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1328 1329
}

1330
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1331
				  struct kvm_mmu_page *sp,
1332
				  u64 *spte,
1333
				  const void *new)
1334
{
1335
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
A
Avi Kivity 已提交
1336
		++vcpu->kvm->stat.mmu_pde_zapped;
1337
		return;
A
Avi Kivity 已提交
1338
	}
1339

A
Avi Kivity 已提交
1340
	++vcpu->kvm->stat.mmu_pte_updated;
1341
	if (sp->role.glevels == PT32_ROOT_LEVEL)
1342
		paging32_update_pte(vcpu, sp, spte, new);
1343
	else
1344
		paging64_update_pte(vcpu, sp, spte, new);
1345 1346
}

1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
{
	if (need_remote_flush(old, new))
		kvm_flush_remote_tlbs(vcpu->kvm);
	else
		kvm_mmu_flush_tlb(vcpu);
}

1368 1369
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
1370
	u64 *spte = vcpu->arch.last_pte_updated;
1371 1372 1373 1374

	return !!(spte && (*spte & PT_ACCESSED_MASK));
}

1375 1376 1377 1378 1379 1380
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
					  const u8 *new, int bytes)
{
	gfn_t gfn;
	int r;
	u64 gpte = 0;
1381
	struct page *page;
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408

	if (bytes != 4 && bytes != 8)
		return;

	/*
	 * Assume that the pte write on a page table of the same type
	 * as the current vcpu paging mode.  This is nearly always true
	 * (might be false while changing modes).  Note it is verified later
	 * by update_pte().
	 */
	if (is_pae(vcpu)) {
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
		if ((bytes == 4) && (gpa % 4 == 0)) {
			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
			if (r)
				return;
			memcpy((void *)&gpte + (gpa % 8), new, 4);
		} else if ((bytes == 8) && (gpa % 8 == 0)) {
			memcpy((void *)&gpte, new, 8);
		}
	} else {
		if ((bytes == 4) && (gpa % 4 == 0))
			memcpy((void *)&gpte, new, 4);
	}
	if (!is_present_pte(gpte))
		return;
	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1409

1410
	down_read(&vcpu->kvm->slots_lock);
1411
	page = gfn_to_page(vcpu->kvm, gfn);
1412
	up_read(&vcpu->kvm->slots_lock);
1413

1414 1415 1416 1417
	if (is_error_page(page)) {
		kvm_release_page_clean(page);
		return;
	}
1418
	vcpu->arch.update_pte.gfn = gfn;
1419
	vcpu->arch.update_pte.page = page;
1420 1421
}

1422
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1423
		       const u8 *new, int bytes)
1424
{
1425
	gfn_t gfn = gpa >> PAGE_SHIFT;
1426
	struct kvm_mmu_page *sp;
1427
	struct hlist_node *node, *n;
1428 1429
	struct hlist_head *bucket;
	unsigned index;
1430
	u64 entry, gentry;
1431 1432
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
1433
	unsigned pte_size;
1434
	unsigned page_offset;
1435
	unsigned misaligned;
1436
	unsigned quadrant;
1437
	int level;
1438
	int flooded = 0;
1439
	int npte;
1440
	int r;
1441

1442
	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1443
	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1444
	spin_lock(&vcpu->kvm->mmu_lock);
1445
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
1446
	++vcpu->kvm->stat.mmu_pte_write;
1447
	kvm_mmu_audit(vcpu, "pre pte write");
1448
	if (gfn == vcpu->arch.last_pt_write_gfn
1449
	    && !last_updated_pte_accessed(vcpu)) {
1450 1451
		++vcpu->arch.last_pt_write_count;
		if (vcpu->arch.last_pt_write_count >= 3)
1452 1453
			flooded = 1;
	} else {
1454 1455 1456
		vcpu->arch.last_pt_write_gfn = gfn;
		vcpu->arch.last_pt_write_count = 1;
		vcpu->arch.last_pte_updated = NULL;
1457
	}
1458
	index = kvm_page_table_hashfn(gfn);
1459
	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1460 1461
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
		if (sp->gfn != gfn || sp->role.metaphysical)
1462
			continue;
1463
		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1464
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1465
		misaligned |= bytes < 4;
1466
		if (misaligned || flooded) {
1467 1468 1469 1470
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
1471 1472 1473 1474 1475
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
1476 1477
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1478 1479
				 gpa, bytes, sp->role.word);
			kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1480
			++vcpu->kvm->stat.mmu_flooded;
1481 1482
			continue;
		}
1483
		page_offset = offset;
1484
		level = sp->role.level;
1485
		npte = 1;
1486
		if (sp->role.glevels == PT32_ROOT_LEVEL) {
1487 1488 1489 1490 1491 1492 1493
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
1494
				page_offset &= ~7; /* kill rounding error */
1495 1496 1497
				page_offset <<= 1;
				npte = 2;
			}
1498
			quadrant = page_offset >> PAGE_SHIFT;
1499
			page_offset &= ~PAGE_MASK;
1500
			if (quadrant != sp->role.quadrant)
1501
				continue;
1502
		}
1503
		spte = &sp->spt[page_offset / sizeof(*spte)];
1504 1505 1506 1507 1508 1509 1510 1511 1512
		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
			gentry = 0;
			r = kvm_read_guest_atomic(vcpu->kvm,
						  gpa & ~(u64)(pte_size - 1),
						  &gentry, pte_size);
			new = (const void *)&gentry;
			if (r < 0)
				new = NULL;
		}
1513
		while (npte--) {
1514
			entry = *spte;
1515
			mmu_pte_write_zap_pte(vcpu, sp, spte);
1516 1517
			if (new)
				mmu_pte_write_new_pte(vcpu, sp, spte, new);
1518
			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1519
			++spte;
1520 1521
		}
	}
1522
	kvm_mmu_audit(vcpu, "post pte write");
1523
	spin_unlock(&vcpu->kvm->mmu_lock);
1524 1525 1526 1527
	if (vcpu->arch.update_pte.page) {
		kvm_release_page_clean(vcpu->arch.update_pte.page);
		vcpu->arch.update_pte.page = NULL;
	}
1528 1529
}

1530 1531
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
1532 1533
	gpa_t gpa;
	int r;
1534

1535
	down_read(&vcpu->kvm->slots_lock);
1536
	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1537
	up_read(&vcpu->kvm->slots_lock);
1538

1539
	spin_lock(&vcpu->kvm->mmu_lock);
1540
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1541
	spin_unlock(&vcpu->kvm->mmu_lock);
1542
	return r;
1543 1544
}

1545
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1546
{
1547
	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1548
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1549

1550
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1551 1552
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1553
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
1554 1555 1556
	}
}

1557 1558 1559 1560 1561
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

1562
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1563 1564 1565 1566 1567 1568 1569 1570
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

1571 1572 1573 1574
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593
	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
		return 0;
	case EMULATE_FAIL:
		kvm_report_emulation_failure(vcpu, "pagetable");
		return 1;
	default:
		BUG();
	}
out:
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

1594 1595 1596 1597 1598 1599
void kvm_enable_tdp(void)
{
	tdp_enabled = true;
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);

A
Avi Kivity 已提交
1600 1601
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
1602
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1603

1604 1605
	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1606 1607
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
1608
	}
1609
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
A
Avi Kivity 已提交
1610 1611 1612 1613
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
1614
	struct page *page;
A
Avi Kivity 已提交
1615 1616 1617 1618
	int i;

	ASSERT(vcpu);

1619 1620 1621
	if (vcpu->kvm->arch.n_requested_mmu_pages)
		vcpu->kvm->arch.n_free_mmu_pages =
					vcpu->kvm->arch.n_requested_mmu_pages;
1622
	else
1623 1624
		vcpu->kvm->arch.n_free_mmu_pages =
					vcpu->kvm->arch.n_alloc_mmu_pages;
1625 1626 1627 1628 1629 1630 1631 1632
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
		goto error_1;
1633
	vcpu->arch.mmu.pae_root = page_address(page);
1634
	for (i = 0; i < 4; ++i)
1635
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1636

A
Avi Kivity 已提交
1637 1638 1639 1640 1641 1642 1643
	return 0;

error_1:
	free_mmu_pages(vcpu);
	return -ENOMEM;
}

1644
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1645 1646
{
	ASSERT(vcpu);
1647
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1648

1649 1650
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
1651

1652 1653 1654
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1655
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1656

1657
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
1658 1659 1660 1661 1662 1663 1664 1665
}

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
1666
	mmu_free_memory_caches(vcpu);
A
Avi Kivity 已提交
1667 1668
}

1669
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
1670
{
1671
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1672

1673
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
A
Avi Kivity 已提交
1674 1675 1676
		int i;
		u64 *pt;

1677
		if (!test_bit(slot, &sp->slot_bitmap))
A
Avi Kivity 已提交
1678 1679
			continue;

1680
		pt = sp->spt;
A
Avi Kivity 已提交
1681 1682
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
1683
			if (pt[i] & PT_WRITABLE_MASK)
A
Avi Kivity 已提交
1684 1685 1686
				pt[i] &= ~PT_WRITABLE_MASK;
	}
}
1687

1688
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
1689
{
1690
	struct kvm_mmu_page *sp, *node;
D
Dor Laor 已提交
1691

1692
	spin_lock(&kvm->mmu_lock);
1693
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1694
		kvm_mmu_zap_page(kvm, sp);
1695
	spin_unlock(&kvm->mmu_lock);
D
Dor Laor 已提交
1696

1697
	kvm_flush_remote_tlbs(kvm);
D
Dor Laor 已提交
1698 1699
}

1700 1701 1702 1703 1704 1705
void kvm_mmu_module_exit(void)
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
1706 1707
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
1708 1709 1710 1711 1712 1713
}

int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
1714
					    0, 0, NULL);
1715 1716 1717 1718
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
1719
					    0, 0, NULL);
1720 1721 1722
	if (!rmap_desc_cache)
		goto nomem;

1723 1724
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
1725
						  0, 0, NULL);
1726 1727 1728
	if (!mmu_page_header_cache)
		goto nomem;

1729 1730 1731 1732 1733 1734 1735
	return 0;

nomem:
	kvm_mmu_module_exit();
	return -ENOMEM;
}

1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;

	for (i = 0; i < kvm->nmemslots; i++)
		nr_pages += kvm->memslots[i].npages;

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776
#ifdef AUDIT

static const char *audit_msg;

static gva_t canonicalize(gva_t gva)
{
#ifdef CONFIG_X86_64
	gva = (long long)(gva << 16) >> 16;
#endif
	return gva;
}

static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
				gva_t va, int level)
{
	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
	int i;
	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
		u64 ent = pt[i];

1777
		if (ent == shadow_trap_nonpresent_pte)
1778 1779 1780
			continue;

		va = canonicalize(va);
1781 1782 1783 1784 1785
		if (level > 1) {
			if (ent == shadow_notrap_nonpresent_pte)
				printk(KERN_ERR "audit: (%s) nontrapping pte"
				       " in nonleaf level: levels %d gva %lx"
				       " level %d pte %llx\n", audit_msg,
1786
				       vcpu->arch.mmu.root_level, va, level, ent);
1787

1788
			audit_mappings_page(vcpu, ent, va, level - 1);
1789
		} else {
1790
			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
A
Avi Kivity 已提交
1791 1792
			struct page *page = gpa_to_page(vcpu, gpa);
			hpa_t hpa = page_to_phys(page);
1793

1794
			if (is_shadow_present_pte(ent)
1795
			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
1796 1797
				printk(KERN_ERR "xx audit error: (%s) levels %d"
				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1798
				       audit_msg, vcpu->arch.mmu.root_level,
M
Mike Day 已提交
1799 1800
				       va, gpa, hpa, ent,
				       is_shadow_present_pte(ent));
1801 1802 1803 1804
			else if (ent == shadow_notrap_nonpresent_pte
				 && !is_error_hpa(hpa))
				printk(KERN_ERR "audit: (%s) notrap shadow,"
				       " valid guest gva %lx\n", audit_msg, va);
1805
			kvm_release_page_clean(page);
1806

1807 1808 1809 1810 1811 1812
		}
	}
}

static void audit_mappings(struct kvm_vcpu *vcpu)
{
1813
	unsigned i;
1814

1815 1816
	if (vcpu->arch.mmu.root_level == 4)
		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1817 1818
	else
		for (i = 0; i < 4; ++i)
1819
			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1820
				audit_mappings_page(vcpu,
1821
						    vcpu->arch.mmu.pae_root[i],
1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835
						    i << 30,
						    2);
}

static int count_rmaps(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
	int i, j, k;

	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
		struct kvm_rmap_desc *d;

		for (j = 0; j < m->npages; ++j) {
1836
			unsigned long *rmapp = &m->rmap[j];
1837

1838
			if (!*rmapp)
1839
				continue;
1840
			if (!(*rmapp & 1)) {
1841 1842 1843
				++nmaps;
				continue;
			}
1844
			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860
			while (d) {
				for (k = 0; k < RMAP_EXT; ++k)
					if (d->shadow_ptes[k])
						++nmaps;
					else
						break;
				d = d->more;
			}
		}
	}
	return nmaps;
}

static int count_writable_mappings(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
1861
	struct kvm_mmu_page *sp;
1862 1863
	int i;

1864
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1865
		u64 *pt = sp->spt;
1866

1867
		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894
			continue;

		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
			u64 ent = pt[i];

			if (!(ent & PT_PRESENT_MASK))
				continue;
			if (!(ent & PT_WRITABLE_MASK))
				continue;
			++nmaps;
		}
	}
	return nmaps;
}

static void audit_rmap(struct kvm_vcpu *vcpu)
{
	int n_rmap = count_rmaps(vcpu);
	int n_actual = count_writable_mappings(vcpu);

	if (n_rmap != n_actual)
		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
		       __FUNCTION__, audit_msg, n_rmap, n_actual);
}

static void audit_write_protection(struct kvm_vcpu *vcpu)
{
1895
	struct kvm_mmu_page *sp;
1896 1897 1898
	struct kvm_memory_slot *slot;
	unsigned long *rmapp;
	gfn_t gfn;
1899

1900
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1901
		if (sp->role.metaphysical)
1902 1903
			continue;

1904 1905
		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1906 1907
		rmapp = &slot->rmap[gfn - slot->base_gfn];
		if (*rmapp)
1908 1909
			printk(KERN_ERR "%s: (%s) shadow page has writable"
			       " mappings: gfn %lx role %x\n",
1910 1911
			       __FUNCTION__, audit_msg, sp->gfn,
			       sp->role.word);
1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927
	}
}

static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
{
	int olddbg = dbg;

	dbg = 0;
	audit_msg = msg;
	audit_rmap(vcpu);
	audit_write_protection(vcpu);
	audit_mappings(vcpu);
	dbg = olddbg;
}

#endif