mmu.c 47.1 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
19 20

#include "vmx.h"
21
#include "mmu.h"
A
Avi Kivity 已提交
22

23
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
24 25 26 27 28
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
29
#include <linux/swap.h>
A
Avi Kivity 已提交
30

A
Avi Kivity 已提交
31 32
#include <asm/page.h>
#include <asm/cmpxchg.h>
33
#include <asm/io.h>
A
Avi Kivity 已提交
34

35 36 37 38 39 40 41 42 43
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
static bool tdp_enabled = false;

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
static int dbg = 1;
#endif
A
Avi Kivity 已提交
69

70 71 72
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
73 74 75 76 77
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
78
#endif
A
Avi Kivity 已提交
79

80 81 82 83
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
#define PT32_PT_BITS 10
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
A
Avi Kivity 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96

#define PT_WRITABLE_SHIFT 1

#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
#define PT_USER_MASK (1ULL << 2)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_MASK (1ULL << 5)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
97 98
#define PT64_NX_SHIFT 63
#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
A
Avi Kivity 已提交
99 100 101 102 103 104 105

#define PT_PAT_SHIFT 7
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)

#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
M
Mike Day 已提交
106 107
#define PT32_DIR_PSE36_MASK \
	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
A
Avi Kivity 已提交
108 109 110 111 112 113 114 115 116 117


#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define VALID_PAGE(x) ((x) != INVALID_PAGE)

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
118
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
119 120 121 122 123 124 125 126 127 128 129

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
130
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
131 132 133 134 135 136 137 138

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


139
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
140 141 142 143 144 145 146
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))

147 148
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
149 150 151 152

#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
153
#define PFERR_FETCH_MASK (1U << 4)
A
Avi Kivity 已提交
154 155 156 157 158 159 160 161

#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3

#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1

162 163
#define RMAP_EXT 4

164 165 166 167 168
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

169 170 171 172 173
struct kvm_rmap_desc {
	u64 *shadow_ptes[RMAP_EXT];
	struct kvm_rmap_desc *more;
};

174 175
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
176
static struct kmem_cache *mmu_page_header_cache;
177

178 179 180 181 182 183 184 185 186 187
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;

void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

A
Avi Kivity 已提交
188 189
static int is_write_protection(struct kvm_vcpu *vcpu)
{
190
	return vcpu->arch.cr0 & X86_CR0_WP;
A
Avi Kivity 已提交
191 192 193 194 195 196 197
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

198 199
static int is_nx(struct kvm_vcpu *vcpu)
{
200
	return vcpu->arch.shadow_efer & EFER_NX;
201 202
}

A
Avi Kivity 已提交
203 204 205 206 207
static int is_present_pte(unsigned long pte)
{
	return pte & PT_PRESENT_MASK;
}

208 209 210 211 212 213
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

A
Avi Kivity 已提交
214 215 216 217 218
static int is_writeble_pte(unsigned long pte)
{
	return pte & PT_WRITABLE_MASK;
}

219 220 221 222 223
static int is_dirty_pte(unsigned long pte)
{
	return pte & PT_DIRTY_MASK;
}

224 225
static int is_rmap_pte(u64 pte)
{
226
	return is_shadow_present_pte(pte);
227 228
}

229 230 231 232 233 234 235
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

236 237 238 239 240 241 242 243 244
static void set_shadow_pte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
	set_64bit((unsigned long *)sptep, spte);
#else
	set_64bit((unsigned long long *)sptep, spte);
#endif
}

245
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246
				  struct kmem_cache *base_cache, int min)
247 248 249 250
{
	void *obj;

	if (cache->nobjs >= min)
251
		return 0;
252
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254
		if (!obj)
255
			return -ENOMEM;
256 257
		cache->objects[cache->nobjs++] = obj;
	}
258
	return 0;
259 260 261 262 263 264 265 266
}

static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
		kfree(mc->objects[--mc->nobjs]);
}

A
Avi Kivity 已提交
267
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268
				       int min)
A
Avi Kivity 已提交
269 270 271 272 273 274
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
276 277 278 279 280 281 282 283 284 285 286
		if (!page)
			return -ENOMEM;
		set_page_private(page, 0);
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
287
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
288 289
}

290
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291
{
292 293
	int r;

294
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295
				   pte_chain_cache, 4);
296 297
	if (r)
		goto out;
298
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299
				   rmap_desc_cache, 1);
300 301
	if (r)
		goto out;
302
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 304
	if (r)
		goto out;
305
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306
				   mmu_page_header_cache, 4);
307 308
out:
	return r;
309 310 311 312
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
313 314 315 316
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	memset(p, 0, size);
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
332
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 334 335
				      sizeof(struct kvm_pte_chain));
}

336
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337
{
338
	kfree(pc);
339 340 341 342
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
343
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 345 346
				      sizeof(struct kvm_rmap_desc));
}

347
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348
{
349
	kfree(rd);
350 351
}

352 353 354 355 356 357 358 359 360 361 362 363 364
/*
 * Take gfn and return the reverse mapping to it.
 * Note: gfn must be unaliased before this function get called
 */

static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
{
	struct kvm_memory_slot *slot;

	slot = gfn_to_memslot(kvm, gfn);
	return &slot->rmap[gfn - slot->base_gfn];
}

365 366 367
/*
 * Reverse mapping data structures:
 *
368 369
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
370
 *
371 372
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
373
 */
374
static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375
{
376
	struct kvm_mmu_page *sp;
377
	struct kvm_rmap_desc *desc;
378
	unsigned long *rmapp;
379 380 381 382
	int i;

	if (!is_rmap_pte(*spte))
		return;
383
	gfn = unalias_gfn(vcpu->kvm, gfn);
384 385
	sp = page_header(__pa(spte));
	sp->gfns[spte - sp->spt] = gfn;
386 387
	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
	if (!*rmapp) {
388
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 390
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
391
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392
		desc = mmu_alloc_rmap_desc(vcpu);
393
		desc->shadow_ptes[0] = (u64 *)*rmapp;
394
		desc->shadow_ptes[1] = spte;
395
		*rmapp = (unsigned long)desc | 1;
396 397
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 400 401
		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
			desc = desc->more;
		if (desc->shadow_ptes[RMAP_EXT-1]) {
402
			desc->more = mmu_alloc_rmap_desc(vcpu);
403 404 405 406 407 408 409 410
			desc = desc->more;
		}
		for (i = 0; desc->shadow_ptes[i]; ++i)
			;
		desc->shadow_ptes[i] = spte;
	}
}

411
static void rmap_desc_remove_entry(unsigned long *rmapp,
412 413 414 415 416 417 418 419 420
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
		;
	desc->shadow_ptes[i] = desc->shadow_ptes[j];
A
Al Viro 已提交
421
	desc->shadow_ptes[j] = NULL;
422 423 424
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
425
		*rmapp = (unsigned long)desc->shadow_ptes[0];
426 427 428 429
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
430
			*rmapp = (unsigned long)desc->more | 1;
431
	mmu_free_rmap_desc(desc);
432 433
}

434
static void rmap_remove(struct kvm *kvm, u64 *spte)
435 436 437
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
438
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
439
	struct page *page;
440
	unsigned long *rmapp;
441 442 443 444
	int i;

	if (!is_rmap_pte(*spte))
		return;
445
	sp = page_header(__pa(spte));
A
Avi Kivity 已提交
446
	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447
	mark_page_accessed(page);
448
	if (is_writeble_pte(*spte))
A
Avi Kivity 已提交
449
		kvm_release_page_dirty(page);
450
	else
A
Avi Kivity 已提交
451
		kvm_release_page_clean(page);
452
	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453
	if (!*rmapp) {
454 455
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
456
	} else if (!(*rmapp & 1)) {
457
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
458
		if ((u64 *)*rmapp != spte) {
459 460 461 462
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
463
		*rmapp = 0;
464 465
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
466
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 468 469 470
		prev_desc = NULL;
		while (desc) {
			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
				if (desc->shadow_ptes[i] == spte) {
471
					rmap_desc_remove_entry(rmapp,
472
							       desc, i,
473 474 475 476 477 478 479 480 481 482
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
		BUG();
	}
}

483
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484 485
{
	struct kvm_rmap_desc *desc;
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
	struct kvm_rmap_desc *prev_desc;
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_desc = NULL;
	prev_spte = NULL;
	while (desc) {
		for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
			if (prev_spte == spte)
				return desc->shadow_ptes[i];
			prev_spte = desc->shadow_ptes[i];
		}
		desc = desc->more;
	}
	return NULL;
}

static void rmap_write_protect(struct kvm *kvm, u64 gfn)
{
513
	unsigned long *rmapp;
514
	u64 *spte;
515
	int write_protected = 0;
516

517 518
	gfn = unalias_gfn(kvm, gfn);
	rmapp = gfn_to_rmap(kvm, gfn);
519

520 521
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
522 523 524
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525
		if (is_writeble_pte(*spte)) {
526
			set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 528
			write_protected = 1;
		}
529
		spte = rmap_next(kvm, rmapp, spte);
530
	}
531 532
	if (write_protected)
		kvm_flush_remote_tlbs(kvm);
533 534
}

535
#ifdef MMU_DEBUG
536
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
537
{
538 539 540
	u64 *pos;
	u64 *end;

541
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542
		if (*pos != shadow_trap_nonpresent_pte) {
543 544
			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
			       pos, *pos);
A
Avi Kivity 已提交
545
			return 0;
546
		}
A
Avi Kivity 已提交
547 548
	return 1;
}
549
#endif
A
Avi Kivity 已提交
550

551
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552
{
553 554 555 556 557
	ASSERT(is_empty_shadow_page(sp->spt));
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
	__free_page(virt_to_page(sp->gfns));
	kfree(sp);
558
	++kvm->arch.n_free_mmu_pages;
559 560
}

561 562
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
563
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
564 565
}

566 567
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
					       u64 *parent_pte)
A
Avi Kivity 已提交
568
{
569
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
570

571 572 573
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 577 578 579
	ASSERT(is_empty_shadow_page(sp->spt));
	sp->slot_bitmap = 0;
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
580
	--vcpu->kvm->arch.n_free_mmu_pages;
581
	return sp;
A
Avi Kivity 已提交
582 583
}

584
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585
				    struct kvm_mmu_page *sp, u64 *parent_pte)
586 587 588 589 590 591 592
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
593 594
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
595 596

		if (!old) {
597
			sp->parent_pte = parent_pte;
598 599
			return;
		}
600
		sp->multimapped = 1;
601
		pte_chain = mmu_alloc_pte_chain(vcpu);
602 603
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 605
		pte_chain->parent_ptes[0] = old;
	}
606
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 608 609 610 611 612 613 614
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
615
	pte_chain = mmu_alloc_pte_chain(vcpu);
616
	BUG_ON(!pte_chain);
617
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 619 620
	pte_chain->parent_ptes[0] = parent_pte;
}

621
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 623 624 625 626 627
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

628 629 630
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
631 632
		return;
	}
633
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 635 636 637 638
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
639 640
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
641 642 643 644 645
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
646 647
			if (i == 0) {
				hlist_del(&pte_chain->link);
648
				mmu_free_pte_chain(pte_chain);
649 650 651
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
652 653
				}
			}
654 655 656 657 658
			return;
		}
	BUG();
}

659
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660 661 662
{
	unsigned index;
	struct hlist_head *bucket;
663
	struct kvm_mmu_page *sp;
664 665 666
	struct hlist_node *node;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667
	index = kvm_page_table_hashfn(gfn);
668
	bucket = &kvm->arch.mmu_page_hash[index];
669 670
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
671
			pgprintk("%s: found role %x\n",
672 673
				 __FUNCTION__, sp->role.word);
			return sp;
674 675 676 677 678 679 680 681 682
		}
	return NULL;
}

static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
					     int metaphysical,
683
					     unsigned access,
684
					     u64 *parent_pte)
685 686 687 688 689
{
	union kvm_mmu_page_role role;
	unsigned index;
	unsigned quadrant;
	struct hlist_head *bucket;
690
	struct kvm_mmu_page *sp;
691 692 693
	struct hlist_node *node;

	role.word = 0;
694
	role.glevels = vcpu->arch.mmu.root_level;
695 696
	role.level = level;
	role.metaphysical = metaphysical;
697
	role.access = access;
698
	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
699 700 701 702 703 704
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
		 gfn, role.word);
705
	index = kvm_page_table_hashfn(gfn);
706
	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
707 708 709
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && sp->role.word == role.word) {
			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
710
			pgprintk("%s: found\n", __FUNCTION__);
711
			return sp;
712
		}
A
Avi Kivity 已提交
713
	++vcpu->kvm->stat.mmu_cache_miss;
714 715 716
	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
	if (!sp)
		return sp;
717
	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
718 719 720
	sp->gfn = gfn;
	sp->role = role;
	hlist_add_head(&sp->hash_link, bucket);
721
	vcpu->arch.mmu.prefetch_page(vcpu, sp);
722
	if (!metaphysical)
723
		rmap_write_protect(vcpu->kvm, gfn);
724
	return sp;
725 726
}

727
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
728
					 struct kvm_mmu_page *sp)
729
{
730 731 732 733
	unsigned i;
	u64 *pt;
	u64 ent;

734
	pt = sp->spt;
735

736
	if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
737
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
738
			if (is_shadow_present_pte(pt[i]))
739
				rmap_remove(kvm, &pt[i]);
740
			pt[i] = shadow_trap_nonpresent_pte;
741
		}
742
		kvm_flush_remote_tlbs(kvm);
743 744 745 746 747 748
		return;
	}

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

749 750
		pt[i] = shadow_trap_nonpresent_pte;
		if (!is_shadow_present_pte(ent))
751 752
			continue;
		ent &= PT64_BASE_ADDR_MASK;
753
		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
754
	}
755
	kvm_flush_remote_tlbs(kvm);
756 757
}

758
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
759
{
760
	mmu_page_remove_parent_pte(sp, parent_pte);
761 762
}

763 764 765 766 767 768
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;

	for (i = 0; i < KVM_MAX_VCPUS; ++i)
		if (kvm->vcpus[i])
769
			kvm->vcpus[i]->arch.last_pte_updated = NULL;
770 771
}

772
static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
773 774 775
{
	u64 *parent_pte;

A
Avi Kivity 已提交
776
	++kvm->stat.mmu_shadow_zapped;
777 778 779
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
780 781 782
		else {
			struct kvm_pte_chain *chain;

783
			chain = container_of(sp->parent_ptes.first,
784 785 786
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
787
		BUG_ON(!parent_pte);
788
		kvm_mmu_put_page(sp, parent_pte);
789
		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
790
	}
791 792 793 794
	kvm_mmu_page_unlink_children(kvm, sp);
	if (!sp->root_count) {
		hlist_del(&sp->hash_link);
		kvm_mmu_free_page(kvm, sp);
A
Avi Kivity 已提交
795
	} else
796
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
797
	kvm_mmu_reset_last_pte_updated(kvm);
798 799
}

800 801 802 803 804 805 806 807 808 809 810 811
/*
 * Changing the number of mmu pages allocated to the vm
 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

812
	if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
813
	    kvm_nr_mmu_pages) {
814 815
		int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
				       - kvm->arch.n_free_mmu_pages;
816 817 818 819

		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
			struct kvm_mmu_page *page;

820
			page = container_of(kvm->arch.active_mmu_pages.prev,
821 822 823 824
					    struct kvm_mmu_page, link);
			kvm_mmu_zap_page(kvm, page);
			n_used_mmu_pages--;
		}
825
		kvm->arch.n_free_mmu_pages = 0;
826 827
	}
	else
828 829
		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
					 - kvm->arch.n_alloc_mmu_pages;
830

831
	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
832 833
}

834
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
835 836 837
{
	unsigned index;
	struct hlist_head *bucket;
838
	struct kvm_mmu_page *sp;
839 840 841 842 843
	struct hlist_node *node, *n;
	int r;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
	r = 0;
844
	index = kvm_page_table_hashfn(gfn);
845
	bucket = &kvm->arch.mmu_page_hash[index];
846 847
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
848
			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
849 850
				 sp->role.word);
			kvm_mmu_zap_page(kvm, sp);
851 852 853
			r = 1;
		}
	return r;
854 855
}

856
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
857
{
858
	struct kvm_mmu_page *sp;
859

860 861 862
	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
		kvm_mmu_zap_page(kvm, sp);
863 864 865
	}
}

866
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
867
{
868
	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
869
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
870

871
	__set_bit(slot, &sp->slot_bitmap);
A
Avi Kivity 已提交
872 873
}

874 875
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
876 877
	struct page *page;

878
	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
879 880 881

	if (gpa == UNMAPPED_GVA)
		return NULL;
882 883 884 885 886 887

	down_read(&current->mm->mmap_sem);
	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
	up_read(&current->mm->mmap_sem);

	return page;
888 889
}

890 891 892
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
893
			 int *ptwrite, gfn_t gfn, struct page *page)
894 895
{
	u64 spte;
896
	int was_rmapped = 0;
897
	int was_writeble = is_writeble_pte(*shadow_pte);
898
	hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
899

900
	pgprintk("%s: spte %llx access %x write_fault %d"
901
		 " user_fault %d gfn %lx\n",
902
		 __FUNCTION__, *shadow_pte, pt_access,
903 904
		 write_fault, user_fault, gfn);

905 906 907 908 909 910 911 912 913 914
	if (is_rmap_pte(*shadow_pte)) {
		if (host_pfn != page_to_pfn(page)) {
			pgprintk("hfn old %lx new %lx\n",
				 host_pfn, page_to_pfn(page));
			rmap_remove(vcpu->kvm, shadow_pte);
		}
		else
			was_rmapped = 1;
	}

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
	if (!(pte_access & ACC_EXEC_MASK))
		spte |= PT64_NX_MASK;

	spte |= PT_PRESENT_MASK;
	if (pte_access & ACC_USER_MASK)
		spte |= PT_USER_MASK;

	spte |= page_to_phys(page);

	if ((pte_access & ACC_WRITE_MASK)
	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
		struct kvm_mmu_page *shadow;

		spte |= PT_WRITABLE_MASK;
		if (user_fault) {
			mmu_unshadow(vcpu->kvm, gfn);
			goto unshadowed;
		}

		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
		if (shadow) {
			pgprintk("%s: found shadow page for %lx, marking ro\n",
				 __FUNCTION__, gfn);
			pte_access &= ~ACC_WRITE_MASK;
			if (is_writeble_pte(spte)) {
				spte &= ~PT_WRITABLE_MASK;
				kvm_x86_ops->tlb_flush(vcpu);
			}
			if (write_fault)
				*ptwrite = 1;
		}
	}

unshadowed:

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
	set_shadow_pte(shadow_pte, spte);
	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
	if (!was_rmapped) {
		rmap_add(vcpu, shadow_pte, gfn);
		if (!is_rmap_pte(*shadow_pte))
			kvm_release_page_clean(page);
968 969 970 971 972
	} else {
		if (was_writeble)
			kvm_release_page_dirty(page);
		else
			kvm_release_page_clean(page);
973 974
	}
	if (!ptwrite || !*ptwrite)
975
		vcpu->arch.last_pte_updated = shadow_pte;
976 977
}

A
Avi Kivity 已提交
978 979 980 981
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

982 983
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
			   gfn_t gfn, struct page *page, int level)
A
Avi Kivity 已提交
984
{
985
	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
986
	int pt_write = 0;
A
Avi Kivity 已提交
987 988 989 990 991 992 993 994 995

	for (; ; level--) {
		u32 index = PT64_INDEX(v, level);
		u64 *table;

		ASSERT(VALID_PAGE(table_addr));
		table = __va(table_addr);

		if (level == 1) {
996
			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
997
				     0, write, 1, &pt_write, gfn, page);
998
			return pt_write;
A
Avi Kivity 已提交
999 1000
		}

1001
		if (table[index] == shadow_trap_nonpresent_pte) {
1002
			struct kvm_mmu_page *new_table;
1003
			gfn_t pseudo_gfn;
A
Avi Kivity 已提交
1004

1005 1006 1007 1008
			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
				>> PAGE_SHIFT;
			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
						     v, level - 1,
1009
						     1, ACC_ALL, &table[index]);
1010
			if (!new_table) {
A
Avi Kivity 已提交
1011
				pgprintk("nonpaging_map: ENOMEM\n");
1012
				kvm_release_page_clean(page);
A
Avi Kivity 已提交
1013 1014 1015
				return -ENOMEM;
			}

1016
			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1017
				| PT_WRITABLE_MASK | PT_USER_MASK;
A
Avi Kivity 已提交
1018 1019 1020 1021 1022
		}
		table_addr = table[index] & PT64_BASE_ADDR_MASK;
	}
}

1023 1024 1025 1026
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
	int r;

1027 1028
	struct page *page;

1029 1030
	down_read(&vcpu->kvm->slots_lock);

1031 1032
	down_read(&current->mm->mmap_sem);
	page = gfn_to_page(vcpu->kvm, gfn);
1033
	up_read(&current->mm->mmap_sem);
1034

1035 1036 1037 1038 1039 1040 1041
	/* mmio */
	if (is_error_page(page)) {
		kvm_release_page_clean(page);
		up_read(&vcpu->kvm->slots_lock);
		return 1;
	}

1042
	spin_lock(&vcpu->kvm->mmu_lock);
1043
	kvm_mmu_free_some_pages(vcpu);
1044
	r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL);
1045 1046
	spin_unlock(&vcpu->kvm->mmu_lock);

1047
	up_read(&vcpu->kvm->slots_lock);
1048

1049 1050 1051 1052
	return r;
}


1053 1054 1055 1056 1057 1058 1059 1060 1061
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1062 1063 1064
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
1065
	struct kvm_mmu_page *sp;
1066

1067
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
1068
		return;
1069
	spin_lock(&vcpu->kvm->mmu_lock);
1070
#ifdef CONFIG_X86_64
1071 1072
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1073

1074 1075
		sp = page_header(root);
		--sp->root_count;
1076
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077
		spin_unlock(&vcpu->kvm->mmu_lock);
1078 1079 1080 1081
		return;
	}
#endif
	for (i = 0; i < 4; ++i) {
1082
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1083

A
Avi Kivity 已提交
1084 1085
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
1086 1087
			sp = page_header(root);
			--sp->root_count;
A
Avi Kivity 已提交
1088
		}
1089
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1090
	}
1091
	spin_unlock(&vcpu->kvm->mmu_lock);
1092
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1093 1094 1095 1096 1097
}

static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
	int i;
1098
	gfn_t root_gfn;
1099
	struct kvm_mmu_page *sp;
1100
	int metaphysical = 0;
1101

1102
	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1103 1104

#ifdef CONFIG_X86_64
1105 1106
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1107 1108

		ASSERT(!VALID_PAGE(root));
1109 1110
		if (tdp_enabled)
			metaphysical = 1;
1111
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1112 1113
				      PT64_ROOT_LEVEL, metaphysical,
				      ACC_ALL, NULL);
1114 1115
		root = __pa(sp->spt);
		++sp->root_count;
1116
		vcpu->arch.mmu.root_hpa = root;
1117 1118 1119
		return;
	}
#endif
1120 1121 1122
	metaphysical = !is_paging(vcpu);
	if (tdp_enabled)
		metaphysical = 1;
1123
	for (i = 0; i < 4; ++i) {
1124
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1125 1126

		ASSERT(!VALID_PAGE(root));
1127 1128 1129
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
			if (!is_present_pte(vcpu->arch.pdptrs[i])) {
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
1130 1131
				continue;
			}
1132 1133
			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
		} else if (vcpu->arch.mmu.root_level == 0)
1134
			root_gfn = 0;
1135
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1136
				      PT32_ROOT_LEVEL, metaphysical,
1137
				      ACC_ALL, NULL);
1138 1139
		root = __pa(sp->spt);
		++sp->root_count;
1140
		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1141
	}
1142
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1143 1144
}

A
Avi Kivity 已提交
1145 1146 1147 1148 1149 1150
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
	return vaddr;
}

static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
A
Avi Kivity 已提交
1151
				u32 error_code)
A
Avi Kivity 已提交
1152
{
1153
	gfn_t gfn;
1154
	int r;
A
Avi Kivity 已提交
1155

1156
	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1157 1158 1159
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
1160

A
Avi Kivity 已提交
1161
	ASSERT(vcpu);
1162
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1163

1164
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
1165

1166 1167
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
1168 1169
}

1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
				u32 error_code)
{
	struct page *page;
	int r;

	ASSERT(vcpu);
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

	down_read(&current->mm->mmap_sem);
	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
	if (is_error_page(page)) {
		kvm_release_page_clean(page);
		up_read(&current->mm->mmap_sem);
		return 1;
	}
	spin_lock(&vcpu->kvm->mmu_lock);
	kvm_mmu_free_some_pages(vcpu);
	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
			 gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL);
	spin_unlock(&vcpu->kvm->mmu_lock);
	up_read(&current->mm->mmap_sem);

	return r;
}

A
Avi Kivity 已提交
1200 1201
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
1202
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1203 1204 1205 1206
}

static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
1207
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1208 1209 1210 1211 1212

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
1213
	context->prefetch_page = nonpaging_prefetch_page;
1214
	context->root_level = 0;
A
Avi Kivity 已提交
1215
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1216
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1217 1218 1219
	return 0;
}

1220
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1221
{
A
Avi Kivity 已提交
1222
	++vcpu->stat.tlb_flush;
1223
	kvm_x86_ops->tlb_flush(vcpu);
A
Avi Kivity 已提交
1224 1225 1226 1227
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
1228
	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
1229
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1230 1231 1232 1233 1234 1235
}

static void inject_page_fault(struct kvm_vcpu *vcpu,
			      u64 addr,
			      u32 err_code)
{
1236
	kvm_inject_page_fault(vcpu, addr, err_code);
A
Avi Kivity 已提交
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

1252
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
A
Avi Kivity 已提交
1253
{
1254
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1255 1256 1257 1258 1259

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
1260
	context->prefetch_page = paging64_prefetch_page;
A
Avi Kivity 已提交
1261
	context->free = paging_free;
1262 1263
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
1264
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1265 1266 1267
	return 0;
}

1268 1269 1270 1271 1272
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}

A
Avi Kivity 已提交
1273 1274
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
1275
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1276 1277 1278 1279 1280

	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
1281
	context->prefetch_page = paging32_prefetch_page;
A
Avi Kivity 已提交
1282 1283
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1284
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1285 1286 1287 1288 1289
	return 0;
}

static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
1290
	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
1291 1292
}

1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *context = &vcpu->arch.mmu;

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = tdp_page_fault;
	context->free = nonpaging_free;
	context->prefetch_page = nonpaging_prefetch_page;
	context->shadow_root_level = TDP_ROOT_LEVEL;
	context->root_hpa = INVALID_PAGE;

	if (!is_paging(vcpu)) {
		context->gva_to_gpa = nonpaging_gva_to_gpa;
		context->root_level = 0;
	} else if (is_long_mode(vcpu)) {
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT64_ROOT_LEVEL;
	} else if (is_pae(vcpu)) {
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT32E_ROOT_LEVEL;
	} else {
		context->gva_to_gpa = paging32_gva_to_gpa;
		context->root_level = PT32_ROOT_LEVEL;
	}

	return 0;
}

static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1322 1323
{
	ASSERT(vcpu);
1324
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1325 1326 1327

	if (!is_paging(vcpu))
		return nonpaging_init_context(vcpu);
A
Avi Kivity 已提交
1328
	else if (is_long_mode(vcpu))
A
Avi Kivity 已提交
1329 1330 1331 1332 1333 1334 1335
		return paging64_init_context(vcpu);
	else if (is_pae(vcpu))
		return paging32E_init_context(vcpu);
	else
		return paging32_init_context(vcpu);
}

1336 1337 1338 1339 1340 1341 1342 1343
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
	if (tdp_enabled)
		return init_kvm_tdp_mmu(vcpu);
	else
		return init_kvm_softmmu(vcpu);
}

A
Avi Kivity 已提交
1344 1345 1346
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1347 1348 1349
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
		vcpu->arch.mmu.free(vcpu);
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1350 1351 1352 1353
	}
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1354 1355 1356 1357
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
1358
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
1359 1360

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1361
{
1362 1363
	int r;

1364
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
1365 1366
	if (r)
		goto out;
1367
	spin_lock(&vcpu->kvm->mmu_lock);
1368
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
1369
	mmu_alloc_roots(vcpu);
1370
	spin_unlock(&vcpu->kvm->mmu_lock);
1371
	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
A
Avi Kivity 已提交
1372
	kvm_mmu_flush_tlb(vcpu);
1373 1374
out:
	return r;
A
Avi Kivity 已提交
1375
}
A
Avi Kivity 已提交
1376 1377 1378 1379 1380 1381
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
A
Avi Kivity 已提交
1382

1383
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1384
				  struct kvm_mmu_page *sp,
1385 1386 1387 1388 1389 1390
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
1391
	if (is_shadow_present_pte(pte)) {
1392
		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1393
			rmap_remove(vcpu->kvm, spte);
1394 1395
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
1396
			mmu_page_remove_parent_pte(child, spte);
1397 1398
		}
	}
1399
	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1400 1401
}

1402
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1403
				  struct kvm_mmu_page *sp,
1404
				  u64 *spte,
1405
				  const void *new)
1406
{
1407
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
A
Avi Kivity 已提交
1408
		++vcpu->kvm->stat.mmu_pde_zapped;
1409
		return;
A
Avi Kivity 已提交
1410
	}
1411

A
Avi Kivity 已提交
1412
	++vcpu->kvm->stat.mmu_pte_updated;
1413
	if (sp->role.glevels == PT32_ROOT_LEVEL)
1414
		paging32_update_pte(vcpu, sp, spte, new);
1415
	else
1416
		paging64_update_pte(vcpu, sp, spte, new);
1417 1418
}

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
{
	if (need_remote_flush(old, new))
		kvm_flush_remote_tlbs(vcpu->kvm);
	else
		kvm_mmu_flush_tlb(vcpu);
}

1440 1441
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
1442
	u64 *spte = vcpu->arch.last_pte_updated;
1443 1444 1445 1446

	return !!(spte && (*spte & PT_ACCESSED_MASK));
}

1447 1448 1449 1450 1451 1452
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
					  const u8 *new, int bytes)
{
	gfn_t gfn;
	int r;
	u64 gpte = 0;
1453
	struct page *page;
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480

	if (bytes != 4 && bytes != 8)
		return;

	/*
	 * Assume that the pte write on a page table of the same type
	 * as the current vcpu paging mode.  This is nearly always true
	 * (might be false while changing modes).  Note it is verified later
	 * by update_pte().
	 */
	if (is_pae(vcpu)) {
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
		if ((bytes == 4) && (gpa % 4 == 0)) {
			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
			if (r)
				return;
			memcpy((void *)&gpte + (gpa % 8), new, 4);
		} else if ((bytes == 8) && (gpa % 8 == 0)) {
			memcpy((void *)&gpte, new, 8);
		}
	} else {
		if ((bytes == 4) && (gpa % 4 == 0))
			memcpy((void *)&gpte, new, 4);
	}
	if (!is_present_pte(gpte))
		return;
	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1481

1482
	down_read(&vcpu->kvm->slots_lock);
1483
	page = gfn_to_page(vcpu->kvm, gfn);
1484
	up_read(&vcpu->kvm->slots_lock);
1485

1486 1487 1488 1489
	if (is_error_page(page)) {
		kvm_release_page_clean(page);
		return;
	}
1490
	vcpu->arch.update_pte.gfn = gfn;
1491
	vcpu->arch.update_pte.page = page;
1492 1493
}

1494
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1495
		       const u8 *new, int bytes)
1496
{
1497
	gfn_t gfn = gpa >> PAGE_SHIFT;
1498
	struct kvm_mmu_page *sp;
1499
	struct hlist_node *node, *n;
1500 1501
	struct hlist_head *bucket;
	unsigned index;
1502
	u64 entry, gentry;
1503 1504
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
1505
	unsigned pte_size;
1506
	unsigned page_offset;
1507
	unsigned misaligned;
1508
	unsigned quadrant;
1509
	int level;
1510
	int flooded = 0;
1511
	int npte;
1512
	int r;
1513

1514
	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1515
	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1516
	spin_lock(&vcpu->kvm->mmu_lock);
1517
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
1518
	++vcpu->kvm->stat.mmu_pte_write;
1519
	kvm_mmu_audit(vcpu, "pre pte write");
1520
	if (gfn == vcpu->arch.last_pt_write_gfn
1521
	    && !last_updated_pte_accessed(vcpu)) {
1522 1523
		++vcpu->arch.last_pt_write_count;
		if (vcpu->arch.last_pt_write_count >= 3)
1524 1525
			flooded = 1;
	} else {
1526 1527 1528
		vcpu->arch.last_pt_write_gfn = gfn;
		vcpu->arch.last_pt_write_count = 1;
		vcpu->arch.last_pte_updated = NULL;
1529
	}
1530
	index = kvm_page_table_hashfn(gfn);
1531
	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1532 1533
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
		if (sp->gfn != gfn || sp->role.metaphysical)
1534
			continue;
1535
		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1536
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1537
		misaligned |= bytes < 4;
1538
		if (misaligned || flooded) {
1539 1540 1541 1542
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
1543 1544 1545 1546 1547
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
1548 1549
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1550 1551
				 gpa, bytes, sp->role.word);
			kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1552
			++vcpu->kvm->stat.mmu_flooded;
1553 1554
			continue;
		}
1555
		page_offset = offset;
1556
		level = sp->role.level;
1557
		npte = 1;
1558
		if (sp->role.glevels == PT32_ROOT_LEVEL) {
1559 1560 1561 1562 1563 1564 1565
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
1566
				page_offset &= ~7; /* kill rounding error */
1567 1568 1569
				page_offset <<= 1;
				npte = 2;
			}
1570
			quadrant = page_offset >> PAGE_SHIFT;
1571
			page_offset &= ~PAGE_MASK;
1572
			if (quadrant != sp->role.quadrant)
1573
				continue;
1574
		}
1575
		spte = &sp->spt[page_offset / sizeof(*spte)];
1576 1577 1578 1579 1580 1581 1582 1583 1584
		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
			gentry = 0;
			r = kvm_read_guest_atomic(vcpu->kvm,
						  gpa & ~(u64)(pte_size - 1),
						  &gentry, pte_size);
			new = (const void *)&gentry;
			if (r < 0)
				new = NULL;
		}
1585
		while (npte--) {
1586
			entry = *spte;
1587
			mmu_pte_write_zap_pte(vcpu, sp, spte);
1588 1589
			if (new)
				mmu_pte_write_new_pte(vcpu, sp, spte, new);
1590
			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1591
			++spte;
1592 1593
		}
	}
1594
	kvm_mmu_audit(vcpu, "post pte write");
1595
	spin_unlock(&vcpu->kvm->mmu_lock);
1596 1597 1598 1599
	if (vcpu->arch.update_pte.page) {
		kvm_release_page_clean(vcpu->arch.update_pte.page);
		vcpu->arch.update_pte.page = NULL;
	}
1600 1601
}

1602 1603
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
1604 1605
	gpa_t gpa;
	int r;
1606

1607
	down_read(&vcpu->kvm->slots_lock);
1608
	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1609
	up_read(&vcpu->kvm->slots_lock);
1610

1611
	spin_lock(&vcpu->kvm->mmu_lock);
1612
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1613
	spin_unlock(&vcpu->kvm->mmu_lock);
1614
	return r;
1615 1616
}

1617
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1618
{
1619
	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1620
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1621

1622
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1623 1624
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1625
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
1626 1627 1628
	}
}

1629 1630 1631 1632 1633
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

1634
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1635 1636 1637 1638 1639 1640 1641 1642
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

1643 1644 1645 1646
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
		return 0;
	case EMULATE_FAIL:
		kvm_report_emulation_failure(vcpu, "pagetable");
		return 1;
	default:
		BUG();
	}
out:
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

1666 1667 1668 1669 1670 1671
void kvm_enable_tdp(void)
{
	tdp_enabled = true;
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);

A
Avi Kivity 已提交
1672 1673
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
1674
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1675

1676 1677
	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1678 1679
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
1680
	}
1681
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
A
Avi Kivity 已提交
1682 1683 1684 1685
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
1686
	struct page *page;
A
Avi Kivity 已提交
1687 1688 1689 1690
	int i;

	ASSERT(vcpu);

1691 1692 1693
	if (vcpu->kvm->arch.n_requested_mmu_pages)
		vcpu->kvm->arch.n_free_mmu_pages =
					vcpu->kvm->arch.n_requested_mmu_pages;
1694
	else
1695 1696
		vcpu->kvm->arch.n_free_mmu_pages =
					vcpu->kvm->arch.n_alloc_mmu_pages;
1697 1698 1699 1700 1701 1702 1703 1704
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
		goto error_1;
1705
	vcpu->arch.mmu.pae_root = page_address(page);
1706
	for (i = 0; i < 4; ++i)
1707
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1708

A
Avi Kivity 已提交
1709 1710 1711 1712 1713 1714 1715
	return 0;

error_1:
	free_mmu_pages(vcpu);
	return -ENOMEM;
}

1716
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1717 1718
{
	ASSERT(vcpu);
1719
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1720

1721 1722
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
1723

1724 1725 1726
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1727
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1728

1729
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
1730 1731 1732 1733 1734 1735 1736 1737
}

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
1738
	mmu_free_memory_caches(vcpu);
A
Avi Kivity 已提交
1739 1740
}

1741
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
1742
{
1743
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1744

1745
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
A
Avi Kivity 已提交
1746 1747 1748
		int i;
		u64 *pt;

1749
		if (!test_bit(slot, &sp->slot_bitmap))
A
Avi Kivity 已提交
1750 1751
			continue;

1752
		pt = sp->spt;
A
Avi Kivity 已提交
1753 1754
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
1755
			if (pt[i] & PT_WRITABLE_MASK)
A
Avi Kivity 已提交
1756 1757 1758
				pt[i] &= ~PT_WRITABLE_MASK;
	}
}
1759

1760
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
1761
{
1762
	struct kvm_mmu_page *sp, *node;
D
Dor Laor 已提交
1763

1764
	spin_lock(&kvm->mmu_lock);
1765
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1766
		kvm_mmu_zap_page(kvm, sp);
1767
	spin_unlock(&kvm->mmu_lock);
D
Dor Laor 已提交
1768

1769
	kvm_flush_remote_tlbs(kvm);
D
Dor Laor 已提交
1770 1771
}

1772 1773 1774 1775 1776 1777
void kvm_mmu_module_exit(void)
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
1778 1779
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
1780 1781 1782 1783 1784 1785
}

int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
1786
					    0, 0, NULL);
1787 1788 1789 1790
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
1791
					    0, 0, NULL);
1792 1793 1794
	if (!rmap_desc_cache)
		goto nomem;

1795 1796
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
1797
						  0, 0, NULL);
1798 1799 1800
	if (!mmu_page_header_cache)
		goto nomem;

1801 1802 1803 1804 1805 1806 1807
	return 0;

nomem:
	kvm_mmu_module_exit();
	return -ENOMEM;
}

1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;

	for (i = 0; i < kvm->nmemslots; i++)
		nr_pages += kvm->memslots[i].npages;

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848
#ifdef AUDIT

static const char *audit_msg;

static gva_t canonicalize(gva_t gva)
{
#ifdef CONFIG_X86_64
	gva = (long long)(gva << 16) >> 16;
#endif
	return gva;
}

static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
				gva_t va, int level)
{
	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
	int i;
	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
		u64 ent = pt[i];

1849
		if (ent == shadow_trap_nonpresent_pte)
1850 1851 1852
			continue;

		va = canonicalize(va);
1853 1854 1855 1856 1857
		if (level > 1) {
			if (ent == shadow_notrap_nonpresent_pte)
				printk(KERN_ERR "audit: (%s) nontrapping pte"
				       " in nonleaf level: levels %d gva %lx"
				       " level %d pte %llx\n", audit_msg,
1858
				       vcpu->arch.mmu.root_level, va, level, ent);
1859

1860
			audit_mappings_page(vcpu, ent, va, level - 1);
1861
		} else {
1862
			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
A
Avi Kivity 已提交
1863 1864
			struct page *page = gpa_to_page(vcpu, gpa);
			hpa_t hpa = page_to_phys(page);
1865

1866
			if (is_shadow_present_pte(ent)
1867
			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
1868 1869
				printk(KERN_ERR "xx audit error: (%s) levels %d"
				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1870
				       audit_msg, vcpu->arch.mmu.root_level,
M
Mike Day 已提交
1871 1872
				       va, gpa, hpa, ent,
				       is_shadow_present_pte(ent));
1873 1874 1875 1876
			else if (ent == shadow_notrap_nonpresent_pte
				 && !is_error_hpa(hpa))
				printk(KERN_ERR "audit: (%s) notrap shadow,"
				       " valid guest gva %lx\n", audit_msg, va);
1877
			kvm_release_page_clean(page);
1878

1879 1880 1881 1882 1883 1884
		}
	}
}

static void audit_mappings(struct kvm_vcpu *vcpu)
{
1885
	unsigned i;
1886

1887 1888
	if (vcpu->arch.mmu.root_level == 4)
		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1889 1890
	else
		for (i = 0; i < 4; ++i)
1891
			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1892
				audit_mappings_page(vcpu,
1893
						    vcpu->arch.mmu.pae_root[i],
1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
						    i << 30,
						    2);
}

static int count_rmaps(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
	int i, j, k;

	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
		struct kvm_rmap_desc *d;

		for (j = 0; j < m->npages; ++j) {
1908
			unsigned long *rmapp = &m->rmap[j];
1909

1910
			if (!*rmapp)
1911
				continue;
1912
			if (!(*rmapp & 1)) {
1913 1914 1915
				++nmaps;
				continue;
			}
1916
			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932
			while (d) {
				for (k = 0; k < RMAP_EXT; ++k)
					if (d->shadow_ptes[k])
						++nmaps;
					else
						break;
				d = d->more;
			}
		}
	}
	return nmaps;
}

static int count_writable_mappings(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
1933
	struct kvm_mmu_page *sp;
1934 1935
	int i;

1936
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1937
		u64 *pt = sp->spt;
1938

1939
		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966
			continue;

		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
			u64 ent = pt[i];

			if (!(ent & PT_PRESENT_MASK))
				continue;
			if (!(ent & PT_WRITABLE_MASK))
				continue;
			++nmaps;
		}
	}
	return nmaps;
}

static void audit_rmap(struct kvm_vcpu *vcpu)
{
	int n_rmap = count_rmaps(vcpu);
	int n_actual = count_writable_mappings(vcpu);

	if (n_rmap != n_actual)
		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
		       __FUNCTION__, audit_msg, n_rmap, n_actual);
}

static void audit_write_protection(struct kvm_vcpu *vcpu)
{
1967
	struct kvm_mmu_page *sp;
1968 1969 1970
	struct kvm_memory_slot *slot;
	unsigned long *rmapp;
	gfn_t gfn;
1971

1972
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1973
		if (sp->role.metaphysical)
1974 1975
			continue;

1976 1977
		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1978 1979
		rmapp = &slot->rmap[gfn - slot->base_gfn];
		if (*rmapp)
1980 1981
			printk(KERN_ERR "%s: (%s) shadow page has writable"
			       " mappings: gfn %lx role %x\n",
1982 1983
			       __FUNCTION__, audit_msg, sp->gfn,
			       sp->role.word);
1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
	}
}

static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
{
	int olddbg = dbg;

	dbg = 0;
	audit_msg = msg;
	audit_rmap(vcpu);
	audit_write_protection(vcpu);
	audit_mappings(vcpu);
	dbg = olddbg;
}

#endif