mmu.c 42.4 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
19 20 21

#include "vmx.h"
#include "kvm.h"
22
#include "x86.h"
23
#include "mmu.h"
A
Avi Kivity 已提交
24

A
Avi Kivity 已提交
25 26 27 28 29
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
30
#include <linux/swap.h>
A
Avi Kivity 已提交
31

A
Avi Kivity 已提交
32 33
#include <asm/page.h>
#include <asm/cmpxchg.h>
34
#include <asm/io.h>
A
Avi Kivity 已提交
35

36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
static int dbg = 1;
#endif
A
Avi Kivity 已提交
61

62 63 64
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
65 66 67 68 69
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
70
#endif
A
Avi Kivity 已提交
71

72 73 74 75
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
#define PT32_PT_BITS 10
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
A
Avi Kivity 已提交
76 77 78 79 80 81 82 83 84 85 86 87 88

#define PT_WRITABLE_SHIFT 1

#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
#define PT_USER_MASK (1ULL << 2)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_MASK (1ULL << 5)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
89 90
#define PT64_NX_SHIFT 63
#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
A
Avi Kivity 已提交
91 92 93 94 95 96 97

#define PT_PAT_SHIFT 7
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)

#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
M
Mike Day 已提交
98 99
#define PT32_DIR_PSE36_MASK \
	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
A
Avi Kivity 已提交
100 101 102 103 104 105 106 107 108 109 110 111


#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

#define VALID_PAGE(x) ((x) != INVALID_PAGE)

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
112
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
113 114 115 116 117 118 119 120 121 122 123

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
124
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
125 126 127 128 129 130 131 132

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


133
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
134 135 136 137 138 139 140
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))

141 142
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
143 144 145 146

#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
147
#define PFERR_FETCH_MASK (1U << 4)
A
Avi Kivity 已提交
148 149 150 151 152 153 154 155

#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3

#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1

156 157
#define RMAP_EXT 4

158 159 160 161 162
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

163 164 165 166 167
struct kvm_rmap_desc {
	u64 *shadow_ptes[RMAP_EXT];
	struct kvm_rmap_desc *more;
};

168 169
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
170
static struct kmem_cache *mmu_page_header_cache;
171

172 173 174 175 176 177 178 179 180 181
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;

void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

A
Avi Kivity 已提交
182 183
static int is_write_protection(struct kvm_vcpu *vcpu)
{
184
	return vcpu->arch.cr0 & X86_CR0_WP;
A
Avi Kivity 已提交
185 186 187 188 189 190 191
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

192 193
static int is_nx(struct kvm_vcpu *vcpu)
{
194
	return vcpu->arch.shadow_efer & EFER_NX;
195 196
}

A
Avi Kivity 已提交
197 198 199 200 201
static int is_present_pte(unsigned long pte)
{
	return pte & PT_PRESENT_MASK;
}

202 203 204 205 206 207 208
static int is_shadow_present_pte(u64 pte)
{
	pte &= ~PT_SHADOW_IO_MARK;
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

A
Avi Kivity 已提交
209 210 211 212 213
static int is_writeble_pte(unsigned long pte)
{
	return pte & PT_WRITABLE_MASK;
}

214 215 216 217 218
static int is_dirty_pte(unsigned long pte)
{
	return pte & PT_DIRTY_MASK;
}

A
Avi Kivity 已提交
219 220 221 222 223
static int is_io_pte(unsigned long pte)
{
	return pte & PT_SHADOW_IO_MARK;
}

224 225
static int is_rmap_pte(u64 pte)
{
226 227
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
228 229
}

230 231 232 233 234 235 236
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

237 238 239 240 241 242 243 244 245
static void set_shadow_pte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
	set_64bit((unsigned long *)sptep, spte);
#else
	set_64bit((unsigned long long *)sptep, spte);
#endif
}

246
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
247
				  struct kmem_cache *base_cache, int min)
248 249 250 251
{
	void *obj;

	if (cache->nobjs >= min)
252
		return 0;
253
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
254
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
255
		if (!obj)
256
			return -ENOMEM;
257 258
		cache->objects[cache->nobjs++] = obj;
	}
259
	return 0;
260 261 262 263 264 265 266 267
}

static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
		kfree(mc->objects[--mc->nobjs]);
}

A
Avi Kivity 已提交
268
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
269
				       int min)
A
Avi Kivity 已提交
270 271 272 273 274 275
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
276
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
277 278 279 280 281 282 283 284 285 286 287
		if (!page)
			return -ENOMEM;
		set_page_private(page, 0);
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
288
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
289 290
}

291
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
292
{
293 294
	int r;

295
	kvm_mmu_free_some_pages(vcpu);
296
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
297
				   pte_chain_cache, 4);
298 299
	if (r)
		goto out;
300
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
301
				   rmap_desc_cache, 1);
302 303
	if (r)
		goto out;
304
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
305 306
	if (r)
		goto out;
307
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
308
				   mmu_page_header_cache, 4);
309 310
out:
	return r;
311 312 313 314
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
315 316 317 318
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	memset(p, 0, size);
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
334
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
335 336 337
				      sizeof(struct kvm_pte_chain));
}

338
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
339
{
340
	kfree(pc);
341 342 343 344
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
345
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
346 347 348
				      sizeof(struct kvm_rmap_desc));
}

349
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
350
{
351
	kfree(rd);
352 353
}

354 355 356 357 358 359 360 361 362 363 364 365 366
/*
 * Take gfn and return the reverse mapping to it.
 * Note: gfn must be unaliased before this function get called
 */

static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
{
	struct kvm_memory_slot *slot;

	slot = gfn_to_memslot(kvm, gfn);
	return &slot->rmap[gfn - slot->base_gfn];
}

367 368 369
/*
 * Reverse mapping data structures:
 *
370 371
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
372
 *
373 374
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
375
 */
376
static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
377
{
378
	struct kvm_mmu_page *sp;
379
	struct kvm_rmap_desc *desc;
380
	unsigned long *rmapp;
381 382 383 384
	int i;

	if (!is_rmap_pte(*spte))
		return;
385
	gfn = unalias_gfn(vcpu->kvm, gfn);
386 387
	sp = page_header(__pa(spte));
	sp->gfns[spte - sp->spt] = gfn;
388 389
	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
	if (!*rmapp) {
390
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
391 392
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
393
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
394
		desc = mmu_alloc_rmap_desc(vcpu);
395
		desc->shadow_ptes[0] = (u64 *)*rmapp;
396
		desc->shadow_ptes[1] = spte;
397
		*rmapp = (unsigned long)desc | 1;
398 399
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
400
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
401 402 403
		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
			desc = desc->more;
		if (desc->shadow_ptes[RMAP_EXT-1]) {
404
			desc->more = mmu_alloc_rmap_desc(vcpu);
405 406 407 408 409 410 411 412
			desc = desc->more;
		}
		for (i = 0; desc->shadow_ptes[i]; ++i)
			;
		desc->shadow_ptes[i] = spte;
	}
}

413
static void rmap_desc_remove_entry(unsigned long *rmapp,
414 415 416 417 418 419 420 421 422
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
		;
	desc->shadow_ptes[i] = desc->shadow_ptes[j];
A
Al Viro 已提交
423
	desc->shadow_ptes[j] = NULL;
424 425 426
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
427
		*rmapp = (unsigned long)desc->shadow_ptes[0];
428 429 430 431
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
432
			*rmapp = (unsigned long)desc->more | 1;
433
	mmu_free_rmap_desc(desc);
434 435
}

436
static void rmap_remove(struct kvm *kvm, u64 *spte)
437 438 439
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
440
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
441
	struct page *page;
442
	unsigned long *rmapp;
443 444 445 446
	int i;

	if (!is_rmap_pte(*spte))
		return;
447
	sp = page_header(__pa(spte));
A
Avi Kivity 已提交
448
	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
449
	mark_page_accessed(page);
450
	if (is_writeble_pte(*spte))
A
Avi Kivity 已提交
451
		kvm_release_page_dirty(page);
452
	else
A
Avi Kivity 已提交
453
		kvm_release_page_clean(page);
454
	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
455
	if (!*rmapp) {
456 457
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
458
	} else if (!(*rmapp & 1)) {
459
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
460
		if ((u64 *)*rmapp != spte) {
461 462 463 464
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
465
		*rmapp = 0;
466 467
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
468
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
469 470 471 472
		prev_desc = NULL;
		while (desc) {
			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
				if (desc->shadow_ptes[i] == spte) {
473
					rmap_desc_remove_entry(rmapp,
474
							       desc, i,
475 476 477 478 479 480 481 482 483 484
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
		BUG();
	}
}

485
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
486 487
{
	struct kvm_rmap_desc *desc;
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
	struct kvm_rmap_desc *prev_desc;
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_desc = NULL;
	prev_spte = NULL;
	while (desc) {
		for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
			if (prev_spte == spte)
				return desc->shadow_ptes[i];
			prev_spte = desc->shadow_ptes[i];
		}
		desc = desc->more;
	}
	return NULL;
}

static void rmap_write_protect(struct kvm *kvm, u64 gfn)
{
515
	unsigned long *rmapp;
516 517
	u64 *spte;

518 519
	gfn = unalias_gfn(kvm, gfn);
	rmapp = gfn_to_rmap(kvm, gfn);
520

521 522
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
523 524 525
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
526 527
		if (is_writeble_pte(*spte))
			set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
528
		kvm_flush_remote_tlbs(kvm);
529
		spte = rmap_next(kvm, rmapp, spte);
530 531 532
	}
}

533
#ifdef MMU_DEBUG
534
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
535
{
536 537 538
	u64 *pos;
	u64 *end;

539
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
540
		if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
541 542
			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
			       pos, *pos);
A
Avi Kivity 已提交
543
			return 0;
544
		}
A
Avi Kivity 已提交
545 546
	return 1;
}
547
#endif
A
Avi Kivity 已提交
548

549
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
550
{
551 552 553 554 555
	ASSERT(is_empty_shadow_page(sp->spt));
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
	__free_page(virt_to_page(sp->gfns));
	kfree(sp);
556
	++kvm->n_free_mmu_pages;
557 558
}

559 560 561 562 563
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
	return gfn;
}

564 565
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
					       u64 *parent_pte)
A
Avi Kivity 已提交
566
{
567
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
568

569
	if (!vcpu->kvm->n_free_mmu_pages)
570
		return NULL;
A
Avi Kivity 已提交
571

572 573 574
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
575 576 577 578 579 580
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
	list_add(&sp->link, &vcpu->kvm->active_mmu_pages);
	ASSERT(is_empty_shadow_page(sp->spt));
	sp->slot_bitmap = 0;
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
A
Avi Kivity 已提交
581
	--vcpu->kvm->n_free_mmu_pages;
582
	return sp;
A
Avi Kivity 已提交
583 584
}

585
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
586
				    struct kvm_mmu_page *sp, u64 *parent_pte)
587 588 589 590 591 592 593
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
594 595
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
596 597

		if (!old) {
598
			sp->parent_pte = parent_pte;
599 600
			return;
		}
601
		sp->multimapped = 1;
602
		pte_chain = mmu_alloc_pte_chain(vcpu);
603 604
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
605 606
		pte_chain->parent_ptes[0] = old;
	}
607
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
608 609 610 611 612 613 614 615
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
616
	pte_chain = mmu_alloc_pte_chain(vcpu);
617
	BUG_ON(!pte_chain);
618
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
619 620 621
	pte_chain->parent_ptes[0] = parent_pte;
}

622
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
623 624 625 626 627 628
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

629 630 631
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
632 633
		return;
	}
634
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
635 636 637 638 639
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
640 641
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
642 643 644 645 646
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
647 648
			if (i == 0) {
				hlist_del(&pte_chain->link);
649
				mmu_free_pte_chain(pte_chain);
650 651 652
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
653 654
				}
			}
655 656 657 658 659
			return;
		}
	BUG();
}

660
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
661 662 663
{
	unsigned index;
	struct hlist_head *bucket;
664
	struct kvm_mmu_page *sp;
665 666 667 668
	struct hlist_node *node;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
669
	bucket = &kvm->mmu_page_hash[index];
670 671
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
672
			pgprintk("%s: found role %x\n",
673 674
				 __FUNCTION__, sp->role.word);
			return sp;
675 676 677 678 679 680 681 682 683
		}
	return NULL;
}

static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
					     int metaphysical,
684
					     unsigned access,
685 686
					     u64 *parent_pte,
					     bool *new_page)
687 688 689 690 691
{
	union kvm_mmu_page_role role;
	unsigned index;
	unsigned quadrant;
	struct hlist_head *bucket;
692
	struct kvm_mmu_page *sp;
693 694 695
	struct hlist_node *node;

	role.word = 0;
696
	role.glevels = vcpu->arch.mmu.root_level;
697 698
	role.level = level;
	role.metaphysical = metaphysical;
699
	role.access = access;
700
	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
701 702 703 704 705 706 707 708
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
		 gfn, role.word);
	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
	bucket = &vcpu->kvm->mmu_page_hash[index];
709 710 711
	hlist_for_each_entry(sp, node, bucket, hash_link)
		if (sp->gfn == gfn && sp->role.word == role.word) {
			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
712
			pgprintk("%s: found\n", __FUNCTION__);
713
			return sp;
714
		}
715 716 717
	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
	if (!sp)
		return sp;
718
	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 720 721
	sp->gfn = gfn;
	sp->role = role;
	hlist_add_head(&sp->hash_link, bucket);
722
	vcpu->arch.mmu.prefetch_page(vcpu, sp);
723
	if (!metaphysical)
724
		rmap_write_protect(vcpu->kvm, gfn);
725 726
	if (new_page)
		*new_page = 1;
727
	return sp;
728 729
}

730
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731
					 struct kvm_mmu_page *sp)
732
{
733 734 735 736
	unsigned i;
	u64 *pt;
	u64 ent;

737
	pt = sp->spt;
738

739
	if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741
			if (is_shadow_present_pte(pt[i]))
742
				rmap_remove(kvm, &pt[i]);
743
			pt[i] = shadow_trap_nonpresent_pte;
744
		}
745
		kvm_flush_remote_tlbs(kvm);
746 747 748 749 750 751
		return;
	}

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

752 753
		pt[i] = shadow_trap_nonpresent_pte;
		if (!is_shadow_present_pte(ent))
754 755
			continue;
		ent &= PT64_BASE_ADDR_MASK;
756
		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757
	}
758
	kvm_flush_remote_tlbs(kvm);
759 760
}

761
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762
{
763
	mmu_page_remove_parent_pte(sp, parent_pte);
764 765
}

766 767 768 769 770 771
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;

	for (i = 0; i < KVM_MAX_VCPUS; ++i)
		if (kvm->vcpus[i])
772
			kvm->vcpus[i]->arch.last_pte_updated = NULL;
773 774
}

775
static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776 777 778
{
	u64 *parent_pte;

A
Avi Kivity 已提交
779
	++kvm->stat.mmu_shadow_zapped;
780 781 782
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
783 784 785
		else {
			struct kvm_pte_chain *chain;

786
			chain = container_of(sp->parent_ptes.first,
787 788 789
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
790
		BUG_ON(!parent_pte);
791
		kvm_mmu_put_page(sp, parent_pte);
792
		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793
	}
794 795 796 797
	kvm_mmu_page_unlink_children(kvm, sp);
	if (!sp->root_count) {
		hlist_del(&sp->hash_link);
		kvm_mmu_free_page(kvm, sp);
A
Avi Kivity 已提交
798
	} else
799
		list_move(&sp->link, &kvm->active_mmu_pages);
800
	kvm_mmu_reset_last_pte_updated(kvm);
801 802
}

803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
/*
 * Changing the number of mmu pages allocated to the vm
 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

	if ((kvm->n_alloc_mmu_pages - kvm->n_free_mmu_pages) >
	    kvm_nr_mmu_pages) {
		int n_used_mmu_pages = kvm->n_alloc_mmu_pages
				       - kvm->n_free_mmu_pages;

		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
			struct kvm_mmu_page *page;

			page = container_of(kvm->active_mmu_pages.prev,
					    struct kvm_mmu_page, link);
			kvm_mmu_zap_page(kvm, page);
			n_used_mmu_pages--;
		}
		kvm->n_free_mmu_pages = 0;
	}
	else
		kvm->n_free_mmu_pages += kvm_nr_mmu_pages
					 - kvm->n_alloc_mmu_pages;

	kvm->n_alloc_mmu_pages = kvm_nr_mmu_pages;
}

837
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838 839 840
{
	unsigned index;
	struct hlist_head *bucket;
841
	struct kvm_mmu_page *sp;
842 843 844 845 846 847
	struct hlist_node *node, *n;
	int r;

	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
	r = 0;
	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848
	bucket = &kvm->mmu_page_hash[index];
849 850
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
		if (sp->gfn == gfn && !sp->role.metaphysical) {
851
			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 853
				 sp->role.word);
			kvm_mmu_zap_page(kvm, sp);
854 855 856
			r = 1;
		}
	return r;
857 858
}

859
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860
{
861
	struct kvm_mmu_page *sp;
862

863 864 865
	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
		kvm_mmu_zap_page(kvm, sp);
866 867 868
	}
}

869
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
870
{
871
	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
873

874
	__set_bit(slot, &sp->slot_bitmap);
A
Avi Kivity 已提交
875 876
}

877 878
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
879
	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880 881 882

	if (gpa == UNMAPPED_GVA)
		return NULL;
A
Avi Kivity 已提交
883
	return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884 885
}

886 887 888 889 890 891 892 893 894
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
			 int *ptwrite, gfn_t gfn)
{
	u64 spte;
	int was_rmapped = is_rmap_pte(*shadow_pte);
	struct page *page;

895
	pgprintk("%s: spte %llx access %x write_fault %d"
896
		 " user_fault %d gfn %lx\n",
897
		 __FUNCTION__, *shadow_pte, pt_access,
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965
		 write_fault, user_fault, gfn);

	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
	if (!(pte_access & ACC_EXEC_MASK))
		spte |= PT64_NX_MASK;

	page = gfn_to_page(vcpu->kvm, gfn);

	spte |= PT_PRESENT_MASK;
	if (pte_access & ACC_USER_MASK)
		spte |= PT_USER_MASK;

	if (is_error_page(page)) {
		set_shadow_pte(shadow_pte,
			       shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
		kvm_release_page_clean(page);
		return;
	}

	spte |= page_to_phys(page);

	if ((pte_access & ACC_WRITE_MASK)
	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
		struct kvm_mmu_page *shadow;

		spte |= PT_WRITABLE_MASK;
		if (user_fault) {
			mmu_unshadow(vcpu->kvm, gfn);
			goto unshadowed;
		}

		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
		if (shadow) {
			pgprintk("%s: found shadow page for %lx, marking ro\n",
				 __FUNCTION__, gfn);
			pte_access &= ~ACC_WRITE_MASK;
			if (is_writeble_pte(spte)) {
				spte &= ~PT_WRITABLE_MASK;
				kvm_x86_ops->tlb_flush(vcpu);
			}
			if (write_fault)
				*ptwrite = 1;
		}
	}

unshadowed:

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
	set_shadow_pte(shadow_pte, spte);
	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
	if (!was_rmapped) {
		rmap_add(vcpu, shadow_pte, gfn);
		if (!is_rmap_pte(*shadow_pte))
			kvm_release_page_clean(page);
	}
	else
		kvm_release_page_clean(page);
	if (!ptwrite || !*ptwrite)
966
		vcpu->arch.last_pte_updated = shadow_pte;
967 968
}

A
Avi Kivity 已提交
969 970 971 972
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

973
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
A
Avi Kivity 已提交
974 975
{
	int level = PT32E_ROOT_LEVEL;
976
	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
977
	int pt_write = 0;
A
Avi Kivity 已提交
978 979 980 981 982 983 984 985 986

	for (; ; level--) {
		u32 index = PT64_INDEX(v, level);
		u64 *table;

		ASSERT(VALID_PAGE(table_addr));
		table = __va(table_addr);

		if (level == 1) {
987 988 989
			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
				     0, write, 1, &pt_write, gfn);
			return pt_write || is_io_pte(table[index]);
A
Avi Kivity 已提交
990 991
		}

992
		if (table[index] == shadow_trap_nonpresent_pte) {
993
			struct kvm_mmu_page *new_table;
994
			gfn_t pseudo_gfn;
A
Avi Kivity 已提交
995

996 997 998 999
			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
				>> PAGE_SHIFT;
			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
						     v, level - 1,
1000 1001
						     1, ACC_ALL, &table[index],
						     NULL);
1002
			if (!new_table) {
A
Avi Kivity 已提交
1003 1004 1005 1006
				pgprintk("nonpaging_map: ENOMEM\n");
				return -ENOMEM;
			}

1007
			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1008
				| PT_WRITABLE_MASK | PT_USER_MASK;
A
Avi Kivity 已提交
1009 1010 1011 1012 1013
		}
		table_addr = table[index] & PT64_BASE_ADDR_MASK;
	}
}

1014 1015 1016 1017 1018 1019 1020 1021 1022
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1023 1024 1025
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
1026
	struct kvm_mmu_page *sp;
1027

1028
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
1029
		return;
1030
#ifdef CONFIG_X86_64
1031 1032
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1033

1034 1035
		sp = page_header(root);
		--sp->root_count;
1036
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1037 1038 1039 1040
		return;
	}
#endif
	for (i = 0; i < 4; ++i) {
1041
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1042

A
Avi Kivity 已提交
1043 1044
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
1045 1046
			sp = page_header(root);
			--sp->root_count;
A
Avi Kivity 已提交
1047
		}
1048
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1049
	}
1050
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1051 1052 1053 1054 1055
}

static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
	int i;
1056
	gfn_t root_gfn;
1057
	struct kvm_mmu_page *sp;
1058

1059
	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1060 1061

#ifdef CONFIG_X86_64
1062 1063
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
1064 1065

		ASSERT(!VALID_PAGE(root));
1066
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1067
				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1068 1069
		root = __pa(sp->spt);
		++sp->root_count;
1070
		vcpu->arch.mmu.root_hpa = root;
1071 1072 1073 1074
		return;
	}
#endif
	for (i = 0; i < 4; ++i) {
1075
		hpa_t root = vcpu->arch.mmu.pae_root[i];
1076 1077

		ASSERT(!VALID_PAGE(root));
1078 1079 1080
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
			if (!is_present_pte(vcpu->arch.pdptrs[i])) {
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
1081 1082
				continue;
			}
1083 1084
			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
		} else if (vcpu->arch.mmu.root_level == 0)
1085
			root_gfn = 0;
1086 1087
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
				      PT32_ROOT_LEVEL, !is_paging(vcpu),
1088
				      ACC_ALL, NULL, NULL);
1089 1090
		root = __pa(sp->spt);
		++sp->root_count;
1091
		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1092
	}
1093
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1094 1095
}

A
Avi Kivity 已提交
1096 1097 1098 1099 1100 1101
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
	return vaddr;
}

static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
A
Avi Kivity 已提交
1102
				u32 error_code)
A
Avi Kivity 已提交
1103
{
1104
	gfn_t gfn;
1105
	int r;
A
Avi Kivity 已提交
1106

1107
	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1108 1109 1110
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
1111

A
Avi Kivity 已提交
1112
	ASSERT(vcpu);
1113
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1114

1115
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
1116

1117 1118
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
1119 1120 1121 1122
}

static void nonpaging_free(struct kvm_vcpu *vcpu)
{
1123
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1124 1125 1126 1127
}

static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
1128
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1129 1130 1131 1132 1133

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
1134
	context->prefetch_page = nonpaging_prefetch_page;
1135
	context->root_level = 0;
A
Avi Kivity 已提交
1136
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1137
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1138 1139 1140
	return 0;
}

1141
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1142
{
A
Avi Kivity 已提交
1143
	++vcpu->stat.tlb_flush;
1144
	kvm_x86_ops->tlb_flush(vcpu);
A
Avi Kivity 已提交
1145 1146 1147 1148
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
1149
	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1150
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
1151 1152 1153 1154 1155 1156
}

static void inject_page_fault(struct kvm_vcpu *vcpu,
			      u64 addr,
			      u32 err_code)
{
1157
	kvm_inject_page_fault(vcpu, addr, err_code);
A
Avi Kivity 已提交
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

1173
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
A
Avi Kivity 已提交
1174
{
1175
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1176 1177 1178 1179 1180

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
1181
	context->prefetch_page = paging64_prefetch_page;
A
Avi Kivity 已提交
1182
	context->free = paging_free;
1183 1184
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
1185
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1186 1187 1188
	return 0;
}

1189 1190 1191 1192 1193
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}

A
Avi Kivity 已提交
1194 1195
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
1196
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
1197 1198 1199 1200 1201

	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
1202
	context->prefetch_page = paging32_prefetch_page;
A
Avi Kivity 已提交
1203 1204
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
1205
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1206 1207 1208 1209 1210
	return 0;
}

static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
1211
	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
1212 1213 1214 1215 1216
}

static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1217
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1218 1219 1220

	if (!is_paging(vcpu))
		return nonpaging_init_context(vcpu);
A
Avi Kivity 已提交
1221
	else if (is_long_mode(vcpu))
A
Avi Kivity 已提交
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
		return paging64_init_context(vcpu);
	else if (is_pae(vcpu))
		return paging32E_init_context(vcpu);
	else
		return paging32_init_context(vcpu);
}

static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1232 1233 1234
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
		vcpu->arch.mmu.free(vcpu);
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
1235 1236 1237 1238
	}
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1239 1240 1241 1242
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
1243
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
1244 1245

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1246
{
1247 1248
	int r;

S
Shaohua Li 已提交
1249
	mutex_lock(&vcpu->kvm->lock);
1250
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
1251 1252 1253
	if (r)
		goto out;
	mmu_alloc_roots(vcpu);
1254
	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
A
Avi Kivity 已提交
1255
	kvm_mmu_flush_tlb(vcpu);
1256
out:
S
Shaohua Li 已提交
1257
	mutex_unlock(&vcpu->kvm->lock);
1258
	return r;
A
Avi Kivity 已提交
1259
}
A
Avi Kivity 已提交
1260 1261 1262 1263 1264 1265
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
A
Avi Kivity 已提交
1266

1267
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1268
				  struct kvm_mmu_page *sp,
1269 1270 1271 1272 1273 1274
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
1275
	if (is_shadow_present_pte(pte)) {
1276
		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1277
			rmap_remove(vcpu->kvm, spte);
1278 1279
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
1280
			mmu_page_remove_parent_pte(child, spte);
1281 1282
		}
	}
1283
	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1284 1285
}

1286
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1287
				  struct kvm_mmu_page *sp,
1288
				  u64 *spte,
1289 1290
				  const void *new, int bytes,
				  int offset_in_pte)
1291
{
1292
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
A
Avi Kivity 已提交
1293
		++vcpu->kvm->stat.mmu_pde_zapped;
1294
		return;
A
Avi Kivity 已提交
1295
	}
1296

A
Avi Kivity 已提交
1297
	++vcpu->kvm->stat.mmu_pte_updated;
1298 1299
	if (sp->role.glevels == PT32_ROOT_LEVEL)
		paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1300
	else
1301
		paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1302 1303
}

1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
{
	if (need_remote_flush(old, new))
		kvm_flush_remote_tlbs(vcpu->kvm);
	else
		kvm_mmu_flush_tlb(vcpu);
}

1325 1326
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
1327
	u64 *spte = vcpu->arch.last_pte_updated;
1328 1329 1330 1331

	return !!(spte && (*spte & PT_ACCESSED_MASK));
}

1332
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1333
		       const u8 *new, int bytes)
1334
{
1335
	gfn_t gfn = gpa >> PAGE_SHIFT;
1336
	struct kvm_mmu_page *sp;
1337
	struct hlist_node *node, *n;
1338 1339
	struct hlist_head *bucket;
	unsigned index;
1340
	u64 entry;
1341 1342
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
1343
	unsigned pte_size;
1344
	unsigned page_offset;
1345
	unsigned misaligned;
1346
	unsigned quadrant;
1347
	int level;
1348
	int flooded = 0;
1349
	int npte;
1350

1351
	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
A
Avi Kivity 已提交
1352
	++vcpu->kvm->stat.mmu_pte_write;
1353
	kvm_mmu_audit(vcpu, "pre pte write");
1354
	if (gfn == vcpu->arch.last_pt_write_gfn
1355
	    && !last_updated_pte_accessed(vcpu)) {
1356 1357
		++vcpu->arch.last_pt_write_count;
		if (vcpu->arch.last_pt_write_count >= 3)
1358 1359
			flooded = 1;
	} else {
1360 1361 1362
		vcpu->arch.last_pt_write_gfn = gfn;
		vcpu->arch.last_pt_write_count = 1;
		vcpu->arch.last_pte_updated = NULL;
1363
	}
1364 1365
	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
	bucket = &vcpu->kvm->mmu_page_hash[index];
1366 1367
	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
		if (sp->gfn != gfn || sp->role.metaphysical)
1368
			continue;
1369
		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1370
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1371
		misaligned |= bytes < 4;
1372
		if (misaligned || flooded) {
1373 1374 1375 1376
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
1377 1378 1379 1380 1381
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
1382 1383
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1384 1385
				 gpa, bytes, sp->role.word);
			kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1386
			++vcpu->kvm->stat.mmu_flooded;
1387 1388
			continue;
		}
1389
		page_offset = offset;
1390
		level = sp->role.level;
1391
		npte = 1;
1392
		if (sp->role.glevels == PT32_ROOT_LEVEL) {
1393 1394 1395 1396 1397 1398 1399
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
1400
				page_offset &= ~7; /* kill rounding error */
1401 1402 1403
				page_offset <<= 1;
				npte = 2;
			}
1404
			quadrant = page_offset >> PAGE_SHIFT;
1405
			page_offset &= ~PAGE_MASK;
1406
			if (quadrant != sp->role.quadrant)
1407
				continue;
1408
		}
1409
		spte = &sp->spt[page_offset / sizeof(*spte)];
1410
		while (npte--) {
1411
			entry = *spte;
1412 1413
			mmu_pte_write_zap_pte(vcpu, sp, spte);
			mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1414
					      page_offset & (pte_size - 1));
1415
			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1416
			++spte;
1417 1418
		}
	}
1419
	kvm_mmu_audit(vcpu, "post pte write");
1420 1421
}

1422 1423
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
1424
	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1425

1426
	return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1427 1428
}

1429
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1430 1431
{
	while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1432
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1433

1434 1435 1436
		sp = container_of(vcpu->kvm->active_mmu_pages.prev,
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
A
Avi Kivity 已提交
1437
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
1438 1439 1440
	}
}

1441 1442 1443 1444 1445 1446
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

	mutex_lock(&vcpu->kvm->lock);
1447
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1448 1449 1450 1451 1452 1453 1454 1455
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

1456 1457 1458 1459
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
	mutex_unlock(&vcpu->kvm->lock);

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
		return 0;
	case EMULATE_FAIL:
		kvm_report_emulation_failure(vcpu, "pagetable");
		return 1;
	default:
		BUG();
	}
out:
	mutex_unlock(&vcpu->kvm->lock);
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

A
Avi Kivity 已提交
1481 1482
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
1483
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1484

1485
	while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1486 1487 1488
		sp = container_of(vcpu->kvm->active_mmu_pages.next,
				  struct kvm_mmu_page, link);
		kvm_mmu_zap_page(vcpu->kvm, sp);
1489
	}
1490
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
A
Avi Kivity 已提交
1491 1492 1493 1494
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
1495
	struct page *page;
A
Avi Kivity 已提交
1496 1497 1498 1499
	int i;

	ASSERT(vcpu);

1500 1501 1502 1503
	if (vcpu->kvm->n_requested_mmu_pages)
		vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_requested_mmu_pages;
	else
		vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_alloc_mmu_pages;
1504 1505 1506 1507 1508 1509 1510 1511
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
		goto error_1;
1512
	vcpu->arch.mmu.pae_root = page_address(page);
1513
	for (i = 0; i < 4; ++i)
1514
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1515

A
Avi Kivity 已提交
1516 1517 1518 1519 1520 1521 1522
	return 0;

error_1:
	free_mmu_pages(vcpu);
	return -ENOMEM;
}

1523
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1524 1525
{
	ASSERT(vcpu);
1526
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
1527

1528 1529
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
1530

1531 1532 1533
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
1534
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1535

1536
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
1537 1538 1539 1540 1541 1542 1543 1544
}

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
1545
	mmu_free_memory_caches(vcpu);
A
Avi Kivity 已提交
1546 1547
}

1548
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
1549
{
1550
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1551

1552
	list_for_each_entry(sp, &kvm->active_mmu_pages, link) {
A
Avi Kivity 已提交
1553 1554 1555
		int i;
		u64 *pt;

1556
		if (!test_bit(slot, &sp->slot_bitmap))
A
Avi Kivity 已提交
1557 1558
			continue;

1559
		pt = sp->spt;
A
Avi Kivity 已提交
1560 1561
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
1562
			if (pt[i] & PT_WRITABLE_MASK)
A
Avi Kivity 已提交
1563 1564 1565
				pt[i] &= ~PT_WRITABLE_MASK;
	}
}
1566

1567
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
1568
{
1569
	struct kvm_mmu_page *sp, *node;
D
Dor Laor 已提交
1570

1571 1572
	list_for_each_entry_safe(sp, node, &kvm->active_mmu_pages, link)
		kvm_mmu_zap_page(kvm, sp);
D
Dor Laor 已提交
1573

1574
	kvm_flush_remote_tlbs(kvm);
D
Dor Laor 已提交
1575 1576
}

1577 1578 1579 1580 1581 1582
void kvm_mmu_module_exit(void)
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
1583 1584
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
1585 1586 1587 1588 1589 1590
}

int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
1591
					    0, 0, NULL);
1592 1593 1594 1595
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
1596
					    0, 0, NULL);
1597 1598 1599
	if (!rmap_desc_cache)
		goto nomem;

1600 1601
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
1602
						  0, 0, NULL);
1603 1604 1605
	if (!mmu_page_header_cache)
		goto nomem;

1606 1607 1608 1609 1610 1611 1612
	return 0;

nomem:
	kvm_mmu_module_exit();
	return -ENOMEM;
}

1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;

	for (i = 0; i < kvm->nmemslots; i++)
		nr_pages += kvm->memslots[i].npages;

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
#ifdef AUDIT

static const char *audit_msg;

static gva_t canonicalize(gva_t gva)
{
#ifdef CONFIG_X86_64
	gva = (long long)(gva << 16) >> 16;
#endif
	return gva;
}

static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
				gva_t va, int level)
{
	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
	int i;
	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
		u64 ent = pt[i];

1654
		if (ent == shadow_trap_nonpresent_pte)
1655 1656 1657
			continue;

		va = canonicalize(va);
1658 1659 1660 1661 1662
		if (level > 1) {
			if (ent == shadow_notrap_nonpresent_pte)
				printk(KERN_ERR "audit: (%s) nontrapping pte"
				       " in nonleaf level: levels %d gva %lx"
				       " level %d pte %llx\n", audit_msg,
1663
				       vcpu->arch.mmu.root_level, va, level, ent);
1664

1665
			audit_mappings_page(vcpu, ent, va, level - 1);
1666
		} else {
1667
			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
A
Avi Kivity 已提交
1668 1669
			struct page *page = gpa_to_page(vcpu, gpa);
			hpa_t hpa = page_to_phys(page);
1670

1671
			if (is_shadow_present_pte(ent)
1672
			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
1673 1674
				printk(KERN_ERR "xx audit error: (%s) levels %d"
				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1675
				       audit_msg, vcpu->arch.mmu.root_level,
M
Mike Day 已提交
1676 1677
				       va, gpa, hpa, ent,
				       is_shadow_present_pte(ent));
1678 1679 1680 1681
			else if (ent == shadow_notrap_nonpresent_pte
				 && !is_error_hpa(hpa))
				printk(KERN_ERR "audit: (%s) notrap shadow,"
				       " valid guest gva %lx\n", audit_msg, va);
1682
			kvm_release_page_clean(page);
1683

1684 1685 1686 1687 1688 1689
		}
	}
}

static void audit_mappings(struct kvm_vcpu *vcpu)
{
1690
	unsigned i;
1691

1692 1693
	if (vcpu->arch.mmu.root_level == 4)
		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1694 1695
	else
		for (i = 0; i < 4; ++i)
1696
			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1697
				audit_mappings_page(vcpu,
1698
						    vcpu->arch.mmu.pae_root[i],
1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712
						    i << 30,
						    2);
}

static int count_rmaps(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
	int i, j, k;

	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
		struct kvm_rmap_desc *d;

		for (j = 0; j < m->npages; ++j) {
1713
			unsigned long *rmapp = &m->rmap[j];
1714

1715
			if (!*rmapp)
1716
				continue;
1717
			if (!(*rmapp & 1)) {
1718 1719 1720
				++nmaps;
				continue;
			}
1721
			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737
			while (d) {
				for (k = 0; k < RMAP_EXT; ++k)
					if (d->shadow_ptes[k])
						++nmaps;
					else
						break;
				d = d->more;
			}
		}
	}
	return nmaps;
}

static int count_writable_mappings(struct kvm_vcpu *vcpu)
{
	int nmaps = 0;
1738
	struct kvm_mmu_page *sp;
1739 1740
	int i;

1741 1742
	list_for_each_entry(sp, &vcpu->kvm->active_mmu_pages, link) {
		u64 *pt = sp->spt;
1743

1744
		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771
			continue;

		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
			u64 ent = pt[i];

			if (!(ent & PT_PRESENT_MASK))
				continue;
			if (!(ent & PT_WRITABLE_MASK))
				continue;
			++nmaps;
		}
	}
	return nmaps;
}

static void audit_rmap(struct kvm_vcpu *vcpu)
{
	int n_rmap = count_rmaps(vcpu);
	int n_actual = count_writable_mappings(vcpu);

	if (n_rmap != n_actual)
		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
		       __FUNCTION__, audit_msg, n_rmap, n_actual);
}

static void audit_write_protection(struct kvm_vcpu *vcpu)
{
1772
	struct kvm_mmu_page *sp;
1773 1774 1775
	struct kvm_memory_slot *slot;
	unsigned long *rmapp;
	gfn_t gfn;
1776

1777 1778
	list_for_each_entry(sp, &vcpu->kvm->active_mmu_pages, link) {
		if (sp->role.metaphysical)
1779 1780
			continue;

1781 1782
		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1783 1784
		rmapp = &slot->rmap[gfn - slot->base_gfn];
		if (*rmapp)
1785 1786
			printk(KERN_ERR "%s: (%s) shadow page has writable"
			       " mappings: gfn %lx role %x\n",
1787 1788
			       __FUNCTION__, audit_msg, sp->gfn,
			       sp->role.word);
1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804
	}
}

static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
{
	int olddbg = dbg;

	dbg = 0;
	audit_msg = msg;
	audit_rmap(vcpu);
	audit_write_protection(vcpu);
	audit_mappings(vcpu);
	dbg = olddbg;
}

#endif