mmu.c 83.6 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
A
Avi Kivity 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affilates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
20

21
#include "mmu.h"
22
#include "x86.h"
A
Avi Kivity 已提交
23
#include "kvm_cache_regs.h"
A
Avi Kivity 已提交
24

25
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
26 27 28 29 30
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
31
#include <linux/swap.h>
M
Marcelo Tosatti 已提交
32
#include <linux/hugetlb.h>
33
#include <linux/compiler.h>
34
#include <linux/srcu.h>
35
#include <linux/slab.h>
36
#include <linux/uaccess.h>
A
Avi Kivity 已提交
37

A
Avi Kivity 已提交
38 39
#include <asm/page.h>
#include <asm/cmpxchg.h>
40
#include <asm/io.h>
41
#include <asm/vmx.h>
A
Avi Kivity 已提交
42

43 44 45 46 47 48 49
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
50
bool tdp_enabled = false;
51

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
75 76
static int dbg = 0;
module_param(dbg, bool, 0644);
77
#endif
A
Avi Kivity 已提交
78

79 80 81
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);

82 83 84
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
85 86 87 88 89
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
90
#endif
A
Avi Kivity 已提交
91 92 93 94 95 96 97

#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
98
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
99 100 101 102 103 104 105 106 107 108 109

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
110
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
111 112 113

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114 115 116
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
117 118 119 120 121

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


122
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
123 124
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
125 126 127 128 129 130
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
131 132 133 134

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135 136 137
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
138

139 140
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
141

142 143
#define RMAP_EXT 4

144 145 146 147 148
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

149 150
#include <trace/events/kvm.h>

151 152 153
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

154 155
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

156 157
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

158
struct kvm_rmap_desc {
A
Avi Kivity 已提交
159
	u64 *sptes[RMAP_EXT];
160 161 162
	struct kvm_rmap_desc *more;
};

163 164 165 166 167 168 169 170 171 172 173 174 175
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	int level;
	u64 *sptep;
	unsigned index;
};

#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

176
typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
M
Marcelo Tosatti 已提交
177

178 179
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
180
static struct kmem_cache *mmu_page_header_cache;
181

182 183
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
S
Sheng Yang 已提交
184 185 186 187 188 189
static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
190

191 192 193 194 195
static inline u64 rsvd_bits(int s, int e)
{
	return ((1ULL << (e - s + 1)) - 1) << s;
}

196 197 198 199 200 201 202
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

S
Sheng Yang 已提交
203 204 205 206 207 208 209
void kvm_mmu_set_base_ptes(u64 base_pte)
{
	shadow_base_present_pte = base_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);

void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210
		u64 dirty_mask, u64 nx_mask, u64 x_mask)
S
Sheng Yang 已提交
211 212 213 214 215 216 217 218 219
{
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

220
static bool is_write_protection(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
221
{
222
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
A
Avi Kivity 已提交
223 224 225 226 227 228 229
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

230 231
static int is_nx(struct kvm_vcpu *vcpu)
{
232
	return vcpu->arch.efer & EFER_NX;
233 234
}

235 236 237 238 239 240
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

M
Marcelo Tosatti 已提交
241 242 243 244 245
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

246
static int is_writable_pte(unsigned long pte)
A
Avi Kivity 已提交
247 248 249 250
{
	return pte & PT_WRITABLE_MASK;
}

251
static int is_dirty_gpte(unsigned long pte)
252
{
A
Avi Kivity 已提交
253
	return pte & PT_DIRTY_MASK;
254 255
}

256
static int is_rmap_spte(u64 pte)
257
{
258
	return is_shadow_present_pte(pte);
259 260
}

261 262 263 264
static int is_last_spte(u64 pte, int level)
{
	if (level == PT_PAGE_TABLE_LEVEL)
		return 1;
265
	if (is_large_pte(pte))
266 267 268 269
		return 1;
	return 0;
}

270
static pfn_t spte_to_pfn(u64 pte)
271
{
272
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
273 274
}

275 276 277 278 279 280 281
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

A
Avi Kivity 已提交
282
static void __set_spte(u64 *sptep, u64 spte)
283 284 285 286 287 288 289 290
{
#ifdef CONFIG_X86_64
	set_64bit((unsigned long *)sptep, spte);
#else
	set_64bit((unsigned long long *)sptep, spte);
#endif
}

291
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292
				  struct kmem_cache *base_cache, int min)
293 294 295 296
{
	void *obj;

	if (cache->nobjs >= min)
297
		return 0;
298
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
299
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
300
		if (!obj)
301
			return -ENOMEM;
302 303
		cache->objects[cache->nobjs++] = obj;
	}
304
	return 0;
305 306
}

307 308
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				  struct kmem_cache *cache)
309 310
{
	while (mc->nobjs)
311
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
312 313
}

A
Avi Kivity 已提交
314
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
315
				       int min)
A
Avi Kivity 已提交
316 317 318 319 320 321
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
322
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
323 324 325 326 327 328 329 330 331 332
		if (!page)
			return -ENOMEM;
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
333
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
334 335
}

336
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
337
{
338 339
	int r;

340
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
341
				   pte_chain_cache, 4);
342 343
	if (r)
		goto out;
344
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
345
				   rmap_desc_cache, 4);
346 347
	if (r)
		goto out;
348
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
349 350
	if (r)
		goto out;
351
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
352
				   mmu_page_header_cache, 4);
353 354
out:
	return r;
355 356 357 358
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
359 360
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
361
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
362 363
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				mmu_page_header_cache);
364 365 366 367 368 369 370 371 372 373 374 375 376 377
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
378
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
379 380 381
				      sizeof(struct kvm_pte_chain));
}

382
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
383
{
384
	kmem_cache_free(pte_chain_cache, pc);
385 386 387 388
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
389
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
390 391 392
				      sizeof(struct kvm_rmap_desc));
}

393
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
394
{
395
	kmem_cache_free(rmap_desc_cache, rd);
396 397
}

398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (!sp->role.direct)
		return sp->gfns[index];

	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
}

static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
	if (sp->role.direct)
		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
	else
		sp->gfns[index] = gfn;
}

M
Marcelo Tosatti 已提交
414 415 416 417
/*
 * Return the pointer to the largepage write count for a given
 * gfn, handling slots that are not large page aligned.
 */
418 419 420
static int *slot_largepage_idx(gfn_t gfn,
			       struct kvm_memory_slot *slot,
			       int level)
M
Marcelo Tosatti 已提交
421 422 423
{
	unsigned long idx;

424 425
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
426
	return &slot->lpage_info[level - 2][idx].write_count;
M
Marcelo Tosatti 已提交
427 428 429 430
}

static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
431
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
432
	int *write_count;
433
	int i;
M
Marcelo Tosatti 已提交
434

A
Avi Kivity 已提交
435
	slot = gfn_to_memslot(kvm, gfn);
436 437 438 439 440
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count += 1;
	}
M
Marcelo Tosatti 已提交
441 442 443 444
}

static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
445
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
446
	int *write_count;
447
	int i;
M
Marcelo Tosatti 已提交
448

A
Avi Kivity 已提交
449
	slot = gfn_to_memslot(kvm, gfn);
450 451 452 453 454 455
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count -= 1;
		WARN_ON(*write_count < 0);
	}
M
Marcelo Tosatti 已提交
456 457
}

458 459 460
static int has_wrprotected_page(struct kvm *kvm,
				gfn_t gfn,
				int level)
M
Marcelo Tosatti 已提交
461
{
462
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
463 464
	int *largepage_idx;

A
Avi Kivity 已提交
465
	slot = gfn_to_memslot(kvm, gfn);
M
Marcelo Tosatti 已提交
466
	if (slot) {
467
		largepage_idx = slot_largepage_idx(gfn, slot, level);
M
Marcelo Tosatti 已提交
468 469 470 471 472 473
		return *largepage_idx;
	}

	return 1;
}

474
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
M
Marcelo Tosatti 已提交
475
{
J
Joerg Roedel 已提交
476
	unsigned long page_size;
477
	int i, ret = 0;
M
Marcelo Tosatti 已提交
478

J
Joerg Roedel 已提交
479
	page_size = kvm_host_page_size(kvm, gfn);
M
Marcelo Tosatti 已提交
480

481 482 483 484 485 486 487 488
	for (i = PT_PAGE_TABLE_LEVEL;
	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
		if (page_size >= KVM_HPAGE_SIZE(i))
			ret = i;
		else
			break;
	}

489
	return ret;
M
Marcelo Tosatti 已提交
490 491
}

492
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
M
Marcelo Tosatti 已提交
493 494
{
	struct kvm_memory_slot *slot;
495
	int host_level, level, max_level;
M
Marcelo Tosatti 已提交
496 497 498

	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
	if (slot && slot->dirty_bitmap)
499
		return PT_PAGE_TABLE_LEVEL;
M
Marcelo Tosatti 已提交
500

501 502 503 504 505
	host_level = host_mapping_level(vcpu->kvm, large_gfn);

	if (host_level == PT_PAGE_TABLE_LEVEL)
		return host_level;

506 507 508 509
	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
		kvm_x86_ops->get_lpage_level() : host_level;

	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
510 511 512 513
		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
			break;

	return level - 1;
M
Marcelo Tosatti 已提交
514 515
}

516 517 518 519
/*
 * Take gfn and return the reverse mapping to it.
 */

520
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
521 522
{
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
523
	unsigned long idx;
524 525

	slot = gfn_to_memslot(kvm, gfn);
526
	if (likely(level == PT_PAGE_TABLE_LEVEL))
M
Marcelo Tosatti 已提交
527 528
		return &slot->rmap[gfn - slot->base_gfn];

529 530
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
M
Marcelo Tosatti 已提交
531

532
	return &slot->lpage_info[level - 2][idx].rmap_pde;
533 534
}

535 536 537
/*
 * Reverse mapping data structures:
 *
538 539
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
540
 *
541 542
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
543 544 545 546
 *
 * Returns the number of rmap entries before the spte was added or zero if
 * the spte was not added.
 *
547
 */
548
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
549
{
550
	struct kvm_mmu_page *sp;
551
	struct kvm_rmap_desc *desc;
552
	unsigned long *rmapp;
553
	int i, count = 0;
554

555
	if (!is_rmap_spte(*spte))
556
		return count;
557
	sp = page_header(__pa(spte));
558
	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
559
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
560
	if (!*rmapp) {
561
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
562 563
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
564
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
565
		desc = mmu_alloc_rmap_desc(vcpu);
A
Avi Kivity 已提交
566 567
		desc->sptes[0] = (u64 *)*rmapp;
		desc->sptes[1] = spte;
568
		*rmapp = (unsigned long)desc | 1;
569 570
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
571
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
A
Avi Kivity 已提交
572
		while (desc->sptes[RMAP_EXT-1] && desc->more) {
573
			desc = desc->more;
574 575
			count += RMAP_EXT;
		}
A
Avi Kivity 已提交
576
		if (desc->sptes[RMAP_EXT-1]) {
577
			desc->more = mmu_alloc_rmap_desc(vcpu);
578 579
			desc = desc->more;
		}
A
Avi Kivity 已提交
580
		for (i = 0; desc->sptes[i]; ++i)
581
			;
A
Avi Kivity 已提交
582
		desc->sptes[i] = spte;
583
	}
584
	return count;
585 586
}

587
static void rmap_desc_remove_entry(unsigned long *rmapp,
588 589 590 591 592 593
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

A
Avi Kivity 已提交
594
	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
595
		;
A
Avi Kivity 已提交
596 597
	desc->sptes[i] = desc->sptes[j];
	desc->sptes[j] = NULL;
598 599 600
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
A
Avi Kivity 已提交
601
		*rmapp = (unsigned long)desc->sptes[0];
602 603 604 605
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
606
			*rmapp = (unsigned long)desc->more | 1;
607
	mmu_free_rmap_desc(desc);
608 609
}

610
static void rmap_remove(struct kvm *kvm, u64 *spte)
611 612 613
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
614
	struct kvm_mmu_page *sp;
615
	pfn_t pfn;
616
	gfn_t gfn;
617
	unsigned long *rmapp;
618 619
	int i;

620
	if (!is_rmap_spte(*spte))
621
		return;
622
	sp = page_header(__pa(spte));
623
	pfn = spte_to_pfn(*spte);
S
Sheng Yang 已提交
624
	if (*spte & shadow_accessed_mask)
625
		kvm_set_pfn_accessed(pfn);
626
	if (is_writable_pte(*spte))
627
		kvm_set_pfn_dirty(pfn);
628 629
	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
630
	if (!*rmapp) {
631 632
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
633
	} else if (!(*rmapp & 1)) {
634
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
635
		if ((u64 *)*rmapp != spte) {
636 637 638 639
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
640
		*rmapp = 0;
641 642
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
643
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
644 645
		prev_desc = NULL;
		while (desc) {
A
Avi Kivity 已提交
646 647
			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
				if (desc->sptes[i] == spte) {
648
					rmap_desc_remove_entry(rmapp,
649
							       desc, i,
650 651 652 653 654 655
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
656
		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
657 658 659 660
		BUG();
	}
}

A
Avi Kivity 已提交
661 662 663 664 665 666
static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
	rmap_remove(kvm, sptep);
	__set_spte(sptep, new_spte);
}

667
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
668 669
{
	struct kvm_rmap_desc *desc;
670 671 672 673 674 675 676 677 678 679 680 681 682
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_spte = NULL;
	while (desc) {
A
Avi Kivity 已提交
683
		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
684
			if (prev_spte == spte)
A
Avi Kivity 已提交
685 686
				return desc->sptes[i];
			prev_spte = desc->sptes[i];
687 688 689 690 691 692
		}
		desc = desc->more;
	}
	return NULL;
}

693
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
694
{
695
	unsigned long *rmapp;
696
	u64 *spte;
697
	int i, write_protected = 0;
698

699
	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
700

701 702
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
703 704 705
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
706
		if (is_writable_pte(*spte)) {
A
Avi Kivity 已提交
707
			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
708 709
			write_protected = 1;
		}
710
		spte = rmap_next(kvm, rmapp, spte);
711
	}
712
	if (write_protected) {
713
		pfn_t pfn;
714 715

		spte = rmap_next(kvm, rmapp, NULL);
716 717
		pfn = spte_to_pfn(*spte);
		kvm_set_pfn_dirty(pfn);
718 719
	}

M
Marcelo Tosatti 已提交
720
	/* check for huge page mappings */
721 722 723 724 725 726 727 728 729
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		rmapp = gfn_to_rmap(kvm, gfn, i);
		spte = rmap_next(kvm, rmapp, NULL);
		while (spte) {
			BUG_ON(!spte);
			BUG_ON(!(*spte & PT_PRESENT_MASK));
			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
730
			if (is_writable_pte(*spte)) {
A
Avi Kivity 已提交
731 732
				drop_spte(kvm, spte,
					  shadow_trap_nonpresent_pte);
733 734 735 736 737
				--kvm->stat.lpages;
				spte = NULL;
				write_protected = 1;
			}
			spte = rmap_next(kvm, rmapp, spte);
M
Marcelo Tosatti 已提交
738 739 740
		}
	}

741
	return write_protected;
742 743
}

F
Frederik Deweerdt 已提交
744 745
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
			   unsigned long data)
746 747 748 749 750 751 752
{
	u64 *spte;
	int need_tlb_flush = 0;

	while ((spte = rmap_next(kvm, rmapp, NULL))) {
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
A
Avi Kivity 已提交
753
		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
754 755 756 757 758
		need_tlb_flush = 1;
	}
	return need_tlb_flush;
}

F
Frederik Deweerdt 已提交
759 760
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
			     unsigned long data)
761 762 763 764 765 766 767 768 769 770 771 772 773 774
{
	int need_flush = 0;
	u64 *spte, new_spte;
	pte_t *ptep = (pte_t *)data;
	pfn_t new_pfn;

	WARN_ON(pte_huge(*ptep));
	new_pfn = pte_pfn(*ptep);
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		BUG_ON(!is_shadow_present_pte(*spte));
		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
		need_flush = 1;
		if (pte_write(*ptep)) {
A
Avi Kivity 已提交
775
			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
776 777 778 779 780 781 782
			spte = rmap_next(kvm, rmapp, NULL);
		} else {
			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
			new_spte |= (u64)new_pfn << PAGE_SHIFT;

			new_spte &= ~PT_WRITABLE_MASK;
			new_spte &= ~SPTE_HOST_WRITEABLE;
783
			if (is_writable_pte(*spte))
784 785 786 787 788 789 790 791 792 793 794
				kvm_set_pfn_dirty(spte_to_pfn(*spte));
			__set_spte(spte, new_spte);
			spte = rmap_next(kvm, rmapp, spte);
		}
	}
	if (need_flush)
		kvm_flush_remote_tlbs(kvm);

	return 0;
}

F
Frederik Deweerdt 已提交
795 796
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
			  unsigned long data,
797
			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
F
Frederik Deweerdt 已提交
798
					 unsigned long data))
799
{
800
	int i, j;
801
	int ret;
802
	int retval = 0;
803 804
	struct kvm_memslots *slots;

805
	slots = kvm_memslots(kvm);
806

807 808
	for (i = 0; i < slots->nmemslots; i++) {
		struct kvm_memory_slot *memslot = &slots->memslots[i];
809 810 811 812 813 814
		unsigned long start = memslot->userspace_addr;
		unsigned long end;

		end = start + (memslot->npages << PAGE_SHIFT);
		if (hva >= start && hva < end) {
			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
815

816
			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
817 818 819 820

			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
				int idx = gfn_offset;
				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
821
				ret |= handler(kvm,
822 823
					&memslot->lpage_info[j][idx].rmap_pde,
					data);
824
			}
825 826
			trace_kvm_age_page(hva, memslot, ret);
			retval |= ret;
827 828 829 830 831 832 833 834
		}
	}

	return retval;
}

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
835 836 837 838 839
	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}

void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
F
Frederik Deweerdt 已提交
840
	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
841 842
}

F
Frederik Deweerdt 已提交
843 844
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
			 unsigned long data)
845 846 847 848
{
	u64 *spte;
	int young = 0;

849 850 851 852 853 854 855
	/*
	 * Emulate the accessed bit for EPT, by checking if this page has
	 * an EPT mapping, and clearing it if it does. On the next access,
	 * a new EPT mapping will be established.
	 * This has some overhead, but not as much as the cost of swapping
	 * out actively used pages or breaking up actively used hugepages.
	 */
856
	if (!shadow_accessed_mask)
857
		return kvm_unmap_rmapp(kvm, rmapp, data);
858

859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		int _young;
		u64 _spte = *spte;
		BUG_ON(!(_spte & PT_PRESENT_MASK));
		_young = _spte & PT_ACCESSED_MASK;
		if (_young) {
			young = 1;
			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
		}
		spte = rmap_next(kvm, rmapp, spte);
	}
	return young;
}

874 875
#define RMAP_RECYCLE_THRESHOLD 1000

876
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
877 878
{
	unsigned long *rmapp;
879 880 881
	struct kvm_mmu_page *sp;

	sp = page_header(__pa(spte));
882

883
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
884

885
	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
886 887 888
	kvm_flush_remote_tlbs(vcpu->kvm);
}

889 890
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
891
	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
892 893
}

894
#ifdef MMU_DEBUG
895
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
896
{
897 898 899
	u64 *pos;
	u64 *end;

900
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
901
		if (is_shadow_present_pte(*pos)) {
902
			printk(KERN_ERR "%s: %p %llx\n", __func__,
903
			       pos, *pos);
A
Avi Kivity 已提交
904
			return 0;
905
		}
A
Avi Kivity 已提交
906 907
	return 1;
}
908
#endif
A
Avi Kivity 已提交
909

910
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
911
{
912
	ASSERT(is_empty_shadow_page(sp->spt));
913
	hlist_del(&sp->hash_link);
914 915
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
916 917
	if (!sp->role.direct)
		__free_page(virt_to_page(sp->gfns));
918
	kmem_cache_free(mmu_page_header_cache, sp);
919
	++kvm->arch.n_free_mmu_pages;
920 921
}

922 923
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
924
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
925 926
}

927
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
928
					       u64 *parent_pte, int direct)
A
Avi Kivity 已提交
929
{
930
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
931

932 933
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
934 935 936
	if (!direct)
		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
						  PAGE_SIZE);
937
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
938
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
939
	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
940 941
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
942
	--vcpu->kvm->arch.n_free_mmu_pages;
943
	return sp;
A
Avi Kivity 已提交
944 945
}

946
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
947
				    struct kvm_mmu_page *sp, u64 *parent_pte)
948 949 950 951 952 953 954
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
955 956
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
957 958

		if (!old) {
959
			sp->parent_pte = parent_pte;
960 961
			return;
		}
962
		sp->multimapped = 1;
963
		pte_chain = mmu_alloc_pte_chain(vcpu);
964 965
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
966 967
		pte_chain->parent_ptes[0] = old;
	}
968
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
969 970 971 972 973 974 975 976
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
977
	pte_chain = mmu_alloc_pte_chain(vcpu);
978
	BUG_ON(!pte_chain);
979
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
980 981 982
	pte_chain->parent_ptes[0] = parent_pte;
}

983
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
984 985 986 987 988 989
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

990 991 992
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
993 994
		return;
	}
995
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
996 997 998 999 1000
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
1001 1002
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
1003 1004 1005 1006 1007
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
1008 1009
			if (i == 0) {
				hlist_del(&pte_chain->link);
1010
				mmu_free_pte_chain(pte_chain);
1011 1012 1013
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
1014 1015
				}
			}
1016 1017 1018 1019 1020
			return;
		}
	BUG();
}

1021
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
M
Marcelo Tosatti 已提交
1022 1023 1024 1025 1026 1027 1028 1029
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	struct kvm_mmu_page *parent_sp;
	int i;

	if (!sp->multimapped && sp->parent_pte) {
		parent_sp = page_header(__pa(sp->parent_pte));
1030
		fn(parent_sp, sp->parent_pte);
M
Marcelo Tosatti 已提交
1031 1032
		return;
	}
1033

M
Marcelo Tosatti 已提交
1034 1035
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1036 1037 1038
			u64 *spte = pte_chain->parent_ptes[i];

			if (!spte)
M
Marcelo Tosatti 已提交
1039
				break;
1040 1041
			parent_sp = page_header(__pa(spte));
			fn(parent_sp, spte);
M
Marcelo Tosatti 已提交
1042 1043 1044
		}
}

1045 1046
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1047
{
1048
	mmu_parent_walk(sp, mark_unsync);
1049 1050
}

1051
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1052
{
1053
	unsigned int index;
1054

1055 1056
	index = spte - sp->spt;
	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1057
		return;
1058
	if (sp->unsync_children++)
1059
		return;
1060
	kvm_mmu_mark_parents_unsync(sp);
1061 1062
}

1063 1064 1065 1066 1067 1068 1069 1070 1071
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1072
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1073
			       struct kvm_mmu_page *sp, bool clear_unsync)
1074 1075 1076 1077
{
	return 1;
}

M
Marcelo Tosatti 已提交
1078 1079 1080 1081
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}

1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
#define KVM_PAGE_ARRAY_NR 16

struct kvm_mmu_pages {
	struct mmu_page_and_offset {
		struct kvm_mmu_page *sp;
		unsigned int idx;
	} page[KVM_PAGE_ARRAY_NR];
	unsigned int nr;
};

1092 1093 1094 1095 1096
#define for_each_unsync_children(bitmap, idx)		\
	for (idx = find_first_bit(bitmap, 512);		\
	     idx < 512;					\
	     idx = find_next_bit(bitmap, 512, idx+1))

1097 1098
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
			 int idx)
1099
{
1100
	int i;
1101

1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116
	if (sp->unsync)
		for (i=0; i < pvec->nr; i++)
			if (pvec->page[i].sp == sp)
				return 0;

	pvec->page[pvec->nr].sp = sp;
	pvec->page[pvec->nr].idx = idx;
	pvec->nr++;
	return (pvec->nr == KVM_PAGE_ARRAY_NR);
}

static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	int i, ret, nr_unsync_leaf = 0;
1117

1118
	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1119
		struct kvm_mmu_page *child;
1120 1121
		u64 ent = sp->spt[i];

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
			goto clear_child_bitmap;

		child = page_header(ent & PT64_BASE_ADDR_MASK);

		if (child->unsync_children) {
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;

			ret = __mmu_unsync_walk(child, pvec);
			if (!ret)
				goto clear_child_bitmap;
			else if (ret > 0)
				nr_unsync_leaf += ret;
			else
				return ret;
		} else if (child->unsync) {
			nr_unsync_leaf++;
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;
		} else
			 goto clear_child_bitmap;

		continue;

clear_child_bitmap:
		__clear_bit(i, sp->unsync_child_bitmap);
		sp->unsync_children--;
		WARN_ON((int)sp->unsync_children < 0);
1151 1152 1153
	}


1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
	return nr_unsync_leaf;
}

static int mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	if (!sp->unsync_children)
		return 0;

	mmu_pages_add(pvec, sp, 0);
	return __mmu_unsync_walk(sp, pvec);
1165 1166 1167 1168 1169
}

static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	WARN_ON(!sp->unsync);
1170
	trace_kvm_mmu_sync_page(sp);
1171 1172 1173 1174
	sp->unsync = 0;
	--kvm->stat.mmu_unsync;
}

1175 1176 1177 1178
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list);
1179

1180 1181
#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
  hlist_for_each_entry(sp, pos,						\
1182 1183 1184
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
	if ((sp)->gfn != (gfn)) {} else

1185 1186
#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
  hlist_for_each_entry(sp, pos,						\
1187 1188 1189 1190
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
			(sp)->role.invalid) {} else

1191
/* @sp->gfn should be write-protected at the call site */
1192
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1193
			   struct list_head *invalid_list, bool clear_unsync)
1194
{
1195
	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1196
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1197 1198 1199
		return 1;
	}

1200
	if (clear_unsync)
1201 1202
		kvm_unlink_unsync_page(vcpu->kvm, sp);

1203
	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1204
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1205 1206 1207 1208 1209 1210 1211
		return 1;
	}

	kvm_mmu_flush_tlb(vcpu);
	return 0;
}

1212 1213 1214
static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
				   struct kvm_mmu_page *sp)
{
1215
	LIST_HEAD(invalid_list);
1216 1217
	int ret;

1218
	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1219
	if (ret)
1220 1221
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);

1222 1223 1224
	return ret;
}

1225 1226
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			 struct list_head *invalid_list)
1227
{
1228
	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1229 1230
}

1231 1232 1233 1234
/* @gfn should be write-protected at the call site */
static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
{
	struct kvm_mmu_page *s;
1235
	struct hlist_node *node;
1236
	LIST_HEAD(invalid_list);
1237 1238
	bool flush = false;

1239
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1240
		if (!s->unsync)
1241 1242 1243 1244
			continue;

		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1245
			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1246
			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1247 1248 1249 1250 1251 1252
			continue;
		}
		kvm_unlink_unsync_page(vcpu->kvm, s);
		flush = true;
	}

1253
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1254 1255 1256 1257
	if (flush)
		kvm_mmu_flush_tlb(vcpu);
}

1258 1259 1260
struct mmu_page_path {
	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
	unsigned int idx[PT64_ROOT_LEVEL-1];
1261 1262
};

1263 1264 1265 1266 1267 1268
#define for_each_sp(pvec, sp, parents, i)			\
		for (i = mmu_pages_next(&pvec, &parents, -1),	\
			sp = pvec.page[i].sp;			\
			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
			i = mmu_pages_next(&pvec, &parents, i))

1269 1270 1271
static int mmu_pages_next(struct kvm_mmu_pages *pvec,
			  struct mmu_page_path *parents,
			  int i)
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
{
	int n;

	for (n = i+1; n < pvec->nr; n++) {
		struct kvm_mmu_page *sp = pvec->page[n].sp;

		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
			parents->idx[0] = pvec->page[n].idx;
			return n;
		}

		parents->parent[sp->role.level-2] = sp;
		parents->idx[sp->role.level-1] = pvec->page[n].idx;
	}

	return n;
}

1290
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1291
{
1292 1293 1294 1295 1296
	struct kvm_mmu_page *sp;
	unsigned int level = 0;

	do {
		unsigned int idx = parents->idx[level];
1297

1298 1299 1300 1301 1302 1303 1304 1305 1306
		sp = parents->parent[level];
		if (!sp)
			return;

		--sp->unsync_children;
		WARN_ON((int)sp->unsync_children < 0);
		__clear_bit(idx, sp->unsync_child_bitmap);
		level++;
	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1307 1308
}

1309 1310 1311
static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
			       struct mmu_page_path *parents,
			       struct kvm_mmu_pages *pvec)
1312
{
1313 1314 1315
	parents->parent[parent->role.level-1] = NULL;
	pvec->nr = 0;
}
1316

1317 1318 1319 1320 1321 1322 1323
static void mmu_sync_children(struct kvm_vcpu *vcpu,
			      struct kvm_mmu_page *parent)
{
	int i;
	struct kvm_mmu_page *sp;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1324
	LIST_HEAD(invalid_list);
1325 1326 1327

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
1328 1329 1330 1331 1332 1333 1334 1335
		int protected = 0;

		for_each_sp(pages, sp, parents, i)
			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);

		if (protected)
			kvm_flush_remote_tlbs(vcpu->kvm);

1336
		for_each_sp(pages, sp, parents, i) {
1337
			kvm_sync_page(vcpu, sp, &invalid_list);
1338 1339
			mmu_pages_clear_parents(&parents);
		}
1340
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1341
		cond_resched_lock(&vcpu->kvm->mmu_lock);
1342 1343
		kvm_mmu_pages_init(parent, &parents, &pages);
	}
1344 1345
}

1346 1347 1348 1349
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
1350
					     int direct,
1351
					     unsigned access,
1352
					     u64 *parent_pte)
1353 1354 1355
{
	union kvm_mmu_page_role role;
	unsigned quadrant;
1356
	struct kvm_mmu_page *sp;
1357
	struct hlist_node *node;
1358
	bool need_sync = false;
1359

1360
	role = vcpu->arch.mmu.base_role;
1361
	role.level = level;
1362
	role.direct = direct;
1363
	if (role.direct)
1364
		role.cr4_pae = 0;
1365
	role.access = access;
1366
	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1367 1368 1369 1370
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
1371
	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1372 1373
		if (!need_sync && sp->unsync)
			need_sync = true;
1374

1375 1376
		if (sp->role.word != role.word)
			continue;
1377

1378 1379
		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
			break;
1380

1381 1382
		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
		if (sp->unsync_children) {
1383
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1384 1385 1386
			kvm_mmu_mark_parents_unsync(sp);
		} else if (sp->unsync)
			kvm_mmu_mark_parents_unsync(sp);
1387

1388 1389 1390
		trace_kvm_mmu_get_page(sp, false);
		return sp;
	}
A
Avi Kivity 已提交
1391
	++vcpu->kvm->stat.mmu_cache_miss;
1392
	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1393 1394 1395 1396
	if (!sp)
		return sp;
	sp->gfn = gfn;
	sp->role = role;
1397 1398
	hlist_add_head(&sp->hash_link,
		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1399
	if (!direct) {
1400 1401
		if (rmap_write_protect(vcpu->kvm, gfn))
			kvm_flush_remote_tlbs(vcpu->kvm);
1402 1403 1404
		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
			kvm_sync_pages(vcpu, gfn);

1405 1406
		account_shadowed(vcpu->kvm, gfn);
	}
1407 1408 1409 1410
	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
		vcpu->arch.mmu.prefetch_page(vcpu, sp);
	else
		nonpaging_prefetch_page(vcpu, sp);
A
Avi Kivity 已提交
1411
	trace_kvm_mmu_get_page(sp, true);
1412
	return sp;
1413 1414
}

1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
			     struct kvm_vcpu *vcpu, u64 addr)
{
	iterator->addr = addr;
	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
	iterator->level = vcpu->arch.mmu.shadow_root_level;
	if (iterator->level == PT32E_ROOT_LEVEL) {
		iterator->shadow_addr
			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
		--iterator->level;
		if (!iterator->shadow_addr)
			iterator->level = 0;
	}
}

static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
	if (iterator->level < PT_PAGE_TABLE_LEVEL)
		return false;
1435 1436 1437 1438 1439

	if (iterator->level == PT_PAGE_TABLE_LEVEL)
		if (is_large_pte(*iterator->sptep))
			return false;

1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
	return true;
}

static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
	--iterator->level;
}

1451
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1452
					 struct kvm_mmu_page *sp)
1453
{
1454 1455 1456 1457
	unsigned i;
	u64 *pt;
	u64 ent;

1458
	pt = sp->spt;
1459 1460 1461 1462

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

M
Marcelo Tosatti 已提交
1463
		if (is_shadow_present_pte(ent)) {
1464
			if (!is_last_spte(ent, sp->role.level)) {
M
Marcelo Tosatti 已提交
1465 1466 1467 1468
				ent &= PT64_BASE_ADDR_MASK;
				mmu_page_remove_parent_pte(page_header(ent),
							   &pt[i]);
			} else {
1469 1470
				if (is_large_pte(ent))
					--kvm->stat.lpages;
A
Avi Kivity 已提交
1471 1472
				drop_spte(kvm, &pt[i],
					  shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
1473 1474
			}
		}
1475
		pt[i] = shadow_trap_nonpresent_pte;
1476
	}
1477 1478
}

1479
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1480
{
1481
	mmu_page_remove_parent_pte(sp, parent_pte);
1482 1483
}

1484 1485 1486
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;
1487
	struct kvm_vcpu *vcpu;
1488

1489 1490
	kvm_for_each_vcpu(i, vcpu, kvm)
		vcpu->arch.last_pte_updated = NULL;
1491 1492
}

1493
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1494 1495 1496
{
	u64 *parent_pte;

1497 1498 1499
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
1500 1501 1502
		else {
			struct kvm_pte_chain *chain;

1503
			chain = container_of(sp->parent_ptes.first,
1504 1505 1506
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
1507
		BUG_ON(!parent_pte);
1508
		kvm_mmu_put_page(sp, parent_pte);
A
Avi Kivity 已提交
1509
		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1510
	}
1511 1512
}

1513
static int mmu_zap_unsync_children(struct kvm *kvm,
1514 1515
				   struct kvm_mmu_page *parent,
				   struct list_head *invalid_list)
1516
{
1517 1518 1519
	int i, zapped = 0;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1520

1521
	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1522
		return 0;
1523 1524 1525 1526 1527 1528

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
		struct kvm_mmu_page *sp;

		for_each_sp(pages, sp, parents, i) {
1529
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1530
			mmu_pages_clear_parents(&parents);
1531
			zapped++;
1532 1533 1534 1535 1536
		}
		kvm_mmu_pages_init(parent, &parents, &pages);
	}

	return zapped;
1537 1538
}

1539 1540
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list)
1541
{
1542
	int ret;
A
Avi Kivity 已提交
1543

1544
	trace_kvm_mmu_prepare_zap_page(sp);
1545
	++kvm->stat.mmu_shadow_zapped;
1546
	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1547
	kvm_mmu_page_unlink_children(kvm, sp);
1548
	kvm_mmu_unlink_parents(kvm, sp);
1549
	if (!sp->role.invalid && !sp->role.direct)
A
Avi Kivity 已提交
1550
		unaccount_shadowed(kvm, sp->gfn);
1551 1552
	if (sp->unsync)
		kvm_unlink_unsync_page(kvm, sp);
1553
	if (!sp->root_count) {
1554 1555
		/* Count self */
		ret++;
1556
		list_move(&sp->link, invalid_list);
1557
	} else {
A
Avi Kivity 已提交
1558
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1559 1560
		kvm_reload_remote_mmus(kvm);
	}
1561 1562

	sp->role.invalid = 1;
1563
	kvm_mmu_reset_last_pte_updated(kvm);
1564
	return ret;
1565 1566
}

1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list)
{
	struct kvm_mmu_page *sp;

	if (list_empty(invalid_list))
		return;

	kvm_flush_remote_tlbs(kvm);

	do {
		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
		WARN_ON(!sp->role.invalid || sp->root_count);
		kvm_mmu_free_page(kvm, sp);
	} while (!list_empty(invalid_list));

}

1585 1586 1587 1588 1589 1590
/*
 * Changing the number of mmu pages allocated to the vm
 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
1591
	int used_pages;
1592
	LIST_HEAD(invalid_list);
1593 1594 1595 1596

	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
	used_pages = max(0, used_pages);

1597 1598 1599 1600 1601 1602
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

1603
	if (used_pages > kvm_nr_mmu_pages) {
1604 1605
		while (used_pages > kvm_nr_mmu_pages &&
			!list_empty(&kvm->arch.active_mmu_pages)) {
1606 1607
			struct kvm_mmu_page *page;

1608
			page = container_of(kvm->arch.active_mmu_pages.prev,
1609
					    struct kvm_mmu_page, link);
1610 1611
			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
							       &invalid_list);
1612
		}
1613
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
1614
		kvm_nr_mmu_pages = used_pages;
1615
		kvm->arch.n_free_mmu_pages = 0;
1616 1617
	}
	else
1618 1619
		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
					 - kvm->arch.n_alloc_mmu_pages;
1620

1621
	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1622 1623
}

1624
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1625
{
1626
	struct kvm_mmu_page *sp;
1627
	struct hlist_node *node;
1628
	LIST_HEAD(invalid_list);
1629 1630
	int r;

1631
	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1632
	r = 0;
1633 1634

	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1635 1636 1637
		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
			 sp->role.word);
		r = 1;
1638
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1639
	}
1640
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1641
	return r;
1642 1643
}

1644
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1645
{
1646
	struct kvm_mmu_page *sp;
1647
	struct hlist_node *node;
1648
	LIST_HEAD(invalid_list);
1649

1650
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1651 1652
		pgprintk("%s: zap %lx %x\n",
			 __func__, gfn, sp->role.word);
1653
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1654
	}
1655
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1656 1657
}

1658
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
1659
{
1660
	int slot = memslot_id(kvm, gfn);
1661
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
1662

1663
	__set_bit(slot, sp->slot_bitmap);
A
Avi Kivity 已提交
1664 1665
}

1666 1667 1668 1669 1670 1671 1672 1673 1674 1675
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
{
	int i;
	u64 *pt = sp->spt;

	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
		return;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		if (pt[i] == shadow_notrap_nonpresent_pte)
A
Avi Kivity 已提交
1676
			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1677 1678 1679
	}
}

1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
/*
 * The function is based on mtrr_type_lookup() in
 * arch/x86/kernel/cpu/mtrr/generic.c
 */
static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
			 u64 start, u64 end)
{
	int i;
	u64 base, mask;
	u8 prev_match, curr_match;
	int num_var_ranges = KVM_NR_VAR_MTRR;

	if (!mtrr_state->enabled)
		return 0xFF;

	/* Make end inclusive end, instead of exclusive */
	end--;

	/* Look in fixed ranges. Just return the type as per start */
	if (mtrr_state->have_fixed && (start < 0x100000)) {
		int idx;

		if (start < 0x80000) {
			idx = 0;
			idx += (start >> 16);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0xC0000) {
			idx = 1 * 8;
			idx += ((start - 0x80000) >> 14);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0x1000000) {
			idx = 3 * 8;
			idx += ((start - 0xC0000) >> 12);
			return mtrr_state->fixed_ranges[idx];
		}
	}

	/*
	 * Look in variable ranges
	 * Look of multiple ranges matching this address and pick type
	 * as per MTRR precedence
	 */
	if (!(mtrr_state->enabled & 2))
		return mtrr_state->def_type;

	prev_match = 0xFF;
	for (i = 0; i < num_var_ranges; ++i) {
		unsigned short start_state, end_state;

		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
			continue;

		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);

		start_state = ((start & mask) == (base & mask));
		end_state = ((end & mask) == (base & mask));
		if (start_state != end_state)
			return 0xFE;

		if ((start & mask) != (base & mask))
			continue;

		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
		if (prev_match == 0xFF) {
			prev_match = curr_match;
			continue;
		}

		if (prev_match == MTRR_TYPE_UNCACHABLE ||
		    curr_match == MTRR_TYPE_UNCACHABLE)
			return MTRR_TYPE_UNCACHABLE;

		if ((prev_match == MTRR_TYPE_WRBACK &&
		     curr_match == MTRR_TYPE_WRTHROUGH) ||
		    (prev_match == MTRR_TYPE_WRTHROUGH &&
		     curr_match == MTRR_TYPE_WRBACK)) {
			prev_match = MTRR_TYPE_WRTHROUGH;
			curr_match = MTRR_TYPE_WRTHROUGH;
		}

		if (prev_match != curr_match)
			return MTRR_TYPE_UNCACHABLE;
	}

	if (prev_match != 0xFF)
		return prev_match;

	return mtrr_state->def_type;
}

1773
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1774 1775 1776 1777 1778 1779 1780 1781 1782
{
	u8 mtrr;

	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
	if (mtrr == 0xfe || mtrr == 0xff)
		mtrr = MTRR_TYPE_WRBACK;
	return mtrr;
}
1783
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1784

1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
	trace_kvm_mmu_unsync_page(sp);
	++vcpu->kvm->stat.mmu_unsync;
	sp->unsync = 1;

	kvm_mmu_mark_parents_unsync(sp);
	mmu_convert_notrap(sp);
}

static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1796 1797
{
	struct kvm_mmu_page *s;
1798
	struct hlist_node *node;
1799

1800
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1801
		if (s->unsync)
1802
			continue;
1803 1804
		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		__kvm_unsync_page(vcpu, s);
1805 1806 1807 1808 1809 1810
	}
}

static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				  bool can_unsync)
{
1811
	struct kvm_mmu_page *s;
1812
	struct hlist_node *node;
1813 1814
	bool need_unsync = false;

1815
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1816 1817 1818
		if (!can_unsync)
			return 1;

1819
		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1820
			return 1;
1821 1822

		if (!need_unsync && !s->unsync) {
1823
			if (!oos_shadow)
1824 1825 1826
				return 1;
			need_unsync = true;
		}
1827
	}
1828 1829
	if (need_unsync)
		kvm_unsync_pages(vcpu, gfn);
1830 1831 1832
	return 0;
}

A
Avi Kivity 已提交
1833
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
1834
		    unsigned pte_access, int user_fault,
1835
		    int write_fault, int dirty, int level,
1836
		    gfn_t gfn, pfn_t pfn, bool speculative,
1837
		    bool can_unsync, bool reset_host_protection)
1838 1839
{
	u64 spte;
M
Marcelo Tosatti 已提交
1840
	int ret = 0;
S
Sheng Yang 已提交
1841

1842 1843 1844 1845 1846
	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
S
Sheng Yang 已提交
1847
	spte = shadow_base_present_pte | shadow_dirty_mask;
1848
	if (!speculative)
1849
		spte |= shadow_accessed_mask;
1850 1851
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
S
Sheng Yang 已提交
1852 1853 1854 1855
	if (pte_access & ACC_EXEC_MASK)
		spte |= shadow_x_mask;
	else
		spte |= shadow_nx_mask;
1856
	if (pte_access & ACC_USER_MASK)
S
Sheng Yang 已提交
1857
		spte |= shadow_user_mask;
1858
	if (level > PT_PAGE_TABLE_LEVEL)
M
Marcelo Tosatti 已提交
1859
		spte |= PT_PAGE_SIZE_MASK;
1860 1861 1862
	if (tdp_enabled)
		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
			kvm_is_mmio_pfn(pfn));
1863

1864 1865 1866
	if (reset_host_protection)
		spte |= SPTE_HOST_WRITEABLE;

1867
	spte |= (u64)pfn << PAGE_SHIFT;
1868 1869

	if ((pte_access & ACC_WRITE_MASK)
1870 1871
	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
		&& !user_fault)) {
1872

1873 1874
		if (level > PT_PAGE_TABLE_LEVEL &&
		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
1875
			ret = 1;
A
Avi Kivity 已提交
1876 1877
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
			goto done;
1878 1879
		}

1880 1881
		spte |= PT_WRITABLE_MASK;

1882 1883 1884
		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
			spte &= ~PT_USER_MASK;

1885 1886 1887 1888 1889 1890
		/*
		 * Optimization: for pte sync, if spte was writable the hash
		 * lookup is unnecessary (and expensive). Write protection
		 * is responsibility of mmu_get_page / kvm_sync_page.
		 * Same reasoning can be applied to dirty page accounting.
		 */
1891
		if (!can_unsync && is_writable_pte(*sptep))
1892 1893
			goto set_pte;

1894
		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1895
			pgprintk("%s: found shadow page for %lx, marking ro\n",
1896
				 __func__, gfn);
M
Marcelo Tosatti 已提交
1897
			ret = 1;
1898
			pte_access &= ~ACC_WRITE_MASK;
1899
			if (is_writable_pte(spte))
1900 1901 1902 1903 1904 1905 1906
				spte &= ~PT_WRITABLE_MASK;
		}
	}

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

1907
set_pte:
A
Avi Kivity 已提交
1908
	__set_spte(sptep, spte);
A
Avi Kivity 已提交
1909
done:
M
Marcelo Tosatti 已提交
1910 1911 1912
	return ret;
}

A
Avi Kivity 已提交
1913
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
1914 1915
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
1916
			 int *ptwrite, int level, gfn_t gfn,
1917 1918
			 pfn_t pfn, bool speculative,
			 bool reset_host_protection)
M
Marcelo Tosatti 已提交
1919 1920
{
	int was_rmapped = 0;
1921
	int was_writable = is_writable_pte(*sptep);
1922
	int rmap_count;
M
Marcelo Tosatti 已提交
1923 1924 1925

	pgprintk("%s: spte %llx access %x write_fault %d"
		 " user_fault %d gfn %lx\n",
A
Avi Kivity 已提交
1926
		 __func__, *sptep, pt_access,
M
Marcelo Tosatti 已提交
1927 1928
		 write_fault, user_fault, gfn);

A
Avi Kivity 已提交
1929
	if (is_rmap_spte(*sptep)) {
M
Marcelo Tosatti 已提交
1930 1931 1932 1933
		/*
		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
		 * the parent of the now unreachable PTE.
		 */
1934 1935
		if (level > PT_PAGE_TABLE_LEVEL &&
		    !is_large_pte(*sptep)) {
M
Marcelo Tosatti 已提交
1936
			struct kvm_mmu_page *child;
A
Avi Kivity 已提交
1937
			u64 pte = *sptep;
M
Marcelo Tosatti 已提交
1938 1939

			child = page_header(pte & PT64_BASE_ADDR_MASK);
A
Avi Kivity 已提交
1940
			mmu_page_remove_parent_pte(child, sptep);
1941 1942
			__set_spte(sptep, shadow_trap_nonpresent_pte);
			kvm_flush_remote_tlbs(vcpu->kvm);
A
Avi Kivity 已提交
1943
		} else if (pfn != spte_to_pfn(*sptep)) {
M
Marcelo Tosatti 已提交
1944
			pgprintk("hfn old %lx new %lx\n",
A
Avi Kivity 已提交
1945
				 spte_to_pfn(*sptep), pfn);
A
Avi Kivity 已提交
1946
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1947
			kvm_flush_remote_tlbs(vcpu->kvm);
1948 1949
		} else
			was_rmapped = 1;
M
Marcelo Tosatti 已提交
1950
	}
1951

A
Avi Kivity 已提交
1952
	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1953 1954
		      dirty, level, gfn, pfn, speculative, true,
		      reset_host_protection)) {
M
Marcelo Tosatti 已提交
1955 1956
		if (write_fault)
			*ptwrite = 1;
1957
		kvm_mmu_flush_tlb(vcpu);
1958
	}
M
Marcelo Tosatti 已提交
1959

A
Avi Kivity 已提交
1960
	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
M
Marcelo Tosatti 已提交
1961
	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
A
Avi Kivity 已提交
1962
		 is_large_pte(*sptep)? "2MB" : "4kB",
1963 1964
		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
		 *sptep, sptep);
A
Avi Kivity 已提交
1965
	if (!was_rmapped && is_large_pte(*sptep))
M
Marcelo Tosatti 已提交
1966 1967
		++vcpu->kvm->stat.lpages;

A
Avi Kivity 已提交
1968
	page_header_update_slot(vcpu->kvm, sptep, gfn);
1969
	if (!was_rmapped) {
1970
		rmap_count = rmap_add(vcpu, sptep, gfn);
1971
		kvm_release_pfn_clean(pfn);
1972
		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1973
			rmap_recycle(vcpu, sptep, gfn);
1974
	} else {
1975
		if (was_writable)
1976
			kvm_release_pfn_dirty(pfn);
1977
		else
1978
			kvm_release_pfn_clean(pfn);
1979
	}
1980
	if (speculative) {
A
Avi Kivity 已提交
1981
		vcpu->arch.last_pte_updated = sptep;
1982 1983
		vcpu->arch.last_pte_gfn = gfn;
	}
1984 1985
}

A
Avi Kivity 已提交
1986 1987 1988 1989
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

1990
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1991
			int level, gfn_t gfn, pfn_t pfn)
1992
{
1993
	struct kvm_shadow_walk_iterator iterator;
1994
	struct kvm_mmu_page *sp;
1995
	int pt_write = 0;
1996
	gfn_t pseudo_gfn;
A
Avi Kivity 已提交
1997

1998
	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1999
		if (iterator.level == level) {
2000 2001
			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
				     0, write, 1, &pt_write,
2002
				     level, gfn, pfn, false, true);
2003 2004
			++vcpu->stat.pf_fixed;
			break;
A
Avi Kivity 已提交
2005 2006
		}

2007
		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2008 2009 2010 2011
			u64 base_addr = iterator.addr;

			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
			pseudo_gfn = base_addr >> PAGE_SHIFT;
2012 2013 2014 2015 2016 2017 2018 2019
			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
					      iterator.level - 1,
					      1, ACC_ALL, iterator.sptep);
			if (!sp) {
				pgprintk("nonpaging_map: ENOMEM\n");
				kvm_release_pfn_clean(pfn);
				return -ENOMEM;
			}
2020

A
Avi Kivity 已提交
2021 2022 2023 2024
			__set_spte(iterator.sptep,
				   __pa(sp->spt)
				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
				   | shadow_user_mask | shadow_x_mask);
2025 2026 2027
		}
	}
	return pt_write;
A
Avi Kivity 已提交
2028 2029
}

2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050
static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
{
	char buf[1];
	void __user *hva;
	int r;

	/* Touch the page, so send SIGBUS */
	hva = (void __user *)gfn_to_hva(kvm, gfn);
	r = copy_from_user(buf, hva, 1);
}

static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
{
	kvm_release_pfn_clean(pfn);
	if (is_hwpoison_pfn(pfn)) {
		kvm_send_hwpoison_signal(kvm, gfn);
		return 0;
	}
	return 1;
}

2051 2052 2053
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
	int r;
2054
	int level;
2055
	pfn_t pfn;
2056
	unsigned long mmu_seq;
2057

2058 2059 2060 2061 2062 2063 2064 2065 2066 2067
	level = mapping_level(vcpu, gfn);

	/*
	 * This path builds a PAE pagetable - so we can map 2mb pages at
	 * maximum. Therefore check if the level is larger than that.
	 */
	if (level > PT_DIRECTORY_LEVEL)
		level = PT_DIRECTORY_LEVEL;

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
2068

2069
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2070
	smp_rmb();
2071
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2072

2073
	/* mmio */
2074 2075
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2076

2077
	spin_lock(&vcpu->kvm->mmu_lock);
2078 2079
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2080
	kvm_mmu_free_some_pages(vcpu);
2081
	r = __direct_map(vcpu, v, write, level, gfn, pfn);
2082 2083 2084
	spin_unlock(&vcpu->kvm->mmu_lock);


2085
	return r;
2086 2087 2088 2089 2090

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2091 2092 2093
}


2094 2095 2096
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
2097
	struct kvm_mmu_page *sp;
2098
	LIST_HEAD(invalid_list);
2099

2100
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
2101
		return;
2102
	spin_lock(&vcpu->kvm->mmu_lock);
2103 2104
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
2105

2106 2107
		sp = page_header(root);
		--sp->root_count;
2108 2109 2110 2111
		if (!sp->root_count && sp->role.invalid) {
			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
		}
2112
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2113
		spin_unlock(&vcpu->kvm->mmu_lock);
2114 2115 2116
		return;
	}
	for (i = 0; i < 4; ++i) {
2117
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2118

A
Avi Kivity 已提交
2119 2120
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
2121 2122
			sp = page_header(root);
			--sp->root_count;
2123
			if (!sp->root_count && sp->role.invalid)
2124 2125
				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
							 &invalid_list);
A
Avi Kivity 已提交
2126
		}
2127
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2128
	}
2129
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2130
	spin_unlock(&vcpu->kvm->mmu_lock);
2131
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2132 2133
}

2134 2135 2136 2137 2138
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
	int ret = 0;

	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2139
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2140 2141 2142 2143 2144 2145 2146
		ret = 1;
	}

	return ret;
}

static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2147 2148
{
	int i;
2149
	gfn_t root_gfn;
2150
	struct kvm_mmu_page *sp;
2151
	int direct = 0;
A
Avi Kivity 已提交
2152
	u64 pdptr;
2153

2154
	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2155

2156 2157
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
2158 2159

		ASSERT(!VALID_PAGE(root));
2160 2161
		if (mmu_check_root(vcpu, root_gfn))
			return 1;
2162 2163 2164 2165
		if (tdp_enabled) {
			direct = 1;
			root_gfn = 0;
		}
2166
		spin_lock(&vcpu->kvm->mmu_lock);
2167
		kvm_mmu_free_some_pages(vcpu);
2168
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2169
				      PT64_ROOT_LEVEL, direct,
2170
				      ACC_ALL, NULL);
2171 2172
		root = __pa(sp->spt);
		++sp->root_count;
2173
		spin_unlock(&vcpu->kvm->mmu_lock);
2174
		vcpu->arch.mmu.root_hpa = root;
2175
		return 0;
2176
	}
2177
	direct = !is_paging(vcpu);
2178
	for (i = 0; i < 4; ++i) {
2179
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2180 2181

		ASSERT(!VALID_PAGE(root));
2182
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
A
Avi Kivity 已提交
2183
			pdptr = kvm_pdptr_read(vcpu, i);
2184
			if (!is_present_gpte(pdptr)) {
2185
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
2186 2187
				continue;
			}
A
Avi Kivity 已提交
2188
			root_gfn = pdptr >> PAGE_SHIFT;
2189
		} else if (vcpu->arch.mmu.root_level == 0)
2190
			root_gfn = 0;
2191 2192
		if (mmu_check_root(vcpu, root_gfn))
			return 1;
2193 2194 2195 2196
		if (tdp_enabled) {
			direct = 1;
			root_gfn = i << 30;
		}
2197
		spin_lock(&vcpu->kvm->mmu_lock);
2198
		kvm_mmu_free_some_pages(vcpu);
2199
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2200
				      PT32_ROOT_LEVEL, direct,
2201
				      ACC_ALL, NULL);
2202 2203
		root = __pa(sp->spt);
		++sp->root_count;
2204 2205
		spin_unlock(&vcpu->kvm->mmu_lock);

2206
		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2207
	}
2208
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2209
	return 0;
2210 2211
}

2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	int i;
	struct kvm_mmu_page *sp;

	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
		return;
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
		sp = page_header(root);
		mmu_sync_children(vcpu, sp);
		return;
	}
	for (i = 0; i < 4; ++i) {
		hpa_t root = vcpu->arch.mmu.pae_root[i];

2228
		if (root && VALID_PAGE(root)) {
2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
			root &= PT64_BASE_ADDR_MASK;
			sp = page_header(root);
			mmu_sync_children(vcpu, sp);
		}
	}
}

void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_sync_roots(vcpu);
2240
	spin_unlock(&vcpu->kvm->mmu_lock);
2241 2242
}

2243 2244
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				  u32 access, u32 *error)
A
Avi Kivity 已提交
2245
{
2246 2247
	if (error)
		*error = 0;
A
Avi Kivity 已提交
2248 2249 2250 2251
	return vaddr;
}

static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
A
Avi Kivity 已提交
2252
				u32 error_code)
A
Avi Kivity 已提交
2253
{
2254
	gfn_t gfn;
2255
	int r;
A
Avi Kivity 已提交
2256

2257
	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2258 2259 2260
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
2261

A
Avi Kivity 已提交
2262
	ASSERT(vcpu);
2263
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2264

2265
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
2266

2267 2268
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
2269 2270
}

2271 2272 2273
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
				u32 error_code)
{
2274
	pfn_t pfn;
2275
	int r;
2276
	int level;
M
Marcelo Tosatti 已提交
2277
	gfn_t gfn = gpa >> PAGE_SHIFT;
2278
	unsigned long mmu_seq;
2279 2280 2281 2282 2283 2284 2285 2286

	ASSERT(vcpu);
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

2287 2288 2289 2290
	level = mapping_level(vcpu, gfn);

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);

2291
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2292
	smp_rmb();
2293
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2294 2295
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2296
	spin_lock(&vcpu->kvm->mmu_lock);
2297 2298
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2299 2300
	kvm_mmu_free_some_pages(vcpu);
	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2301
			 level, gfn, pfn);
2302 2303 2304
	spin_unlock(&vcpu->kvm->mmu_lock);

	return r;
2305 2306 2307 2308 2309

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2310 2311
}

A
Avi Kivity 已提交
2312 2313
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
2314
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2315 2316 2317 2318
}

static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
2319
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2320 2321 2322 2323 2324

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
2325
	context->prefetch_page = nonpaging_prefetch_page;
2326
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2327
	context->invlpg = nonpaging_invlpg;
2328
	context->root_level = 0;
A
Avi Kivity 已提交
2329
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2330
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2331 2332 2333
	return 0;
}

2334
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2335
{
A
Avi Kivity 已提交
2336
	++vcpu->stat.tlb_flush;
2337
	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
A
Avi Kivity 已提交
2338 2339 2340 2341
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
2342
	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2343
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2344 2345 2346 2347 2348 2349
}

static void inject_page_fault(struct kvm_vcpu *vcpu,
			      u64 addr,
			      u32 err_code)
{
2350
	kvm_inject_page_fault(vcpu, addr, err_code);
A
Avi Kivity 已提交
2351 2352 2353 2354 2355 2356 2357
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

2358 2359 2360 2361 2362 2363 2364 2365
static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
{
	int bit7;

	bit7 = (gpte >> 7) & 1;
	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
}

A
Avi Kivity 已提交
2366 2367 2368 2369 2370 2371 2372 2373
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
{
	struct kvm_mmu *context = &vcpu->arch.mmu;
	int maxphyaddr = cpuid_maxphyaddr(vcpu);
	u64 exb_bit_rsvd = 0;

	if (!is_nx(vcpu))
		exb_bit_rsvd = rsvd_bits(63, 63);
	switch (level) {
	case PT32_ROOT_LEVEL:
		/* no rsvd bits for 2 level 4K page table entries */
		context->rsvd_bits_mask[0][1] = 0;
		context->rsvd_bits_mask[0][0] = 0;
2387 2388 2389 2390 2391 2392 2393
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];

		if (!is_pse(vcpu)) {
			context->rsvd_bits_mask[1][1] = 0;
			break;
		}

2394 2395 2396 2397 2398 2399 2400 2401
		if (is_cpuid_PSE36())
			/* 36bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
		else
			/* 32 bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
		break;
	case PT32E_ROOT_LEVEL:
2402 2403 2404
		context->rsvd_bits_mask[0][2] =
			rsvd_bits(maxphyaddr, 63) |
			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2405
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2406
			rsvd_bits(maxphyaddr, 62);	/* PDE */
2407 2408 2409 2410 2411
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62); 	/* PTE */
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62) |
			rsvd_bits(13, 20);		/* large page */
2412
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2413 2414 2415 2416 2417 2418 2419
		break;
	case PT64_ROOT_LEVEL:
		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2420
			rsvd_bits(maxphyaddr, 51);
2421 2422 2423
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51);
		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2424 2425 2426
		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 29);
2427
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2428 2429
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 20);		/* large page */
2430
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2431 2432 2433 2434
		break;
	}
}

2435
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
A
Avi Kivity 已提交
2436
{
2437
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2438 2439 2440 2441 2442

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
2443
	context->prefetch_page = paging64_prefetch_page;
2444
	context->sync_page = paging64_sync_page;
M
Marcelo Tosatti 已提交
2445
	context->invlpg = paging64_invlpg;
A
Avi Kivity 已提交
2446
	context->free = paging_free;
2447 2448
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
2449
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2450 2451 2452
	return 0;
}

2453 2454
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
2455
	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2456 2457 2458
	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}

A
Avi Kivity 已提交
2459 2460
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
2461
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2462

2463
	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
A
Avi Kivity 已提交
2464 2465 2466 2467
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
2468
	context->prefetch_page = paging32_prefetch_page;
2469
	context->sync_page = paging32_sync_page;
M
Marcelo Tosatti 已提交
2470
	context->invlpg = paging32_invlpg;
A
Avi Kivity 已提交
2471 2472
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2473
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2474 2475 2476 2477 2478
	return 0;
}

static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
2479
	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2480
	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
2481 2482
}

2483 2484 2485 2486 2487 2488 2489 2490
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *context = &vcpu->arch.mmu;

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = tdp_page_fault;
	context->free = nonpaging_free;
	context->prefetch_page = nonpaging_prefetch_page;
2491
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2492
	context->invlpg = nonpaging_invlpg;
2493
	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2494 2495 2496 2497 2498 2499
	context->root_hpa = INVALID_PAGE;

	if (!is_paging(vcpu)) {
		context->gva_to_gpa = nonpaging_gva_to_gpa;
		context->root_level = 0;
	} else if (is_long_mode(vcpu)) {
2500
		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2501 2502 2503
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT64_ROOT_LEVEL;
	} else if (is_pae(vcpu)) {
2504
		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2505 2506 2507
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT32E_ROOT_LEVEL;
	} else {
2508
		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2509 2510 2511 2512 2513 2514 2515 2516
		context->gva_to_gpa = paging32_gva_to_gpa;
		context->root_level = PT32_ROOT_LEVEL;
	}

	return 0;
}

static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2517
{
2518 2519
	int r;

A
Avi Kivity 已提交
2520
	ASSERT(vcpu);
2521
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2522 2523

	if (!is_paging(vcpu))
2524
		r = nonpaging_init_context(vcpu);
A
Avi Kivity 已提交
2525
	else if (is_long_mode(vcpu))
2526
		r = paging64_init_context(vcpu);
A
Avi Kivity 已提交
2527
	else if (is_pae(vcpu))
2528
		r = paging32E_init_context(vcpu);
A
Avi Kivity 已提交
2529
	else
2530 2531
		r = paging32_init_context(vcpu);

2532
	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2533
	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2534 2535

	return r;
A
Avi Kivity 已提交
2536 2537
}

2538 2539
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
2540 2541
	vcpu->arch.update_pte.pfn = bad_pfn;

2542 2543 2544 2545 2546 2547
	if (tdp_enabled)
		return init_kvm_tdp_mmu(vcpu);
	else
		return init_kvm_softmmu(vcpu);
}

A
Avi Kivity 已提交
2548 2549 2550
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
2551 2552
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
		/* mmu.free() should set root_hpa = INVALID_PAGE */
2553
		vcpu->arch.mmu.free(vcpu);
A
Avi Kivity 已提交
2554 2555 2556
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2557 2558 2559 2560
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
2561
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
2562 2563

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2564
{
2565 2566
	int r;

2567
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
2568 2569
	if (r)
		goto out;
2570
	r = mmu_alloc_roots(vcpu);
2571
	spin_lock(&vcpu->kvm->mmu_lock);
2572
	mmu_sync_roots(vcpu);
2573
	spin_unlock(&vcpu->kvm->mmu_lock);
2574 2575
	if (r)
		goto out;
2576
	/* set_cr3() should ensure TLB has been flushed */
2577
	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2578 2579
out:
	return r;
A
Avi Kivity 已提交
2580
}
A
Avi Kivity 已提交
2581 2582 2583 2584 2585 2586
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
A
Avi Kivity 已提交
2587

2588
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2589
				  struct kvm_mmu_page *sp,
2590 2591 2592 2593 2594 2595
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
2596
	if (is_shadow_present_pte(pte)) {
2597
		if (is_last_spte(pte, sp->role.level))
A
Avi Kivity 已提交
2598
			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2599 2600
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
2601
			mmu_page_remove_parent_pte(child, spte);
2602 2603
		}
	}
A
Avi Kivity 已提交
2604
	__set_spte(spte, shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
2605 2606
	if (is_large_pte(pte))
		--vcpu->kvm->stat.lpages;
2607 2608
}

2609
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2610
				  struct kvm_mmu_page *sp,
2611
				  u64 *spte,
2612
				  const void *new)
2613
{
2614
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2615 2616
		++vcpu->kvm->stat.mmu_pde_zapped;
		return;
2617
        }
2618

A
Avi Kivity 已提交
2619
	++vcpu->kvm->stat.mmu_pte_updated;
2620
	if (!sp->role.cr4_pae)
2621
		paging32_update_pte(vcpu, sp, spte, new);
2622
	else
2623
		paging64_update_pte(vcpu, sp, spte, new);
2624 2625
}

2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

2639 2640
static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
				    bool remote_flush, bool local_flush)
2641
{
2642 2643 2644 2645
	if (zap_page)
		return;

	if (remote_flush)
2646
		kvm_flush_remote_tlbs(vcpu->kvm);
2647
	else if (local_flush)
2648 2649 2650
		kvm_mmu_flush_tlb(vcpu);
}

2651 2652
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
2653
	u64 *spte = vcpu->arch.last_pte_updated;
2654

S
Sheng Yang 已提交
2655
	return !!(spte && (*spte & shadow_accessed_mask));
2656 2657
}

2658
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2659
					  u64 gpte)
2660 2661
{
	gfn_t gfn;
2662
	pfn_t pfn;
2663

2664
	if (!is_present_gpte(gpte))
2665 2666
		return;
	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2667

2668
	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2669
	smp_rmb();
2670
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2671

2672 2673
	if (is_error_pfn(pfn)) {
		kvm_release_pfn_clean(pfn);
2674 2675
		return;
	}
2676
	vcpu->arch.update_pte.gfn = gfn;
2677
	vcpu->arch.update_pte.pfn = pfn;
2678 2679
}

2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{
	u64 *spte = vcpu->arch.last_pte_updated;

	if (spte
	    && vcpu->arch.last_pte_gfn == gfn
	    && shadow_accessed_mask
	    && !(*spte & shadow_accessed_mask)
	    && is_shadow_present_pte(*spte))
		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
}

2692
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2693 2694
		       const u8 *new, int bytes,
		       bool guest_initiated)
2695
{
2696
	gfn_t gfn = gpa >> PAGE_SHIFT;
2697
	struct kvm_mmu_page *sp;
2698
	struct hlist_node *node;
2699
	LIST_HEAD(invalid_list);
2700
	u64 entry, gentry;
2701 2702
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
2703
	unsigned pte_size;
2704
	unsigned page_offset;
2705
	unsigned misaligned;
2706
	unsigned quadrant;
2707
	int level;
2708
	int flooded = 0;
2709
	int npte;
2710
	int r;
2711
	int invlpg_counter;
2712 2713 2714
	bool remote_flush, local_flush, zap_page;

	zap_page = remote_flush = local_flush = false;
2715

2716
	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2717

2718
	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2719 2720 2721 2722 2723 2724 2725

	/*
	 * Assume that the pte write on a page table of the same type
	 * as the current vcpu paging mode.  This is nearly always true
	 * (might be false while changing modes).  Note it is verified later
	 * by update_pte().
	 */
2726
	if ((is_pae(vcpu) && bytes == 4) || !new) {
2727
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2728 2729 2730 2731 2732
		if (is_pae(vcpu)) {
			gpa &= ~(gpa_t)7;
			bytes = 8;
		}
		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2733 2734
		if (r)
			gentry = 0;
2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747
		new = (const u8 *)&gentry;
	}

	switch (bytes) {
	case 4:
		gentry = *(const u32 *)new;
		break;
	case 8:
		gentry = *(const u64 *)new;
		break;
	default:
		gentry = 0;
		break;
2748 2749 2750
	}

	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2751
	spin_lock(&vcpu->kvm->mmu_lock);
2752 2753
	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
		gentry = 0;
2754
	kvm_mmu_access_page(vcpu, gfn);
2755
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
2756
	++vcpu->kvm->stat.mmu_pte_write;
2757
	kvm_mmu_audit(vcpu, "pre pte write");
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768
	if (guest_initiated) {
		if (gfn == vcpu->arch.last_pt_write_gfn
		    && !last_updated_pte_accessed(vcpu)) {
			++vcpu->arch.last_pt_write_count;
			if (vcpu->arch.last_pt_write_count >= 3)
				flooded = 1;
		} else {
			vcpu->arch.last_pt_write_gfn = gfn;
			vcpu->arch.last_pt_write_count = 1;
			vcpu->arch.last_pte_updated = NULL;
		}
2769
	}
2770

2771
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2772
		pte_size = sp->role.cr4_pae ? 8 : 4;
2773
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2774
		misaligned |= bytes < 4;
2775
		if (misaligned || flooded) {
2776 2777 2778 2779
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
2780 2781 2782 2783 2784
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
2785 2786
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2787
				 gpa, bytes, sp->role.word);
2788
			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2789
						     &invalid_list);
A
Avi Kivity 已提交
2790
			++vcpu->kvm->stat.mmu_flooded;
2791 2792
			continue;
		}
2793
		page_offset = offset;
2794
		level = sp->role.level;
2795
		npte = 1;
2796
		if (!sp->role.cr4_pae) {
2797 2798 2799 2800 2801 2802 2803
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
2804
				page_offset &= ~7; /* kill rounding error */
2805 2806 2807
				page_offset <<= 1;
				npte = 2;
			}
2808
			quadrant = page_offset >> PAGE_SHIFT;
2809
			page_offset &= ~PAGE_MASK;
2810
			if (quadrant != sp->role.quadrant)
2811
				continue;
2812
		}
2813
		local_flush = true;
2814
		spte = &sp->spt[page_offset / sizeof(*spte)];
2815
		while (npte--) {
2816
			entry = *spte;
2817
			mmu_pte_write_zap_pte(vcpu, sp, spte);
2818 2819
			if (gentry)
				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2820 2821
			if (!remote_flush && need_remote_flush(entry, *spte))
				remote_flush = true;
2822
			++spte;
2823 2824
		}
	}
2825
	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2826
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2827
	kvm_mmu_audit(vcpu, "post pte write");
2828
	spin_unlock(&vcpu->kvm->mmu_lock);
2829 2830 2831
	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
		vcpu->arch.update_pte.pfn = bad_pfn;
2832
	}
2833 2834
}

2835 2836
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
2837 2838
	gpa_t gpa;
	int r;
2839

2840 2841 2842
	if (tdp_enabled)
		return 0;

2843
	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2844

2845
	spin_lock(&vcpu->kvm->mmu_lock);
2846
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2847
	spin_unlock(&vcpu->kvm->mmu_lock);
2848
	return r;
2849
}
2850
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2851

2852
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2853
{
2854
	int free_pages;
2855
	LIST_HEAD(invalid_list);
2856 2857 2858

	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
	while (free_pages < KVM_REFILL_PAGES &&
2859
	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2860
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
2861

2862
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2863
				  struct kvm_mmu_page, link);
2864 2865
		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
						       &invalid_list);
A
Avi Kivity 已提交
2866
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
2867
	}
2868
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
A
Avi Kivity 已提交
2869 2870
}

2871 2872 2873 2874 2875
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

2876
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2877 2878 2879 2880 2881 2882 2883 2884
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

2885 2886 2887 2888
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

A
Avi Kivity 已提交
2889
	er = emulate_instruction(vcpu, cr2, error_code, 0);
2890 2891 2892 2893 2894 2895

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
2896
		/* fall through */
2897
	case EMULATE_FAIL:
2898
		return 0;
2899 2900 2901 2902 2903 2904 2905 2906
	default:
		BUG();
	}
out:
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

M
Marcelo Tosatti 已提交
2907 2908 2909 2910 2911 2912 2913 2914
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
	vcpu->arch.mmu.invlpg(vcpu, gva);
	kvm_mmu_flush_tlb(vcpu);
	++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);

2915 2916 2917 2918 2919 2920
void kvm_enable_tdp(void)
{
	tdp_enabled = true;
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);

2921 2922 2923 2924 2925 2926
void kvm_disable_tdp(void)
{
	tdp_enabled = false;
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);

A
Avi Kivity 已提交
2927 2928
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
2929
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
A
Avi Kivity 已提交
2930 2931 2932 2933
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
2934
	struct page *page;
A
Avi Kivity 已提交
2935 2936 2937 2938
	int i;

	ASSERT(vcpu);

2939 2940 2941 2942 2943 2944 2945
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
2946 2947
		return -ENOMEM;

2948
	vcpu->arch.mmu.pae_root = page_address(page);
2949
	for (i = 0; i < 4; ++i)
2950
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2951

A
Avi Kivity 已提交
2952 2953 2954
	return 0;
}

2955
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2956 2957
{
	ASSERT(vcpu);
2958
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2959

2960 2961
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
2962

2963 2964 2965
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
2966
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2967

2968
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
2969 2970 2971 2972 2973 2974 2975 2976
}

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
2977
	mmu_free_memory_caches(vcpu);
A
Avi Kivity 已提交
2978 2979
}

2980
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
2981
{
2982
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
2983

2984
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
A
Avi Kivity 已提交
2985 2986 2987
		int i;
		u64 *pt;

2988
		if (!test_bit(slot, sp->slot_bitmap))
A
Avi Kivity 已提交
2989 2990
			continue;

2991
		pt = sp->spt;
A
Avi Kivity 已提交
2992 2993
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
2994
			if (is_writable_pte(pt[i]))
A
Avi Kivity 已提交
2995 2996
				pt[i] &= ~PT_WRITABLE_MASK;
	}
2997
	kvm_flush_remote_tlbs(kvm);
A
Avi Kivity 已提交
2998
}
2999

3000
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
3001
{
3002
	struct kvm_mmu_page *sp, *node;
3003
	LIST_HEAD(invalid_list);
D
Dor Laor 已提交
3004

3005
	spin_lock(&kvm->mmu_lock);
3006
restart:
3007
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3008
		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3009 3010
			goto restart;

3011
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3012
	spin_unlock(&kvm->mmu_lock);
D
Dor Laor 已提交
3013 3014
}

3015 3016
static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
					       struct list_head *invalid_list)
3017 3018 3019 3020 3021
{
	struct kvm_mmu_page *page;

	page = container_of(kvm->arch.active_mmu_pages.prev,
			    struct kvm_mmu_page, link);
3022
	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3023 3024
}

3025
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3026 3027 3028 3029 3030 3031 3032 3033
{
	struct kvm *kvm;
	struct kvm *kvm_freed = NULL;
	int cache_count = 0;

	spin_lock(&kvm_lock);

	list_for_each_entry(kvm, &vm_list, vm_list) {
G
Gui Jianfeng 已提交
3034
		int npages, idx, freed_pages;
3035
		LIST_HEAD(invalid_list);
3036

3037
		idx = srcu_read_lock(&kvm->srcu);
3038 3039 3040 3041 3042
		spin_lock(&kvm->mmu_lock);
		npages = kvm->arch.n_alloc_mmu_pages -
			 kvm->arch.n_free_mmu_pages;
		cache_count += npages;
		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3043 3044
			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
							  &invalid_list);
G
Gui Jianfeng 已提交
3045
			cache_count -= freed_pages;
3046 3047 3048 3049
			kvm_freed = kvm;
		}
		nr_to_scan--;

3050
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3051
		spin_unlock(&kvm->mmu_lock);
3052
		srcu_read_unlock(&kvm->srcu, idx);
3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066
	}
	if (kvm_freed)
		list_move_tail(&kvm_freed->vm_list, &vm_list);

	spin_unlock(&kvm_lock);

	return cache_count;
}

static struct shrinker mmu_shrinker = {
	.shrink = mmu_shrink,
	.seeks = DEFAULT_SEEKS * 10,
};

I
Ingo Molnar 已提交
3067
static void mmu_destroy_caches(void)
3068 3069 3070 3071 3072
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
3073 3074
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
3075 3076
}

3077 3078 3079 3080 3081 3082
void kvm_mmu_module_exit(void)
{
	mmu_destroy_caches();
	unregister_shrinker(&mmu_shrinker);
}

3083 3084 3085 3086
int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
3087
					    0, 0, NULL);
3088 3089 3090 3091
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
3092
					    0, 0, NULL);
3093 3094 3095
	if (!rmap_desc_cache)
		goto nomem;

3096 3097
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
3098
						  0, 0, NULL);
3099 3100 3101
	if (!mmu_page_header_cache)
		goto nomem;

3102 3103
	register_shrinker(&mmu_shrinker);

3104 3105 3106
	return 0;

nomem:
3107
	mmu_destroy_caches();
3108 3109 3110
	return -ENOMEM;
}

3111 3112 3113 3114 3115 3116 3117 3118
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;
3119
	struct kvm_memslots *slots;
3120

3121 3122
	slots = kvm_memslots(kvm);

3123 3124
	for (i = 0; i < slots->nmemslots; i++)
		nr_pages += slots->memslots[i].npages;
3125 3126 3127 3128 3129 3130 3131 3132

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167
static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	if (len > buffer->len)
		return NULL;
	return buffer->ptr;
}

static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	void *ret;

	ret = pv_mmu_peek_buffer(buffer, len);
	if (!ret)
		return ret;
	buffer->ptr += len;
	buffer->len -= len;
	buffer->processed += len;
	return ret;
}

static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
			     gpa_t addr, gpa_t value)
{
	int bytes = 8;
	int r;

	if (!is_long_mode(vcpu) && !is_pae(vcpu))
		bytes = 4;

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

3168
	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3169 3170 3171 3172 3173 3174 3175
		return -EFAULT;

	return 1;
}

static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
3176
	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229
	return 1;
}

static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
	spin_unlock(&vcpu->kvm->mmu_lock);
	return 1;
}

static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
			     struct kvm_pv_mmu_op_buffer *buffer)
{
	struct kvm_mmu_op_header *header;

	header = pv_mmu_peek_buffer(buffer, sizeof *header);
	if (!header)
		return 0;
	switch (header->op) {
	case KVM_MMU_OP_WRITE_PTE: {
		struct kvm_mmu_op_write_pte *wpte;

		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
		if (!wpte)
			return 0;
		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
					wpte->pte_val);
	}
	case KVM_MMU_OP_FLUSH_TLB: {
		struct kvm_mmu_op_flush_tlb *ftlb;

		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
		if (!ftlb)
			return 0;
		return kvm_pv_mmu_flush_tlb(vcpu);
	}
	case KVM_MMU_OP_RELEASE_PT: {
		struct kvm_mmu_op_release_pt *rpt;

		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
		if (!rpt)
			return 0;
		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
	}
	default: return 0;
	}
}

int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
		  gpa_t addr, unsigned long *ret)
{
	int r;
3230
	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3231

3232 3233 3234
	buffer->ptr = buffer->buf;
	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
	buffer->processed = 0;
3235

3236
	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3237 3238 3239
	if (r)
		goto out;

3240 3241
	while (buffer->len) {
		r = kvm_pv_mmu_op_one(vcpu, buffer);
3242 3243 3244 3245 3246 3247 3248 3249
		if (r < 0)
			goto out;
		if (r == 0)
			break;
	}

	r = 1;
out:
3250
	*ret = buffer->processed;
3251 3252 3253
	return r;
}

3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
	struct kvm_shadow_walk_iterator iterator;
	int nr_sptes = 0;

	spin_lock(&vcpu->kvm->mmu_lock);
	for_each_shadow_entry(vcpu, addr, iterator) {
		sptes[iterator.level-1] = *iterator.sptep;
		nr_sptes++;
		if (!is_shadow_present_pte(*iterator.sptep))
			break;
	}
	spin_unlock(&vcpu->kvm->mmu_lock);

	return nr_sptes;
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283
#ifdef AUDIT

static const char *audit_msg;

static gva_t canonicalize(gva_t gva)
{
#ifdef CONFIG_X86_64
	gva = (long long)(gva << 16) >> 16;
#endif
	return gva;
}

3284

3285
typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3286 3287 3288 3289 3290 3291 3292 3293 3294 3295

static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
			    inspect_spte_fn fn)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		u64 ent = sp->spt[i];

		if (is_shadow_present_pte(ent)) {
3296
			if (!is_last_spte(ent, sp->role.level)) {
3297 3298 3299
				struct kvm_mmu_page *child;
				child = page_header(ent & PT64_BASE_ADDR_MASK);
				__mmu_spte_walk(kvm, child, fn);
3300
			} else
3301
				fn(kvm, &sp->spt[i]);
3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330
		}
	}
}

static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
{
	int i;
	struct kvm_mmu_page *sp;

	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
		return;
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
		sp = page_header(root);
		__mmu_spte_walk(vcpu->kvm, sp, fn);
		return;
	}
	for (i = 0; i < 4; ++i) {
		hpa_t root = vcpu->arch.mmu.pae_root[i];

		if (root && VALID_PAGE(root)) {
			root &= PT64_BASE_ADDR_MASK;
			sp = page_header(root);
			__mmu_spte_walk(vcpu->kvm, sp, fn);
		}
	}
	return;
}

3331 3332 3333 3334 3335 3336 3337 3338 3339 3340
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
				gva_t va, int level)
{
	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
	int i;
	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
		u64 ent = pt[i];

3341
		if (ent == shadow_trap_nonpresent_pte)
3342 3343 3344
			continue;

		va = canonicalize(va);
3345 3346 3347
		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
			audit_mappings_page(vcpu, ent, va, level - 1);
		else {
3348
			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
J
Jan Kiszka 已提交
3349 3350 3351
			gfn_t gfn = gpa >> PAGE_SHIFT;
			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3352

3353 3354 3355 3356 3357
			if (is_error_pfn(pfn)) {
				kvm_release_pfn_clean(pfn);
				continue;
			}

3358
			if (is_shadow_present_pte(ent)
3359
			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
3360 3361
				printk(KERN_ERR "xx audit error: (%s) levels %d"
				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3362
				       audit_msg, vcpu->arch.mmu.root_level,
M
Mike Day 已提交
3363 3364
				       va, gpa, hpa, ent,
				       is_shadow_present_pte(ent));
3365 3366 3367 3368
			else if (ent == shadow_notrap_nonpresent_pte
				 && !is_error_hpa(hpa))
				printk(KERN_ERR "audit: (%s) notrap shadow,"
				       " valid guest gva %lx\n", audit_msg, va);
3369
			kvm_release_pfn_clean(pfn);
3370

3371 3372 3373 3374 3375 3376
		}
	}
}

static void audit_mappings(struct kvm_vcpu *vcpu)
{
3377
	unsigned i;
3378

3379 3380
	if (vcpu->arch.mmu.root_level == 4)
		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3381 3382
	else
		for (i = 0; i < 4; ++i)
3383
			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3384
				audit_mappings_page(vcpu,
3385
						    vcpu->arch.mmu.pae_root[i],
3386 3387 3388 3389 3390 3391
						    i << 30,
						    2);
}

static int count_rmaps(struct kvm_vcpu *vcpu)
{
3392 3393
	struct kvm *kvm = vcpu->kvm;
	struct kvm_memslots *slots;
3394
	int nmaps = 0;
3395
	int i, j, k, idx;
3396

3397
	idx = srcu_read_lock(&kvm->srcu);
3398
	slots = kvm_memslots(kvm);
3399
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3400
		struct kvm_memory_slot *m = &slots->memslots[i];
3401 3402 3403
		struct kvm_rmap_desc *d;

		for (j = 0; j < m->npages; ++j) {
3404
			unsigned long *rmapp = &m->rmap[j];
3405

3406
			if (!*rmapp)
3407
				continue;
3408
			if (!(*rmapp & 1)) {
3409 3410 3411
				++nmaps;
				continue;
			}
3412
			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3413 3414
			while (d) {
				for (k = 0; k < RMAP_EXT; ++k)
A
Avi Kivity 已提交
3415
					if (d->sptes[k])
3416 3417 3418 3419 3420 3421 3422
						++nmaps;
					else
						break;
				d = d->more;
			}
		}
	}
3423
	srcu_read_unlock(&kvm->srcu, idx);
3424 3425 3426
	return nmaps;
}

3427
void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3428 3429 3430 3431 3432
{
	unsigned long *rmapp;
	struct kvm_mmu_page *rev_sp;
	gfn_t gfn;

3433
	if (is_writable_pte(*sptep)) {
3434
		rev_sp = page_header(__pa(sptep));
3435
		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3436 3437 3438 3439 3440 3441 3442

		if (!gfn_to_memslot(kvm, gfn)) {
			if (!printk_ratelimit())
				return;
			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
					 audit_msg, gfn);
			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3443
			       audit_msg, (long int)(sptep - rev_sp->spt),
3444 3445 3446 3447 3448
					rev_sp->gfn);
			dump_stack();
			return;
		}

3449
		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466
		if (!*rmapp) {
			if (!printk_ratelimit())
				return;
			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
					 audit_msg, *sptep);
			dump_stack();
		}
	}

}

void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
{
	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}

static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3467
{
3468
	struct kvm_mmu_page *sp;
3469 3470
	int i;

3471
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3472
		u64 *pt = sp->spt;
3473

3474
		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3475 3476 3477 3478 3479 3480 3481
			continue;

		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
			u64 ent = pt[i];

			if (!(ent & PT_PRESENT_MASK))
				continue;
3482
			if (!is_writable_pte(ent))
3483
				continue;
3484
			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3485 3486
		}
	}
3487
	return;
3488 3489 3490 3491
}

static void audit_rmap(struct kvm_vcpu *vcpu)
{
3492 3493
	check_writable_mappings_rmap(vcpu);
	count_rmaps(vcpu);
3494 3495 3496 3497
}

static void audit_write_protection(struct kvm_vcpu *vcpu)
{
3498
	struct kvm_mmu_page *sp;
3499 3500
	struct kvm_memory_slot *slot;
	unsigned long *rmapp;
3501
	u64 *spte;
3502
	gfn_t gfn;
3503

3504
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3505
		if (sp->role.direct)
3506
			continue;
3507 3508
		if (sp->unsync)
			continue;
3509

A
Avi Kivity 已提交
3510
		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3511
		rmapp = &slot->rmap[gfn - slot->base_gfn];
3512 3513 3514

		spte = rmap_next(vcpu->kvm, rmapp, NULL);
		while (spte) {
3515
			if (is_writable_pte(*spte))
3516 3517
				printk(KERN_ERR "%s: (%s) shadow page has "
				"writable mappings: gfn %lx role %x\n",
3518
			       __func__, audit_msg, sp->gfn,
3519
			       sp->role.word);
3520 3521
			spte = rmap_next(vcpu->kvm, rmapp, spte);
		}
3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532
	}
}

static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
{
	int olddbg = dbg;

	dbg = 0;
	audit_msg = msg;
	audit_rmap(vcpu);
	audit_write_protection(vcpu);
3533 3534
	if (strcmp("pre pte write", audit_msg) != 0)
		audit_mappings(vcpu);
3535
	audit_writable_sptes_have_rmaps(vcpu);
3536 3537 3538 3539
	dbg = olddbg;
}

#endif