mmu.c 85.7 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
A
Avi Kivity 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affilates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
20

21
#include "mmu.h"
22
#include "x86.h"
A
Avi Kivity 已提交
23
#include "kvm_cache_regs.h"
A
Avi Kivity 已提交
24

25
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
26 27 28 29 30
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
31
#include <linux/swap.h>
M
Marcelo Tosatti 已提交
32
#include <linux/hugetlb.h>
33
#include <linux/compiler.h>
34
#include <linux/srcu.h>
35
#include <linux/slab.h>
36
#include <linux/uaccess.h>
A
Avi Kivity 已提交
37

A
Avi Kivity 已提交
38 39
#include <asm/page.h>
#include <asm/cmpxchg.h>
40
#include <asm/io.h>
41
#include <asm/vmx.h>
A
Avi Kivity 已提交
42

43 44 45 46 47 48 49
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
50
bool tdp_enabled = false;
51

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
75 76
static int dbg = 0;
module_param(dbg, bool, 0644);
77
#endif
A
Avi Kivity 已提交
78

79 80 81
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);

82 83 84
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
85 86 87 88 89
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
90
#endif
A
Avi Kivity 已提交
91 92 93 94 95 96 97

#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
98
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
99 100 101 102 103 104 105 106 107 108 109

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
110
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
111 112 113

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114 115 116
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
117 118 119 120 121

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


122
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
123 124
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
125 126 127 128 129 130
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
131 132 133 134

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135 136 137
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
138

139 140
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
141

142 143
#define RMAP_EXT 4

144 145 146 147 148
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

149 150
#include <trace/events/kvm.h>

151 152 153
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

154 155
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

156 157
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

158
struct kvm_rmap_desc {
A
Avi Kivity 已提交
159
	u64 *sptes[RMAP_EXT];
160 161 162
	struct kvm_rmap_desc *more;
};

163 164 165 166 167 168 169 170 171 172 173 174 175
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	int level;
	u64 *sptep;
	unsigned index;
};

#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

176
typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
M
Marcelo Tosatti 已提交
177

178 179
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
180
static struct kmem_cache *mmu_page_header_cache;
181

182 183
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
S
Sheng Yang 已提交
184 185 186 187 188 189
static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
190

191 192 193 194 195
static inline u64 rsvd_bits(int s, int e)
{
	return ((1ULL << (e - s + 1)) - 1) << s;
}

196 197 198 199 200 201 202
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

S
Sheng Yang 已提交
203 204 205 206 207 208 209
void kvm_mmu_set_base_ptes(u64 base_pte)
{
	shadow_base_present_pte = base_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);

void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210
		u64 dirty_mask, u64 nx_mask, u64 x_mask)
S
Sheng Yang 已提交
211 212 213 214 215 216 217 218 219
{
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

220
static bool is_write_protection(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
221
{
222
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
A
Avi Kivity 已提交
223 224 225 226 227 228 229
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

230 231
static int is_nx(struct kvm_vcpu *vcpu)
{
232
	return vcpu->arch.efer & EFER_NX;
233 234
}

235 236 237 238 239 240
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

M
Marcelo Tosatti 已提交
241 242 243 244 245
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

246
static int is_writable_pte(unsigned long pte)
A
Avi Kivity 已提交
247 248 249 250
{
	return pte & PT_WRITABLE_MASK;
}

251
static int is_dirty_gpte(unsigned long pte)
252
{
A
Avi Kivity 已提交
253
	return pte & PT_DIRTY_MASK;
254 255
}

256
static int is_rmap_spte(u64 pte)
257
{
258
	return is_shadow_present_pte(pte);
259 260
}

261 262 263 264
static int is_last_spte(u64 pte, int level)
{
	if (level == PT_PAGE_TABLE_LEVEL)
		return 1;
265
	if (is_large_pte(pte))
266 267 268 269
		return 1;
	return 0;
}

270
static pfn_t spte_to_pfn(u64 pte)
271
{
272
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
273 274
}

275 276 277 278 279 280 281
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

A
Avi Kivity 已提交
282
static void __set_spte(u64 *sptep, u64 spte)
283
{
284
	set_64bit(sptep, spte);
285 286
}

287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
static u64 __xchg_spte(u64 *sptep, u64 new_spte)
{
#ifdef CONFIG_X86_64
	return xchg(sptep, new_spte);
#else
	u64 old_spte;

	do {
		old_spte = *sptep;
	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);

	return old_spte;
#endif
}

302 303 304 305
static void update_spte(u64 *sptep, u64 new_spte)
{
	u64 old_spte;

306 307
	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
	      !is_rmap_spte(*sptep))
308
		__set_spte(sptep, new_spte);
309
	else {
310 311 312 313 314 315
		old_spte = __xchg_spte(sptep, new_spte);
		if (old_spte & shadow_accessed_mask)
			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
	}
}

316
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
317
				  struct kmem_cache *base_cache, int min)
318 319 320 321
{
	void *obj;

	if (cache->nobjs >= min)
322
		return 0;
323
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
324
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
325
		if (!obj)
326
			return -ENOMEM;
327 328
		cache->objects[cache->nobjs++] = obj;
	}
329
	return 0;
330 331
}

332 333
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				  struct kmem_cache *cache)
334 335
{
	while (mc->nobjs)
336
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
337 338
}

A
Avi Kivity 已提交
339
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
340
				       int min)
A
Avi Kivity 已提交
341 342 343 344 345 346
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
347
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
348 349 350 351 352 353 354 355 356 357
		if (!page)
			return -ENOMEM;
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
358
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
359 360
}

361
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
362
{
363 364
	int r;

365
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
366
				   pte_chain_cache, 4);
367 368
	if (r)
		goto out;
369
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370
				   rmap_desc_cache, 4);
371 372
	if (r)
		goto out;
373
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
374 375
	if (r)
		goto out;
376
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
377
				   mmu_page_header_cache, 4);
378 379
out:
	return r;
380 381 382 383
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
384 385
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
386
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
387 388
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				mmu_page_header_cache);
389 390 391 392 393 394 395 396 397 398 399 400 401 402
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
403
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
404 405 406
				      sizeof(struct kvm_pte_chain));
}

407
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
408
{
409
	kmem_cache_free(pte_chain_cache, pc);
410 411 412 413
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
414
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
415 416 417
				      sizeof(struct kvm_rmap_desc));
}

418
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
419
{
420
	kmem_cache_free(rmap_desc_cache, rd);
421 422
}

423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (!sp->role.direct)
		return sp->gfns[index];

	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
}

static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
	if (sp->role.direct)
		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
	else
		sp->gfns[index] = gfn;
}

M
Marcelo Tosatti 已提交
439 440 441 442
/*
 * Return the pointer to the largepage write count for a given
 * gfn, handling slots that are not large page aligned.
 */
443 444 445
static int *slot_largepage_idx(gfn_t gfn,
			       struct kvm_memory_slot *slot,
			       int level)
M
Marcelo Tosatti 已提交
446 447 448
{
	unsigned long idx;

449 450
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
451
	return &slot->lpage_info[level - 2][idx].write_count;
M
Marcelo Tosatti 已提交
452 453 454 455
}

static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
456
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
457
	int *write_count;
458
	int i;
M
Marcelo Tosatti 已提交
459

A
Avi Kivity 已提交
460
	slot = gfn_to_memslot(kvm, gfn);
461 462 463 464 465
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count += 1;
	}
M
Marcelo Tosatti 已提交
466 467 468 469
}

static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
470
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
471
	int *write_count;
472
	int i;
M
Marcelo Tosatti 已提交
473

A
Avi Kivity 已提交
474
	slot = gfn_to_memslot(kvm, gfn);
475 476 477 478 479 480
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count -= 1;
		WARN_ON(*write_count < 0);
	}
M
Marcelo Tosatti 已提交
481 482
}

483 484 485
static int has_wrprotected_page(struct kvm *kvm,
				gfn_t gfn,
				int level)
M
Marcelo Tosatti 已提交
486
{
487
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
488 489
	int *largepage_idx;

A
Avi Kivity 已提交
490
	slot = gfn_to_memslot(kvm, gfn);
M
Marcelo Tosatti 已提交
491
	if (slot) {
492
		largepage_idx = slot_largepage_idx(gfn, slot, level);
M
Marcelo Tosatti 已提交
493 494 495 496 497 498
		return *largepage_idx;
	}

	return 1;
}

499
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
M
Marcelo Tosatti 已提交
500
{
J
Joerg Roedel 已提交
501
	unsigned long page_size;
502
	int i, ret = 0;
M
Marcelo Tosatti 已提交
503

J
Joerg Roedel 已提交
504
	page_size = kvm_host_page_size(kvm, gfn);
M
Marcelo Tosatti 已提交
505

506 507 508 509 510 511 512 513
	for (i = PT_PAGE_TABLE_LEVEL;
	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
		if (page_size >= KVM_HPAGE_SIZE(i))
			ret = i;
		else
			break;
	}

514
	return ret;
M
Marcelo Tosatti 已提交
515 516
}

517
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
M
Marcelo Tosatti 已提交
518 519
{
	struct kvm_memory_slot *slot;
520
	int host_level, level, max_level;
M
Marcelo Tosatti 已提交
521 522 523

	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
	if (slot && slot->dirty_bitmap)
524
		return PT_PAGE_TABLE_LEVEL;
M
Marcelo Tosatti 已提交
525

526 527 528 529 530
	host_level = host_mapping_level(vcpu->kvm, large_gfn);

	if (host_level == PT_PAGE_TABLE_LEVEL)
		return host_level;

531 532 533 534
	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
		kvm_x86_ops->get_lpage_level() : host_level;

	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
535 536 537 538
		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
			break;

	return level - 1;
M
Marcelo Tosatti 已提交
539 540
}

541 542 543 544
/*
 * Take gfn and return the reverse mapping to it.
 */

545
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
546 547
{
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
548
	unsigned long idx;
549 550

	slot = gfn_to_memslot(kvm, gfn);
551
	if (likely(level == PT_PAGE_TABLE_LEVEL))
M
Marcelo Tosatti 已提交
552 553
		return &slot->rmap[gfn - slot->base_gfn];

554 555
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
M
Marcelo Tosatti 已提交
556

557
	return &slot->lpage_info[level - 2][idx].rmap_pde;
558 559
}

560 561 562
/*
 * Reverse mapping data structures:
 *
563 564
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
565
 *
566 567
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
568 569 570 571
 *
 * Returns the number of rmap entries before the spte was added or zero if
 * the spte was not added.
 *
572
 */
573
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
574
{
575
	struct kvm_mmu_page *sp;
576
	struct kvm_rmap_desc *desc;
577
	unsigned long *rmapp;
578
	int i, count = 0;
579

580
	if (!is_rmap_spte(*spte))
581
		return count;
582
	sp = page_header(__pa(spte));
583
	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
584
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
585
	if (!*rmapp) {
586
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
587 588
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
589
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
590
		desc = mmu_alloc_rmap_desc(vcpu);
A
Avi Kivity 已提交
591 592
		desc->sptes[0] = (u64 *)*rmapp;
		desc->sptes[1] = spte;
593
		*rmapp = (unsigned long)desc | 1;
594 595
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
A
Avi Kivity 已提交
597
		while (desc->sptes[RMAP_EXT-1] && desc->more) {
598
			desc = desc->more;
599 600
			count += RMAP_EXT;
		}
A
Avi Kivity 已提交
601
		if (desc->sptes[RMAP_EXT-1]) {
602
			desc->more = mmu_alloc_rmap_desc(vcpu);
603 604
			desc = desc->more;
		}
A
Avi Kivity 已提交
605
		for (i = 0; desc->sptes[i]; ++i)
606
			;
A
Avi Kivity 已提交
607
		desc->sptes[i] = spte;
608
	}
609
	return count;
610 611
}

612
static void rmap_desc_remove_entry(unsigned long *rmapp,
613 614 615 616 617 618
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

A
Avi Kivity 已提交
619
	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
620
		;
A
Avi Kivity 已提交
621 622
	desc->sptes[i] = desc->sptes[j];
	desc->sptes[j] = NULL;
623 624 625
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
A
Avi Kivity 已提交
626
		*rmapp = (unsigned long)desc->sptes[0];
627 628 629 630
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
631
			*rmapp = (unsigned long)desc->more | 1;
632
	mmu_free_rmap_desc(desc);
633 634
}

635
static void rmap_remove(struct kvm *kvm, u64 *spte)
636 637 638
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
639
	struct kvm_mmu_page *sp;
640
	gfn_t gfn;
641
	unsigned long *rmapp;
642 643
	int i;

644
	sp = page_header(__pa(spte));
645 646
	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647
	if (!*rmapp) {
648 649
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
650
	} else if (!(*rmapp & 1)) {
651
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
652
		if ((u64 *)*rmapp != spte) {
653 654 655 656
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
657
		*rmapp = 0;
658 659
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
660
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 662
		prev_desc = NULL;
		while (desc) {
A
Avi Kivity 已提交
663 664
			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
				if (desc->sptes[i] == spte) {
665
					rmap_desc_remove_entry(rmapp,
666
							       desc, i,
667 668 669 670 671 672
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
673
		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
674 675 676 677
		BUG();
	}
}

678
static void set_spte_track_bits(u64 *sptep, u64 new_spte)
A
Avi Kivity 已提交
679
{
680
	pfn_t pfn;
681 682 683 684 685 686 687
	u64 old_spte = *sptep;

	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
	      old_spte & shadow_accessed_mask) {
		__set_spte(sptep, new_spte);
	} else
		old_spte = __xchg_spte(sptep, new_spte);
688

689
	if (!is_rmap_spte(old_spte))
690
		return;
691
	pfn = spte_to_pfn(old_spte);
692
	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693
		kvm_set_pfn_accessed(pfn);
694
	if (is_writable_pte(old_spte))
695
		kvm_set_pfn_dirty(pfn);
696 697 698 699 700
}

static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
	set_spte_track_bits(sptep, new_spte);
A
Avi Kivity 已提交
701 702 703
	rmap_remove(kvm, sptep);
}

704
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
705 706
{
	struct kvm_rmap_desc *desc;
707 708 709 710 711 712 713 714 715 716 717 718 719
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_spte = NULL;
	while (desc) {
A
Avi Kivity 已提交
720
		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
721
			if (prev_spte == spte)
A
Avi Kivity 已提交
722 723
				return desc->sptes[i];
			prev_spte = desc->sptes[i];
724 725 726 727 728 729
		}
		desc = desc->more;
	}
	return NULL;
}

730
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
731
{
732
	unsigned long *rmapp;
733
	u64 *spte;
734
	int i, write_protected = 0;
735

736
	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
737

738 739
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
740 741 742
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
743
		if (is_writable_pte(*spte)) {
744
			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
745 746
			write_protected = 1;
		}
747
		spte = rmap_next(kvm, rmapp, spte);
748
	}
749
	if (write_protected) {
750
		pfn_t pfn;
751 752

		spte = rmap_next(kvm, rmapp, NULL);
753 754
		pfn = spte_to_pfn(*spte);
		kvm_set_pfn_dirty(pfn);
755 756
	}

M
Marcelo Tosatti 已提交
757
	/* check for huge page mappings */
758 759 760 761 762 763 764 765 766
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		rmapp = gfn_to_rmap(kvm, gfn, i);
		spte = rmap_next(kvm, rmapp, NULL);
		while (spte) {
			BUG_ON(!spte);
			BUG_ON(!(*spte & PT_PRESENT_MASK));
			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
767
			if (is_writable_pte(*spte)) {
A
Avi Kivity 已提交
768 769
				drop_spte(kvm, spte,
					  shadow_trap_nonpresent_pte);
770 771 772 773 774
				--kvm->stat.lpages;
				spte = NULL;
				write_protected = 1;
			}
			spte = rmap_next(kvm, rmapp, spte);
M
Marcelo Tosatti 已提交
775 776 777
		}
	}

778
	return write_protected;
779 780
}

F
Frederik Deweerdt 已提交
781 782
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
			   unsigned long data)
783 784 785 786 787 788 789
{
	u64 *spte;
	int need_tlb_flush = 0;

	while ((spte = rmap_next(kvm, rmapp, NULL))) {
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
A
Avi Kivity 已提交
790
		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
791 792 793 794 795
		need_tlb_flush = 1;
	}
	return need_tlb_flush;
}

F
Frederik Deweerdt 已提交
796 797
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
			     unsigned long data)
798 799
{
	int need_flush = 0;
800
	u64 *spte, new_spte;
801 802 803 804 805 806 807 808 809 810 811
	pte_t *ptep = (pte_t *)data;
	pfn_t new_pfn;

	WARN_ON(pte_huge(*ptep));
	new_pfn = pte_pfn(*ptep);
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		BUG_ON(!is_shadow_present_pte(*spte));
		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
		need_flush = 1;
		if (pte_write(*ptep)) {
A
Avi Kivity 已提交
812
			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
813 814 815 816 817 818 819
			spte = rmap_next(kvm, rmapp, NULL);
		} else {
			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
			new_spte |= (u64)new_pfn << PAGE_SHIFT;

			new_spte &= ~PT_WRITABLE_MASK;
			new_spte &= ~SPTE_HOST_WRITEABLE;
820
			new_spte &= ~shadow_accessed_mask;
821
			set_spte_track_bits(spte, new_spte);
822 823 824 825 826 827 828 829 830
			spte = rmap_next(kvm, rmapp, spte);
		}
	}
	if (need_flush)
		kvm_flush_remote_tlbs(kvm);

	return 0;
}

F
Frederik Deweerdt 已提交
831 832
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
			  unsigned long data,
833
			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
F
Frederik Deweerdt 已提交
834
					 unsigned long data))
835
{
836
	int i, j;
837
	int ret;
838
	int retval = 0;
839 840
	struct kvm_memslots *slots;

841
	slots = kvm_memslots(kvm);
842

843 844
	for (i = 0; i < slots->nmemslots; i++) {
		struct kvm_memory_slot *memslot = &slots->memslots[i];
845 846 847 848 849 850
		unsigned long start = memslot->userspace_addr;
		unsigned long end;

		end = start + (memslot->npages << PAGE_SHIFT);
		if (hva >= start && hva < end) {
			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
851

852
			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
853 854

			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
855 856 857 858 859 860
				unsigned long idx;
				int sh;

				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
					(memslot->base_gfn >> sh);
861
				ret |= handler(kvm,
862 863
					&memslot->lpage_info[j][idx].rmap_pde,
					data);
864
			}
865 866
			trace_kvm_age_page(hva, memslot, ret);
			retval |= ret;
867 868 869 870 871 872 873 874
		}
	}

	return retval;
}

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
875 876 877 878 879
	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}

void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
F
Frederik Deweerdt 已提交
880
	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
881 882
}

F
Frederik Deweerdt 已提交
883 884
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
			 unsigned long data)
885 886 887 888
{
	u64 *spte;
	int young = 0;

889 890 891 892 893 894 895
	/*
	 * Emulate the accessed bit for EPT, by checking if this page has
	 * an EPT mapping, and clearing it if it does. On the next access,
	 * a new EPT mapping will be established.
	 * This has some overhead, but not as much as the cost of swapping
	 * out actively used pages or breaking up actively used hugepages.
	 */
896
	if (!shadow_accessed_mask)
897
		return kvm_unmap_rmapp(kvm, rmapp, data);
898

899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		int _young;
		u64 _spte = *spte;
		BUG_ON(!(_spte & PT_PRESENT_MASK));
		_young = _spte & PT_ACCESSED_MASK;
		if (_young) {
			young = 1;
			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
		}
		spte = rmap_next(kvm, rmapp, spte);
	}
	return young;
}

914 915
#define RMAP_RECYCLE_THRESHOLD 1000

916
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
917 918
{
	unsigned long *rmapp;
919 920 921
	struct kvm_mmu_page *sp;

	sp = page_header(__pa(spte));
922

923
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
924

925
	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
926 927 928
	kvm_flush_remote_tlbs(vcpu->kvm);
}

929 930
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
931
	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
932 933
}

934
#ifdef MMU_DEBUG
935
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
936
{
937 938 939
	u64 *pos;
	u64 *end;

940
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
941
		if (is_shadow_present_pte(*pos)) {
942
			printk(KERN_ERR "%s: %p %llx\n", __func__,
943
			       pos, *pos);
A
Avi Kivity 已提交
944
			return 0;
945
		}
A
Avi Kivity 已提交
946 947
	return 1;
}
948
#endif
A
Avi Kivity 已提交
949

950
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951
{
952
	ASSERT(is_empty_shadow_page(sp->spt));
953
	hlist_del(&sp->hash_link);
954 955
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
956 957
	if (!sp->role.direct)
		__free_page(virt_to_page(sp->gfns));
958
	kmem_cache_free(mmu_page_header_cache, sp);
959
	++kvm->arch.n_free_mmu_pages;
960 961
}

962 963
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
964
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
965 966
}

967
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
968
					       u64 *parent_pte, int direct)
A
Avi Kivity 已提交
969
{
970
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
971

972 973
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
974 975 976
	if (!direct)
		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
						  PAGE_SIZE);
977
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
978
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
979
	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 981
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
982
	--vcpu->kvm->arch.n_free_mmu_pages;
983
	return sp;
A
Avi Kivity 已提交
984 985
}

986
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
987
				    struct kvm_mmu_page *sp, u64 *parent_pte)
988 989 990 991 992 993 994
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
995 996
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
997 998

		if (!old) {
999
			sp->parent_pte = parent_pte;
1000 1001
			return;
		}
1002
		sp->multimapped = 1;
1003
		pte_chain = mmu_alloc_pte_chain(vcpu);
1004 1005
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1006 1007
		pte_chain->parent_ptes[0] = old;
	}
1008
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1009 1010 1011 1012 1013 1014 1015 1016
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
1017
	pte_chain = mmu_alloc_pte_chain(vcpu);
1018
	BUG_ON(!pte_chain);
1019
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1020 1021 1022
	pte_chain->parent_ptes[0] = parent_pte;
}

1023
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1024 1025 1026 1027 1028 1029
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

1030 1031 1032
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
1033 1034
		return;
	}
1035
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1036 1037 1038 1039 1040
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
1041 1042
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
1043 1044 1045 1046 1047
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
1048 1049
			if (i == 0) {
				hlist_del(&pte_chain->link);
1050
				mmu_free_pte_chain(pte_chain);
1051 1052 1053
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
1054 1055
				}
			}
1056 1057 1058 1059 1060
			return;
		}
	BUG();
}

1061
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
M
Marcelo Tosatti 已提交
1062 1063 1064 1065 1066 1067 1068 1069
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	struct kvm_mmu_page *parent_sp;
	int i;

	if (!sp->multimapped && sp->parent_pte) {
		parent_sp = page_header(__pa(sp->parent_pte));
1070
		fn(parent_sp, sp->parent_pte);
M
Marcelo Tosatti 已提交
1071 1072
		return;
	}
1073

M
Marcelo Tosatti 已提交
1074 1075
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1076 1077 1078
			u64 *spte = pte_chain->parent_ptes[i];

			if (!spte)
M
Marcelo Tosatti 已提交
1079
				break;
1080 1081
			parent_sp = page_header(__pa(spte));
			fn(parent_sp, spte);
M
Marcelo Tosatti 已提交
1082 1083 1084
		}
}

1085 1086
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1087
{
1088
	mmu_parent_walk(sp, mark_unsync);
1089 1090
}

1091
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1092
{
1093
	unsigned int index;
1094

1095 1096
	index = spte - sp->spt;
	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1097
		return;
1098
	if (sp->unsync_children++)
1099
		return;
1100
	kvm_mmu_mark_parents_unsync(sp);
1101 1102
}

1103 1104 1105 1106 1107 1108 1109 1110 1111
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1112
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1113
			       struct kvm_mmu_page *sp, bool clear_unsync)
1114 1115 1116 1117
{
	return 1;
}

M
Marcelo Tosatti 已提交
1118 1119 1120 1121
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
#define KVM_PAGE_ARRAY_NR 16

struct kvm_mmu_pages {
	struct mmu_page_and_offset {
		struct kvm_mmu_page *sp;
		unsigned int idx;
	} page[KVM_PAGE_ARRAY_NR];
	unsigned int nr;
};

1132 1133 1134 1135 1136
#define for_each_unsync_children(bitmap, idx)		\
	for (idx = find_first_bit(bitmap, 512);		\
	     idx < 512;					\
	     idx = find_next_bit(bitmap, 512, idx+1))

1137 1138
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
			 int idx)
1139
{
1140
	int i;
1141

1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156
	if (sp->unsync)
		for (i=0; i < pvec->nr; i++)
			if (pvec->page[i].sp == sp)
				return 0;

	pvec->page[pvec->nr].sp = sp;
	pvec->page[pvec->nr].idx = idx;
	pvec->nr++;
	return (pvec->nr == KVM_PAGE_ARRAY_NR);
}

static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	int i, ret, nr_unsync_leaf = 0;
1157

1158
	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1159
		struct kvm_mmu_page *child;
1160 1161
		u64 ent = sp->spt[i];

1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
			goto clear_child_bitmap;

		child = page_header(ent & PT64_BASE_ADDR_MASK);

		if (child->unsync_children) {
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;

			ret = __mmu_unsync_walk(child, pvec);
			if (!ret)
				goto clear_child_bitmap;
			else if (ret > 0)
				nr_unsync_leaf += ret;
			else
				return ret;
		} else if (child->unsync) {
			nr_unsync_leaf++;
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;
		} else
			 goto clear_child_bitmap;

		continue;

clear_child_bitmap:
		__clear_bit(i, sp->unsync_child_bitmap);
		sp->unsync_children--;
		WARN_ON((int)sp->unsync_children < 0);
1191 1192 1193
	}


1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
	return nr_unsync_leaf;
}

static int mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	if (!sp->unsync_children)
		return 0;

	mmu_pages_add(pvec, sp, 0);
	return __mmu_unsync_walk(sp, pvec);
1205 1206 1207 1208 1209
}

static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	WARN_ON(!sp->unsync);
1210
	trace_kvm_mmu_sync_page(sp);
1211 1212 1213 1214
	sp->unsync = 0;
	--kvm->stat.mmu_unsync;
}

1215 1216 1217 1218
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list);
1219

1220 1221
#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
  hlist_for_each_entry(sp, pos,						\
1222 1223 1224
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
	if ((sp)->gfn != (gfn)) {} else

1225 1226
#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
  hlist_for_each_entry(sp, pos,						\
1227 1228 1229 1230
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
			(sp)->role.invalid) {} else

1231
/* @sp->gfn should be write-protected at the call site */
1232
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1233
			   struct list_head *invalid_list, bool clear_unsync)
1234
{
1235
	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1236
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1237 1238 1239
		return 1;
	}

1240
	if (clear_unsync)
1241 1242
		kvm_unlink_unsync_page(vcpu->kvm, sp);

1243
	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1244
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1245 1246 1247 1248 1249 1250 1251
		return 1;
	}

	kvm_mmu_flush_tlb(vcpu);
	return 0;
}

1252 1253 1254
static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
				   struct kvm_mmu_page *sp)
{
1255
	LIST_HEAD(invalid_list);
1256 1257
	int ret;

1258
	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1259
	if (ret)
1260 1261
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);

1262 1263 1264
	return ret;
}

1265 1266
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			 struct list_head *invalid_list)
1267
{
1268
	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1269 1270
}

1271 1272 1273 1274
/* @gfn should be write-protected at the call site */
static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
{
	struct kvm_mmu_page *s;
1275
	struct hlist_node *node;
1276
	LIST_HEAD(invalid_list);
1277 1278
	bool flush = false;

1279
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1280
		if (!s->unsync)
1281 1282 1283 1284
			continue;

		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285
			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1286
			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 1288 1289 1290 1291 1292
			continue;
		}
		kvm_unlink_unsync_page(vcpu->kvm, s);
		flush = true;
	}

1293
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1294 1295 1296 1297
	if (flush)
		kvm_mmu_flush_tlb(vcpu);
}

1298 1299 1300
struct mmu_page_path {
	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
	unsigned int idx[PT64_ROOT_LEVEL-1];
1301 1302
};

1303 1304 1305 1306 1307 1308
#define for_each_sp(pvec, sp, parents, i)			\
		for (i = mmu_pages_next(&pvec, &parents, -1),	\
			sp = pvec.page[i].sp;			\
			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
			i = mmu_pages_next(&pvec, &parents, i))

1309 1310 1311
static int mmu_pages_next(struct kvm_mmu_pages *pvec,
			  struct mmu_page_path *parents,
			  int i)
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
{
	int n;

	for (n = i+1; n < pvec->nr; n++) {
		struct kvm_mmu_page *sp = pvec->page[n].sp;

		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
			parents->idx[0] = pvec->page[n].idx;
			return n;
		}

		parents->parent[sp->role.level-2] = sp;
		parents->idx[sp->role.level-1] = pvec->page[n].idx;
	}

	return n;
}

1330
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1331
{
1332 1333 1334 1335 1336
	struct kvm_mmu_page *sp;
	unsigned int level = 0;

	do {
		unsigned int idx = parents->idx[level];
1337

1338 1339 1340 1341 1342 1343 1344 1345 1346
		sp = parents->parent[level];
		if (!sp)
			return;

		--sp->unsync_children;
		WARN_ON((int)sp->unsync_children < 0);
		__clear_bit(idx, sp->unsync_child_bitmap);
		level++;
	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1347 1348
}

1349 1350 1351
static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
			       struct mmu_page_path *parents,
			       struct kvm_mmu_pages *pvec)
1352
{
1353 1354 1355
	parents->parent[parent->role.level-1] = NULL;
	pvec->nr = 0;
}
1356

1357 1358 1359 1360 1361 1362 1363
static void mmu_sync_children(struct kvm_vcpu *vcpu,
			      struct kvm_mmu_page *parent)
{
	int i;
	struct kvm_mmu_page *sp;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1364
	LIST_HEAD(invalid_list);
1365 1366 1367

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
1368 1369 1370 1371 1372 1373 1374 1375
		int protected = 0;

		for_each_sp(pages, sp, parents, i)
			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);

		if (protected)
			kvm_flush_remote_tlbs(vcpu->kvm);

1376
		for_each_sp(pages, sp, parents, i) {
1377
			kvm_sync_page(vcpu, sp, &invalid_list);
1378 1379
			mmu_pages_clear_parents(&parents);
		}
1380
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1381
		cond_resched_lock(&vcpu->kvm->mmu_lock);
1382 1383
		kvm_mmu_pages_init(parent, &parents, &pages);
	}
1384 1385
}

1386 1387 1388 1389
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
1390
					     int direct,
1391
					     unsigned access,
1392
					     u64 *parent_pte)
1393 1394 1395
{
	union kvm_mmu_page_role role;
	unsigned quadrant;
1396
	struct kvm_mmu_page *sp;
1397
	struct hlist_node *node;
1398
	bool need_sync = false;
1399

1400
	role = vcpu->arch.mmu.base_role;
1401
	role.level = level;
1402
	role.direct = direct;
1403
	if (role.direct)
1404
		role.cr4_pae = 0;
1405
	role.access = access;
1406
	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 1408 1409 1410
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
1411
	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1412 1413
		if (!need_sync && sp->unsync)
			need_sync = true;
1414

1415 1416
		if (sp->role.word != role.word)
			continue;
1417

1418 1419
		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
			break;
1420

1421 1422
		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
		if (sp->unsync_children) {
1423
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1424 1425 1426
			kvm_mmu_mark_parents_unsync(sp);
		} else if (sp->unsync)
			kvm_mmu_mark_parents_unsync(sp);
1427

1428 1429 1430
		trace_kvm_mmu_get_page(sp, false);
		return sp;
	}
A
Avi Kivity 已提交
1431
	++vcpu->kvm->stat.mmu_cache_miss;
1432
	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1433 1434 1435 1436
	if (!sp)
		return sp;
	sp->gfn = gfn;
	sp->role = role;
1437 1438
	hlist_add_head(&sp->hash_link,
		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1439
	if (!direct) {
1440 1441
		if (rmap_write_protect(vcpu->kvm, gfn))
			kvm_flush_remote_tlbs(vcpu->kvm);
1442 1443 1444
		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
			kvm_sync_pages(vcpu, gfn);

1445 1446
		account_shadowed(vcpu->kvm, gfn);
	}
1447 1448 1449 1450
	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
		vcpu->arch.mmu.prefetch_page(vcpu, sp);
	else
		nonpaging_prefetch_page(vcpu, sp);
A
Avi Kivity 已提交
1451
	trace_kvm_mmu_get_page(sp, true);
1452
	return sp;
1453 1454
}

1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
			     struct kvm_vcpu *vcpu, u64 addr)
{
	iterator->addr = addr;
	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
	iterator->level = vcpu->arch.mmu.shadow_root_level;
	if (iterator->level == PT32E_ROOT_LEVEL) {
		iterator->shadow_addr
			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
		--iterator->level;
		if (!iterator->shadow_addr)
			iterator->level = 0;
	}
}

static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
	if (iterator->level < PT_PAGE_TABLE_LEVEL)
		return false;
1475 1476 1477 1478 1479

	if (iterator->level == PT_PAGE_TABLE_LEVEL)
		if (is_large_pte(*iterator->sptep))
			return false;

1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
	return true;
}

static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
	--iterator->level;
}

1491 1492 1493 1494 1495 1496 1497
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
	u64 spte;

	spte = __pa(sp->spt)
		| PT_PRESENT_MASK | PT_ACCESSED_MASK
		| PT_WRITABLE_MASK | PT_USER_MASK;
1498
	__set_spte(sptep, spte);
1499 1500
}

1501 1502 1503 1504 1505 1506 1507 1508
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
	if (is_large_pte(*sptep)) {
		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
		kvm_flush_remote_tlbs(vcpu->kvm);
	}
}

1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
				   unsigned direct_access)
{
	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
		struct kvm_mmu_page *child;

		/*
		 * For the direct sp, if the guest pte's dirty bit
		 * changed form clean to dirty, it will corrupt the
		 * sp's access: allow writable in the read-only sp,
		 * so we should update the spte at this point to get
		 * a new sp with the correct access.
		 */
		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
		if (child->role.access == direct_access)
			return;

		mmu_page_remove_parent_pte(child, sptep);
		__set_spte(sptep, shadow_trap_nonpresent_pte);
		kvm_flush_remote_tlbs(vcpu->kvm);
	}
}

1532
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1533
					 struct kvm_mmu_page *sp)
1534
{
1535 1536 1537 1538
	unsigned i;
	u64 *pt;
	u64 ent;

1539
	pt = sp->spt;
1540 1541 1542 1543

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

M
Marcelo Tosatti 已提交
1544
		if (is_shadow_present_pte(ent)) {
1545
			if (!is_last_spte(ent, sp->role.level)) {
M
Marcelo Tosatti 已提交
1546 1547 1548 1549
				ent &= PT64_BASE_ADDR_MASK;
				mmu_page_remove_parent_pte(page_header(ent),
							   &pt[i]);
			} else {
1550 1551
				if (is_large_pte(ent))
					--kvm->stat.lpages;
A
Avi Kivity 已提交
1552 1553
				drop_spte(kvm, &pt[i],
					  shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
1554 1555
			}
		}
1556
		pt[i] = shadow_trap_nonpresent_pte;
1557
	}
1558 1559
}

1560
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1561
{
1562
	mmu_page_remove_parent_pte(sp, parent_pte);
1563 1564
}

1565 1566 1567
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;
1568
	struct kvm_vcpu *vcpu;
1569

1570 1571
	kvm_for_each_vcpu(i, vcpu, kvm)
		vcpu->arch.last_pte_updated = NULL;
1572 1573
}

1574
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1575 1576 1577
{
	u64 *parent_pte;

1578 1579 1580
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
1581 1582 1583
		else {
			struct kvm_pte_chain *chain;

1584
			chain = container_of(sp->parent_ptes.first,
1585 1586 1587
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
1588
		BUG_ON(!parent_pte);
1589
		kvm_mmu_put_page(sp, parent_pte);
A
Avi Kivity 已提交
1590
		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1591
	}
1592 1593
}

1594
static int mmu_zap_unsync_children(struct kvm *kvm,
1595 1596
				   struct kvm_mmu_page *parent,
				   struct list_head *invalid_list)
1597
{
1598 1599 1600
	int i, zapped = 0;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1601

1602
	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1603
		return 0;
1604 1605 1606 1607 1608 1609

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
		struct kvm_mmu_page *sp;

		for_each_sp(pages, sp, parents, i) {
1610
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1611
			mmu_pages_clear_parents(&parents);
1612
			zapped++;
1613 1614 1615 1616 1617
		}
		kvm_mmu_pages_init(parent, &parents, &pages);
	}

	return zapped;
1618 1619
}

1620 1621
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list)
1622
{
1623
	int ret;
A
Avi Kivity 已提交
1624

1625
	trace_kvm_mmu_prepare_zap_page(sp);
1626
	++kvm->stat.mmu_shadow_zapped;
1627
	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1628
	kvm_mmu_page_unlink_children(kvm, sp);
1629
	kvm_mmu_unlink_parents(kvm, sp);
1630
	if (!sp->role.invalid && !sp->role.direct)
A
Avi Kivity 已提交
1631
		unaccount_shadowed(kvm, sp->gfn);
1632 1633
	if (sp->unsync)
		kvm_unlink_unsync_page(kvm, sp);
1634
	if (!sp->root_count) {
1635 1636
		/* Count self */
		ret++;
1637
		list_move(&sp->link, invalid_list);
1638
	} else {
A
Avi Kivity 已提交
1639
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1640 1641
		kvm_reload_remote_mmus(kvm);
	}
1642 1643

	sp->role.invalid = 1;
1644
	kvm_mmu_reset_last_pte_updated(kvm);
1645
	return ret;
1646 1647
}

1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list)
{
	struct kvm_mmu_page *sp;

	if (list_empty(invalid_list))
		return;

	kvm_flush_remote_tlbs(kvm);

	do {
		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
		WARN_ON(!sp->role.invalid || sp->root_count);
		kvm_mmu_free_page(kvm, sp);
	} while (!list_empty(invalid_list));

}

1666 1667 1668 1669 1670 1671
/*
 * Changing the number of mmu pages allocated to the vm
 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
1672
	int used_pages;
1673
	LIST_HEAD(invalid_list);
1674 1675 1676 1677

	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
	used_pages = max(0, used_pages);

1678 1679 1680 1681 1682 1683
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

1684
	if (used_pages > kvm_nr_mmu_pages) {
1685 1686
		while (used_pages > kvm_nr_mmu_pages &&
			!list_empty(&kvm->arch.active_mmu_pages)) {
1687 1688
			struct kvm_mmu_page *page;

1689
			page = container_of(kvm->arch.active_mmu_pages.prev,
1690
					    struct kvm_mmu_page, link);
1691 1692
			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
							       &invalid_list);
1693
		}
1694
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
1695
		kvm_nr_mmu_pages = used_pages;
1696
		kvm->arch.n_free_mmu_pages = 0;
1697 1698
	}
	else
1699 1700
		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
					 - kvm->arch.n_alloc_mmu_pages;
1701

1702
	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1703 1704
}

1705
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1706
{
1707
	struct kvm_mmu_page *sp;
1708
	struct hlist_node *node;
1709
	LIST_HEAD(invalid_list);
1710 1711
	int r;

1712
	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1713
	r = 0;
1714 1715

	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 1717 1718
		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
			 sp->role.word);
		r = 1;
1719
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1720
	}
1721
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1722
	return r;
1723 1724
}

1725
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1726
{
1727
	struct kvm_mmu_page *sp;
1728
	struct hlist_node *node;
1729
	LIST_HEAD(invalid_list);
1730

1731
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 1733
		pgprintk("%s: zap %lx %x\n",
			 __func__, gfn, sp->role.word);
1734
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735
	}
1736
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1737 1738
}

1739
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
1740
{
1741
	int slot = memslot_id(kvm, gfn);
1742
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
1743

1744
	__set_bit(slot, sp->slot_bitmap);
A
Avi Kivity 已提交
1745 1746
}

1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
{
	int i;
	u64 *pt = sp->spt;

	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
		return;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		if (pt[i] == shadow_notrap_nonpresent_pte)
A
Avi Kivity 已提交
1757
			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1758 1759 1760
	}
}

1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853
/*
 * The function is based on mtrr_type_lookup() in
 * arch/x86/kernel/cpu/mtrr/generic.c
 */
static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
			 u64 start, u64 end)
{
	int i;
	u64 base, mask;
	u8 prev_match, curr_match;
	int num_var_ranges = KVM_NR_VAR_MTRR;

	if (!mtrr_state->enabled)
		return 0xFF;

	/* Make end inclusive end, instead of exclusive */
	end--;

	/* Look in fixed ranges. Just return the type as per start */
	if (mtrr_state->have_fixed && (start < 0x100000)) {
		int idx;

		if (start < 0x80000) {
			idx = 0;
			idx += (start >> 16);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0xC0000) {
			idx = 1 * 8;
			idx += ((start - 0x80000) >> 14);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0x1000000) {
			idx = 3 * 8;
			idx += ((start - 0xC0000) >> 12);
			return mtrr_state->fixed_ranges[idx];
		}
	}

	/*
	 * Look in variable ranges
	 * Look of multiple ranges matching this address and pick type
	 * as per MTRR precedence
	 */
	if (!(mtrr_state->enabled & 2))
		return mtrr_state->def_type;

	prev_match = 0xFF;
	for (i = 0; i < num_var_ranges; ++i) {
		unsigned short start_state, end_state;

		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
			continue;

		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);

		start_state = ((start & mask) == (base & mask));
		end_state = ((end & mask) == (base & mask));
		if (start_state != end_state)
			return 0xFE;

		if ((start & mask) != (base & mask))
			continue;

		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
		if (prev_match == 0xFF) {
			prev_match = curr_match;
			continue;
		}

		if (prev_match == MTRR_TYPE_UNCACHABLE ||
		    curr_match == MTRR_TYPE_UNCACHABLE)
			return MTRR_TYPE_UNCACHABLE;

		if ((prev_match == MTRR_TYPE_WRBACK &&
		     curr_match == MTRR_TYPE_WRTHROUGH) ||
		    (prev_match == MTRR_TYPE_WRTHROUGH &&
		     curr_match == MTRR_TYPE_WRBACK)) {
			prev_match = MTRR_TYPE_WRTHROUGH;
			curr_match = MTRR_TYPE_WRTHROUGH;
		}

		if (prev_match != curr_match)
			return MTRR_TYPE_UNCACHABLE;
	}

	if (prev_match != 0xFF)
		return prev_match;

	return mtrr_state->def_type;
}

1854
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1855 1856 1857 1858 1859 1860 1861 1862 1863
{
	u8 mtrr;

	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
	if (mtrr == 0xfe || mtrr == 0xff)
		mtrr = MTRR_TYPE_WRBACK;
	return mtrr;
}
1864
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1865

1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
	trace_kvm_mmu_unsync_page(sp);
	++vcpu->kvm->stat.mmu_unsync;
	sp->unsync = 1;

	kvm_mmu_mark_parents_unsync(sp);
	mmu_convert_notrap(sp);
}

static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1877 1878
{
	struct kvm_mmu_page *s;
1879
	struct hlist_node *node;
1880

1881
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1882
		if (s->unsync)
1883
			continue;
1884 1885
		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		__kvm_unsync_page(vcpu, s);
1886 1887 1888 1889 1890 1891
	}
}

static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				  bool can_unsync)
{
1892
	struct kvm_mmu_page *s;
1893
	struct hlist_node *node;
1894 1895
	bool need_unsync = false;

1896
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1897 1898 1899
		if (!can_unsync)
			return 1;

1900
		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1901
			return 1;
1902 1903

		if (!need_unsync && !s->unsync) {
1904
			if (!oos_shadow)
1905 1906 1907
				return 1;
			need_unsync = true;
		}
1908
	}
1909 1910
	if (need_unsync)
		kvm_unsync_pages(vcpu, gfn);
1911 1912 1913
	return 0;
}

A
Avi Kivity 已提交
1914
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
1915
		    unsigned pte_access, int user_fault,
1916
		    int write_fault, int dirty, int level,
1917
		    gfn_t gfn, pfn_t pfn, bool speculative,
1918
		    bool can_unsync, bool reset_host_protection)
1919 1920
{
	u64 spte;
M
Marcelo Tosatti 已提交
1921
	int ret = 0;
S
Sheng Yang 已提交
1922

1923 1924 1925 1926 1927
	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
S
Sheng Yang 已提交
1928
	spte = shadow_base_present_pte | shadow_dirty_mask;
1929
	if (!speculative)
1930
		spte |= shadow_accessed_mask;
1931 1932
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
S
Sheng Yang 已提交
1933 1934 1935 1936
	if (pte_access & ACC_EXEC_MASK)
		spte |= shadow_x_mask;
	else
		spte |= shadow_nx_mask;
1937
	if (pte_access & ACC_USER_MASK)
S
Sheng Yang 已提交
1938
		spte |= shadow_user_mask;
1939
	if (level > PT_PAGE_TABLE_LEVEL)
M
Marcelo Tosatti 已提交
1940
		spte |= PT_PAGE_SIZE_MASK;
1941 1942 1943
	if (tdp_enabled)
		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
			kvm_is_mmio_pfn(pfn));
1944

1945 1946 1947
	if (reset_host_protection)
		spte |= SPTE_HOST_WRITEABLE;

1948
	spte |= (u64)pfn << PAGE_SHIFT;
1949 1950

	if ((pte_access & ACC_WRITE_MASK)
1951 1952
	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
		&& !user_fault)) {
1953

1954 1955
		if (level > PT_PAGE_TABLE_LEVEL &&
		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
1956
			ret = 1;
A
Avi Kivity 已提交
1957 1958
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
			goto done;
1959 1960
		}

1961 1962
		spte |= PT_WRITABLE_MASK;

1963 1964 1965
		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
			spte &= ~PT_USER_MASK;

1966 1967 1968 1969 1970 1971
		/*
		 * Optimization: for pte sync, if spte was writable the hash
		 * lookup is unnecessary (and expensive). Write protection
		 * is responsibility of mmu_get_page / kvm_sync_page.
		 * Same reasoning can be applied to dirty page accounting.
		 */
1972
		if (!can_unsync && is_writable_pte(*sptep))
1973 1974
			goto set_pte;

1975
		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976
			pgprintk("%s: found shadow page for %lx, marking ro\n",
1977
				 __func__, gfn);
M
Marcelo Tosatti 已提交
1978
			ret = 1;
1979
			pte_access &= ~ACC_WRITE_MASK;
1980
			if (is_writable_pte(spte))
1981 1982 1983 1984 1985 1986 1987
				spte &= ~PT_WRITABLE_MASK;
		}
	}

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

1988
set_pte:
1989 1990
	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
		kvm_set_pfn_dirty(pfn);
1991
	update_spte(sptep, spte);
A
Avi Kivity 已提交
1992
done:
M
Marcelo Tosatti 已提交
1993 1994 1995
	return ret;
}

A
Avi Kivity 已提交
1996
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
1997 1998
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
1999
			 int *ptwrite, int level, gfn_t gfn,
2000 2001
			 pfn_t pfn, bool speculative,
			 bool reset_host_protection)
M
Marcelo Tosatti 已提交
2002 2003
{
	int was_rmapped = 0;
2004
	int rmap_count;
M
Marcelo Tosatti 已提交
2005 2006 2007

	pgprintk("%s: spte %llx access %x write_fault %d"
		 " user_fault %d gfn %lx\n",
A
Avi Kivity 已提交
2008
		 __func__, *sptep, pt_access,
M
Marcelo Tosatti 已提交
2009 2010
		 write_fault, user_fault, gfn);

A
Avi Kivity 已提交
2011
	if (is_rmap_spte(*sptep)) {
M
Marcelo Tosatti 已提交
2012 2013 2014 2015
		/*
		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
		 * the parent of the now unreachable PTE.
		 */
2016 2017
		if (level > PT_PAGE_TABLE_LEVEL &&
		    !is_large_pte(*sptep)) {
M
Marcelo Tosatti 已提交
2018
			struct kvm_mmu_page *child;
A
Avi Kivity 已提交
2019
			u64 pte = *sptep;
M
Marcelo Tosatti 已提交
2020 2021

			child = page_header(pte & PT64_BASE_ADDR_MASK);
A
Avi Kivity 已提交
2022
			mmu_page_remove_parent_pte(child, sptep);
2023 2024
			__set_spte(sptep, shadow_trap_nonpresent_pte);
			kvm_flush_remote_tlbs(vcpu->kvm);
A
Avi Kivity 已提交
2025
		} else if (pfn != spte_to_pfn(*sptep)) {
M
Marcelo Tosatti 已提交
2026
			pgprintk("hfn old %lx new %lx\n",
A
Avi Kivity 已提交
2027
				 spte_to_pfn(*sptep), pfn);
A
Avi Kivity 已提交
2028
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029
			kvm_flush_remote_tlbs(vcpu->kvm);
2030 2031
		} else
			was_rmapped = 1;
M
Marcelo Tosatti 已提交
2032
	}
2033

A
Avi Kivity 已提交
2034
	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2035 2036
		      dirty, level, gfn, pfn, speculative, true,
		      reset_host_protection)) {
M
Marcelo Tosatti 已提交
2037 2038
		if (write_fault)
			*ptwrite = 1;
2039
		kvm_mmu_flush_tlb(vcpu);
2040
	}
M
Marcelo Tosatti 已提交
2041

A
Avi Kivity 已提交
2042
	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
M
Marcelo Tosatti 已提交
2043
	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
A
Avi Kivity 已提交
2044
		 is_large_pte(*sptep)? "2MB" : "4kB",
2045 2046
		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
		 *sptep, sptep);
A
Avi Kivity 已提交
2047
	if (!was_rmapped && is_large_pte(*sptep))
M
Marcelo Tosatti 已提交
2048 2049
		++vcpu->kvm->stat.lpages;

A
Avi Kivity 已提交
2050
	page_header_update_slot(vcpu->kvm, sptep, gfn);
2051
	if (!was_rmapped) {
2052
		rmap_count = rmap_add(vcpu, sptep, gfn);
2053
		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2054
			rmap_recycle(vcpu, sptep, gfn);
2055
	}
2056
	kvm_release_pfn_clean(pfn);
2057
	if (speculative) {
A
Avi Kivity 已提交
2058
		vcpu->arch.last_pte_updated = sptep;
2059 2060
		vcpu->arch.last_pte_gfn = gfn;
	}
2061 2062
}

A
Avi Kivity 已提交
2063 2064 2065 2066
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

2067
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068
			int level, gfn_t gfn, pfn_t pfn)
2069
{
2070
	struct kvm_shadow_walk_iterator iterator;
2071
	struct kvm_mmu_page *sp;
2072
	int pt_write = 0;
2073
	gfn_t pseudo_gfn;
A
Avi Kivity 已提交
2074

2075
	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2076
		if (iterator.level == level) {
2077 2078
			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
				     0, write, 1, &pt_write,
2079
				     level, gfn, pfn, false, true);
2080 2081
			++vcpu->stat.pf_fixed;
			break;
A
Avi Kivity 已提交
2082 2083
		}

2084
		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2085 2086 2087 2088
			u64 base_addr = iterator.addr;

			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
			pseudo_gfn = base_addr >> PAGE_SHIFT;
2089 2090 2091 2092 2093 2094 2095 2096
			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
					      iterator.level - 1,
					      1, ACC_ALL, iterator.sptep);
			if (!sp) {
				pgprintk("nonpaging_map: ENOMEM\n");
				kvm_release_pfn_clean(pfn);
				return -ENOMEM;
			}
2097

A
Avi Kivity 已提交
2098 2099 2100 2101
			__set_spte(iterator.sptep,
				   __pa(sp->spt)
				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
				   | shadow_user_mask | shadow_x_mask);
2102 2103 2104
		}
	}
	return pt_write;
A
Avi Kivity 已提交
2105 2106
}

2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123
static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
{
	char buf[1];
	void __user *hva;
	int r;

	/* Touch the page, so send SIGBUS */
	hva = (void __user *)gfn_to_hva(kvm, gfn);
	r = copy_from_user(buf, hva, 1);
}

static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
{
	kvm_release_pfn_clean(pfn);
	if (is_hwpoison_pfn(pfn)) {
		kvm_send_hwpoison_signal(kvm, gfn);
		return 0;
2124 2125 2126
	} else if (is_fault_pfn(pfn))
		return -EFAULT;

2127 2128 2129
	return 1;
}

2130 2131 2132
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
	int r;
2133
	int level;
2134
	pfn_t pfn;
2135
	unsigned long mmu_seq;
2136

2137 2138 2139 2140 2141 2142 2143 2144 2145 2146
	level = mapping_level(vcpu, gfn);

	/*
	 * This path builds a PAE pagetable - so we can map 2mb pages at
	 * maximum. Therefore check if the level is larger than that.
	 */
	if (level > PT_DIRECTORY_LEVEL)
		level = PT_DIRECTORY_LEVEL;

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
2147

2148
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2149
	smp_rmb();
2150
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2151

2152
	/* mmio */
2153 2154
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2155

2156
	spin_lock(&vcpu->kvm->mmu_lock);
2157 2158
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2159
	kvm_mmu_free_some_pages(vcpu);
2160
	r = __direct_map(vcpu, v, write, level, gfn, pfn);
2161 2162 2163
	spin_unlock(&vcpu->kvm->mmu_lock);


2164
	return r;
2165 2166 2167 2168 2169

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2170 2171 2172
}


2173 2174 2175
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
2176
	struct kvm_mmu_page *sp;
2177
	LIST_HEAD(invalid_list);
2178

2179
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
2180
		return;
2181
	spin_lock(&vcpu->kvm->mmu_lock);
2182 2183
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
2184

2185 2186
		sp = page_header(root);
		--sp->root_count;
2187 2188 2189 2190
		if (!sp->root_count && sp->role.invalid) {
			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
		}
2191
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2192
		spin_unlock(&vcpu->kvm->mmu_lock);
2193 2194 2195
		return;
	}
	for (i = 0; i < 4; ++i) {
2196
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2197

A
Avi Kivity 已提交
2198 2199
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
2200 2201
			sp = page_header(root);
			--sp->root_count;
2202
			if (!sp->root_count && sp->role.invalid)
2203 2204
				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
							 &invalid_list);
A
Avi Kivity 已提交
2205
		}
2206
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2207
	}
2208
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2209
	spin_unlock(&vcpu->kvm->mmu_lock);
2210
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2211 2212
}

2213 2214 2215 2216 2217
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
	int ret = 0;

	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2218
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2219 2220 2221 2222 2223 2224 2225
		ret = 1;
	}

	return ret;
}

static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2226 2227
{
	int i;
2228
	gfn_t root_gfn;
2229
	struct kvm_mmu_page *sp;
2230
	int direct = 0;
A
Avi Kivity 已提交
2231
	u64 pdptr;
2232

2233
	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234

2235 2236
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2238

		ASSERT(!VALID_PAGE(root));
2239 2240
		if (mmu_check_root(vcpu, root_gfn))
			return 1;
2241 2242 2243 2244
		if (tdp_enabled) {
			direct = 1;
			root_gfn = 0;
		}
2245
		spin_lock(&vcpu->kvm->mmu_lock);
2246
		kvm_mmu_free_some_pages(vcpu);
2247
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2248
				      PT64_ROOT_LEVEL, direct,
2249
				      ACC_ALL, NULL);
2250 2251
		root = __pa(sp->spt);
		++sp->root_count;
2252
		spin_unlock(&vcpu->kvm->mmu_lock);
2253
		vcpu->arch.mmu.root_hpa = root;
2254
		return 0;
2255
	}
2256
	direct = !is_paging(vcpu);
2257
	for (i = 0; i < 4; ++i) {
2258
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2260

		ASSERT(!VALID_PAGE(root));
2261
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
A
Avi Kivity 已提交
2262
			pdptr = kvm_pdptr_read(vcpu, i);
2263
			if (!is_present_gpte(pdptr)) {
2264
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
2265 2266
				continue;
			}
A
Avi Kivity 已提交
2267
			root_gfn = pdptr >> PAGE_SHIFT;
2268
		} else if (vcpu->arch.mmu.root_level == 0)
2269
			root_gfn = 0;
2270 2271
		if (mmu_check_root(vcpu, root_gfn))
			return 1;
2272 2273 2274 2275
		if (tdp_enabled) {
			direct = 1;
			root_gfn = i << 30;
		}
2276
		spin_lock(&vcpu->kvm->mmu_lock);
2277
		kvm_mmu_free_some_pages(vcpu);
2278
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279
				      PT32_ROOT_LEVEL, direct,
2280
				      ACC_ALL, NULL);
2281 2282
		root = __pa(sp->spt);
		++sp->root_count;
2283 2284
		spin_unlock(&vcpu->kvm->mmu_lock);

2285
		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2286
	}
2287
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2288
	return 0;
2289 2290
}

2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	int i;
	struct kvm_mmu_page *sp;

	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
		return;
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
		sp = page_header(root);
		mmu_sync_children(vcpu, sp);
		return;
	}
	for (i = 0; i < 4; ++i) {
		hpa_t root = vcpu->arch.mmu.pae_root[i];

2307
		if (root && VALID_PAGE(root)) {
2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318
			root &= PT64_BASE_ADDR_MASK;
			sp = page_header(root);
			mmu_sync_children(vcpu, sp);
		}
	}
}

void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_sync_roots(vcpu);
2319
	spin_unlock(&vcpu->kvm->mmu_lock);
2320 2321
}

2322 2323
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				  u32 access, u32 *error)
A
Avi Kivity 已提交
2324
{
2325 2326
	if (error)
		*error = 0;
A
Avi Kivity 已提交
2327 2328 2329 2330
	return vaddr;
}

static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
A
Avi Kivity 已提交
2331
				u32 error_code)
A
Avi Kivity 已提交
2332
{
2333
	gfn_t gfn;
2334
	int r;
A
Avi Kivity 已提交
2335

2336
	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2337 2338 2339
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
2340

A
Avi Kivity 已提交
2341
	ASSERT(vcpu);
2342
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2343

2344
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
2345

2346 2347
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
2348 2349
}

2350 2351 2352
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
				u32 error_code)
{
2353
	pfn_t pfn;
2354
	int r;
2355
	int level;
M
Marcelo Tosatti 已提交
2356
	gfn_t gfn = gpa >> PAGE_SHIFT;
2357
	unsigned long mmu_seq;
2358 2359 2360 2361 2362 2363 2364 2365

	ASSERT(vcpu);
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

2366 2367 2368 2369
	level = mapping_level(vcpu, gfn);

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);

2370
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2371
	smp_rmb();
2372
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2373 2374
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2375
	spin_lock(&vcpu->kvm->mmu_lock);
2376 2377
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2378 2379
	kvm_mmu_free_some_pages(vcpu);
	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2380
			 level, gfn, pfn);
2381 2382 2383
	spin_unlock(&vcpu->kvm->mmu_lock);

	return r;
2384 2385 2386 2387 2388

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2389 2390
}

A
Avi Kivity 已提交
2391 2392
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
2393
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2394 2395 2396 2397
}

static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
2398
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2399 2400 2401 2402 2403

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
2404
	context->prefetch_page = nonpaging_prefetch_page;
2405
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2406
	context->invlpg = nonpaging_invlpg;
2407
	context->root_level = 0;
A
Avi Kivity 已提交
2408
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2409
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2410 2411 2412
	return 0;
}

2413
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2414
{
A
Avi Kivity 已提交
2415
	++vcpu->stat.tlb_flush;
2416
	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
A
Avi Kivity 已提交
2417 2418 2419 2420
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
2421
	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2422
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2423 2424 2425 2426 2427 2428
}

static void inject_page_fault(struct kvm_vcpu *vcpu,
			      u64 addr,
			      u32 err_code)
{
2429
	kvm_inject_page_fault(vcpu, addr, err_code);
A
Avi Kivity 已提交
2430 2431 2432 2433 2434 2435 2436
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

2437 2438 2439 2440 2441 2442 2443 2444
static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
{
	int bit7;

	bit7 = (gpte >> 7) & 1;
	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
}

A
Avi Kivity 已提交
2445 2446 2447 2448 2449 2450 2451 2452
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
{
	struct kvm_mmu *context = &vcpu->arch.mmu;
	int maxphyaddr = cpuid_maxphyaddr(vcpu);
	u64 exb_bit_rsvd = 0;

	if (!is_nx(vcpu))
		exb_bit_rsvd = rsvd_bits(63, 63);
	switch (level) {
	case PT32_ROOT_LEVEL:
		/* no rsvd bits for 2 level 4K page table entries */
		context->rsvd_bits_mask[0][1] = 0;
		context->rsvd_bits_mask[0][0] = 0;
2466 2467 2468 2469 2470 2471 2472
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];

		if (!is_pse(vcpu)) {
			context->rsvd_bits_mask[1][1] = 0;
			break;
		}

2473 2474 2475 2476 2477 2478 2479 2480
		if (is_cpuid_PSE36())
			/* 36bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
		else
			/* 32 bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
		break;
	case PT32E_ROOT_LEVEL:
2481 2482 2483
		context->rsvd_bits_mask[0][2] =
			rsvd_bits(maxphyaddr, 63) |
			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2484
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2485
			rsvd_bits(maxphyaddr, 62);	/* PDE */
2486 2487 2488 2489 2490
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62); 	/* PTE */
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62) |
			rsvd_bits(13, 20);		/* large page */
2491
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2492 2493 2494 2495 2496 2497 2498
		break;
	case PT64_ROOT_LEVEL:
		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2499
			rsvd_bits(maxphyaddr, 51);
2500 2501 2502
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51);
		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2503 2504 2505
		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 29);
2506
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2507 2508
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 20);		/* large page */
2509
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2510 2511 2512 2513
		break;
	}
}

2514
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
A
Avi Kivity 已提交
2515
{
2516
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2517 2518 2519 2520 2521

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
2522
	context->prefetch_page = paging64_prefetch_page;
2523
	context->sync_page = paging64_sync_page;
M
Marcelo Tosatti 已提交
2524
	context->invlpg = paging64_invlpg;
A
Avi Kivity 已提交
2525
	context->free = paging_free;
2526 2527
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
2528
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2529 2530 2531
	return 0;
}

2532 2533
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
2534
	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2535 2536 2537
	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}

A
Avi Kivity 已提交
2538 2539
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
2540
	struct kvm_mmu *context = &vcpu->arch.mmu;
A
Avi Kivity 已提交
2541

2542
	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
A
Avi Kivity 已提交
2543 2544 2545 2546
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
2547
	context->prefetch_page = paging32_prefetch_page;
2548
	context->sync_page = paging32_sync_page;
M
Marcelo Tosatti 已提交
2549
	context->invlpg = paging32_invlpg;
A
Avi Kivity 已提交
2550 2551
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2552
	context->root_hpa = INVALID_PAGE;
A
Avi Kivity 已提交
2553 2554 2555 2556 2557
	return 0;
}

static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
2558
	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2559
	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
2560 2561
}

2562 2563 2564 2565 2566 2567 2568 2569
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *context = &vcpu->arch.mmu;

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = tdp_page_fault;
	context->free = nonpaging_free;
	context->prefetch_page = nonpaging_prefetch_page;
2570
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2571
	context->invlpg = nonpaging_invlpg;
2572
	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 2574 2575 2576 2577 2578
	context->root_hpa = INVALID_PAGE;

	if (!is_paging(vcpu)) {
		context->gva_to_gpa = nonpaging_gva_to_gpa;
		context->root_level = 0;
	} else if (is_long_mode(vcpu)) {
2579
		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2580 2581 2582
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT64_ROOT_LEVEL;
	} else if (is_pae(vcpu)) {
2583
		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2584 2585 2586
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT32E_ROOT_LEVEL;
	} else {
2587
		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2588 2589 2590 2591 2592 2593 2594 2595
		context->gva_to_gpa = paging32_gva_to_gpa;
		context->root_level = PT32_ROOT_LEVEL;
	}

	return 0;
}

static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2596
{
2597 2598
	int r;

A
Avi Kivity 已提交
2599
	ASSERT(vcpu);
2600
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2601 2602

	if (!is_paging(vcpu))
2603
		r = nonpaging_init_context(vcpu);
A
Avi Kivity 已提交
2604
	else if (is_long_mode(vcpu))
2605
		r = paging64_init_context(vcpu);
A
Avi Kivity 已提交
2606
	else if (is_pae(vcpu))
2607
		r = paging32E_init_context(vcpu);
A
Avi Kivity 已提交
2608
	else
2609 2610
		r = paging32_init_context(vcpu);

2611
	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612
	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 2614

	return r;
A
Avi Kivity 已提交
2615 2616
}

2617 2618
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
2619 2620
	vcpu->arch.update_pte.pfn = bad_pfn;

2621 2622 2623 2624 2625 2626
	if (tdp_enabled)
		return init_kvm_tdp_mmu(vcpu);
	else
		return init_kvm_softmmu(vcpu);
}

A
Avi Kivity 已提交
2627 2628 2629
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
2630 2631
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
		/* mmu.free() should set root_hpa = INVALID_PAGE */
2632
		vcpu->arch.mmu.free(vcpu);
A
Avi Kivity 已提交
2633 2634 2635
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2636 2637 2638 2639
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
2640
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
2641 2642

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2643
{
2644 2645
	int r;

2646
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
2647 2648
	if (r)
		goto out;
2649
	r = mmu_alloc_roots(vcpu);
2650
	spin_lock(&vcpu->kvm->mmu_lock);
2651
	mmu_sync_roots(vcpu);
2652
	spin_unlock(&vcpu->kvm->mmu_lock);
2653 2654
	if (r)
		goto out;
2655
	/* set_cr3() should ensure TLB has been flushed */
2656
	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657 2658
out:
	return r;
A
Avi Kivity 已提交
2659
}
A
Avi Kivity 已提交
2660 2661 2662 2663 2664 2665
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
A
Avi Kivity 已提交
2666

2667
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668
				  struct kvm_mmu_page *sp,
2669 2670 2671 2672 2673 2674
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
2675
	if (is_shadow_present_pte(pte)) {
2676
		if (is_last_spte(pte, sp->role.level))
A
Avi Kivity 已提交
2677
			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2678 2679
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
2680
			mmu_page_remove_parent_pte(child, spte);
2681 2682
		}
	}
A
Avi Kivity 已提交
2683
	__set_spte(spte, shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
2684 2685
	if (is_large_pte(pte))
		--vcpu->kvm->stat.lpages;
2686 2687
}

2688
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2689
				  struct kvm_mmu_page *sp,
2690
				  u64 *spte,
2691
				  const void *new)
2692
{
2693
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2694 2695
		++vcpu->kvm->stat.mmu_pde_zapped;
		return;
2696
        }
2697

2698 2699 2700
	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
		return;

A
Avi Kivity 已提交
2701
	++vcpu->kvm->stat.mmu_pte_updated;
2702
	if (!sp->role.cr4_pae)
2703
		paging32_update_pte(vcpu, sp, spte, new);
2704
	else
2705
		paging64_update_pte(vcpu, sp, spte, new);
2706 2707
}

2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

2721 2722
static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
				    bool remote_flush, bool local_flush)
2723
{
2724 2725 2726 2727
	if (zap_page)
		return;

	if (remote_flush)
2728
		kvm_flush_remote_tlbs(vcpu->kvm);
2729
	else if (local_flush)
2730 2731 2732
		kvm_mmu_flush_tlb(vcpu);
}

2733 2734
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
2735
	u64 *spte = vcpu->arch.last_pte_updated;
2736

S
Sheng Yang 已提交
2737
	return !!(spte && (*spte & shadow_accessed_mask));
2738 2739
}

2740
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2741
					  u64 gpte)
2742 2743
{
	gfn_t gfn;
2744
	pfn_t pfn;
2745

2746
	if (!is_present_gpte(gpte))
2747 2748
		return;
	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2749

2750
	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2751
	smp_rmb();
2752
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2753

2754 2755
	if (is_error_pfn(pfn)) {
		kvm_release_pfn_clean(pfn);
2756 2757
		return;
	}
2758
	vcpu->arch.update_pte.gfn = gfn;
2759
	vcpu->arch.update_pte.pfn = pfn;
2760 2761
}

2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{
	u64 *spte = vcpu->arch.last_pte_updated;

	if (spte
	    && vcpu->arch.last_pte_gfn == gfn
	    && shadow_accessed_mask
	    && !(*spte & shadow_accessed_mask)
	    && is_shadow_present_pte(*spte))
		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
}

2774
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2775 2776
		       const u8 *new, int bytes,
		       bool guest_initiated)
2777
{
2778
	gfn_t gfn = gpa >> PAGE_SHIFT;
2779
	union kvm_mmu_page_role mask = { .word = 0 };
2780
	struct kvm_mmu_page *sp;
2781
	struct hlist_node *node;
2782
	LIST_HEAD(invalid_list);
2783
	u64 entry, gentry;
2784 2785
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
2786
	unsigned pte_size;
2787
	unsigned page_offset;
2788
	unsigned misaligned;
2789
	unsigned quadrant;
2790
	int level;
2791
	int flooded = 0;
2792
	int npte;
2793
	int r;
2794
	int invlpg_counter;
2795 2796 2797
	bool remote_flush, local_flush, zap_page;

	zap_page = remote_flush = local_flush = false;
2798

2799
	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2800

2801
	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2802 2803 2804 2805 2806 2807 2808

	/*
	 * Assume that the pte write on a page table of the same type
	 * as the current vcpu paging mode.  This is nearly always true
	 * (might be false while changing modes).  Note it is verified later
	 * by update_pte().
	 */
2809
	if ((is_pae(vcpu) && bytes == 4) || !new) {
2810
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2811 2812 2813 2814 2815
		if (is_pae(vcpu)) {
			gpa &= ~(gpa_t)7;
			bytes = 8;
		}
		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2816 2817
		if (r)
			gentry = 0;
2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
		new = (const u8 *)&gentry;
	}

	switch (bytes) {
	case 4:
		gentry = *(const u32 *)new;
		break;
	case 8:
		gentry = *(const u64 *)new;
		break;
	default:
		gentry = 0;
		break;
2831 2832 2833
	}

	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2834
	spin_lock(&vcpu->kvm->mmu_lock);
2835 2836
	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
		gentry = 0;
2837
	kvm_mmu_access_page(vcpu, gfn);
2838
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
2839
	++vcpu->kvm->stat.mmu_pte_write;
2840
	kvm_mmu_audit(vcpu, "pre pte write");
2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851
	if (guest_initiated) {
		if (gfn == vcpu->arch.last_pt_write_gfn
		    && !last_updated_pte_accessed(vcpu)) {
			++vcpu->arch.last_pt_write_count;
			if (vcpu->arch.last_pt_write_count >= 3)
				flooded = 1;
		} else {
			vcpu->arch.last_pt_write_gfn = gfn;
			vcpu->arch.last_pt_write_count = 1;
			vcpu->arch.last_pte_updated = NULL;
		}
2852
	}
2853

2854
	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2855
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2856
		pte_size = sp->role.cr4_pae ? 8 : 4;
2857
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2858
		misaligned |= bytes < 4;
2859
		if (misaligned || flooded) {
2860 2861 2862 2863
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
2864 2865 2866 2867 2868
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
2869 2870
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2871
				 gpa, bytes, sp->role.word);
2872
			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2873
						     &invalid_list);
A
Avi Kivity 已提交
2874
			++vcpu->kvm->stat.mmu_flooded;
2875 2876
			continue;
		}
2877
		page_offset = offset;
2878
		level = sp->role.level;
2879
		npte = 1;
2880
		if (!sp->role.cr4_pae) {
2881 2882 2883 2884 2885 2886 2887
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
2888
				page_offset &= ~7; /* kill rounding error */
2889 2890 2891
				page_offset <<= 1;
				npte = 2;
			}
2892
			quadrant = page_offset >> PAGE_SHIFT;
2893
			page_offset &= ~PAGE_MASK;
2894
			if (quadrant != sp->role.quadrant)
2895
				continue;
2896
		}
2897
		local_flush = true;
2898
		spte = &sp->spt[page_offset / sizeof(*spte)];
2899
		while (npte--) {
2900
			entry = *spte;
2901
			mmu_pte_write_zap_pte(vcpu, sp, spte);
2902 2903 2904
			if (gentry &&
			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
			      & mask.word))
2905
				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2906 2907
			if (!remote_flush && need_remote_flush(entry, *spte))
				remote_flush = true;
2908
			++spte;
2909 2910
		}
	}
2911
	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913
	kvm_mmu_audit(vcpu, "post pte write");
2914
	spin_unlock(&vcpu->kvm->mmu_lock);
2915 2916 2917
	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
		vcpu->arch.update_pte.pfn = bad_pfn;
2918
	}
2919 2920
}

2921 2922
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
2923 2924
	gpa_t gpa;
	int r;
2925

2926 2927 2928
	if (tdp_enabled)
		return 0;

2929
	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2930

2931
	spin_lock(&vcpu->kvm->mmu_lock);
2932
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2933
	spin_unlock(&vcpu->kvm->mmu_lock);
2934
	return r;
2935
}
2936
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937

2938
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2939
{
2940
	int free_pages;
2941
	LIST_HEAD(invalid_list);
2942 2943 2944

	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
	while (free_pages < KVM_REFILL_PAGES &&
2945
	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
2947

2948
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949
				  struct kvm_mmu_page, link);
2950 2951
		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
						       &invalid_list);
A
Avi Kivity 已提交
2952
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
2953
	}
2954
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
A
Avi Kivity 已提交
2955 2956
}

2957 2958 2959 2960 2961
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

2962
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2963 2964 2965 2966 2967 2968 2969 2970
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

2971 2972 2973 2974
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

A
Avi Kivity 已提交
2975
	er = emulate_instruction(vcpu, cr2, error_code, 0);
2976 2977 2978 2979 2980 2981

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
2982
		/* fall through */
2983
	case EMULATE_FAIL:
2984
		return 0;
2985 2986 2987 2988 2989 2990 2991 2992
	default:
		BUG();
	}
out:
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

M
Marcelo Tosatti 已提交
2993 2994 2995 2996 2997 2998 2999 3000
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
	vcpu->arch.mmu.invlpg(vcpu, gva);
	kvm_mmu_flush_tlb(vcpu);
	++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);

3001 3002 3003 3004 3005 3006
void kvm_enable_tdp(void)
{
	tdp_enabled = true;
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);

3007 3008 3009 3010 3011 3012
void kvm_disable_tdp(void)
{
	tdp_enabled = false;
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);

A
Avi Kivity 已提交
3013 3014
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
3015
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
A
Avi Kivity 已提交
3016 3017 3018 3019
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
3020
	struct page *page;
A
Avi Kivity 已提交
3021 3022 3023 3024
	int i;

	ASSERT(vcpu);

3025 3026 3027 3028 3029 3030 3031
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
3032 3033
		return -ENOMEM;

3034
	vcpu->arch.mmu.pae_root = page_address(page);
3035
	for (i = 0; i < 4; ++i)
3036
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3037

A
Avi Kivity 已提交
3038 3039 3040
	return 0;
}

3041
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3042 3043
{
	ASSERT(vcpu);
3044
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
3045

3046 3047
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
3048

3049 3050 3051
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
3052
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3053

3054
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
3055 3056 3057 3058 3059 3060 3061 3062
}

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
3063
	mmu_free_memory_caches(vcpu);
A
Avi Kivity 已提交
3064 3065
}

3066
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
3067
{
3068
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
3069

3070
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
A
Avi Kivity 已提交
3071 3072 3073
		int i;
		u64 *pt;

3074
		if (!test_bit(slot, sp->slot_bitmap))
A
Avi Kivity 已提交
3075 3076
			continue;

3077
		pt = sp->spt;
A
Avi Kivity 已提交
3078 3079
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
3080
			if (is_writable_pte(pt[i]))
A
Avi Kivity 已提交
3081 3082
				pt[i] &= ~PT_WRITABLE_MASK;
	}
3083
	kvm_flush_remote_tlbs(kvm);
A
Avi Kivity 已提交
3084
}
3085

3086
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
3087
{
3088
	struct kvm_mmu_page *sp, *node;
3089
	LIST_HEAD(invalid_list);
D
Dor Laor 已提交
3090

3091
	spin_lock(&kvm->mmu_lock);
3092
restart:
3093
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3094
		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3095 3096
			goto restart;

3097
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3098
	spin_unlock(&kvm->mmu_lock);
D
Dor Laor 已提交
3099 3100
}

3101 3102
static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
					       struct list_head *invalid_list)
3103 3104 3105 3106 3107
{
	struct kvm_mmu_page *page;

	page = container_of(kvm->arch.active_mmu_pages.prev,
			    struct kvm_mmu_page, link);
3108
	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3109 3110
}

3111
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3112 3113 3114 3115 3116 3117 3118 3119
{
	struct kvm *kvm;
	struct kvm *kvm_freed = NULL;
	int cache_count = 0;

	spin_lock(&kvm_lock);

	list_for_each_entry(kvm, &vm_list, vm_list) {
G
Gui Jianfeng 已提交
3120
		int npages, idx, freed_pages;
3121
		LIST_HEAD(invalid_list);
3122

3123
		idx = srcu_read_lock(&kvm->srcu);
3124 3125 3126 3127 3128
		spin_lock(&kvm->mmu_lock);
		npages = kvm->arch.n_alloc_mmu_pages -
			 kvm->arch.n_free_mmu_pages;
		cache_count += npages;
		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 3130
			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
							  &invalid_list);
G
Gui Jianfeng 已提交
3131
			cache_count -= freed_pages;
3132 3133 3134 3135
			kvm_freed = kvm;
		}
		nr_to_scan--;

3136
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3137
		spin_unlock(&kvm->mmu_lock);
3138
		srcu_read_unlock(&kvm->srcu, idx);
3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152
	}
	if (kvm_freed)
		list_move_tail(&kvm_freed->vm_list, &vm_list);

	spin_unlock(&kvm_lock);

	return cache_count;
}

static struct shrinker mmu_shrinker = {
	.shrink = mmu_shrink,
	.seeks = DEFAULT_SEEKS * 10,
};

I
Ingo Molnar 已提交
3153
static void mmu_destroy_caches(void)
3154 3155 3156 3157 3158
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
3159 3160
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
3161 3162
}

3163 3164 3165 3166 3167 3168
void kvm_mmu_module_exit(void)
{
	mmu_destroy_caches();
	unregister_shrinker(&mmu_shrinker);
}

3169 3170 3171 3172
int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
3173
					    0, 0, NULL);
3174 3175 3176 3177
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
3178
					    0, 0, NULL);
3179 3180 3181
	if (!rmap_desc_cache)
		goto nomem;

3182 3183
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
3184
						  0, 0, NULL);
3185 3186 3187
	if (!mmu_page_header_cache)
		goto nomem;

3188 3189
	register_shrinker(&mmu_shrinker);

3190 3191 3192
	return 0;

nomem:
3193
	mmu_destroy_caches();
3194 3195 3196
	return -ENOMEM;
}

3197 3198 3199 3200 3201 3202 3203 3204
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;
3205
	struct kvm_memslots *slots;
3206

3207 3208
	slots = kvm_memslots(kvm);

3209 3210
	for (i = 0; i < slots->nmemslots; i++)
		nr_pages += slots->memslots[i].npages;
3211 3212 3213 3214 3215 3216 3217 3218

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253
static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	if (len > buffer->len)
		return NULL;
	return buffer->ptr;
}

static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	void *ret;

	ret = pv_mmu_peek_buffer(buffer, len);
	if (!ret)
		return ret;
	buffer->ptr += len;
	buffer->len -= len;
	buffer->processed += len;
	return ret;
}

static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
			     gpa_t addr, gpa_t value)
{
	int bytes = 8;
	int r;

	if (!is_long_mode(vcpu) && !is_pae(vcpu))
		bytes = 4;

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

3254
	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3255 3256 3257 3258 3259 3260 3261
		return -EFAULT;

	return 1;
}

static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
3262
	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315
	return 1;
}

static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
	spin_unlock(&vcpu->kvm->mmu_lock);
	return 1;
}

static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
			     struct kvm_pv_mmu_op_buffer *buffer)
{
	struct kvm_mmu_op_header *header;

	header = pv_mmu_peek_buffer(buffer, sizeof *header);
	if (!header)
		return 0;
	switch (header->op) {
	case KVM_MMU_OP_WRITE_PTE: {
		struct kvm_mmu_op_write_pte *wpte;

		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
		if (!wpte)
			return 0;
		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
					wpte->pte_val);
	}
	case KVM_MMU_OP_FLUSH_TLB: {
		struct kvm_mmu_op_flush_tlb *ftlb;

		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
		if (!ftlb)
			return 0;
		return kvm_pv_mmu_flush_tlb(vcpu);
	}
	case KVM_MMU_OP_RELEASE_PT: {
		struct kvm_mmu_op_release_pt *rpt;

		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
		if (!rpt)
			return 0;
		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
	}
	default: return 0;
	}
}

int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
		  gpa_t addr, unsigned long *ret)
{
	int r;
3316
	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3317

3318 3319 3320
	buffer->ptr = buffer->buf;
	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
	buffer->processed = 0;
3321

3322
	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3323 3324 3325
	if (r)
		goto out;

3326 3327
	while (buffer->len) {
		r = kvm_pv_mmu_op_one(vcpu, buffer);
3328 3329 3330 3331 3332 3333 3334 3335
		if (r < 0)
			goto out;
		if (r == 0)
			break;
	}

	r = 1;
out:
3336
	*ret = buffer->processed;
3337 3338 3339
	return r;
}

3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
	struct kvm_shadow_walk_iterator iterator;
	int nr_sptes = 0;

	spin_lock(&vcpu->kvm->mmu_lock);
	for_each_shadow_entry(vcpu, addr, iterator) {
		sptes[iterator.level-1] = *iterator.sptep;
		nr_sptes++;
		if (!is_shadow_present_pte(*iterator.sptep))
			break;
	}
	spin_unlock(&vcpu->kvm->mmu_lock);

	return nr_sptes;
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369
#ifdef AUDIT

static const char *audit_msg;

static gva_t canonicalize(gva_t gva)
{
#ifdef CONFIG_X86_64
	gva = (long long)(gva << 16) >> 16;
#endif
	return gva;
}

3370

3371
typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372 3373 3374 3375 3376 3377 3378 3379 3380 3381

static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
			    inspect_spte_fn fn)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		u64 ent = sp->spt[i];

		if (is_shadow_present_pte(ent)) {
3382
			if (!is_last_spte(ent, sp->role.level)) {
3383 3384 3385
				struct kvm_mmu_page *child;
				child = page_header(ent & PT64_BASE_ADDR_MASK);
				__mmu_spte_walk(kvm, child, fn);
3386
			} else
3387
				fn(kvm, &sp->spt[i]);
3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416
		}
	}
}

static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
{
	int i;
	struct kvm_mmu_page *sp;

	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
		return;
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		hpa_t root = vcpu->arch.mmu.root_hpa;
		sp = page_header(root);
		__mmu_spte_walk(vcpu->kvm, sp, fn);
		return;
	}
	for (i = 0; i < 4; ++i) {
		hpa_t root = vcpu->arch.mmu.pae_root[i];

		if (root && VALID_PAGE(root)) {
			root &= PT64_BASE_ADDR_MASK;
			sp = page_header(root);
			__mmu_spte_walk(vcpu->kvm, sp, fn);
		}
	}
	return;
}

3417 3418 3419 3420 3421 3422 3423 3424 3425 3426
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
				gva_t va, int level)
{
	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
	int i;
	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
		u64 ent = pt[i];

3427
		if (ent == shadow_trap_nonpresent_pte)
3428 3429 3430
			continue;

		va = canonicalize(va);
3431 3432 3433
		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
			audit_mappings_page(vcpu, ent, va, level - 1);
		else {
3434
			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
J
Jan Kiszka 已提交
3435 3436 3437
			gfn_t gfn = gpa >> PAGE_SHIFT;
			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438

3439 3440 3441 3442 3443
			if (is_error_pfn(pfn)) {
				kvm_release_pfn_clean(pfn);
				continue;
			}

3444
			if (is_shadow_present_pte(ent)
3445
			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 3447
				printk(KERN_ERR "xx audit error: (%s) levels %d"
				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448
				       audit_msg, vcpu->arch.mmu.root_level,
M
Mike Day 已提交
3449 3450
				       va, gpa, hpa, ent,
				       is_shadow_present_pte(ent));
3451 3452 3453 3454
			else if (ent == shadow_notrap_nonpresent_pte
				 && !is_error_hpa(hpa))
				printk(KERN_ERR "audit: (%s) notrap shadow,"
				       " valid guest gva %lx\n", audit_msg, va);
3455
			kvm_release_pfn_clean(pfn);
3456

3457 3458 3459 3460 3461 3462
		}
	}
}

static void audit_mappings(struct kvm_vcpu *vcpu)
{
3463
	unsigned i;
3464

3465 3466
	if (vcpu->arch.mmu.root_level == 4)
		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 3468
	else
		for (i = 0; i < 4; ++i)
3469
			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470
				audit_mappings_page(vcpu,
3471
						    vcpu->arch.mmu.pae_root[i],
3472 3473 3474 3475 3476 3477
						    i << 30,
						    2);
}

static int count_rmaps(struct kvm_vcpu *vcpu)
{
3478 3479
	struct kvm *kvm = vcpu->kvm;
	struct kvm_memslots *slots;
3480
	int nmaps = 0;
3481
	int i, j, k, idx;
3482

3483
	idx = srcu_read_lock(&kvm->srcu);
3484
	slots = kvm_memslots(kvm);
3485
	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486
		struct kvm_memory_slot *m = &slots->memslots[i];
3487 3488 3489
		struct kvm_rmap_desc *d;

		for (j = 0; j < m->npages; ++j) {
3490
			unsigned long *rmapp = &m->rmap[j];
3491

3492
			if (!*rmapp)
3493
				continue;
3494
			if (!(*rmapp & 1)) {
3495 3496 3497
				++nmaps;
				continue;
			}
3498
			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 3500
			while (d) {
				for (k = 0; k < RMAP_EXT; ++k)
A
Avi Kivity 已提交
3501
					if (d->sptes[k])
3502 3503 3504 3505 3506 3507 3508
						++nmaps;
					else
						break;
				d = d->more;
			}
		}
	}
3509
	srcu_read_unlock(&kvm->srcu, idx);
3510 3511 3512
	return nmaps;
}

3513
void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514 3515 3516 3517 3518
{
	unsigned long *rmapp;
	struct kvm_mmu_page *rev_sp;
	gfn_t gfn;

3519
	if (is_writable_pte(*sptep)) {
3520
		rev_sp = page_header(__pa(sptep));
3521
		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522 3523 3524 3525 3526 3527 3528

		if (!gfn_to_memslot(kvm, gfn)) {
			if (!printk_ratelimit())
				return;
			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
					 audit_msg, gfn);
			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529
			       audit_msg, (long int)(sptep - rev_sp->spt),
3530 3531 3532 3533 3534
					rev_sp->gfn);
			dump_stack();
			return;
		}

3535
		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552
		if (!*rmapp) {
			if (!printk_ratelimit())
				return;
			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
					 audit_msg, *sptep);
			dump_stack();
		}
	}

}

void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
{
	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}

static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553
{
3554
	struct kvm_mmu_page *sp;
3555 3556
	int i;

3557
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558
		u64 *pt = sp->spt;
3559

3560
		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 3562 3563 3564 3565 3566 3567
			continue;

		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
			u64 ent = pt[i];

			if (!(ent & PT_PRESENT_MASK))
				continue;
3568
			if (!is_writable_pte(ent))
3569
				continue;
3570
			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 3572
		}
	}
3573
	return;
3574 3575 3576 3577
}

static void audit_rmap(struct kvm_vcpu *vcpu)
{
3578 3579
	check_writable_mappings_rmap(vcpu);
	count_rmaps(vcpu);
3580 3581 3582 3583
}

static void audit_write_protection(struct kvm_vcpu *vcpu)
{
3584
	struct kvm_mmu_page *sp;
3585 3586
	struct kvm_memory_slot *slot;
	unsigned long *rmapp;
3587
	u64 *spte;
3588
	gfn_t gfn;
3589

3590
	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591
		if (sp->role.direct)
3592
			continue;
3593 3594
		if (sp->unsync)
			continue;
3595

A
Avi Kivity 已提交
3596
		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597
		rmapp = &slot->rmap[gfn - slot->base_gfn];
3598 3599 3600

		spte = rmap_next(vcpu->kvm, rmapp, NULL);
		while (spte) {
3601
			if (is_writable_pte(*spte))
3602 3603
				printk(KERN_ERR "%s: (%s) shadow page has "
				"writable mappings: gfn %lx role %x\n",
3604
			       __func__, audit_msg, sp->gfn,
3605
			       sp->role.word);
3606 3607
			spte = rmap_next(vcpu->kvm, rmapp, spte);
		}
3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618
	}
}

static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
{
	int olddbg = dbg;

	dbg = 0;
	audit_msg = msg;
	audit_rmap(vcpu);
	audit_write_protection(vcpu);
3619 3620
	if (strcmp("pre pte write", audit_msg) != 0)
		audit_mappings(vcpu);
3621
	audit_writable_sptes_have_rmaps(vcpu);
3622 3623 3624 3625
	dbg = olddbg;
}

#endif