mmu.c 89.2 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
A
Avi Kivity 已提交
20

21
#include "irq.h"
22
#include "mmu.h"
23
#include "x86.h"
A
Avi Kivity 已提交
24
#include "kvm_cache_regs.h"
25
#include "x86.h"
A
Avi Kivity 已提交
26

27
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
28 29 30 31 32
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
33
#include <linux/swap.h>
M
Marcelo Tosatti 已提交
34
#include <linux/hugetlb.h>
35
#include <linux/compiler.h>
36
#include <linux/srcu.h>
37
#include <linux/slab.h>
38
#include <linux/uaccess.h>
A
Avi Kivity 已提交
39

A
Avi Kivity 已提交
40 41
#include <asm/page.h>
#include <asm/cmpxchg.h>
42
#include <asm/io.h>
43
#include <asm/vmx.h>
A
Avi Kivity 已提交
44

45 46 47 48 49 50 51
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
52
bool tdp_enabled = false;
53

54 55 56 57
enum {
	AUDIT_PRE_PAGE_FAULT,
	AUDIT_POST_PAGE_FAULT,
	AUDIT_PRE_PTE_WRITE,
58 59 60
	AUDIT_POST_PTE_WRITE,
	AUDIT_PRE_SYNC,
	AUDIT_POST_SYNC
61
};
62

63 64 65 66
char *audit_point_name[] = {
	"pre page fault",
	"post page fault",
	"pre pte write",
67 68 69
	"post pte write",
	"pre sync",
	"post sync"
70
};
71

72
#undef MMU_DEBUG
73 74 75 76 77 78 79 80 81 82 83 84 85

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

86
#ifdef MMU_DEBUG
87 88
static int dbg = 0;
module_param(dbg, bool, 0644);
89
#endif
A
Avi Kivity 已提交
90

91 92 93
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);

94 95 96
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
A
Avi Kivity 已提交
97 98 99 100 101
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
102
#endif
A
Avi Kivity 已提交
103

104 105
#define PTE_PREFETCH_NUM		8

A
Avi Kivity 已提交
106 107 108 109 110 111
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
M
Mike Day 已提交
112
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
A
Avi Kivity 已提交
113 114 115 116 117 118 119 120 121 122 123

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
124
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
125 126 127

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128 129 130
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
131 132 133 134 135

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


136
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
A
Avi Kivity 已提交
137 138
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
139 140 141 142 143 144
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
145 146 147 148

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
149 150 151
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
152

153 154
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
A
Avi Kivity 已提交
155

156 157
#define RMAP_EXT 4

158 159 160 161 162
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

163 164
#include <trace/events/kvm.h>

165 166 167
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

168 169
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

170 171
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

172
struct kvm_rmap_desc {
A
Avi Kivity 已提交
173
	u64 *sptes[RMAP_EXT];
174 175 176
	struct kvm_rmap_desc *more;
};

177 178 179 180 181 182 183 184 185 186 187 188 189
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	int level;
	u64 *sptep;
	unsigned index;
};

#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

190
typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
M
Marcelo Tosatti 已提交
191

192 193
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
194
static struct kmem_cache *mmu_page_header_cache;
195
static struct percpu_counter kvm_total_used_mmu_pages;
196

197 198
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
S
Sheng Yang 已提交
199 200 201 202 203 204
static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
205

206 207 208 209 210
static inline u64 rsvd_bits(int s, int e)
{
	return ((1ULL << (e - s + 1)) - 1) << s;
}

211 212 213 214 215 216 217
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

S
Sheng Yang 已提交
218 219 220 221 222 223 224
void kvm_mmu_set_base_ptes(u64 base_pte)
{
	shadow_base_present_pte = base_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);

void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
225
		u64 dirty_mask, u64 nx_mask, u64 x_mask)
S
Sheng Yang 已提交
226 227 228 229 230 231 232 233 234
{
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

235
static bool is_write_protection(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
236
{
237
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
A
Avi Kivity 已提交
238 239 240 241 242 243 244
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

245 246
static int is_nx(struct kvm_vcpu *vcpu)
{
247
	return vcpu->arch.efer & EFER_NX;
248 249
}

250 251 252 253 254 255
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

M
Marcelo Tosatti 已提交
256 257 258 259 260
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

261
static int is_writable_pte(unsigned long pte)
A
Avi Kivity 已提交
262 263 264 265
{
	return pte & PT_WRITABLE_MASK;
}

266
static int is_dirty_gpte(unsigned long pte)
267
{
A
Avi Kivity 已提交
268
	return pte & PT_DIRTY_MASK;
269 270
}

271
static int is_rmap_spte(u64 pte)
272
{
273
	return is_shadow_present_pte(pte);
274 275
}

276 277 278 279
static int is_last_spte(u64 pte, int level)
{
	if (level == PT_PAGE_TABLE_LEVEL)
		return 1;
280
	if (is_large_pte(pte))
281 282 283 284
		return 1;
	return 0;
}

285
static pfn_t spte_to_pfn(u64 pte)
286
{
287
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
288 289
}

290 291 292 293 294 295 296
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

A
Avi Kivity 已提交
297
static void __set_spte(u64 *sptep, u64 spte)
298
{
299
	set_64bit(sptep, spte);
300 301
}

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
static u64 __xchg_spte(u64 *sptep, u64 new_spte)
{
#ifdef CONFIG_X86_64
	return xchg(sptep, new_spte);
#else
	u64 old_spte;

	do {
		old_spte = *sptep;
	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);

	return old_spte;
#endif
}

317 318 319 320 321 322 323 324
static bool spte_has_volatile_bits(u64 spte)
{
	if (!shadow_accessed_mask)
		return false;

	if (!is_shadow_present_pte(spte))
		return false;

325 326
	if ((spte & shadow_accessed_mask) &&
	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
327 328 329 330 331
		return false;

	return true;
}

332 333 334 335 336
static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
{
	return (old_spte & bit_mask) && !(new_spte & bit_mask);
}

337 338
static void update_spte(u64 *sptep, u64 new_spte)
{
339 340 341
	u64 mask, old_spte = *sptep;

	WARN_ON(!is_rmap_spte(new_spte));
342

343 344 345 346 347 348 349
	new_spte |= old_spte & shadow_dirty_mask;

	mask = shadow_accessed_mask;
	if (is_writable_pte(old_spte))
		mask |= shadow_dirty_mask;

	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
350
		__set_spte(sptep, new_spte);
351
	else
352
		old_spte = __xchg_spte(sptep, new_spte);
353 354 355 356 357 358 359 360

	if (!shadow_accessed_mask)
		return;

	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
361 362
}

363
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
364
				  struct kmem_cache *base_cache, int min)
365 366 367 368
{
	void *obj;

	if (cache->nobjs >= min)
369
		return 0;
370
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
371
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
372
		if (!obj)
373
			return -ENOMEM;
374 375
		cache->objects[cache->nobjs++] = obj;
	}
376
	return 0;
377 378
}

379 380
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				  struct kmem_cache *cache)
381 382
{
	while (mc->nobjs)
383
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
384 385
}

A
Avi Kivity 已提交
386
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
387
				       int min)
A
Avi Kivity 已提交
388 389 390 391 392 393
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
394
		page = alloc_page(GFP_KERNEL);
A
Avi Kivity 已提交
395 396 397 398 399 400 401 402 403 404
		if (!page)
			return -ENOMEM;
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
405
		free_page((unsigned long)mc->objects[--mc->nobjs]);
A
Avi Kivity 已提交
406 407
}

408
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
409
{
410 411
	int r;

412
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
413
				   pte_chain_cache, 4);
414 415
	if (r)
		goto out;
416
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
417
				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
418 419
	if (r)
		goto out;
420
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
421 422
	if (r)
		goto out;
423
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
424
				   mmu_page_header_cache, 4);
425 426
out:
	return r;
427 428 429 430
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
431 432
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
433
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
434 435
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				mmu_page_header_cache);
436 437 438 439 440 441 442 443 444 445 446 447 448 449
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
450
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
451 452 453
				      sizeof(struct kvm_pte_chain));
}

454
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
455
{
456
	kmem_cache_free(pte_chain_cache, pc);
457 458 459 460
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
461
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
462 463 464
				      sizeof(struct kvm_rmap_desc));
}

465
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
466
{
467
	kmem_cache_free(rmap_desc_cache, rd);
468 469
}

470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (!sp->role.direct)
		return sp->gfns[index];

	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
}

static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
	if (sp->role.direct)
		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
	else
		sp->gfns[index] = gfn;
}

M
Marcelo Tosatti 已提交
486 487 488 489
/*
 * Return the pointer to the largepage write count for a given
 * gfn, handling slots that are not large page aligned.
 */
490 491 492
static int *slot_largepage_idx(gfn_t gfn,
			       struct kvm_memory_slot *slot,
			       int level)
M
Marcelo Tosatti 已提交
493 494 495
{
	unsigned long idx;

496 497
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
498
	return &slot->lpage_info[level - 2][idx].write_count;
M
Marcelo Tosatti 已提交
499 500 501 502
}

static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
503
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
504
	int *write_count;
505
	int i;
M
Marcelo Tosatti 已提交
506

A
Avi Kivity 已提交
507
	slot = gfn_to_memslot(kvm, gfn);
508 509 510 511 512
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count += 1;
	}
M
Marcelo Tosatti 已提交
513 514 515 516
}

static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
517
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
518
	int *write_count;
519
	int i;
M
Marcelo Tosatti 已提交
520

A
Avi Kivity 已提交
521
	slot = gfn_to_memslot(kvm, gfn);
522 523 524 525 526 527
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count -= 1;
		WARN_ON(*write_count < 0);
	}
M
Marcelo Tosatti 已提交
528 529
}

530 531 532
static int has_wrprotected_page(struct kvm *kvm,
				gfn_t gfn,
				int level)
M
Marcelo Tosatti 已提交
533
{
534
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
535 536
	int *largepage_idx;

A
Avi Kivity 已提交
537
	slot = gfn_to_memslot(kvm, gfn);
M
Marcelo Tosatti 已提交
538
	if (slot) {
539
		largepage_idx = slot_largepage_idx(gfn, slot, level);
M
Marcelo Tosatti 已提交
540 541 542 543 544 545
		return *largepage_idx;
	}

	return 1;
}

546
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
M
Marcelo Tosatti 已提交
547
{
J
Joerg Roedel 已提交
548
	unsigned long page_size;
549
	int i, ret = 0;
M
Marcelo Tosatti 已提交
550

J
Joerg Roedel 已提交
551
	page_size = kvm_host_page_size(kvm, gfn);
M
Marcelo Tosatti 已提交
552

553 554 555 556 557 558 559 560
	for (i = PT_PAGE_TABLE_LEVEL;
	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
		if (page_size >= KVM_HPAGE_SIZE(i))
			ret = i;
		else
			break;
	}

561
	return ret;
M
Marcelo Tosatti 已提交
562 563
}

564
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
M
Marcelo Tosatti 已提交
565 566
{
	struct kvm_memory_slot *slot;
567
	int host_level, level, max_level;
M
Marcelo Tosatti 已提交
568 569 570

	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
	if (slot && slot->dirty_bitmap)
571
		return PT_PAGE_TABLE_LEVEL;
M
Marcelo Tosatti 已提交
572

573 574 575 576 577
	host_level = host_mapping_level(vcpu->kvm, large_gfn);

	if (host_level == PT_PAGE_TABLE_LEVEL)
		return host_level;

578 579 580 581
	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
		kvm_x86_ops->get_lpage_level() : host_level;

	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
582 583 584 585
		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
			break;

	return level - 1;
M
Marcelo Tosatti 已提交
586 587
}

588 589 590 591
/*
 * Take gfn and return the reverse mapping to it.
 */

592
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
593 594
{
	struct kvm_memory_slot *slot;
M
Marcelo Tosatti 已提交
595
	unsigned long idx;
596 597

	slot = gfn_to_memslot(kvm, gfn);
598
	if (likely(level == PT_PAGE_TABLE_LEVEL))
M
Marcelo Tosatti 已提交
599 600
		return &slot->rmap[gfn - slot->base_gfn];

601 602
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
M
Marcelo Tosatti 已提交
603

604
	return &slot->lpage_info[level - 2][idx].rmap_pde;
605 606
}

607 608 609
/*
 * Reverse mapping data structures:
 *
610 611
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
612
 *
613 614
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
615 616 617 618
 *
 * Returns the number of rmap entries before the spte was added or zero if
 * the spte was not added.
 *
619
 */
620
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
621
{
622
	struct kvm_mmu_page *sp;
623
	struct kvm_rmap_desc *desc;
624
	unsigned long *rmapp;
625
	int i, count = 0;
626

627
	if (!is_rmap_spte(*spte))
628
		return count;
629
	sp = page_header(__pa(spte));
630
	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
631
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
632
	if (!*rmapp) {
633
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
634 635
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
636
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
637
		desc = mmu_alloc_rmap_desc(vcpu);
A
Avi Kivity 已提交
638 639
		desc->sptes[0] = (u64 *)*rmapp;
		desc->sptes[1] = spte;
640
		*rmapp = (unsigned long)desc | 1;
641
		++count;
642 643
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
644
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
A
Avi Kivity 已提交
645
		while (desc->sptes[RMAP_EXT-1] && desc->more) {
646
			desc = desc->more;
647 648
			count += RMAP_EXT;
		}
A
Avi Kivity 已提交
649
		if (desc->sptes[RMAP_EXT-1]) {
650
			desc->more = mmu_alloc_rmap_desc(vcpu);
651 652
			desc = desc->more;
		}
A
Avi Kivity 已提交
653
		for (i = 0; desc->sptes[i]; ++i)
654
			++count;
A
Avi Kivity 已提交
655
		desc->sptes[i] = spte;
656
	}
657
	return count;
658 659
}

660
static void rmap_desc_remove_entry(unsigned long *rmapp,
661 662 663 664 665 666
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

A
Avi Kivity 已提交
667
	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
668
		;
A
Avi Kivity 已提交
669 670
	desc->sptes[i] = desc->sptes[j];
	desc->sptes[j] = NULL;
671 672 673
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
A
Avi Kivity 已提交
674
		*rmapp = (unsigned long)desc->sptes[0];
675 676 677 678
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
679
			*rmapp = (unsigned long)desc->more | 1;
680
	mmu_free_rmap_desc(desc);
681 682
}

683
static void rmap_remove(struct kvm *kvm, u64 *spte)
684 685 686
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
687
	struct kvm_mmu_page *sp;
688
	gfn_t gfn;
689
	unsigned long *rmapp;
690 691
	int i;

692
	sp = page_header(__pa(spte));
693 694
	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
695
	if (!*rmapp) {
696
		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
697
		BUG();
698
	} else if (!(*rmapp & 1)) {
699
		rmap_printk("rmap_remove:  %p 1->0\n", spte);
700
		if ((u64 *)*rmapp != spte) {
701
			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
702 703
			BUG();
		}
704
		*rmapp = 0;
705
	} else {
706
		rmap_printk("rmap_remove:  %p many->many\n", spte);
707
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
708 709
		prev_desc = NULL;
		while (desc) {
A
Avi Kivity 已提交
710 711
			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
				if (desc->sptes[i] == spte) {
712
					rmap_desc_remove_entry(rmapp,
713
							       desc, i,
714 715 716 717 718 719
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
720
		pr_err("rmap_remove: %p many->many\n", spte);
721 722 723 724
		BUG();
	}
}

725
static int set_spte_track_bits(u64 *sptep, u64 new_spte)
A
Avi Kivity 已提交
726
{
727
	pfn_t pfn;
728 729
	u64 old_spte = *sptep;

730
	if (!spte_has_volatile_bits(old_spte))
731
		__set_spte(sptep, new_spte);
732
	else
733
		old_spte = __xchg_spte(sptep, new_spte);
734

735
	if (!is_rmap_spte(old_spte))
736
		return 0;
737

738
	pfn = spte_to_pfn(old_spte);
739
	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
740
		kvm_set_pfn_accessed(pfn);
741
	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
742
		kvm_set_pfn_dirty(pfn);
743
	return 1;
744 745 746 747
}

static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
748 749
	if (set_spte_track_bits(sptep, new_spte))
		rmap_remove(kvm, sptep);
A
Avi Kivity 已提交
750 751
}

752
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
753 754
{
	struct kvm_rmap_desc *desc;
755 756 757 758 759 760 761 762 763 764 765 766 767
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_spte = NULL;
	while (desc) {
A
Avi Kivity 已提交
768
		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
769
			if (prev_spte == spte)
A
Avi Kivity 已提交
770 771
				return desc->sptes[i];
			prev_spte = desc->sptes[i];
772 773 774 775 776 777
		}
		desc = desc->more;
	}
	return NULL;
}

778
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
779
{
780
	unsigned long *rmapp;
781
	u64 *spte;
782
	int i, write_protected = 0;
783

784
	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
785

786 787
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
788 789 790
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
791
		if (is_writable_pte(*spte)) {
792
			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
793 794
			write_protected = 1;
		}
795
		spte = rmap_next(kvm, rmapp, spte);
796
	}
797

M
Marcelo Tosatti 已提交
798
	/* check for huge page mappings */
799 800 801 802 803 804 805 806 807
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		rmapp = gfn_to_rmap(kvm, gfn, i);
		spte = rmap_next(kvm, rmapp, NULL);
		while (spte) {
			BUG_ON(!spte);
			BUG_ON(!(*spte & PT_PRESENT_MASK));
			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
808
			if (is_writable_pte(*spte)) {
A
Avi Kivity 已提交
809 810
				drop_spte(kvm, spte,
					  shadow_trap_nonpresent_pte);
811 812 813 814 815
				--kvm->stat.lpages;
				spte = NULL;
				write_protected = 1;
			}
			spte = rmap_next(kvm, rmapp, spte);
M
Marcelo Tosatti 已提交
816 817 818
		}
	}

819
	return write_protected;
820 821
}

F
Frederik Deweerdt 已提交
822 823
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
			   unsigned long data)
824 825 826 827 828 829 830
{
	u64 *spte;
	int need_tlb_flush = 0;

	while ((spte = rmap_next(kvm, rmapp, NULL))) {
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
A
Avi Kivity 已提交
831
		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
832 833 834 835 836
		need_tlb_flush = 1;
	}
	return need_tlb_flush;
}

F
Frederik Deweerdt 已提交
837 838
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
			     unsigned long data)
839 840
{
	int need_flush = 0;
841
	u64 *spte, new_spte;
842 843 844 845 846 847 848 849 850 851 852
	pte_t *ptep = (pte_t *)data;
	pfn_t new_pfn;

	WARN_ON(pte_huge(*ptep));
	new_pfn = pte_pfn(*ptep);
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		BUG_ON(!is_shadow_present_pte(*spte));
		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
		need_flush = 1;
		if (pte_write(*ptep)) {
A
Avi Kivity 已提交
853
			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
854 855 856 857 858 859 860
			spte = rmap_next(kvm, rmapp, NULL);
		} else {
			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
			new_spte |= (u64)new_pfn << PAGE_SHIFT;

			new_spte &= ~PT_WRITABLE_MASK;
			new_spte &= ~SPTE_HOST_WRITEABLE;
861
			new_spte &= ~shadow_accessed_mask;
862
			set_spte_track_bits(spte, new_spte);
863 864 865 866 867 868 869 870 871
			spte = rmap_next(kvm, rmapp, spte);
		}
	}
	if (need_flush)
		kvm_flush_remote_tlbs(kvm);

	return 0;
}

F
Frederik Deweerdt 已提交
872 873
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
			  unsigned long data,
874
			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
F
Frederik Deweerdt 已提交
875
					 unsigned long data))
876
{
877
	int i, j;
878
	int ret;
879
	int retval = 0;
880 881
	struct kvm_memslots *slots;

882
	slots = kvm_memslots(kvm);
883

884 885
	for (i = 0; i < slots->nmemslots; i++) {
		struct kvm_memory_slot *memslot = &slots->memslots[i];
886 887 888 889 890 891
		unsigned long start = memslot->userspace_addr;
		unsigned long end;

		end = start + (memslot->npages << PAGE_SHIFT);
		if (hva >= start && hva < end) {
			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
892

893
			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
894 895

			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
896 897 898 899 900 901
				unsigned long idx;
				int sh;

				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
					(memslot->base_gfn >> sh);
902
				ret |= handler(kvm,
903 904
					&memslot->lpage_info[j][idx].rmap_pde,
					data);
905
			}
906 907
			trace_kvm_age_page(hva, memslot, ret);
			retval |= ret;
908 909 910 911 912 913 914 915
		}
	}

	return retval;
}

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
916 917 918 919 920
	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}

void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
F
Frederik Deweerdt 已提交
921
	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
922 923
}

F
Frederik Deweerdt 已提交
924 925
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
			 unsigned long data)
926 927 928 929
{
	u64 *spte;
	int young = 0;

930 931 932 933 934 935 936
	/*
	 * Emulate the accessed bit for EPT, by checking if this page has
	 * an EPT mapping, and clearing it if it does. On the next access,
	 * a new EPT mapping will be established.
	 * This has some overhead, but not as much as the cost of swapping
	 * out actively used pages or breaking up actively used hugepages.
	 */
937
	if (!shadow_accessed_mask)
938
		return kvm_unmap_rmapp(kvm, rmapp, data);
939

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		int _young;
		u64 _spte = *spte;
		BUG_ON(!(_spte & PT_PRESENT_MASK));
		_young = _spte & PT_ACCESSED_MASK;
		if (_young) {
			young = 1;
			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
		}
		spte = rmap_next(kvm, rmapp, spte);
	}
	return young;
}

955 956
#define RMAP_RECYCLE_THRESHOLD 1000

957
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
958 959
{
	unsigned long *rmapp;
960 961 962
	struct kvm_mmu_page *sp;

	sp = page_header(__pa(spte));
963

964
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
965

966
	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
967 968 969
	kvm_flush_remote_tlbs(vcpu->kvm);
}

970 971
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
972
	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
973 974
}

975
#ifdef MMU_DEBUG
976
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
977
{
978 979 980
	u64 *pos;
	u64 *end;

981
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
982
		if (is_shadow_present_pte(*pos)) {
983
			printk(KERN_ERR "%s: %p %llx\n", __func__,
984
			       pos, *pos);
A
Avi Kivity 已提交
985
			return 0;
986
		}
A
Avi Kivity 已提交
987 988
	return 1;
}
989
#endif
A
Avi Kivity 已提交
990

991 992 993 994 995 996 997 998 999 1000 1001 1002
/*
 * This value is the sum of all of the kvm instances's
 * kvm->arch.n_used_mmu_pages values.  We need a global,
 * aggregate version in order to make the slab shrinker
 * faster
 */
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
{
	kvm->arch.n_used_mmu_pages += nr;
	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

1003
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1004
{
1005
	ASSERT(is_empty_shadow_page(sp->spt));
1006
	hlist_del(&sp->hash_link);
1007 1008
	list_del(&sp->link);
	__free_page(virt_to_page(sp->spt));
1009 1010
	if (!sp->role.direct)
		__free_page(virt_to_page(sp->gfns));
1011
	kmem_cache_free(mmu_page_header_cache, sp);
1012
	kvm_mod_used_mmu_pages(kvm, -1);
1013 1014
}

1015 1016
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
1017
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1018 1019
}

1020
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1021
					       u64 *parent_pte, int direct)
A
Avi Kivity 已提交
1022
{
1023
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
1024

1025 1026
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1027 1028 1029
	if (!direct)
		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
						  PAGE_SIZE);
1030
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1031
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1032
	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1033 1034
	sp->multimapped = 0;
	sp->parent_pte = parent_pte;
1035
	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1036
	return sp;
A
Avi Kivity 已提交
1037 1038
}

1039
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1040
				    struct kvm_mmu_page *sp, u64 *parent_pte)
1041 1042 1043 1044 1045 1046 1047
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

	if (!parent_pte)
		return;
1048 1049
	if (!sp->multimapped) {
		u64 *old = sp->parent_pte;
1050 1051

		if (!old) {
1052
			sp->parent_pte = parent_pte;
1053 1054
			return;
		}
1055
		sp->multimapped = 1;
1056
		pte_chain = mmu_alloc_pte_chain(vcpu);
1057 1058
		INIT_HLIST_HEAD(&sp->parent_ptes);
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1059 1060
		pte_chain->parent_ptes[0] = old;
	}
1061
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1062 1063 1064 1065 1066 1067 1068 1069
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
			continue;
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
			if (!pte_chain->parent_ptes[i]) {
				pte_chain->parent_ptes[i] = parent_pte;
				return;
			}
	}
1070
	pte_chain = mmu_alloc_pte_chain(vcpu);
1071
	BUG_ON(!pte_chain);
1072
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1073 1074 1075
	pte_chain->parent_ptes[0] = parent_pte;
}

1076
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1077 1078 1079 1080 1081 1082
				       u64 *parent_pte)
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	int i;

1083 1084 1085
	if (!sp->multimapped) {
		BUG_ON(sp->parent_pte != parent_pte);
		sp->parent_pte = NULL;
1086 1087
		return;
	}
1088
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1089 1090 1091 1092 1093
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
			if (!pte_chain->parent_ptes[i])
				break;
			if (pte_chain->parent_ptes[i] != parent_pte)
				continue;
1094 1095
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
				&& pte_chain->parent_ptes[i + 1]) {
1096 1097 1098 1099 1100
				pte_chain->parent_ptes[i]
					= pte_chain->parent_ptes[i + 1];
				++i;
			}
			pte_chain->parent_ptes[i] = NULL;
1101 1102
			if (i == 0) {
				hlist_del(&pte_chain->link);
1103
				mmu_free_pte_chain(pte_chain);
1104 1105 1106
				if (hlist_empty(&sp->parent_ptes)) {
					sp->multimapped = 0;
					sp->parent_pte = NULL;
1107 1108
				}
			}
1109 1110 1111 1112 1113
			return;
		}
	BUG();
}

1114
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
M
Marcelo Tosatti 已提交
1115 1116 1117 1118 1119 1120 1121 1122
{
	struct kvm_pte_chain *pte_chain;
	struct hlist_node *node;
	struct kvm_mmu_page *parent_sp;
	int i;

	if (!sp->multimapped && sp->parent_pte) {
		parent_sp = page_header(__pa(sp->parent_pte));
1123
		fn(parent_sp, sp->parent_pte);
M
Marcelo Tosatti 已提交
1124 1125
		return;
	}
1126

M
Marcelo Tosatti 已提交
1127 1128
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1129 1130 1131
			u64 *spte = pte_chain->parent_ptes[i];

			if (!spte)
M
Marcelo Tosatti 已提交
1132
				break;
1133 1134
			parent_sp = page_header(__pa(spte));
			fn(parent_sp, spte);
M
Marcelo Tosatti 已提交
1135 1136 1137
		}
}

1138 1139
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1140
{
1141
	mmu_parent_walk(sp, mark_unsync);
1142 1143
}

1144
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1145
{
1146
	unsigned int index;
1147

1148 1149
	index = spte - sp->spt;
	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1150
		return;
1151
	if (sp->unsync_children++)
1152
		return;
1153
	kvm_mmu_mark_parents_unsync(sp);
1154 1155
}

1156 1157 1158 1159 1160 1161 1162 1163 1164
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp)
{
	int i;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
		sp->spt[i] = shadow_trap_nonpresent_pte;
}

1165
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1166
			       struct kvm_mmu_page *sp, bool clear_unsync)
1167 1168 1169 1170
{
	return 1;
}

M
Marcelo Tosatti 已提交
1171 1172 1173 1174
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
#define KVM_PAGE_ARRAY_NR 16

struct kvm_mmu_pages {
	struct mmu_page_and_offset {
		struct kvm_mmu_page *sp;
		unsigned int idx;
	} page[KVM_PAGE_ARRAY_NR];
	unsigned int nr;
};

1185 1186 1187 1188 1189
#define for_each_unsync_children(bitmap, idx)		\
	for (idx = find_first_bit(bitmap, 512);		\
	     idx < 512;					\
	     idx = find_next_bit(bitmap, 512, idx+1))

1190 1191
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
			 int idx)
1192
{
1193
	int i;
1194

1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209
	if (sp->unsync)
		for (i=0; i < pvec->nr; i++)
			if (pvec->page[i].sp == sp)
				return 0;

	pvec->page[pvec->nr].sp = sp;
	pvec->page[pvec->nr].idx = idx;
	pvec->nr++;
	return (pvec->nr == KVM_PAGE_ARRAY_NR);
}

static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	int i, ret, nr_unsync_leaf = 0;
1210

1211
	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1212
		struct kvm_mmu_page *child;
1213 1214
		u64 ent = sp->spt[i];

1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
			goto clear_child_bitmap;

		child = page_header(ent & PT64_BASE_ADDR_MASK);

		if (child->unsync_children) {
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;

			ret = __mmu_unsync_walk(child, pvec);
			if (!ret)
				goto clear_child_bitmap;
			else if (ret > 0)
				nr_unsync_leaf += ret;
			else
				return ret;
		} else if (child->unsync) {
			nr_unsync_leaf++;
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;
		} else
			 goto clear_child_bitmap;

		continue;

clear_child_bitmap:
		__clear_bit(i, sp->unsync_child_bitmap);
		sp->unsync_children--;
		WARN_ON((int)sp->unsync_children < 0);
1244 1245 1246
	}


1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	return nr_unsync_leaf;
}

static int mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	if (!sp->unsync_children)
		return 0;

	mmu_pages_add(pvec, sp, 0);
	return __mmu_unsync_walk(sp, pvec);
1258 1259 1260 1261 1262
}

static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	WARN_ON(!sp->unsync);
1263
	trace_kvm_mmu_sync_page(sp);
1264 1265 1266 1267
	sp->unsync = 0;
	--kvm->stat.mmu_unsync;
}

1268 1269 1270 1271
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list);
1272

1273 1274
#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
  hlist_for_each_entry(sp, pos,						\
1275 1276 1277
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
	if ((sp)->gfn != (gfn)) {} else

1278 1279
#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
  hlist_for_each_entry(sp, pos,						\
1280 1281 1282 1283
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
			(sp)->role.invalid) {} else

1284
/* @sp->gfn should be write-protected at the call site */
1285
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1286
			   struct list_head *invalid_list, bool clear_unsync)
1287
{
1288
	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1289
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1290 1291 1292
		return 1;
	}

1293
	if (clear_unsync)
1294 1295
		kvm_unlink_unsync_page(vcpu->kvm, sp);

1296
	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1297
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1298 1299 1300 1301 1302 1303 1304
		return 1;
	}

	kvm_mmu_flush_tlb(vcpu);
	return 0;
}

1305 1306 1307
static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
				   struct kvm_mmu_page *sp)
{
1308
	LIST_HEAD(invalid_list);
1309 1310
	int ret;

1311
	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1312
	if (ret)
1313 1314
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);

1315 1316 1317
	return ret;
}

1318 1319
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			 struct list_head *invalid_list)
1320
{
1321
	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1322 1323
}

1324 1325 1326 1327
/* @gfn should be write-protected at the call site */
static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
{
	struct kvm_mmu_page *s;
1328
	struct hlist_node *node;
1329
	LIST_HEAD(invalid_list);
1330 1331
	bool flush = false;

1332
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1333
		if (!s->unsync)
1334 1335 1336 1337
			continue;

		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1338
			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1339
			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1340 1341 1342 1343 1344 1345
			continue;
		}
		kvm_unlink_unsync_page(vcpu->kvm, s);
		flush = true;
	}

1346
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1347 1348 1349 1350
	if (flush)
		kvm_mmu_flush_tlb(vcpu);
}

1351 1352 1353
struct mmu_page_path {
	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
	unsigned int idx[PT64_ROOT_LEVEL-1];
1354 1355
};

1356 1357 1358 1359 1360 1361
#define for_each_sp(pvec, sp, parents, i)			\
		for (i = mmu_pages_next(&pvec, &parents, -1),	\
			sp = pvec.page[i].sp;			\
			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
			i = mmu_pages_next(&pvec, &parents, i))

1362 1363 1364
static int mmu_pages_next(struct kvm_mmu_pages *pvec,
			  struct mmu_page_path *parents,
			  int i)
1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382
{
	int n;

	for (n = i+1; n < pvec->nr; n++) {
		struct kvm_mmu_page *sp = pvec->page[n].sp;

		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
			parents->idx[0] = pvec->page[n].idx;
			return n;
		}

		parents->parent[sp->role.level-2] = sp;
		parents->idx[sp->role.level-1] = pvec->page[n].idx;
	}

	return n;
}

1383
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1384
{
1385 1386 1387 1388 1389
	struct kvm_mmu_page *sp;
	unsigned int level = 0;

	do {
		unsigned int idx = parents->idx[level];
1390

1391 1392 1393 1394 1395 1396 1397 1398 1399
		sp = parents->parent[level];
		if (!sp)
			return;

		--sp->unsync_children;
		WARN_ON((int)sp->unsync_children < 0);
		__clear_bit(idx, sp->unsync_child_bitmap);
		level++;
	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1400 1401
}

1402 1403 1404
static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
			       struct mmu_page_path *parents,
			       struct kvm_mmu_pages *pvec)
1405
{
1406 1407 1408
	parents->parent[parent->role.level-1] = NULL;
	pvec->nr = 0;
}
1409

1410 1411 1412 1413 1414 1415 1416
static void mmu_sync_children(struct kvm_vcpu *vcpu,
			      struct kvm_mmu_page *parent)
{
	int i;
	struct kvm_mmu_page *sp;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1417
	LIST_HEAD(invalid_list);
1418 1419 1420

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
1421 1422 1423 1424 1425 1426 1427 1428
		int protected = 0;

		for_each_sp(pages, sp, parents, i)
			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);

		if (protected)
			kvm_flush_remote_tlbs(vcpu->kvm);

1429
		for_each_sp(pages, sp, parents, i) {
1430
			kvm_sync_page(vcpu, sp, &invalid_list);
1431 1432
			mmu_pages_clear_parents(&parents);
		}
1433
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1434
		cond_resched_lock(&vcpu->kvm->mmu_lock);
1435 1436
		kvm_mmu_pages_init(parent, &parents, &pages);
	}
1437 1438
}

1439 1440 1441 1442
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
1443
					     int direct,
1444
					     unsigned access,
1445
					     u64 *parent_pte)
1446 1447 1448
{
	union kvm_mmu_page_role role;
	unsigned quadrant;
1449
	struct kvm_mmu_page *sp;
1450
	struct hlist_node *node;
1451
	bool need_sync = false;
1452

1453
	role = vcpu->arch.mmu.base_role;
1454
	role.level = level;
1455
	role.direct = direct;
1456
	if (role.direct)
1457
		role.cr4_pae = 0;
1458
	role.access = access;
1459 1460
	if (!vcpu->arch.mmu.direct_map
	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1461 1462 1463 1464
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
1465
	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1466 1467
		if (!need_sync && sp->unsync)
			need_sync = true;
1468

1469 1470
		if (sp->role.word != role.word)
			continue;
1471

1472 1473
		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
			break;
1474

1475 1476
		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
		if (sp->unsync_children) {
1477
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1478 1479 1480
			kvm_mmu_mark_parents_unsync(sp);
		} else if (sp->unsync)
			kvm_mmu_mark_parents_unsync(sp);
1481

1482 1483 1484
		trace_kvm_mmu_get_page(sp, false);
		return sp;
	}
A
Avi Kivity 已提交
1485
	++vcpu->kvm->stat.mmu_cache_miss;
1486
	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1487 1488 1489 1490
	if (!sp)
		return sp;
	sp->gfn = gfn;
	sp->role = role;
1491 1492
	hlist_add_head(&sp->hash_link,
		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1493
	if (!direct) {
1494 1495
		if (rmap_write_protect(vcpu->kvm, gfn))
			kvm_flush_remote_tlbs(vcpu->kvm);
1496 1497 1498
		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
			kvm_sync_pages(vcpu, gfn);

1499 1500
		account_shadowed(vcpu->kvm, gfn);
	}
1501 1502 1503 1504
	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
		vcpu->arch.mmu.prefetch_page(vcpu, sp);
	else
		nonpaging_prefetch_page(vcpu, sp);
A
Avi Kivity 已提交
1505
	trace_kvm_mmu_get_page(sp, true);
1506
	return sp;
1507 1508
}

1509 1510 1511 1512 1513 1514
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
			     struct kvm_vcpu *vcpu, u64 addr)
{
	iterator->addr = addr;
	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
	iterator->level = vcpu->arch.mmu.shadow_root_level;
1515 1516 1517 1518 1519 1520

	if (iterator->level == PT64_ROOT_LEVEL &&
	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
	    !vcpu->arch.mmu.direct_map)
		--iterator->level;

1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
	if (iterator->level == PT32E_ROOT_LEVEL) {
		iterator->shadow_addr
			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
		--iterator->level;
		if (!iterator->shadow_addr)
			iterator->level = 0;
	}
}

static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
	if (iterator->level < PT_PAGE_TABLE_LEVEL)
		return false;
1535 1536 1537 1538 1539

	if (iterator->level == PT_PAGE_TABLE_LEVEL)
		if (is_large_pte(*iterator->sptep))
			return false;

1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
	return true;
}

static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
	--iterator->level;
}

1551 1552 1553 1554 1555 1556 1557
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
	u64 spte;

	spte = __pa(sp->spt)
		| PT_PRESENT_MASK | PT_ACCESSED_MASK
		| PT_WRITABLE_MASK | PT_USER_MASK;
1558
	__set_spte(sptep, spte);
1559 1560
}

1561 1562 1563 1564 1565 1566 1567 1568
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
	if (is_large_pte(*sptep)) {
		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
		kvm_flush_remote_tlbs(vcpu->kvm);
	}
}

1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
				   unsigned direct_access)
{
	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
		struct kvm_mmu_page *child;

		/*
		 * For the direct sp, if the guest pte's dirty bit
		 * changed form clean to dirty, it will corrupt the
		 * sp's access: allow writable in the read-only sp,
		 * so we should update the spte at this point to get
		 * a new sp with the correct access.
		 */
		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
		if (child->role.access == direct_access)
			return;

		mmu_page_remove_parent_pte(child, sptep);
		__set_spte(sptep, shadow_trap_nonpresent_pte);
		kvm_flush_remote_tlbs(vcpu->kvm);
	}
}

1592
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1593
					 struct kvm_mmu_page *sp)
1594
{
1595 1596 1597 1598
	unsigned i;
	u64 *pt;
	u64 ent;

1599
	pt = sp->spt;
1600 1601 1602 1603

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		ent = pt[i];

M
Marcelo Tosatti 已提交
1604
		if (is_shadow_present_pte(ent)) {
1605
			if (!is_last_spte(ent, sp->role.level)) {
M
Marcelo Tosatti 已提交
1606 1607 1608 1609
				ent &= PT64_BASE_ADDR_MASK;
				mmu_page_remove_parent_pte(page_header(ent),
							   &pt[i]);
			} else {
1610 1611
				if (is_large_pte(ent))
					--kvm->stat.lpages;
A
Avi Kivity 已提交
1612 1613
				drop_spte(kvm, &pt[i],
					  shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
1614 1615
			}
		}
1616
		pt[i] = shadow_trap_nonpresent_pte;
1617
	}
1618 1619
}

1620
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1621
{
1622
	mmu_page_remove_parent_pte(sp, parent_pte);
1623 1624
}

1625 1626 1627
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
	int i;
1628
	struct kvm_vcpu *vcpu;
1629

1630 1631
	kvm_for_each_vcpu(i, vcpu, kvm)
		vcpu->arch.last_pte_updated = NULL;
1632 1633
}

1634
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1635 1636 1637
{
	u64 *parent_pte;

1638 1639 1640
	while (sp->multimapped || sp->parent_pte) {
		if (!sp->multimapped)
			parent_pte = sp->parent_pte;
1641 1642 1643
		else {
			struct kvm_pte_chain *chain;

1644
			chain = container_of(sp->parent_ptes.first,
1645 1646 1647
					     struct kvm_pte_chain, link);
			parent_pte = chain->parent_ptes[0];
		}
1648
		BUG_ON(!parent_pte);
1649
		kvm_mmu_put_page(sp, parent_pte);
A
Avi Kivity 已提交
1650
		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1651
	}
1652 1653
}

1654
static int mmu_zap_unsync_children(struct kvm *kvm,
1655 1656
				   struct kvm_mmu_page *parent,
				   struct list_head *invalid_list)
1657
{
1658 1659 1660
	int i, zapped = 0;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
1661

1662
	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1663
		return 0;
1664 1665 1666 1667 1668 1669

	kvm_mmu_pages_init(parent, &parents, &pages);
	while (mmu_unsync_walk(parent, &pages)) {
		struct kvm_mmu_page *sp;

		for_each_sp(pages, sp, parents, i) {
1670
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1671
			mmu_pages_clear_parents(&parents);
1672
			zapped++;
1673 1674 1675 1676 1677
		}
		kvm_mmu_pages_init(parent, &parents, &pages);
	}

	return zapped;
1678 1679
}

1680 1681
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				    struct list_head *invalid_list)
1682
{
1683
	int ret;
A
Avi Kivity 已提交
1684

1685
	trace_kvm_mmu_prepare_zap_page(sp);
1686
	++kvm->stat.mmu_shadow_zapped;
1687
	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1688
	kvm_mmu_page_unlink_children(kvm, sp);
1689
	kvm_mmu_unlink_parents(kvm, sp);
1690
	if (!sp->role.invalid && !sp->role.direct)
A
Avi Kivity 已提交
1691
		unaccount_shadowed(kvm, sp->gfn);
1692 1693
	if (sp->unsync)
		kvm_unlink_unsync_page(kvm, sp);
1694
	if (!sp->root_count) {
1695 1696
		/* Count self */
		ret++;
1697
		list_move(&sp->link, invalid_list);
1698
	} else {
A
Avi Kivity 已提交
1699
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1700 1701
		kvm_reload_remote_mmus(kvm);
	}
1702 1703

	sp->role.invalid = 1;
1704
	kvm_mmu_reset_last_pte_updated(kvm);
1705
	return ret;
1706 1707
}

1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list)
{
	struct kvm_mmu_page *sp;

	if (list_empty(invalid_list))
		return;

	kvm_flush_remote_tlbs(kvm);

	do {
		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
		WARN_ON(!sp->role.invalid || sp->root_count);
		kvm_mmu_free_page(kvm, sp);
	} while (!list_empty(invalid_list));

}

1726 1727
/*
 * Changing the number of mmu pages allocated to the vm
1728
 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1729
 */
1730
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1731
{
1732
	LIST_HEAD(invalid_list);
1733 1734 1735 1736 1737 1738
	/*
	 * If we set the number of mmu pages to be smaller be than the
	 * number of actived pages , we must to free some mmu pages before we
	 * change the value
	 */

1739 1740
	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1741
			!list_empty(&kvm->arch.active_mmu_pages)) {
1742 1743
			struct kvm_mmu_page *page;

1744
			page = container_of(kvm->arch.active_mmu_pages.prev,
1745
					    struct kvm_mmu_page, link);
1746 1747
			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
			kvm_mmu_commit_zap_page(kvm, &invalid_list);
1748
		}
1749
		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1750 1751
	}

1752
	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1753 1754
}

1755
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1756
{
1757
	struct kvm_mmu_page *sp;
1758
	struct hlist_node *node;
1759
	LIST_HEAD(invalid_list);
1760 1761
	int r;

1762
	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1763
	r = 0;
1764 1765

	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1766
		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1767 1768
			 sp->role.word);
		r = 1;
1769
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1770
	}
1771
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1772
	return r;
1773 1774
}

1775
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1776
{
1777
	struct kvm_mmu_page *sp;
1778
	struct hlist_node *node;
1779
	LIST_HEAD(invalid_list);
1780

1781
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1782
		pgprintk("%s: zap %llx %x\n",
1783
			 __func__, gfn, sp->role.word);
1784
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1785
	}
1786
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 1788
}

1789
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
A
Avi Kivity 已提交
1790
{
1791
	int slot = memslot_id(kvm, gfn);
1792
	struct kvm_mmu_page *sp = page_header(__pa(pte));
A
Avi Kivity 已提交
1793

1794
	__set_bit(slot, sp->slot_bitmap);
A
Avi Kivity 已提交
1795 1796
}

1797 1798 1799 1800 1801 1802 1803 1804 1805 1806
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
{
	int i;
	u64 *pt = sp->spt;

	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
		return;

	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
		if (pt[i] == shadow_notrap_nonpresent_pte)
A
Avi Kivity 已提交
1807
			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1808 1809 1810
	}
}

1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903
/*
 * The function is based on mtrr_type_lookup() in
 * arch/x86/kernel/cpu/mtrr/generic.c
 */
static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
			 u64 start, u64 end)
{
	int i;
	u64 base, mask;
	u8 prev_match, curr_match;
	int num_var_ranges = KVM_NR_VAR_MTRR;

	if (!mtrr_state->enabled)
		return 0xFF;

	/* Make end inclusive end, instead of exclusive */
	end--;

	/* Look in fixed ranges. Just return the type as per start */
	if (mtrr_state->have_fixed && (start < 0x100000)) {
		int idx;

		if (start < 0x80000) {
			idx = 0;
			idx += (start >> 16);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0xC0000) {
			idx = 1 * 8;
			idx += ((start - 0x80000) >> 14);
			return mtrr_state->fixed_ranges[idx];
		} else if (start < 0x1000000) {
			idx = 3 * 8;
			idx += ((start - 0xC0000) >> 12);
			return mtrr_state->fixed_ranges[idx];
		}
	}

	/*
	 * Look in variable ranges
	 * Look of multiple ranges matching this address and pick type
	 * as per MTRR precedence
	 */
	if (!(mtrr_state->enabled & 2))
		return mtrr_state->def_type;

	prev_match = 0xFF;
	for (i = 0; i < num_var_ranges; ++i) {
		unsigned short start_state, end_state;

		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
			continue;

		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);

		start_state = ((start & mask) == (base & mask));
		end_state = ((end & mask) == (base & mask));
		if (start_state != end_state)
			return 0xFE;

		if ((start & mask) != (base & mask))
			continue;

		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
		if (prev_match == 0xFF) {
			prev_match = curr_match;
			continue;
		}

		if (prev_match == MTRR_TYPE_UNCACHABLE ||
		    curr_match == MTRR_TYPE_UNCACHABLE)
			return MTRR_TYPE_UNCACHABLE;

		if ((prev_match == MTRR_TYPE_WRBACK &&
		     curr_match == MTRR_TYPE_WRTHROUGH) ||
		    (prev_match == MTRR_TYPE_WRTHROUGH &&
		     curr_match == MTRR_TYPE_WRBACK)) {
			prev_match = MTRR_TYPE_WRTHROUGH;
			curr_match = MTRR_TYPE_WRTHROUGH;
		}

		if (prev_match != curr_match)
			return MTRR_TYPE_UNCACHABLE;
	}

	if (prev_match != 0xFF)
		return prev_match;

	return mtrr_state->def_type;
}

1904
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1905 1906 1907 1908 1909 1910 1911 1912 1913
{
	u8 mtrr;

	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
	if (mtrr == 0xfe || mtrr == 0xff)
		mtrr = MTRR_TYPE_WRBACK;
	return mtrr;
}
1914
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1915

1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
	trace_kvm_mmu_unsync_page(sp);
	++vcpu->kvm->stat.mmu_unsync;
	sp->unsync = 1;

	kvm_mmu_mark_parents_unsync(sp);
	mmu_convert_notrap(sp);
}

static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1927 1928
{
	struct kvm_mmu_page *s;
1929
	struct hlist_node *node;
1930

1931
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1932
		if (s->unsync)
1933
			continue;
1934 1935
		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
		__kvm_unsync_page(vcpu, s);
1936 1937 1938 1939 1940 1941
	}
}

static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				  bool can_unsync)
{
1942
	struct kvm_mmu_page *s;
1943
	struct hlist_node *node;
1944 1945
	bool need_unsync = false;

1946
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1947 1948 1949
		if (!can_unsync)
			return 1;

1950
		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1951
			return 1;
1952 1953

		if (!need_unsync && !s->unsync) {
1954
			if (!oos_shadow)
1955 1956 1957
				return 1;
			need_unsync = true;
		}
1958
	}
1959 1960
	if (need_unsync)
		kvm_unsync_pages(vcpu, gfn);
1961 1962 1963
	return 0;
}

A
Avi Kivity 已提交
1964
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
1965
		    unsigned pte_access, int user_fault,
1966
		    int write_fault, int dirty, int level,
1967
		    gfn_t gfn, pfn_t pfn, bool speculative,
1968
		    bool can_unsync, bool reset_host_protection)
1969 1970
{
	u64 spte;
M
Marcelo Tosatti 已提交
1971
	int ret = 0;
S
Sheng Yang 已提交
1972

1973 1974 1975 1976 1977
	/*
	 * We don't set the accessed bit, since we sometimes want to see
	 * whether the guest actually used the pte (in order to detect
	 * demand paging).
	 */
1978
	spte = shadow_base_present_pte;
1979
	if (!speculative)
1980
		spte |= shadow_accessed_mask;
1981 1982
	if (!dirty)
		pte_access &= ~ACC_WRITE_MASK;
S
Sheng Yang 已提交
1983 1984 1985 1986
	if (pte_access & ACC_EXEC_MASK)
		spte |= shadow_x_mask;
	else
		spte |= shadow_nx_mask;
1987
	if (pte_access & ACC_USER_MASK)
S
Sheng Yang 已提交
1988
		spte |= shadow_user_mask;
1989
	if (level > PT_PAGE_TABLE_LEVEL)
M
Marcelo Tosatti 已提交
1990
		spte |= PT_PAGE_SIZE_MASK;
1991
	if (tdp_enabled)
1992 1993
		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
			kvm_is_mmio_pfn(pfn));
1994

1995 1996 1997
	if (reset_host_protection)
		spte |= SPTE_HOST_WRITEABLE;

1998
	spte |= (u64)pfn << PAGE_SHIFT;
1999 2000

	if ((pte_access & ACC_WRITE_MASK)
2001 2002
	    || (!vcpu->arch.mmu.direct_map && write_fault
		&& !is_write_protection(vcpu) && !user_fault)) {
2003

2004 2005
		if (level > PT_PAGE_TABLE_LEVEL &&
		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
2006
			ret = 1;
A
Avi Kivity 已提交
2007 2008
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
			goto done;
2009 2010
		}

2011 2012
		spte |= PT_WRITABLE_MASK;

2013 2014
		if (!vcpu->arch.mmu.direct_map
		    && !(pte_access & ACC_WRITE_MASK))
2015 2016
			spte &= ~PT_USER_MASK;

2017 2018 2019 2020 2021 2022
		/*
		 * Optimization: for pte sync, if spte was writable the hash
		 * lookup is unnecessary (and expensive). Write protection
		 * is responsibility of mmu_get_page / kvm_sync_page.
		 * Same reasoning can be applied to dirty page accounting.
		 */
2023
		if (!can_unsync && is_writable_pte(*sptep))
2024 2025
			goto set_pte;

2026
		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2027
			pgprintk("%s: found shadow page for %llx, marking ro\n",
2028
				 __func__, gfn);
M
Marcelo Tosatti 已提交
2029
			ret = 1;
2030
			pte_access &= ~ACC_WRITE_MASK;
2031
			if (is_writable_pte(spte))
2032 2033 2034 2035 2036 2037 2038
				spte &= ~PT_WRITABLE_MASK;
		}
	}

	if (pte_access & ACC_WRITE_MASK)
		mark_page_dirty(vcpu->kvm, gfn);

2039
set_pte:
2040
	update_spte(sptep, spte);
A
Avi Kivity 已提交
2041
done:
M
Marcelo Tosatti 已提交
2042 2043 2044
	return ret;
}

A
Avi Kivity 已提交
2045
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
M
Marcelo Tosatti 已提交
2046 2047
			 unsigned pt_access, unsigned pte_access,
			 int user_fault, int write_fault, int dirty,
2048
			 int *ptwrite, int level, gfn_t gfn,
2049 2050
			 pfn_t pfn, bool speculative,
			 bool reset_host_protection)
M
Marcelo Tosatti 已提交
2051 2052
{
	int was_rmapped = 0;
2053
	int rmap_count;
M
Marcelo Tosatti 已提交
2054 2055

	pgprintk("%s: spte %llx access %x write_fault %d"
2056
		 " user_fault %d gfn %llx\n",
A
Avi Kivity 已提交
2057
		 __func__, *sptep, pt_access,
M
Marcelo Tosatti 已提交
2058 2059
		 write_fault, user_fault, gfn);

A
Avi Kivity 已提交
2060
	if (is_rmap_spte(*sptep)) {
M
Marcelo Tosatti 已提交
2061 2062 2063 2064
		/*
		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
		 * the parent of the now unreachable PTE.
		 */
2065 2066
		if (level > PT_PAGE_TABLE_LEVEL &&
		    !is_large_pte(*sptep)) {
M
Marcelo Tosatti 已提交
2067
			struct kvm_mmu_page *child;
A
Avi Kivity 已提交
2068
			u64 pte = *sptep;
M
Marcelo Tosatti 已提交
2069 2070

			child = page_header(pte & PT64_BASE_ADDR_MASK);
A
Avi Kivity 已提交
2071
			mmu_page_remove_parent_pte(child, sptep);
2072 2073
			__set_spte(sptep, shadow_trap_nonpresent_pte);
			kvm_flush_remote_tlbs(vcpu->kvm);
A
Avi Kivity 已提交
2074
		} else if (pfn != spte_to_pfn(*sptep)) {
2075
			pgprintk("hfn old %llx new %llx\n",
A
Avi Kivity 已提交
2076
				 spte_to_pfn(*sptep), pfn);
A
Avi Kivity 已提交
2077
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2078
			kvm_flush_remote_tlbs(vcpu->kvm);
2079 2080
		} else
			was_rmapped = 1;
M
Marcelo Tosatti 已提交
2081
	}
2082

A
Avi Kivity 已提交
2083
	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2084 2085
		      dirty, level, gfn, pfn, speculative, true,
		      reset_host_protection)) {
M
Marcelo Tosatti 已提交
2086 2087
		if (write_fault)
			*ptwrite = 1;
2088
		kvm_mmu_flush_tlb(vcpu);
2089
	}
M
Marcelo Tosatti 已提交
2090

A
Avi Kivity 已提交
2091
	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2092
	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
A
Avi Kivity 已提交
2093
		 is_large_pte(*sptep)? "2MB" : "4kB",
2094 2095
		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
		 *sptep, sptep);
A
Avi Kivity 已提交
2096
	if (!was_rmapped && is_large_pte(*sptep))
M
Marcelo Tosatti 已提交
2097 2098
		++vcpu->kvm->stat.lpages;

A
Avi Kivity 已提交
2099
	page_header_update_slot(vcpu->kvm, sptep, gfn);
2100
	if (!was_rmapped) {
2101
		rmap_count = rmap_add(vcpu, sptep, gfn);
2102
		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2103
			rmap_recycle(vcpu, sptep, gfn);
2104
	}
2105
	kvm_release_pfn_clean(pfn);
2106
	if (speculative) {
A
Avi Kivity 已提交
2107
		vcpu->arch.last_pte_updated = sptep;
2108 2109
		vcpu->arch.last_pte_gfn = gfn;
	}
2110 2111
}

A
Avi Kivity 已提交
2112 2113 2114 2115
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214
static struct kvm_memory_slot *
pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
{
	struct kvm_memory_slot *slot;

	slot = gfn_to_memslot(vcpu->kvm, gfn);
	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
	      (no_dirty_log && slot->dirty_bitmap))
		slot = NULL;

	return slot;
}

static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
				     bool no_dirty_log)
{
	struct kvm_memory_slot *slot;
	unsigned long hva;

	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
	if (!slot) {
		get_page(bad_page);
		return page_to_pfn(bad_page);
	}

	hva = gfn_to_hva_memslot(slot, gfn);

	return hva_to_pfn_atomic(vcpu->kvm, hva);
}

static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp,
				    u64 *start, u64 *end)
{
	struct page *pages[PTE_PREFETCH_NUM];
	unsigned access = sp->role.access;
	int i, ret;
	gfn_t gfn;

	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
	if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
		return -1;

	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
	if (ret <= 0)
		return -1;

	for (i = 0; i < ret; i++, gfn++, start++)
		mmu_set_spte(vcpu, start, ACC_ALL,
			     access, 0, 0, 1, NULL,
			     sp->role.level, gfn,
			     page_to_pfn(pages[i]), true, true);

	return 0;
}

static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
				  struct kvm_mmu_page *sp, u64 *sptep)
{
	u64 *spte, *start = NULL;
	int i;

	WARN_ON(!sp->role.direct);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
			if (!start)
				continue;
			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
				break;
			start = NULL;
		} else if (!start)
			start = spte;
	}
}

static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
	struct kvm_mmu_page *sp;

	/*
	 * Since it's no accessed bit on EPT, it's no way to
	 * distinguish between actually accessed translations
	 * and prefetched, so disable pte prefetch if EPT is
	 * enabled.
	 */
	if (!shadow_accessed_mask)
		return;

	sp = page_header(__pa(sptep));
	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	__direct_pte_prefetch(vcpu, sp, sptep);
}

2215
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2216
			int level, gfn_t gfn, pfn_t pfn)
2217
{
2218
	struct kvm_shadow_walk_iterator iterator;
2219
	struct kvm_mmu_page *sp;
2220
	int pt_write = 0;
2221
	gfn_t pseudo_gfn;
A
Avi Kivity 已提交
2222

2223
	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2224
		if (iterator.level == level) {
2225 2226
			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
				     0, write, 1, &pt_write,
2227
				     level, gfn, pfn, false, true);
2228
			direct_pte_prefetch(vcpu, iterator.sptep);
2229 2230
			++vcpu->stat.pf_fixed;
			break;
A
Avi Kivity 已提交
2231 2232
		}

2233
		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2234 2235 2236 2237
			u64 base_addr = iterator.addr;

			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
			pseudo_gfn = base_addr >> PAGE_SHIFT;
2238 2239 2240 2241 2242 2243 2244 2245
			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
					      iterator.level - 1,
					      1, ACC_ALL, iterator.sptep);
			if (!sp) {
				pgprintk("nonpaging_map: ENOMEM\n");
				kvm_release_pfn_clean(pfn);
				return -ENOMEM;
			}
2246

A
Avi Kivity 已提交
2247 2248 2249
			__set_spte(iterator.sptep,
				   __pa(sp->spt)
				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
2250 2251
				   | shadow_user_mask | shadow_x_mask
				   | shadow_accessed_mask);
2252 2253 2254
		}
	}
	return pt_write;
A
Avi Kivity 已提交
2255 2256
}

H
Huang Ying 已提交
2257
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2258
{
H
Huang Ying 已提交
2259 2260 2261 2262 2263 2264 2265
	siginfo_t info;

	info.si_signo	= SIGBUS;
	info.si_errno	= 0;
	info.si_code	= BUS_MCEERR_AR;
	info.si_addr	= (void __user *)address;
	info.si_addr_lsb = PAGE_SHIFT;
2266

H
Huang Ying 已提交
2267
	send_sig_info(SIGBUS, &info, tsk);
2268 2269 2270 2271 2272 2273
}

static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
{
	kvm_release_pfn_clean(pfn);
	if (is_hwpoison_pfn(pfn)) {
H
Huang Ying 已提交
2274
		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2275
		return 0;
2276 2277 2278
	} else if (is_fault_pfn(pfn))
		return -EFAULT;

2279 2280 2281
	return 1;
}

2282 2283 2284
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
	int r;
2285
	int level;
2286
	pfn_t pfn;
2287
	unsigned long mmu_seq;
2288

2289 2290 2291 2292 2293 2294 2295 2296 2297 2298
	level = mapping_level(vcpu, gfn);

	/*
	 * This path builds a PAE pagetable - so we can map 2mb pages at
	 * maximum. Therefore check if the level is larger than that.
	 */
	if (level > PT_DIRECTORY_LEVEL)
		level = PT_DIRECTORY_LEVEL;

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
2299

2300
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2301
	smp_rmb();
2302
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2303

2304
	/* mmio */
2305 2306
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2307

2308
	spin_lock(&vcpu->kvm->mmu_lock);
2309 2310
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2311
	kvm_mmu_free_some_pages(vcpu);
2312
	r = __direct_map(vcpu, v, write, level, gfn, pfn);
2313 2314 2315
	spin_unlock(&vcpu->kvm->mmu_lock);


2316
	return r;
2317 2318 2319 2320 2321

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2322 2323 2324
}


2325 2326 2327
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
	int i;
2328
	struct kvm_mmu_page *sp;
2329
	LIST_HEAD(invalid_list);
2330

2331
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
A
Avi Kivity 已提交
2332
		return;
2333
	spin_lock(&vcpu->kvm->mmu_lock);
2334 2335 2336
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
	     vcpu->arch.mmu.direct_map)) {
2337
		hpa_t root = vcpu->arch.mmu.root_hpa;
2338

2339 2340
		sp = page_header(root);
		--sp->root_count;
2341 2342 2343 2344
		if (!sp->root_count && sp->role.invalid) {
			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
		}
2345
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2346
		spin_unlock(&vcpu->kvm->mmu_lock);
2347 2348 2349
		return;
	}
	for (i = 0; i < 4; ++i) {
2350
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2351

A
Avi Kivity 已提交
2352 2353
		if (root) {
			root &= PT64_BASE_ADDR_MASK;
2354 2355
			sp = page_header(root);
			--sp->root_count;
2356
			if (!sp->root_count && sp->role.invalid)
2357 2358
				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
							 &invalid_list);
A
Avi Kivity 已提交
2359
		}
2360
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2361
	}
2362
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2363
	spin_unlock(&vcpu->kvm->mmu_lock);
2364
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2365 2366
}

2367 2368 2369 2370 2371
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
	int ret = 0;

	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2372
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2373 2374 2375 2376 2377 2378
		ret = 1;
	}

	return ret;
}

2379 2380 2381
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu_page *sp;
2382
	unsigned i;
2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398

	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		spin_lock(&vcpu->kvm->mmu_lock);
		kvm_mmu_free_some_pages(vcpu);
		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
				      1, ACC_ALL, NULL);
		++sp->root_count;
		spin_unlock(&vcpu->kvm->mmu_lock);
		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
		for (i = 0; i < 4; ++i) {
			hpa_t root = vcpu->arch.mmu.pae_root[i];

			ASSERT(!VALID_PAGE(root));
			spin_lock(&vcpu->kvm->mmu_lock);
			kvm_mmu_free_some_pages(vcpu);
2399 2400
			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
					      i << 30,
2401 2402 2403 2404 2405 2406 2407
					      PT32_ROOT_LEVEL, 1, ACC_ALL,
					      NULL);
			root = __pa(sp->spt);
			++sp->root_count;
			spin_unlock(&vcpu->kvm->mmu_lock);
			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
		}
2408
		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2409 2410 2411 2412 2413 2414 2415
	} else
		BUG();

	return 0;
}

static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2416
{
2417
	struct kvm_mmu_page *sp;
2418 2419 2420
	u64 pdptr, pm_mask;
	gfn_t root_gfn;
	int i;
2421

2422
	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2423

2424 2425 2426 2427 2428 2429 2430 2431
	if (mmu_check_root(vcpu, root_gfn))
		return 1;

	/*
	 * Do we shadow a long mode page table? If so we need to
	 * write-protect the guests page table root.
	 */
	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2432
		hpa_t root = vcpu->arch.mmu.root_hpa;
2433 2434

		ASSERT(!VALID_PAGE(root));
2435

2436
		spin_lock(&vcpu->kvm->mmu_lock);
2437
		kvm_mmu_free_some_pages(vcpu);
2438 2439
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
				      0, ACC_ALL, NULL);
2440 2441
		root = __pa(sp->spt);
		++sp->root_count;
2442
		spin_unlock(&vcpu->kvm->mmu_lock);
2443
		vcpu->arch.mmu.root_hpa = root;
2444
		return 0;
2445
	}
2446

2447 2448
	/*
	 * We shadow a 32 bit page table. This may be a legacy 2-level
2449 2450
	 * or a PAE 3-level page table. In either case we need to be aware that
	 * the shadow page table may be a PAE or a long mode page table.
2451
	 */
2452 2453 2454 2455
	pm_mask = PT_PRESENT_MASK;
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;

2456
	for (i = 0; i < 4; ++i) {
2457
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2458 2459

		ASSERT(!VALID_PAGE(root));
2460
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2461
			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2462
			if (!is_present_gpte(pdptr)) {
2463
				vcpu->arch.mmu.pae_root[i] = 0;
A
Avi Kivity 已提交
2464 2465
				continue;
			}
A
Avi Kivity 已提交
2466
			root_gfn = pdptr >> PAGE_SHIFT;
2467 2468
			if (mmu_check_root(vcpu, root_gfn))
				return 1;
2469
		}
2470
		spin_lock(&vcpu->kvm->mmu_lock);
2471
		kvm_mmu_free_some_pages(vcpu);
2472
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2473
				      PT32_ROOT_LEVEL, 0,
2474
				      ACC_ALL, NULL);
2475 2476
		root = __pa(sp->spt);
		++sp->root_count;
2477 2478
		spin_unlock(&vcpu->kvm->mmu_lock);

2479
		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2480
	}
2481
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507

	/*
	 * If we shadow a 32 bit page table with a long mode page
	 * table we enter this path.
	 */
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
		if (vcpu->arch.mmu.lm_root == NULL) {
			/*
			 * The additional page necessary for this is only
			 * allocated on demand.
			 */

			u64 *lm_root;

			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
			if (lm_root == NULL)
				return 1;

			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;

			vcpu->arch.mmu.lm_root = lm_root;
		}

		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
	}

2508
	return 0;
2509 2510
}

2511 2512 2513 2514 2515 2516 2517 2518
static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
	if (vcpu->arch.mmu.direct_map)
		return mmu_alloc_direct_roots(vcpu);
	else
		return mmu_alloc_shadow_roots(vcpu);
}

2519 2520 2521 2522 2523
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	int i;
	struct kvm_mmu_page *sp;

2524 2525 2526
	if (vcpu->arch.mmu.direct_map)
		return;

2527 2528
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
		return;
2529 2530

	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2531
	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2532 2533 2534 2535 2536 2537 2538 2539
		hpa_t root = vcpu->arch.mmu.root_hpa;
		sp = page_header(root);
		mmu_sync_children(vcpu, sp);
		return;
	}
	for (i = 0; i < 4; ++i) {
		hpa_t root = vcpu->arch.mmu.pae_root[i];

2540
		if (root && VALID_PAGE(root)) {
2541 2542 2543 2544 2545
			root &= PT64_BASE_ADDR_MASK;
			sp = page_header(root);
			mmu_sync_children(vcpu, sp);
		}
	}
2546
	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2547 2548 2549 2550 2551 2552
}

void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_sync_roots(vcpu);
2553
	spin_unlock(&vcpu->kvm->mmu_lock);
2554 2555
}

2556 2557
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				  u32 access, u32 *error)
A
Avi Kivity 已提交
2558
{
2559 2560
	if (error)
		*error = 0;
A
Avi Kivity 已提交
2561 2562 2563
	return vaddr;
}

2564 2565 2566 2567 2568 2569 2570 2571
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
					 u32 access, u32 *error)
{
	if (error)
		*error = 0;
	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}

A
Avi Kivity 已提交
2572
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
G
Gleb Natapov 已提交
2573
				u32 error_code, bool no_apf)
A
Avi Kivity 已提交
2574
{
2575
	gfn_t gfn;
2576
	int r;
A
Avi Kivity 已提交
2577

2578
	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2579 2580 2581
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
2582

A
Avi Kivity 已提交
2583
	ASSERT(vcpu);
2584
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2585

2586
	gfn = gva >> PAGE_SHIFT;
A
Avi Kivity 已提交
2587

2588 2589
	return nonpaging_map(vcpu, gva & PAGE_MASK,
			     error_code & PFERR_WRITE_MASK, gfn);
A
Avi Kivity 已提交
2590 2591
}

2592
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2593 2594
{
	struct kvm_arch_async_pf arch;
2595
	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609
	arch.gfn = gfn;

	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}

static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
		     kvm_event_needs_reinjection(vcpu)))
		return false;

	return kvm_x86_ops->interrupt_allowed(vcpu);
}

G
Gleb Natapov 已提交
2610 2611
static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
			 gva_t gva, pfn_t *pfn)
2612 2613 2614 2615 2616 2617 2618 2619 2620 2621
{
	bool async;

	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);

	if (!async)
		return false; /* *pfn has correct page already */

	put_page(pfn_to_page(*pfn));

G
Gleb Natapov 已提交
2622
	if (!no_apf && can_do_async_pf(vcpu)) {
2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636
		trace_kvm_try_async_get_page(async, *pfn);
		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
			trace_kvm_async_pf_doublefault(gva, gfn);
			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
			return true;
		} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
			return true;
	}

	*pfn = gfn_to_pfn(vcpu->kvm, gfn);

	return false;
}

G
Gleb Natapov 已提交
2637 2638
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
			  bool no_apf)
2639
{
2640
	pfn_t pfn;
2641
	int r;
2642
	int level;
M
Marcelo Tosatti 已提交
2643
	gfn_t gfn = gpa >> PAGE_SHIFT;
2644
	unsigned long mmu_seq;
2645 2646 2647 2648 2649 2650 2651 2652

	ASSERT(vcpu);
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

2653 2654 2655 2656
	level = mapping_level(vcpu, gfn);

	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);

2657
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2658
	smp_rmb();
2659

G
Gleb Natapov 已提交
2660
	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn))
2661 2662 2663
		return 0;

	/* mmio */
2664 2665
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2666
	spin_lock(&vcpu->kvm->mmu_lock);
2667 2668
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
2669 2670
	kvm_mmu_free_some_pages(vcpu);
	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2671
			 level, gfn, pfn);
2672 2673 2674
	spin_unlock(&vcpu->kvm->mmu_lock);

	return r;
2675 2676 2677 2678 2679

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
2680 2681
}

A
Avi Kivity 已提交
2682 2683
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
2684
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2685 2686
}

2687 2688
static int nonpaging_init_context(struct kvm_vcpu *vcpu,
				  struct kvm_mmu *context)
A
Avi Kivity 已提交
2689 2690 2691 2692 2693
{
	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
	context->free = nonpaging_free;
2694
	context->prefetch_page = nonpaging_prefetch_page;
2695
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2696
	context->invlpg = nonpaging_invlpg;
2697
	context->root_level = 0;
A
Avi Kivity 已提交
2698
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2699
	context->root_hpa = INVALID_PAGE;
2700
	context->direct_map = true;
2701
	context->nx = false;
A
Avi Kivity 已提交
2702 2703 2704
	return 0;
}

2705
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2706
{
A
Avi Kivity 已提交
2707
	++vcpu->stat.tlb_flush;
2708
	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
A
Avi Kivity 已提交
2709 2710 2711 2712
}

static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
2713
	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2714
	mmu_free_roots(vcpu);
A
Avi Kivity 已提交
2715 2716
}

2717 2718 2719 2720 2721
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
	return vcpu->arch.cr3;
}

2722
static void inject_page_fault(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2723
{
2724
	vcpu->arch.mmu.inject_page_fault(vcpu);
A
Avi Kivity 已提交
2725 2726 2727 2728 2729 2730 2731
}

static void paging_free(struct kvm_vcpu *vcpu)
{
	nonpaging_free(vcpu);
}

2732
static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2733 2734 2735 2736
{
	int bit7;

	bit7 = (gpte >> 7) & 1;
2737
	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2738 2739
}

A
Avi Kivity 已提交
2740 2741 2742 2743 2744 2745 2746 2747
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

2748 2749 2750
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				  struct kvm_mmu *context,
				  int level)
2751 2752 2753 2754
{
	int maxphyaddr = cpuid_maxphyaddr(vcpu);
	u64 exb_bit_rsvd = 0;

2755
	if (!context->nx)
2756 2757 2758 2759 2760 2761
		exb_bit_rsvd = rsvd_bits(63, 63);
	switch (level) {
	case PT32_ROOT_LEVEL:
		/* no rsvd bits for 2 level 4K page table entries */
		context->rsvd_bits_mask[0][1] = 0;
		context->rsvd_bits_mask[0][0] = 0;
2762 2763 2764 2765 2766 2767 2768
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];

		if (!is_pse(vcpu)) {
			context->rsvd_bits_mask[1][1] = 0;
			break;
		}

2769 2770 2771 2772 2773 2774 2775 2776
		if (is_cpuid_PSE36())
			/* 36bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
		else
			/* 32 bits PSE 4MB page */
			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
		break;
	case PT32E_ROOT_LEVEL:
2777 2778 2779
		context->rsvd_bits_mask[0][2] =
			rsvd_bits(maxphyaddr, 63) |
			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2780
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2781
			rsvd_bits(maxphyaddr, 62);	/* PDE */
2782 2783 2784 2785 2786
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62); 	/* PTE */
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 62) |
			rsvd_bits(13, 20);		/* large page */
2787
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2788 2789 2790 2791 2792 2793 2794
		break;
	case PT64_ROOT_LEVEL:
		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2795
			rsvd_bits(maxphyaddr, 51);
2796 2797 2798
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51);
		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2799 2800 2801
		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 29);
2802
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2803 2804
			rsvd_bits(maxphyaddr, 51) |
			rsvd_bits(13, 20);		/* large page */
2805
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2806 2807 2808 2809
		break;
	}
}

2810 2811 2812
static int paging64_init_context_common(struct kvm_vcpu *vcpu,
					struct kvm_mmu *context,
					int level)
A
Avi Kivity 已提交
2813
{
2814 2815
	context->nx = is_nx(vcpu);

2816
	reset_rsvds_bits_mask(vcpu, context, level);
A
Avi Kivity 已提交
2817 2818 2819 2820 2821

	ASSERT(is_pae(vcpu));
	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
2822
	context->prefetch_page = paging64_prefetch_page;
2823
	context->sync_page = paging64_sync_page;
M
Marcelo Tosatti 已提交
2824
	context->invlpg = paging64_invlpg;
A
Avi Kivity 已提交
2825
	context->free = paging_free;
2826 2827
	context->root_level = level;
	context->shadow_root_level = level;
A
Avi Kivity 已提交
2828
	context->root_hpa = INVALID_PAGE;
2829
	context->direct_map = false;
A
Avi Kivity 已提交
2830 2831 2832
	return 0;
}

2833 2834
static int paging64_init_context(struct kvm_vcpu *vcpu,
				 struct kvm_mmu *context)
2835
{
2836
	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2837 2838
}

2839 2840
static int paging32_init_context(struct kvm_vcpu *vcpu,
				 struct kvm_mmu *context)
A
Avi Kivity 已提交
2841
{
2842 2843
	context->nx = false;

2844
	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
A
Avi Kivity 已提交
2845 2846 2847 2848 2849

	context->new_cr3 = paging_new_cr3;
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
	context->free = paging_free;
2850
	context->prefetch_page = paging32_prefetch_page;
2851
	context->sync_page = paging32_sync_page;
M
Marcelo Tosatti 已提交
2852
	context->invlpg = paging32_invlpg;
A
Avi Kivity 已提交
2853 2854
	context->root_level = PT32_ROOT_LEVEL;
	context->shadow_root_level = PT32E_ROOT_LEVEL;
A
Avi Kivity 已提交
2855
	context->root_hpa = INVALID_PAGE;
2856
	context->direct_map = false;
A
Avi Kivity 已提交
2857 2858 2859
	return 0;
}

2860 2861
static int paging32E_init_context(struct kvm_vcpu *vcpu,
				  struct kvm_mmu *context)
A
Avi Kivity 已提交
2862
{
2863
	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
A
Avi Kivity 已提交
2864 2865
}

2866 2867
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
2868
	struct kvm_mmu *context = vcpu->arch.walk_mmu;
2869 2870 2871 2872 2873

	context->new_cr3 = nonpaging_new_cr3;
	context->page_fault = tdp_page_fault;
	context->free = nonpaging_free;
	context->prefetch_page = nonpaging_prefetch_page;
2874
	context->sync_page = nonpaging_sync_page;
M
Marcelo Tosatti 已提交
2875
	context->invlpg = nonpaging_invlpg;
2876
	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2877
	context->root_hpa = INVALID_PAGE;
2878
	context->direct_map = true;
2879
	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2880
	context->get_cr3 = get_cr3;
2881
	context->inject_page_fault = kvm_inject_page_fault;
2882
	context->nx = is_nx(vcpu);
2883 2884

	if (!is_paging(vcpu)) {
2885
		context->nx = false;
2886 2887 2888
		context->gva_to_gpa = nonpaging_gva_to_gpa;
		context->root_level = 0;
	} else if (is_long_mode(vcpu)) {
2889
		context->nx = is_nx(vcpu);
2890
		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2891 2892 2893
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT64_ROOT_LEVEL;
	} else if (is_pae(vcpu)) {
2894
		context->nx = is_nx(vcpu);
2895
		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2896 2897 2898
		context->gva_to_gpa = paging64_gva_to_gpa;
		context->root_level = PT32E_ROOT_LEVEL;
	} else {
2899
		context->nx = false;
2900
		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2901 2902 2903 2904 2905 2906 2907
		context->gva_to_gpa = paging32_gva_to_gpa;
		context->root_level = PT32_ROOT_LEVEL;
	}

	return 0;
}

2908
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
A
Avi Kivity 已提交
2909
{
2910
	int r;
A
Avi Kivity 已提交
2911
	ASSERT(vcpu);
2912
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
2913 2914

	if (!is_paging(vcpu))
2915
		r = nonpaging_init_context(vcpu, context);
A
Avi Kivity 已提交
2916
	else if (is_long_mode(vcpu))
2917
		r = paging64_init_context(vcpu, context);
A
Avi Kivity 已提交
2918
	else if (is_pae(vcpu))
2919
		r = paging32E_init_context(vcpu, context);
A
Avi Kivity 已提交
2920
	else
2921
		r = paging32_init_context(vcpu, context);
2922

2923
	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2924
	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
2925 2926 2927 2928 2929 2930 2931

	return r;
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);

static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
2932
	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2933

2934 2935 2936
	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2937 2938

	return r;
A
Avi Kivity 已提交
2939 2940
}

2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954
static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;

	g_context->get_cr3           = get_cr3;
	g_context->inject_page_fault = kvm_inject_page_fault;

	/*
	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
	 * translation of l2_gpa to l1_gpa addresses is done using the
	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
	 * functions between mmu and nested_mmu are swapped.
	 */
	if (!is_paging(vcpu)) {
2955
		g_context->nx = false;
2956 2957 2958
		g_context->root_level = 0;
		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
	} else if (is_long_mode(vcpu)) {
2959
		g_context->nx = is_nx(vcpu);
2960 2961 2962 2963
		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
		g_context->root_level = PT64_ROOT_LEVEL;
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
	} else if (is_pae(vcpu)) {
2964
		g_context->nx = is_nx(vcpu);
2965 2966 2967 2968
		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
		g_context->root_level = PT32E_ROOT_LEVEL;
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
	} else {
2969
		g_context->nx = false;
2970 2971 2972 2973 2974 2975 2976 2977
		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
		g_context->root_level = PT32_ROOT_LEVEL;
		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
	}

	return 0;
}

2978 2979
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
2980 2981
	vcpu->arch.update_pte.pfn = bad_pfn;

2982 2983 2984
	if (mmu_is_nested(vcpu))
		return init_kvm_nested_mmu(vcpu);
	else if (tdp_enabled)
2985 2986 2987 2988 2989
		return init_kvm_tdp_mmu(vcpu);
	else
		return init_kvm_softmmu(vcpu);
}

A
Avi Kivity 已提交
2990 2991 2992
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
2993 2994
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
		/* mmu.free() should set root_hpa = INVALID_PAGE */
2995
		vcpu->arch.mmu.free(vcpu);
A
Avi Kivity 已提交
2996 2997 2998
}

int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2999 3000 3001 3002
{
	destroy_kvm_mmu(vcpu);
	return init_kvm_mmu(vcpu);
}
3003
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
3004 3005

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3006
{
3007 3008
	int r;

3009
	r = mmu_topup_memory_caches(vcpu);
A
Avi Kivity 已提交
3010 3011
	if (r)
		goto out;
3012
	r = mmu_alloc_roots(vcpu);
3013
	spin_lock(&vcpu->kvm->mmu_lock);
3014
	mmu_sync_roots(vcpu);
3015
	spin_unlock(&vcpu->kvm->mmu_lock);
3016 3017
	if (r)
		goto out;
3018
	/* set_cr3() should ensure TLB has been flushed */
3019
	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3020 3021
out:
	return r;
A
Avi Kivity 已提交
3022
}
A
Avi Kivity 已提交
3023 3024 3025 3026 3027 3028
EXPORT_SYMBOL_GPL(kvm_mmu_load);

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
	mmu_free_roots(vcpu);
}
3029
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
A
Avi Kivity 已提交
3030

3031
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3032
				  struct kvm_mmu_page *sp,
3033 3034 3035 3036 3037 3038
				  u64 *spte)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
3039
	if (is_shadow_present_pte(pte)) {
3040
		if (is_last_spte(pte, sp->role.level))
A
Avi Kivity 已提交
3041
			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3042 3043
		else {
			child = page_header(pte & PT64_BASE_ADDR_MASK);
3044
			mmu_page_remove_parent_pte(child, spte);
3045 3046
		}
	}
A
Avi Kivity 已提交
3047
	__set_spte(spte, shadow_trap_nonpresent_pte);
M
Marcelo Tosatti 已提交
3048 3049
	if (is_large_pte(pte))
		--vcpu->kvm->stat.lpages;
3050 3051
}

3052
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3053
				  struct kvm_mmu_page *sp,
3054
				  u64 *spte,
3055
				  const void *new)
3056
{
3057
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3058 3059
		++vcpu->kvm->stat.mmu_pde_zapped;
		return;
3060
        }
3061

3062
	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3063 3064
		return;

A
Avi Kivity 已提交
3065
	++vcpu->kvm->stat.mmu_pte_updated;
3066
	if (!sp->role.cr4_pae)
3067
		paging32_update_pte(vcpu, sp, spte, new);
3068
	else
3069
		paging64_update_pte(vcpu, sp, spte, new);
3070 3071
}

3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

3085 3086
static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
				    bool remote_flush, bool local_flush)
3087
{
3088 3089 3090 3091
	if (zap_page)
		return;

	if (remote_flush)
3092
		kvm_flush_remote_tlbs(vcpu->kvm);
3093
	else if (local_flush)
3094 3095 3096
		kvm_mmu_flush_tlb(vcpu);
}

3097 3098
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
3099
	u64 *spte = vcpu->arch.last_pte_updated;
3100

S
Sheng Yang 已提交
3101
	return !!(spte && (*spte & shadow_accessed_mask));
3102 3103
}

3104
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3105
					  u64 gpte)
3106 3107
{
	gfn_t gfn;
3108
	pfn_t pfn;
3109

3110
	if (!is_present_gpte(gpte))
3111 3112
		return;
	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3113

3114
	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3115
	smp_rmb();
3116
	pfn = gfn_to_pfn(vcpu->kvm, gfn);
3117

3118 3119
	if (is_error_pfn(pfn)) {
		kvm_release_pfn_clean(pfn);
3120 3121
		return;
	}
3122
	vcpu->arch.update_pte.gfn = gfn;
3123
	vcpu->arch.update_pte.pfn = pfn;
3124 3125
}

3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{
	u64 *spte = vcpu->arch.last_pte_updated;

	if (spte
	    && vcpu->arch.last_pte_gfn == gfn
	    && shadow_accessed_mask
	    && !(*spte & shadow_accessed_mask)
	    && is_shadow_present_pte(*spte))
		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
}

3138
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3139 3140
		       const u8 *new, int bytes,
		       bool guest_initiated)
3141
{
3142
	gfn_t gfn = gpa >> PAGE_SHIFT;
3143
	union kvm_mmu_page_role mask = { .word = 0 };
3144
	struct kvm_mmu_page *sp;
3145
	struct hlist_node *node;
3146
	LIST_HEAD(invalid_list);
3147
	u64 entry, gentry;
3148 3149
	u64 *spte;
	unsigned offset = offset_in_page(gpa);
3150
	unsigned pte_size;
3151
	unsigned page_offset;
3152
	unsigned misaligned;
3153
	unsigned quadrant;
3154
	int level;
3155
	int flooded = 0;
3156
	int npte;
3157
	int r;
3158
	int invlpg_counter;
3159 3160 3161
	bool remote_flush, local_flush, zap_page;

	zap_page = remote_flush = local_flush = false;
3162

3163
	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3164

3165
	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3166 3167 3168 3169 3170 3171 3172

	/*
	 * Assume that the pte write on a page table of the same type
	 * as the current vcpu paging mode.  This is nearly always true
	 * (might be false while changing modes).  Note it is verified later
	 * by update_pte().
	 */
3173
	if ((is_pae(vcpu) && bytes == 4) || !new) {
3174
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3175 3176 3177 3178 3179
		if (is_pae(vcpu)) {
			gpa &= ~(gpa_t)7;
			bytes = 8;
		}
		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3180 3181
		if (r)
			gentry = 0;
3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194
		new = (const u8 *)&gentry;
	}

	switch (bytes) {
	case 4:
		gentry = *(const u32 *)new;
		break;
	case 8:
		gentry = *(const u64 *)new;
		break;
	default:
		gentry = 0;
		break;
3195 3196 3197
	}

	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
3198
	spin_lock(&vcpu->kvm->mmu_lock);
3199 3200
	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
		gentry = 0;
3201
	kvm_mmu_access_page(vcpu, gfn);
3202
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
3203
	++vcpu->kvm->stat.mmu_pte_write;
3204
	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215
	if (guest_initiated) {
		if (gfn == vcpu->arch.last_pt_write_gfn
		    && !last_updated_pte_accessed(vcpu)) {
			++vcpu->arch.last_pt_write_count;
			if (vcpu->arch.last_pt_write_count >= 3)
				flooded = 1;
		} else {
			vcpu->arch.last_pt_write_gfn = gfn;
			vcpu->arch.last_pt_write_count = 1;
			vcpu->arch.last_pte_updated = NULL;
		}
3216
	}
3217

3218
	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3219
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3220
		pte_size = sp->role.cr4_pae ? 8 : 4;
3221
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3222
		misaligned |= bytes < 4;
3223
		if (misaligned || flooded) {
3224 3225 3226 3227
			/*
			 * Misaligned accesses are too much trouble to fix
			 * up; also, they usually indicate a page is not used
			 * as a page table.
3228 3229 3230 3231 3232
			 *
			 * If we're seeing too many writes to a page,
			 * it may no longer be a page table, or we may be
			 * forking, in which case it is better to unmap the
			 * page.
3233 3234
			 */
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3235
				 gpa, bytes, sp->role.word);
3236
			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3237
						     &invalid_list);
A
Avi Kivity 已提交
3238
			++vcpu->kvm->stat.mmu_flooded;
3239 3240
			continue;
		}
3241
		page_offset = offset;
3242
		level = sp->role.level;
3243
		npte = 1;
3244
		if (!sp->role.cr4_pae) {
3245 3246 3247 3248 3249 3250 3251
			page_offset <<= 1;	/* 32->64 */
			/*
			 * A 32-bit pde maps 4MB while the shadow pdes map
			 * only 2MB.  So we need to double the offset again
			 * and zap two pdes instead of one.
			 */
			if (level == PT32_ROOT_LEVEL) {
3252
				page_offset &= ~7; /* kill rounding error */
3253 3254 3255
				page_offset <<= 1;
				npte = 2;
			}
3256
			quadrant = page_offset >> PAGE_SHIFT;
3257
			page_offset &= ~PAGE_MASK;
3258
			if (quadrant != sp->role.quadrant)
3259
				continue;
3260
		}
3261
		local_flush = true;
3262
		spte = &sp->spt[page_offset / sizeof(*spte)];
3263
		while (npte--) {
3264
			entry = *spte;
3265
			mmu_pte_write_zap_pte(vcpu, sp, spte);
3266 3267 3268
			if (gentry &&
			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
			      & mask.word))
3269
				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3270 3271
			if (!remote_flush && need_remote_flush(entry, *spte))
				remote_flush = true;
3272
			++spte;
3273 3274
		}
	}
3275
	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3276
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3277
	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3278
	spin_unlock(&vcpu->kvm->mmu_lock);
3279 3280 3281
	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
		vcpu->arch.update_pte.pfn = bad_pfn;
3282
	}
3283 3284
}

3285 3286
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
3287 3288
	gpa_t gpa;
	int r;
3289

3290
	if (vcpu->arch.mmu.direct_map)
3291 3292
		return 0;

3293
	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3294

3295
	spin_lock(&vcpu->kvm->mmu_lock);
3296
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3297
	spin_unlock(&vcpu->kvm->mmu_lock);
3298
	return r;
3299
}
3300
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3301

3302
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3303
{
3304
	LIST_HEAD(invalid_list);
3305

3306
	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3307
	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3308
		struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
3309

3310
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3311
				  struct kvm_mmu_page, link);
3312
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3313
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
A
Avi Kivity 已提交
3314
		++vcpu->kvm->stat.mmu_recycled;
A
Avi Kivity 已提交
3315 3316 3317
	}
}

3318 3319 3320 3321 3322
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
	int r;
	enum emulation_result er;

G
Gleb Natapov 已提交
3323
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3324 3325 3326 3327 3328 3329 3330 3331
	if (r < 0)
		goto out;

	if (!r) {
		r = 1;
		goto out;
	}

3332 3333 3334 3335
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		goto out;

A
Avi Kivity 已提交
3336
	er = emulate_instruction(vcpu, cr2, error_code, 0);
3337 3338 3339 3340 3341 3342

	switch (er) {
	case EMULATE_DONE:
		return 1;
	case EMULATE_DO_MMIO:
		++vcpu->stat.mmio_exits;
3343
		/* fall through */
3344
	case EMULATE_FAIL:
3345
		return 0;
3346 3347 3348 3349 3350 3351 3352 3353
	default:
		BUG();
	}
out:
	return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

M
Marcelo Tosatti 已提交
3354 3355 3356 3357 3358 3359 3360 3361
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
	vcpu->arch.mmu.invlpg(vcpu, gva);
	kvm_mmu_flush_tlb(vcpu);
	++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);

3362 3363 3364 3365 3366 3367
void kvm_enable_tdp(void)
{
	tdp_enabled = true;
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);

3368 3369 3370 3371 3372 3373
void kvm_disable_tdp(void)
{
	tdp_enabled = false;
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);

A
Avi Kivity 已提交
3374 3375
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
3376
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
3377 3378
	if (vcpu->arch.mmu.lm_root != NULL)
		free_page((unsigned long)vcpu->arch.mmu.lm_root);
A
Avi Kivity 已提交
3379 3380 3381 3382
}

static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
3383
	struct page *page;
A
Avi Kivity 已提交
3384 3385 3386 3387
	int i;

	ASSERT(vcpu);

3388 3389 3390 3391 3392 3393 3394
	/*
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
	 * Therefore we need to allocate shadow page tables in the first
	 * 4GB of memory, which happens to fit the DMA32 zone.
	 */
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
	if (!page)
3395 3396
		return -ENOMEM;

3397
	vcpu->arch.mmu.pae_root = page_address(page);
3398
	for (i = 0; i < 4; ++i)
3399
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3400

A
Avi Kivity 已提交
3401 3402 3403
	return 0;
}

3404
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3405 3406
{
	ASSERT(vcpu);
3407
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
A
Avi Kivity 已提交
3408

3409 3410
	return alloc_mmu_pages(vcpu);
}
A
Avi Kivity 已提交
3411

3412 3413 3414
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);
3415
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3416

3417
	return init_kvm_mmu(vcpu);
A
Avi Kivity 已提交
3418 3419
}

3420
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
A
Avi Kivity 已提交
3421
{
3422
	struct kvm_mmu_page *sp;
A
Avi Kivity 已提交
3423

3424
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
A
Avi Kivity 已提交
3425 3426 3427
		int i;
		u64 *pt;

3428
		if (!test_bit(slot, sp->slot_bitmap))
A
Avi Kivity 已提交
3429 3430
			continue;

3431
		pt = sp->spt;
A
Avi Kivity 已提交
3432 3433
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
			/* avoid RMW */
3434
			if (is_writable_pte(pt[i]))
A
Avi Kivity 已提交
3435 3436
				pt[i] &= ~PT_WRITABLE_MASK;
	}
3437
	kvm_flush_remote_tlbs(kvm);
A
Avi Kivity 已提交
3438
}
3439

3440
void kvm_mmu_zap_all(struct kvm *kvm)
D
Dor Laor 已提交
3441
{
3442
	struct kvm_mmu_page *sp, *node;
3443
	LIST_HEAD(invalid_list);
D
Dor Laor 已提交
3444

3445
	spin_lock(&kvm->mmu_lock);
3446
restart:
3447
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3448
		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3449 3450
			goto restart;

3451
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3452
	spin_unlock(&kvm->mmu_lock);
D
Dor Laor 已提交
3453 3454
}

3455 3456
static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
					       struct list_head *invalid_list)
3457 3458 3459 3460 3461
{
	struct kvm_mmu_page *page;

	page = container_of(kvm->arch.active_mmu_pages.prev,
			    struct kvm_mmu_page, link);
3462
	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3463 3464
}

3465
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3466 3467 3468
{
	struct kvm *kvm;
	struct kvm *kvm_freed = NULL;
3469 3470 3471

	if (nr_to_scan == 0)
		goto out;
3472 3473 3474 3475

	spin_lock(&kvm_lock);

	list_for_each_entry(kvm, &vm_list, vm_list) {
3476
		int idx, freed_pages;
3477
		LIST_HEAD(invalid_list);
3478

3479
		idx = srcu_read_lock(&kvm->srcu);
3480
		spin_lock(&kvm->mmu_lock);
3481 3482
		if (!kvm_freed && nr_to_scan > 0 &&
		    kvm->arch.n_used_mmu_pages > 0) {
3483 3484
			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
							  &invalid_list);
3485 3486 3487 3488
			kvm_freed = kvm;
		}
		nr_to_scan--;

3489
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3490
		spin_unlock(&kvm->mmu_lock);
3491
		srcu_read_unlock(&kvm->srcu, idx);
3492 3493 3494 3495 3496 3497
	}
	if (kvm_freed)
		list_move_tail(&kvm_freed->vm_list, &vm_list);

	spin_unlock(&kvm_lock);

3498 3499
out:
	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3500 3501 3502 3503 3504 3505 3506
}

static struct shrinker mmu_shrinker = {
	.shrink = mmu_shrink,
	.seeks = DEFAULT_SEEKS * 10,
};

I
Ingo Molnar 已提交
3507
static void mmu_destroy_caches(void)
3508 3509 3510 3511 3512
{
	if (pte_chain_cache)
		kmem_cache_destroy(pte_chain_cache);
	if (rmap_desc_cache)
		kmem_cache_destroy(rmap_desc_cache);
3513 3514
	if (mmu_page_header_cache)
		kmem_cache_destroy(mmu_page_header_cache);
3515 3516
}

3517 3518 3519
void kvm_mmu_module_exit(void)
{
	mmu_destroy_caches();
3520
	percpu_counter_destroy(&kvm_total_used_mmu_pages);
3521 3522 3523
	unregister_shrinker(&mmu_shrinker);
}

3524 3525 3526 3527
int kvm_mmu_module_init(void)
{
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
					    sizeof(struct kvm_pte_chain),
3528
					    0, 0, NULL);
3529 3530 3531 3532
	if (!pte_chain_cache)
		goto nomem;
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
					    sizeof(struct kvm_rmap_desc),
3533
					    0, 0, NULL);
3534 3535 3536
	if (!rmap_desc_cache)
		goto nomem;

3537 3538
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
3539
						  0, 0, NULL);
3540 3541 3542
	if (!mmu_page_header_cache)
		goto nomem;

3543 3544 3545
	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
		goto nomem;

3546 3547
	register_shrinker(&mmu_shrinker);

3548 3549 3550
	return 0;

nomem:
3551
	mmu_destroy_caches();
3552 3553 3554
	return -ENOMEM;
}

3555 3556 3557 3558 3559 3560 3561 3562
/*
 * Caculate mmu pages needed for kvm.
 */
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
	int i;
	unsigned int nr_mmu_pages;
	unsigned int  nr_pages = 0;
3563
	struct kvm_memslots *slots;
3564

3565 3566
	slots = kvm_memslots(kvm);

3567 3568
	for (i = 0; i < slots->nmemslots; i++)
		nr_pages += slots->memslots[i].npages;
3569 3570 3571 3572 3573 3574 3575 3576

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
	nr_mmu_pages = max(nr_mmu_pages,
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);

	return nr_mmu_pages;
}

3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611
static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	if (len > buffer->len)
		return NULL;
	return buffer->ptr;
}

static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
				unsigned len)
{
	void *ret;

	ret = pv_mmu_peek_buffer(buffer, len);
	if (!ret)
		return ret;
	buffer->ptr += len;
	buffer->len -= len;
	buffer->processed += len;
	return ret;
}

static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
			     gpa_t addr, gpa_t value)
{
	int bytes = 8;
	int r;

	if (!is_long_mode(vcpu) && !is_pae(vcpu))
		bytes = 4;

	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;

3612
	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3613 3614 3615 3616 3617 3618 3619
		return -EFAULT;

	return 1;
}

static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
3620
	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673
	return 1;
}

static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
{
	spin_lock(&vcpu->kvm->mmu_lock);
	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
	spin_unlock(&vcpu->kvm->mmu_lock);
	return 1;
}

static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
			     struct kvm_pv_mmu_op_buffer *buffer)
{
	struct kvm_mmu_op_header *header;

	header = pv_mmu_peek_buffer(buffer, sizeof *header);
	if (!header)
		return 0;
	switch (header->op) {
	case KVM_MMU_OP_WRITE_PTE: {
		struct kvm_mmu_op_write_pte *wpte;

		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
		if (!wpte)
			return 0;
		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
					wpte->pte_val);
	}
	case KVM_MMU_OP_FLUSH_TLB: {
		struct kvm_mmu_op_flush_tlb *ftlb;

		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
		if (!ftlb)
			return 0;
		return kvm_pv_mmu_flush_tlb(vcpu);
	}
	case KVM_MMU_OP_RELEASE_PT: {
		struct kvm_mmu_op_release_pt *rpt;

		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
		if (!rpt)
			return 0;
		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
	}
	default: return 0;
	}
}

int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
		  gpa_t addr, unsigned long *ret)
{
	int r;
3674
	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3675

3676 3677 3678
	buffer->ptr = buffer->buf;
	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
	buffer->processed = 0;
3679

3680
	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3681 3682 3683
	if (r)
		goto out;

3684 3685
	while (buffer->len) {
		r = kvm_pv_mmu_op_one(vcpu, buffer);
3686 3687 3688 3689 3690 3691 3692 3693
		if (r < 0)
			goto out;
		if (r == 0)
			break;
	}

	r = 1;
out:
3694
	*ret = buffer->processed;
3695 3696 3697
	return r;
}

3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
	struct kvm_shadow_walk_iterator iterator;
	int nr_sptes = 0;

	spin_lock(&vcpu->kvm->mmu_lock);
	for_each_shadow_entry(vcpu, addr, iterator) {
		sptes[iterator.level-1] = *iterator.sptep;
		nr_sptes++;
		if (!is_shadow_present_pte(*iterator.sptep))
			break;
	}
	spin_unlock(&vcpu->kvm->mmu_lock);

	return nr_sptes;
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

3716
#ifdef CONFIG_KVM_MMU_AUDIT
3717
#include "mmu_audit.c"
3718 3719
#else
static void mmu_audit_disable(void) { }
3720
#endif
3721 3722 3723 3724 3725 3726 3727 3728 3729 3730

void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
	ASSERT(vcpu);

	destroy_kvm_mmu(vcpu);
	free_mmu_pages(vcpu);
	mmu_free_memory_caches(vcpu);
	mmu_audit_disable();
}