mmu.c 165.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
A
Avi Kivity 已提交
2 3 4 5 6 7 8 9 10
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
11
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
12 13 14 15 16
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */
A
Avi Kivity 已提交
17

18
#include "irq.h"
19
#include "ioapic.h"
20
#include "mmu.h"
21
#include "mmu_internal.h"
22
#include "tdp_mmu.h"
23
#include "x86.h"
A
Avi Kivity 已提交
24
#include "kvm_cache_regs.h"
25
#include "kvm_emulate.h"
26
#include "cpuid.h"
27
#include "spte.h"
A
Avi Kivity 已提交
28

29
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
30 31 32 33
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
34 35
#include <linux/moduleparam.h>
#include <linux/export.h>
36
#include <linux/swap.h>
M
Marcelo Tosatti 已提交
37
#include <linux/hugetlb.h>
38
#include <linux/compiler.h>
39
#include <linux/srcu.h>
40
#include <linux/slab.h>
41
#include <linux/sched/signal.h>
42
#include <linux/uaccess.h>
43
#include <linux/hash.h>
44
#include <linux/kern_levels.h>
45
#include <linux/kthread.h>
A
Avi Kivity 已提交
46

A
Avi Kivity 已提交
47
#include <asm/page.h>
48
#include <asm/memtype.h>
A
Avi Kivity 已提交
49
#include <asm/cmpxchg.h>
50
#include <asm/io.h>
51
#include <asm/set_memory.h>
52
#include <asm/vmx.h>
53
#include <asm/kvm_page_track.h>
54
#include "trace.h"
A
Avi Kivity 已提交
55

56 57
#include "paging.h"

P
Paolo Bonzini 已提交
58 59
extern bool itlb_multihit_kvm_mitigation;

60
int __read_mostly nx_huge_pages = -1;
61 62 63 64
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
#else
65
static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
66
#endif
P
Paolo Bonzini 已提交
67 68

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
69
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
P
Paolo Bonzini 已提交
70

71
static const struct kernel_param_ops nx_huge_pages_ops = {
P
Paolo Bonzini 已提交
72 73 74 75
	.set = set_nx_huge_pages,
	.get = param_get_bool,
};

76
static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
77 78 79 80
	.set = set_nx_huge_pages_recovery_ratio,
	.get = param_get_uint,
};

P
Paolo Bonzini 已提交
81 82
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
83 84 85
module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
		&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
P
Paolo Bonzini 已提交
86

87 88 89
static bool __read_mostly force_flush_and_sync_on_reuse;
module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);

90 91 92 93 94 95 96
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
97
bool tdp_enabled = false;
98

99
static int max_huge_page_level __read_mostly;
100
static int max_tdp_level __read_mostly;
101

102 103 104 105
enum {
	AUDIT_PRE_PAGE_FAULT,
	AUDIT_POST_PAGE_FAULT,
	AUDIT_PRE_PTE_WRITE,
106 107 108
	AUDIT_POST_PTE_WRITE,
	AUDIT_PRE_SYNC,
	AUDIT_POST_SYNC
109
};
110 111

#ifdef MMU_DEBUG
112
bool dbg = 0;
113
module_param(dbg, bool, 0644);
114
#endif
A
Avi Kivity 已提交
115

116 117
#define PTE_PREFETCH_NUM		8

A
Avi Kivity 已提交
118 119 120
#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
M
Mike Day 已提交
121
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
A
Avi Kivity 已提交
122

123 124 125
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
126 127 128 129 130 131 132 133

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134 135 136
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
A
Avi Kivity 已提交
137

138 139
#include <trace/events/kvm.h>

140
/* make pte_list_desc fit well in cache lines */
141
#define PTE_LIST_EXT 14
142

143 144 145 146 147
/*
 * Slight optimization of cacheline layout, by putting `more' and `spte_count'
 * at the start; then accessing it will only use one single cacheline for
 * either full (entries==PTE_LIST_EXT) case or entries<=6.
 */
148 149
struct pte_list_desc {
	struct pte_list_desc *more;
150 151 152 153 154 155
	/*
	 * Stores number of entries stored in the pte_list_desc.  No need to be
	 * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
	 */
	u64 spte_count;
	u64 *sptes[PTE_LIST_EXT];
156 157
};

158 159 160 161
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	u64 *sptep;
162
	int level;
163 164 165
	unsigned index;
};

166 167 168 169 170 171 172
#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
					 (_root), (_addr));                \
	     shadow_walk_okay(&(_walker));			           \
	     shadow_walk_next(&(_walker)))

#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
173 174 175 176
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

177 178 179 180 181 182
#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
	     shadow_walk_okay(&(_walker)) &&				\
		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
	     __shadow_walk_next(&(_walker), spte))

183
static struct kmem_cache *pte_list_desc_cache;
184
struct kmem_cache *mmu_page_header_cache;
185
static struct percpu_counter kvm_total_used_mmu_pages;
186

187
static void mmu_spte_set(u64 *sptep, u64 spte);
188 189
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
190

191 192 193 194 195 196
struct kvm_mmu_role_regs {
	const unsigned long cr0;
	const unsigned long cr4;
	const u64 efer;
};

197 198 199
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
/*
 * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
 * reading from the role_regs.  Once the mmu_role is constructed, it becomes
 * the single source of truth for the MMU's state.
 */
#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
{									\
	return !!(regs->reg & flag);					\
}
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
/*
 * The MMU itself (with a valid role) is the single source of truth for the
 * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
 * and the vCPU may be incorrect/irrelevant.
 */
#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
static inline bool is_##reg##_##name(struct kvm_mmu *mmu)	\
{								\
	return !!(mmu->mmu_role. base_or_ext . reg##_##name);	\
}
BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);

242 243 244 245 246 247 248 249 250 251
static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu_role_regs regs = {
		.cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
		.cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
		.efer = vcpu->arch.efer,
	};

	return regs;
}
252

253 254 255 256 257 258 259 260 261 262 263 264
static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
{
	if (!____is_cr0_pg(regs))
		return 0;
	else if (____is_efer_lma(regs))
		return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
					       PT64_ROOT_4LEVEL;
	else if (____is_cr4_pae(regs))
		return PT32E_ROOT_LEVEL;
	else
		return PT32_ROOT_LEVEL;
}
265 266 267

static inline bool kvm_available_flush_tlb_with_range(void)
{
268
	return kvm_x86_ops.tlb_remote_flush_with_range;
269 270 271 272 273 274 275
}

static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
		struct kvm_tlb_range *range)
{
	int ret = -ENOTSUPP;

276
	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
277
		ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
278 279 280 281 282

	if (ret)
		kvm_flush_remote_tlbs(kvm);
}

283
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
284 285 286 287 288 289 290 291 292 293
		u64 start_gfn, u64 pages)
{
	struct kvm_tlb_range range;

	range.start_gfn = start_gfn;
	range.pages = pages;

	kvm_flush_remote_tlbs_with_range(kvm, &range);
}

294 295 296
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
			   unsigned int access)
{
297
	u64 spte = make_mmio_spte(vcpu, gfn, access);
298

299 300
	trace_mark_mmio_spte(sptep, gfn, spte);
	mmu_spte_set(sptep, spte);
301 302 303 304
}

static gfn_t get_mmio_spte_gfn(u64 spte)
{
305
	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
306

307
	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
308 309 310
	       & shadow_nonpresent_or_rsvd_mask;

	return gpa >> PAGE_SHIFT;
311 312 313 314
}

static unsigned get_mmio_spte_access(u64 spte)
{
315
	return spte & shadow_mmio_access_mask;
316 317
}

318
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
319
{
320
	u64 kvm_gen, spte_gen, gen;
321

322 323 324
	gen = kvm_vcpu_memslots(vcpu)->generation;
	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
		return false;
325

326
	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
327 328 329 330
	spte_gen = get_mmio_spte_generation(spte);

	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
	return likely(kvm_gen == spte_gen);
331 332
}

333 334 335
static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                                  struct x86_exception *exception)
{
336
	/* Check if guest physical address doesn't exceed guest maximum */
337
	if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
338 339 340 341
		exception->error_code |= PFERR_RSVD_MASK;
		return UNMAPPED_GVA;
	}

342 343 344
        return gpa;
}

A
Avi Kivity 已提交
345 346 347 348 349
static int is_cpuid_PSE36(void)
{
	return 1;
}

350 351 352 353 354 355 356
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

357
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
358
static void __set_spte(u64 *sptep, u64 spte)
359
{
360
	WRITE_ONCE(*sptep, spte);
361 362
}

363
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
364
{
365
	WRITE_ONCE(*sptep, spte);
366 367 368 369 370 371
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	return xchg(sptep, spte);
}
372 373 374

static u64 __get_spte_lockless(u64 *sptep)
{
375
	return READ_ONCE(*sptep);
376
}
377
#else
378 379 380 381 382 383 384
union split_spte {
	struct {
		u32 spte_low;
		u32 spte_high;
	};
	u64 spte;
};
385

386 387
static void count_spte_clear(u64 *sptep, u64 spte)
{
388
	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
389 390 391 392 393 394 395 396 397

	if (is_shadow_present_pte(spte))
		return;

	/* Ensure the spte is completely set before we increase the count */
	smp_wmb();
	sp->clear_spte_count++;
}

398 399 400
static void __set_spte(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;
401

402 403 404 405 406 407 408 409 410 411 412 413
	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	ssptep->spte_high = sspte.spte_high;

	/*
	 * If we map the spte from nonpresent to present, We should store
	 * the high bits firstly, then set present bit, so cpu can not
	 * fetch this spte while we are setting the spte.
	 */
	smp_wmb();

414
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
415 416
}

417 418 419 420 421 422 423
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

424
	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
425 426 427 428 429 430 431 432

	/*
	 * If we map the spte from present to nonpresent, we should clear
	 * present bit firstly to avoid vcpu fetch the old high bits.
	 */
	smp_wmb();

	ssptep->spte_high = sspte.spte_high;
433
	count_spte_clear(sptep, spte);
434 435 436 437 438 439 440 441 442 443 444
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
	union split_spte *ssptep, sspte, orig;

	ssptep = (union split_spte *)sptep;
	sspte = (union split_spte)spte;

	/* xchg acts as a barrier before the setting of the high bits */
	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
445 446
	orig.spte_high = ssptep->spte_high;
	ssptep->spte_high = sspte.spte_high;
447
	count_spte_clear(sptep, spte);
448 449 450

	return orig.spte;
}
451 452 453

/*
 * The idea using the light way get the spte on x86_32 guest is from
454
 * gup_get_pte (mm/gup.c).
455 456 457 458 459 460 461 462 463 464 465 466 467 468
 *
 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 * coalesces them and we are running out of the MMU lock.  Therefore
 * we need to protect against in-progress updates of the spte.
 *
 * Reading the spte while an update is in progress may get the old value
 * for the high part of the spte.  The race is fine for a present->non-present
 * change (because the high part of the spte is ignored for non-present spte),
 * but for a present->present change we must reread the spte.
 *
 * All such changes are done in two steps (present->non-present and
 * non-present->present), hence it is enough to count the number of
 * present->non-present updates: if it changed while reading the spte,
 * we might have hit the race.  This is done using clear_spte_count.
469 470 471
 */
static u64 __get_spte_lockless(u64 *sptep)
{
472
	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
	union split_spte spte, *orig = (union split_spte *)sptep;
	int count;

retry:
	count = sp->clear_spte_count;
	smp_rmb();

	spte.spte_low = orig->spte_low;
	smp_rmb();

	spte.spte_high = orig->spte_high;
	smp_rmb();

	if (unlikely(spte.spte_low != orig->spte_low ||
	      count != sp->clear_spte_count))
		goto retry;

	return spte.spte;
}
492 493
#endif

494 495
static bool spte_has_volatile_bits(u64 spte)
{
496 497 498
	if (!is_shadow_present_pte(spte))
		return false;

499
	/*
500
	 * Always atomically update spte if it can be updated
501 502 503 504
	 * out of mmu-lock, it can ensure dirty bit is not lost,
	 * also, it can help us to get a stable is_writable_pte()
	 * to ensure tlb flush is not missed.
	 */
505 506
	if (spte_can_locklessly_be_made_writable(spte) ||
	    is_access_track_spte(spte))
507 508
		return true;

509
	if (spte_ad_enabled(spte)) {
510 511 512 513
		if ((spte & shadow_accessed_mask) == 0 ||
	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
			return true;
	}
514

515
	return false;
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529
/* Rules for using mmu_spte_set:
 * Set the sptep from nonpresent to present.
 * Note: the sptep being assigned *must* be either not present
 * or in a state where the hardware will not attempt to update
 * the spte.
 */
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
	WARN_ON(is_shadow_present_pte(*sptep));
	__set_spte(sptep, new_spte);
}

530 531 532
/*
 * Update the SPTE (excluding the PFN), but do not track changes in its
 * accessed/dirty status.
533
 */
534
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
535
{
536
	u64 old_spte = *sptep;
537

538
	WARN_ON(!is_shadow_present_pte(new_spte));
539

540 541
	if (!is_shadow_present_pte(old_spte)) {
		mmu_spte_set(sptep, new_spte);
542
		return old_spte;
543
	}
544

545
	if (!spte_has_volatile_bits(old_spte))
546
		__update_clear_spte_fast(sptep, new_spte);
547
	else
548
		old_spte = __update_clear_spte_slow(sptep, new_spte);
549

550 551
	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
	return old_spte;
}

/* Rules for using mmu_spte_update:
 * Update the state bits, it means the mapped pfn is not changed.
 *
 * Whenever we overwrite a writable spte with a read-only one we
 * should flush remote TLBs. Otherwise rmap_write_protect
 * will find a read-only spte, even though the writable spte
 * might be cached on a CPU's TLB, the return value indicates this
 * case.
 *
 * Returns true if the TLB needs to be flushed
 */
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
	bool flush = false;
	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);

	if (!is_shadow_present_pte(old_spte))
		return false;

574 575
	/*
	 * For the spte updated out of mmu-lock is safe, since
576
	 * we always atomically update it, see the comments in
577 578
	 * spte_has_volatile_bits().
	 */
579
	if (spte_can_locklessly_be_made_writable(old_spte) &&
580
	      !is_writable_pte(new_spte))
581
		flush = true;
582

583
	/*
584
	 * Flush TLB when accessed/dirty states are changed in the page tables,
585 586 587
	 * to guarantee consistency between TLB and page tables.
	 */

588 589
	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
		flush = true;
590
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
591 592 593 594
	}

	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
		flush = true;
595
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
596
	}
597

598
	return flush;
599 600
}

601 602 603 604
/*
 * Rules for using mmu_spte_clear_track_bits:
 * It sets the sptep from present to nonpresent, and track the
 * state bits, it is used to clear the last level sptep.
605
 * Returns the old PTE.
606
 */
607
static u64 mmu_spte_clear_track_bits(u64 *sptep)
608
{
D
Dan Williams 已提交
609
	kvm_pfn_t pfn;
610 611 612
	u64 old_spte = *sptep;

	if (!spte_has_volatile_bits(old_spte))
613
		__update_clear_spte_fast(sptep, 0ull);
614
	else
615
		old_spte = __update_clear_spte_slow(sptep, 0ull);
616

617
	if (!is_shadow_present_pte(old_spte))
618
		return old_spte;
619 620

	pfn = spte_to_pfn(old_spte);
621 622 623 624 625 626

	/*
	 * KVM does not hold the refcount of the page used by
	 * kvm mmu, before reclaiming the page, we should
	 * unmap it from mmu first.
	 */
627
	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
628

629
	if (is_accessed_spte(old_spte))
630
		kvm_set_pfn_accessed(pfn);
631 632

	if (is_dirty_spte(old_spte))
633
		kvm_set_pfn_dirty(pfn);
634

635
	return old_spte;
636 637 638 639 640 641 642 643 644
}

/*
 * Rules for using mmu_spte_clear_no_track:
 * Directly clear spte without caring the state bits of sptep,
 * it is used to set the upper level spte.
 */
static void mmu_spte_clear_no_track(u64 *sptep)
{
645
	__update_clear_spte_fast(sptep, 0ull);
646 647
}

648 649 650 651 652
static u64 mmu_spte_get_lockless(u64 *sptep)
{
	return __get_spte_lockless(sptep);
}

653 654 655 656
/* Restore an acc-track PTE back to a regular PTE */
static u64 restore_acc_track_spte(u64 spte)
{
	u64 new_spte = spte;
657 658
	u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
			 & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
659

660
	WARN_ON_ONCE(spte_ad_enabled(spte));
661 662 663
	WARN_ON_ONCE(!is_access_track_spte(spte));

	new_spte &= ~shadow_acc_track_mask;
664 665
	new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
		      SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
666 667 668 669 670
	new_spte |= saved_bits;

	return new_spte;
}

671 672 673 674 675 676 677 678
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
	u64 spte = mmu_spte_get_lockless(sptep);

	if (!is_accessed_spte(spte))
		return false;

679
	if (spte_ad_enabled(spte)) {
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
		clear_bit((ffs(shadow_accessed_mask) - 1),
			  (unsigned long *)sptep);
	} else {
		/*
		 * Capture the dirty status of the page, so that it doesn't get
		 * lost when the SPTE is marked for access tracking.
		 */
		if (is_writable_pte(spte))
			kvm_set_pfn_dirty(spte_to_pfn(spte));

		spte = mark_spte_for_access_track(spte);
		mmu_spte_update_no_track(sptep, spte);
	}

	return true;
}

697 698
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
699 700 701 702 703 704 705 706
	if (is_tdp_mmu(vcpu->arch.mmu)) {
		kvm_tdp_mmu_walk_lockless_begin();
	} else {
		/*
		 * Prevent page table teardown by making any free-er wait during
		 * kvm_flush_remote_tlbs() IPI to all active vcpus.
		 */
		local_irq_disable();
707

708 709 710 711 712 713
		/*
		 * Make sure a following spte read is not reordered ahead of the write
		 * to vcpu->mode.
		 */
		smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
	}
714 715 716 717
}

static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
718 719 720 721 722 723 724 725 726 727 728
	if (is_tdp_mmu(vcpu->arch.mmu)) {
		kvm_tdp_mmu_walk_lockless_end();
	} else {
		/*
		 * Make sure the write to vcpu->mode is not reordered in front of
		 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
		 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
		 */
		smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
		local_irq_enable();
	}
729 730
}

731
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
732
{
733 734
	int r;

735
	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
736 737
	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
738
	if (r)
739
		return r;
740 741
	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
				       PT64_ROOT_MAX_LEVEL);
742
	if (r)
743
		return r;
744
	if (maybe_indirect) {
745 746
		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
					       PT64_ROOT_MAX_LEVEL);
747 748 749
		if (r)
			return r;
	}
750 751
	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
					  PT64_ROOT_MAX_LEVEL);
752 753 754 755
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
756 757 758 759
	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
760 761
}

762
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
763
{
764
	return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
765 766
}

767
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
768
{
769
	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
770 771
}

772 773 774 775 776 777 778 779 780 781
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (!sp->role.direct)
		return sp->gfns[index];

	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
}

static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
782
	if (!sp->role.direct) {
783
		sp->gfns[index] = gfn;
784 785 786 787 788 789 790 791
		return;
	}

	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
		pr_err_ratelimited("gfn mismatch under direct page %llx "
				   "(expected %llx, got %llx)\n",
				   sp->gfn,
				   kvm_mmu_page_get_gfn(sp, index), gfn);
792 793
}

M
Marcelo Tosatti 已提交
794
/*
795 796
 * Return the pointer to the large page information for a given gfn,
 * handling slots that are not large page aligned.
M
Marcelo Tosatti 已提交
797
 */
798
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
799
		const struct kvm_memory_slot *slot, int level)
M
Marcelo Tosatti 已提交
800 801 802
{
	unsigned long idx;

803
	idx = gfn_to_index(gfn, slot->base_gfn, level);
804
	return &slot->arch.lpage_info[level - 2][idx];
M
Marcelo Tosatti 已提交
805 806
}

807
static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
808 809 810 811 812
					    gfn_t gfn, int count)
{
	struct kvm_lpage_info *linfo;
	int i;

813
	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
814 815 816 817 818 819
		linfo = lpage_info_slot(gfn, slot, i);
		linfo->disallow_lpage += count;
		WARN_ON(linfo->disallow_lpage < 0);
	}
}

820
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
821 822 823 824
{
	update_gfn_disallow_lpage_count(slot, gfn, 1);
}

825
void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
826 827 828 829
{
	update_gfn_disallow_lpage_count(slot, gfn, -1);
}

830
static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
M
Marcelo Tosatti 已提交
831
{
832
	struct kvm_memslots *slots;
833
	struct kvm_memory_slot *slot;
834
	gfn_t gfn;
M
Marcelo Tosatti 已提交
835

836
	kvm->arch.indirect_shadow_pages++;
837
	gfn = sp->gfn;
838 839
	slots = kvm_memslots_for_spte_role(kvm, sp->role);
	slot = __gfn_to_memslot(slots, gfn);
840 841

	/* the non-leaf shadow pages are keeping readonly. */
842
	if (sp->role.level > PG_LEVEL_4K)
843 844 845
		return kvm_slot_page_track_add_page(kvm, slot, gfn,
						    KVM_PAGE_TRACK_WRITE);

846
	kvm_mmu_gfn_disallow_lpage(slot, gfn);
M
Marcelo Tosatti 已提交
847 848
}

849
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
P
Paolo Bonzini 已提交
850 851 852 853 854
{
	if (sp->lpage_disallowed)
		return;

	++kvm->stat.nx_lpage_splits;
855 856
	list_add_tail(&sp->lpage_disallowed_link,
		      &kvm->arch.lpage_disallowed_mmu_pages);
P
Paolo Bonzini 已提交
857 858 859
	sp->lpage_disallowed = true;
}

860
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
M
Marcelo Tosatti 已提交
861
{
862
	struct kvm_memslots *slots;
863
	struct kvm_memory_slot *slot;
864
	gfn_t gfn;
M
Marcelo Tosatti 已提交
865

866
	kvm->arch.indirect_shadow_pages--;
867
	gfn = sp->gfn;
868 869
	slots = kvm_memslots_for_spte_role(kvm, sp->role);
	slot = __gfn_to_memslot(slots, gfn);
870
	if (sp->role.level > PG_LEVEL_4K)
871 872 873
		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
						       KVM_PAGE_TRACK_WRITE);

874
	kvm_mmu_gfn_allow_lpage(slot, gfn);
M
Marcelo Tosatti 已提交
875 876
}

877
void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
P
Paolo Bonzini 已提交
878 879 880
{
	--kvm->stat.nx_lpage_splits;
	sp->lpage_disallowed = false;
881
	list_del(&sp->lpage_disallowed_link);
P
Paolo Bonzini 已提交
882 883
}

884 885 886
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
			    bool no_dirty_log)
M
Marcelo Tosatti 已提交
887 888
{
	struct kvm_memory_slot *slot;
889

890
	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
891 892
	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
		return NULL;
893
	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
894
		return NULL;
895 896 897 898

	return slot;
}

899
/*
900
 * About rmap_head encoding:
901
 *
902 903
 * If the bit zero of rmap_head->val is clear, then it points to the only spte
 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
904
 * pte_list_desc containing more mappings.
905 906 907 908
 */

/*
 * Returns the number of pointers in the rmap chain, not counting the new one.
909
 */
910
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
911
			struct kvm_rmap_head *rmap_head)
912
{
913
	struct pte_list_desc *desc;
914
	int count = 0;
915

916
	if (!rmap_head->val) {
917
		rmap_printk("%p %llx 0->1\n", spte, *spte);
918 919
		rmap_head->val = (unsigned long)spte;
	} else if (!(rmap_head->val & 1)) {
920
		rmap_printk("%p %llx 1->many\n", spte, *spte);
921
		desc = mmu_alloc_pte_list_desc(vcpu);
922
		desc->sptes[0] = (u64 *)rmap_head->val;
A
Avi Kivity 已提交
923
		desc->sptes[1] = spte;
924
		desc->spte_count = 2;
925
		rmap_head->val = (unsigned long)desc | 1;
926
		++count;
927
	} else {
928
		rmap_printk("%p %llx many->many\n", spte, *spte);
929
		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
930
		while (desc->spte_count == PTE_LIST_EXT) {
931
			count += PTE_LIST_EXT;
932 933 934
			if (!desc->more) {
				desc->more = mmu_alloc_pte_list_desc(vcpu);
				desc = desc->more;
935
				desc->spte_count = 0;
936 937
				break;
			}
938 939
			desc = desc->more;
		}
940 941
		count += desc->spte_count;
		desc->sptes[desc->spte_count++] = spte;
942
	}
943
	return count;
944 945
}

946
static void
947 948 949
pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
			   struct pte_list_desc *desc, int i,
			   struct pte_list_desc *prev_desc)
950
{
951
	int j = desc->spte_count - 1;
952

A
Avi Kivity 已提交
953 954
	desc->sptes[i] = desc->sptes[j];
	desc->sptes[j] = NULL;
955 956
	desc->spte_count--;
	if (desc->spte_count)
957 958
		return;
	if (!prev_desc && !desc->more)
959
		rmap_head->val = 0;
960 961 962 963
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
964
			rmap_head->val = (unsigned long)desc->more | 1;
965
	mmu_free_pte_list_desc(desc);
966 967
}

968
static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
969
{
970 971
	struct pte_list_desc *desc;
	struct pte_list_desc *prev_desc;
972 973
	int i;

974
	if (!rmap_head->val) {
975
		pr_err("%s: %p 0->BUG\n", __func__, spte);
976
		BUG();
977
	} else if (!(rmap_head->val & 1)) {
978
		rmap_printk("%p 1->0\n", spte);
979
		if ((u64 *)rmap_head->val != spte) {
980
			pr_err("%s:  %p 1->BUG\n", __func__, spte);
981 982
			BUG();
		}
983
		rmap_head->val = 0;
984
	} else {
985
		rmap_printk("%p many->many\n", spte);
986
		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
987 988
		prev_desc = NULL;
		while (desc) {
989
			for (i = 0; i < desc->spte_count; ++i) {
A
Avi Kivity 已提交
990
				if (desc->sptes[i] == spte) {
991 992
					pte_list_desc_remove_entry(rmap_head,
							desc, i, prev_desc);
993 994
					return;
				}
995
			}
996 997 998
			prev_desc = desc;
			desc = desc->more;
		}
999
		pr_err("%s: %p many->many\n", __func__, spte);
1000 1001 1002 1003
		BUG();
	}
}

1004 1005 1006 1007 1008 1009
static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
{
	mmu_spte_clear_track_bits(sptep);
	__pte_list_remove(sptep, rmap_head);
}

P
Peter Xu 已提交
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
/* Return true if rmap existed, false otherwise */
static bool pte_list_destroy(struct kvm_rmap_head *rmap_head)
{
	struct pte_list_desc *desc, *next;
	int i;

	if (!rmap_head->val)
		return false;

	if (!(rmap_head->val & 1)) {
		mmu_spte_clear_track_bits((u64 *)rmap_head->val);
		goto out;
	}

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);

	for (; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
			mmu_spte_clear_track_bits(desc->sptes[i]);
		next = desc->more;
		mmu_free_pte_list_desc(desc);
	}
out:
	/* rmap_head is meaningless now, remember to reset it */
	rmap_head->val = 0;
	return true;
}

1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
	struct pte_list_desc *desc;
	unsigned int count = 0;

	if (!rmap_head->val)
		return 0;
	else if (!(rmap_head->val & 1))
		return 1;

	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);

	while (desc) {
		count += desc->spte_count;
		desc = desc->more;
	}

	return count;
}

1058 1059
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
					 const struct kvm_memory_slot *slot)
1060
{
1061
	unsigned long idx;
1062

1063
	idx = gfn_to_index(gfn, slot->base_gfn, level);
1064
	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1065 1066
}

1067 1068
static bool rmap_can_add(struct kvm_vcpu *vcpu)
{
1069
	struct kvm_mmu_memory_cache *mc;
1070

1071
	mc = &vcpu->arch.mmu_pte_list_desc_cache;
1072
	return kvm_mmu_memory_cache_nr_free_objects(mc);
1073 1074
}

1075 1076
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
1077
	struct kvm_memory_slot *slot;
1078
	struct kvm_mmu_page *sp;
1079
	struct kvm_rmap_head *rmap_head;
1080

1081
	sp = sptep_to_sp(spte);
1082
	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1083
	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1084
	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1085
	return pte_list_add(vcpu, spte, rmap_head);
1086 1087
}

1088

1089 1090
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
1091 1092
	struct kvm_memslots *slots;
	struct kvm_memory_slot *slot;
1093 1094
	struct kvm_mmu_page *sp;
	gfn_t gfn;
1095
	struct kvm_rmap_head *rmap_head;
1096

1097
	sp = sptep_to_sp(spte);
1098
	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1099 1100 1101 1102 1103 1104 1105 1106 1107

	/*
	 * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
	 * context of a vCPU so have to determine which memslots to use based
	 * on context information in sp->role.
	 */
	slots = kvm_memslots_for_spte_role(kvm, sp->role);

	slot = __gfn_to_memslot(slots, gfn);
1108
	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1109

1110
	__pte_list_remove(spte, rmap_head);
1111 1112
}

1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
/*
 * Used by the following functions to iterate through the sptes linked by a
 * rmap.  All fields are private and not assumed to be used outside.
 */
struct rmap_iterator {
	/* private fields */
	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
	int pos;			/* index of the sptep */
};

/*
 * Iteration must be started by this function.  This should also be used after
 * removing/dropping sptes from the rmap link because in such cases the
M
Miaohe Lin 已提交
1126
 * information in the iterator may not be valid.
1127 1128 1129
 *
 * Returns sptep if found, NULL otherwise.
 */
1130 1131
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
			   struct rmap_iterator *iter)
1132
{
1133 1134
	u64 *sptep;

1135
	if (!rmap_head->val)
1136 1137
		return NULL;

1138
	if (!(rmap_head->val & 1)) {
1139
		iter->desc = NULL;
1140 1141
		sptep = (u64 *)rmap_head->val;
		goto out;
1142 1143
	}

1144
	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1145
	iter->pos = 0;
1146 1147 1148 1149
	sptep = iter->desc->sptes[iter->pos];
out:
	BUG_ON(!is_shadow_present_pte(*sptep));
	return sptep;
1150 1151 1152 1153 1154 1155 1156 1157 1158
}

/*
 * Must be used with a valid iterator: e.g. after rmap_get_first().
 *
 * Returns sptep if found, NULL otherwise.
 */
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
1159 1160
	u64 *sptep;

1161 1162 1163 1164 1165
	if (iter->desc) {
		if (iter->pos < PTE_LIST_EXT - 1) {
			++iter->pos;
			sptep = iter->desc->sptes[iter->pos];
			if (sptep)
1166
				goto out;
1167 1168 1169 1170 1171 1172 1173
		}

		iter->desc = iter->desc->more;

		if (iter->desc) {
			iter->pos = 0;
			/* desc->sptes[0] cannot be NULL */
1174 1175
			sptep = iter->desc->sptes[iter->pos];
			goto out;
1176 1177 1178 1179
		}
	}

	return NULL;
1180 1181 1182
out:
	BUG_ON(!is_shadow_present_pte(*sptep));
	return sptep;
1183 1184
}

1185 1186
#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
1187
	     _spte_; _spte_ = rmap_get_next(_iter_))
1188

1189
static void drop_spte(struct kvm *kvm, u64 *sptep)
1190
{
1191 1192 1193
	u64 old_spte = mmu_spte_clear_track_bits(sptep);

	if (is_shadow_present_pte(old_spte))
1194
		rmap_remove(kvm, sptep);
A
Avi Kivity 已提交
1195 1196
}

1197 1198 1199 1200

static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
{
	if (is_large_pte(*sptep)) {
1201
		WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
		drop_spte(kvm, sptep);
		--kvm->stat.lpages;
		return true;
	}

	return false;
}

static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
1212
	if (__drop_large_spte(vcpu->kvm, sptep)) {
1213
		struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1214 1215 1216 1217

		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
			KVM_PAGES_PER_HPAGE(sp->role.level));
	}
1218 1219 1220
}

/*
1221
 * Write-protect on the specified @sptep, @pt_protect indicates whether
1222
 * spte write-protection is caused by protecting shadow page table.
1223
 *
T
Tiejun Chen 已提交
1224
 * Note: write protection is difference between dirty logging and spte
1225 1226 1227 1228 1229
 * protection:
 * - for dirty logging, the spte can be set to writable at anytime if
 *   its dirty bitmap is properly set.
 * - for spte protection, the spte can be writable only after unsync-ing
 *   shadow page.
1230
 *
1231
 * Return true if tlb need be flushed.
1232
 */
1233
static bool spte_write_protect(u64 *sptep, bool pt_protect)
1234 1235 1236
{
	u64 spte = *sptep;

1237
	if (!is_writable_pte(spte) &&
1238
	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1239 1240
		return false;

1241
	rmap_printk("spte %p %llx\n", sptep, *sptep);
1242

1243
	if (pt_protect)
1244
		spte &= ~shadow_mmu_writable_mask;
1245
	spte = spte & ~PT_WRITABLE_MASK;
1246

1247
	return mmu_spte_update(sptep, spte);
1248 1249
}

1250 1251
static bool __rmap_write_protect(struct kvm *kvm,
				 struct kvm_rmap_head *rmap_head,
1252
				 bool pt_protect)
1253
{
1254 1255
	u64 *sptep;
	struct rmap_iterator iter;
1256
	bool flush = false;
1257

1258
	for_each_rmap_spte(rmap_head, &iter, sptep)
1259
		flush |= spte_write_protect(sptep, pt_protect);
1260

1261
	return flush;
1262 1263
}

1264
static bool spte_clear_dirty(u64 *sptep)
1265 1266 1267
{
	u64 spte = *sptep;

1268
	rmap_printk("spte %p %llx\n", sptep, *sptep);
1269

1270
	MMU_WARN_ON(!spte_ad_enabled(spte));
1271 1272 1273 1274
	spte &= ~shadow_dirty_mask;
	return mmu_spte_update(sptep, spte);
}

1275
static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1276 1277 1278
{
	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
					       (unsigned long *)sptep);
1279
	if (was_writable && !spte_ad_enabled(*sptep))
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
		kvm_set_pfn_dirty(spte_to_pfn(*sptep));

	return was_writable;
}

/*
 * Gets the GFN ready for another round of dirty logging by clearing the
 *	- D bit on ad-enabled SPTEs, and
 *	- W bit on ad-disabled SPTEs.
 * Returns true iff any D or W bits were cleared.
 */
1291
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1292
			       const struct kvm_memory_slot *slot)
1293 1294 1295 1296 1297
{
	u64 *sptep;
	struct rmap_iterator iter;
	bool flush = false;

1298
	for_each_rmap_spte(rmap_head, &iter, sptep)
1299 1300
		if (spte_ad_need_write_protect(*sptep))
			flush |= spte_wrprot_for_clear_dirty(sptep);
1301
		else
1302
			flush |= spte_clear_dirty(sptep);
1303 1304 1305 1306

	return flush;
}

1307
/**
1308
 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1309 1310 1311 1312 1313
 * @kvm: kvm instance
 * @slot: slot to protect
 * @gfn_offset: start of the BITS_PER_LONG pages we care about
 * @mask: indicates which pages we should protect
 *
1314
 * Used when we do not need to care about huge page mappings.
1315
 */
1316
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1317 1318
				     struct kvm_memory_slot *slot,
				     gfn_t gfn_offset, unsigned long mask)
1319
{
1320
	struct kvm_rmap_head *rmap_head;
1321

1322
	if (is_tdp_mmu_enabled(kvm))
1323 1324
		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
				slot->base_gfn + gfn_offset, mask, true);
1325 1326 1327 1328

	if (!kvm_memslots_have_rmaps(kvm))
		return;

1329
	while (mask) {
1330 1331
		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
					PG_LEVEL_4K, slot);
1332
		__rmap_write_protect(kvm, rmap_head, false);
M
Marcelo Tosatti 已提交
1333

1334 1335 1336
		/* clear the first set bit */
		mask &= mask - 1;
	}
1337 1338
}

1339
/**
1340 1341
 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
 * protect the page if the D-bit isn't supported.
1342 1343 1344 1345 1346 1347 1348
 * @kvm: kvm instance
 * @slot: slot to clear D-bit
 * @gfn_offset: start of the BITS_PER_LONG pages we care about
 * @mask: indicates which pages we should clear D-bit
 *
 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
 */
1349 1350 1351
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
					 struct kvm_memory_slot *slot,
					 gfn_t gfn_offset, unsigned long mask)
1352
{
1353
	struct kvm_rmap_head *rmap_head;
1354

1355
	if (is_tdp_mmu_enabled(kvm))
1356 1357
		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
				slot->base_gfn + gfn_offset, mask, false);
1358 1359 1360 1361

	if (!kvm_memslots_have_rmaps(kvm))
		return;

1362
	while (mask) {
1363 1364
		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
					PG_LEVEL_4K, slot);
1365
		__rmap_clear_dirty(kvm, rmap_head, slot);
1366 1367 1368 1369 1370 1371

		/* clear the first set bit */
		mask &= mask - 1;
	}
}

1372 1373 1374 1375 1376 1377 1378
/**
 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
 * PT level pages.
 *
 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
 * enable dirty logging for them.
 *
1379 1380
 * We need to care about huge page mappings: e.g. during dirty logging we may
 * have such mappings.
1381 1382 1383 1384 1385
 */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
				struct kvm_memory_slot *slot,
				gfn_t gfn_offset, unsigned long mask)
{
1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
	/*
	 * Huge pages are NOT write protected when we start dirty logging in
	 * initially-all-set mode; must write protect them here so that they
	 * are split to 4K on the first write.
	 *
	 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
	 * of memslot has no such restriction, so the range can cross two large
	 * pages.
	 */
	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
		gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
		gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);

		kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);

		/* Cross two large pages? */
		if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
		    ALIGN(end << PAGE_SHIFT, PMD_SIZE))
			kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
						       PG_LEVEL_2M);
	}

	/* Now handle 4K PTEs.  */
1409 1410
	if (kvm_x86_ops.cpu_dirty_log_size)
		kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1411 1412
	else
		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1413 1414
}

1415 1416
int kvm_cpu_dirty_log_size(void)
{
1417
	return kvm_x86_ops.cpu_dirty_log_size;
1418 1419
}

1420
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1421 1422
				    struct kvm_memory_slot *slot, u64 gfn,
				    int min_level)
1423
{
1424
	struct kvm_rmap_head *rmap_head;
1425
	int i;
1426
	bool write_protected = false;
1427

1428 1429
	if (kvm_memslots_have_rmaps(kvm)) {
		for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1430
			rmap_head = gfn_to_rmap(gfn, i, slot);
1431 1432
			write_protected |= __rmap_write_protect(kvm, rmap_head, true);
		}
1433 1434
	}

1435
	if (is_tdp_mmu_enabled(kvm))
1436
		write_protected |=
1437
			kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1438

1439
	return write_protected;
1440 1441
}

1442 1443 1444 1445 1446
static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
	struct kvm_memory_slot *slot;

	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1447
	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1448 1449
}

1450
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1451
			  const struct kvm_memory_slot *slot)
1452
{
P
Peter Xu 已提交
1453
	return pte_list_destroy(rmap_head);
1454 1455
}

1456 1457 1458
static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			    struct kvm_memory_slot *slot, gfn_t gfn, int level,
			    pte_t unused)
1459
{
1460
	return kvm_zap_rmapp(kvm, rmap_head, slot);
1461 1462
}

1463 1464 1465
static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			      struct kvm_memory_slot *slot, gfn_t gfn, int level,
			      pte_t pte)
1466
{
1467 1468
	u64 *sptep;
	struct rmap_iterator iter;
1469
	int need_flush = 0;
1470
	u64 new_spte;
D
Dan Williams 已提交
1471
	kvm_pfn_t new_pfn;
1472

1473 1474
	WARN_ON(pte_huge(pte));
	new_pfn = pte_pfn(pte);
1475

1476
restart:
1477
	for_each_rmap_spte(rmap_head, &iter, sptep) {
1478
		rmap_printk("spte %p %llx gfn %llx (%d)\n",
1479
			    sptep, *sptep, gfn, level);
1480

1481
		need_flush = 1;
1482

1483
		if (pte_write(pte)) {
1484
			pte_list_remove(rmap_head, sptep);
1485
			goto restart;
1486
		} else {
1487 1488
			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
					*sptep, new_pfn);
1489 1490 1491

			mmu_spte_clear_track_bits(sptep);
			mmu_spte_set(sptep, new_spte);
1492 1493
		}
	}
1494

1495 1496 1497 1498 1499
	if (need_flush && kvm_available_flush_tlb_with_range()) {
		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
		return 0;
	}

1500
	return need_flush;
1501 1502
}

1503 1504
struct slot_rmap_walk_iterator {
	/* input fields. */
1505
	const struct kvm_memory_slot *slot;
1506 1507 1508 1509 1510 1511 1512
	gfn_t start_gfn;
	gfn_t end_gfn;
	int start_level;
	int end_level;

	/* output fields. */
	gfn_t gfn;
1513
	struct kvm_rmap_head *rmap;
1514 1515 1516
	int level;

	/* private field. */
1517
	struct kvm_rmap_head *end_rmap;
1518 1519 1520 1521 1522 1523 1524
};

static void
rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
{
	iterator->level = level;
	iterator->gfn = iterator->start_gfn;
1525 1526
	iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
	iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1527 1528 1529 1530
}

static void
slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1531
		    const struct kvm_memory_slot *slot, int start_level,
1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569
		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
{
	iterator->slot = slot;
	iterator->start_level = start_level;
	iterator->end_level = end_level;
	iterator->start_gfn = start_gfn;
	iterator->end_gfn = end_gfn;

	rmap_walk_init_level(iterator, iterator->start_level);
}

static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
{
	return !!iterator->rmap;
}

static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
	if (++iterator->rmap <= iterator->end_rmap) {
		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
		return;
	}

	if (++iterator->level > iterator->end_level) {
		iterator->rmap = NULL;
		return;
	}

	rmap_walk_init_level(iterator, iterator->level);
}

#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
	   _start_gfn, _end_gfn, _iter_)				\
	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
				 _end_level_, _start_gfn, _end_gfn);	\
	     slot_rmap_walk_okay(_iter_);				\
	     slot_rmap_walk_next(_iter_))

1570 1571 1572
typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			       struct kvm_memory_slot *slot, gfn_t gfn,
			       int level, pte_t pte);
1573

1574 1575 1576
static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
						 struct kvm_gfn_range *range,
						 rmap_handler_t handler)
1577
{
1578
	struct slot_rmap_walk_iterator iterator;
1579
	bool ret = false;
1580

1581 1582 1583 1584
	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
				 range->start, range->end - 1, &iterator)
		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
			       iterator.level, range->pte);
1585

1586
	return ret;
1587 1588
}

1589
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1590
{
1591
	bool flush = false;
1592

1593 1594
	if (kvm_memslots_have_rmaps(kvm))
		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
1595

1596
	if (is_tdp_mmu_enabled(kvm))
1597
		flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1598

1599
	return flush;
1600 1601
}

1602
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1603
{
1604
	bool flush = false;
1605

1606 1607
	if (kvm_memslots_have_rmaps(kvm))
		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
1608

1609
	if (is_tdp_mmu_enabled(kvm))
1610
		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1611

1612
	return flush;
1613 1614
}

1615 1616 1617
static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			  struct kvm_memory_slot *slot, gfn_t gfn, int level,
			  pte_t unused)
1618
{
1619
	u64 *sptep;
1620
	struct rmap_iterator iter;
1621 1622
	int young = 0;

1623 1624
	for_each_rmap_spte(rmap_head, &iter, sptep)
		young |= mmu_spte_age(sptep);
1625

1626 1627 1628
	return young;
}

1629 1630 1631
static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			       struct kvm_memory_slot *slot, gfn_t gfn,
			       int level, pte_t unused)
A
Andrea Arcangeli 已提交
1632
{
1633 1634
	u64 *sptep;
	struct rmap_iterator iter;
A
Andrea Arcangeli 已提交
1635

1636 1637 1638 1639
	for_each_rmap_spte(rmap_head, &iter, sptep)
		if (is_accessed_spte(*sptep))
			return 1;
	return 0;
A
Andrea Arcangeli 已提交
1640 1641
}

1642 1643
#define RMAP_RECYCLE_THRESHOLD 1000

1644
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1645
{
1646
	struct kvm_memory_slot *slot;
1647
	struct kvm_rmap_head *rmap_head;
1648 1649
	struct kvm_mmu_page *sp;

1650
	sp = sptep_to_sp(spte);
1651
	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1652
	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1653

1654
	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
1655 1656
	kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
			KVM_PAGES_PER_HPAGE(sp->role.level));
1657 1658
}

1659
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1660
{
1661
	bool young = false;
1662

1663 1664
	if (kvm_memslots_have_rmaps(kvm))
		young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
1665

1666
	if (is_tdp_mmu_enabled(kvm))
1667
		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1668 1669

	return young;
1670 1671
}

1672
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
A
Andrea Arcangeli 已提交
1673
{
1674
	bool young = false;
1675

1676 1677
	if (kvm_memslots_have_rmaps(kvm))
		young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
1678

1679
	if (is_tdp_mmu_enabled(kvm))
1680
		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1681 1682

	return young;
A
Andrea Arcangeli 已提交
1683 1684
}

1685
#ifdef MMU_DEBUG
1686
static int is_empty_shadow_page(u64 *spt)
A
Avi Kivity 已提交
1687
{
1688 1689 1690
	u64 *pos;
	u64 *end;

1691
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1692
		if (is_shadow_present_pte(*pos)) {
1693
			printk(KERN_ERR "%s: %p %llx\n", __func__,
1694
			       pos, *pos);
A
Avi Kivity 已提交
1695
			return 0;
1696
		}
A
Avi Kivity 已提交
1697 1698
	return 1;
}
1699
#endif
A
Avi Kivity 已提交
1700

1701 1702 1703 1704 1705 1706
/*
 * This value is the sum of all of the kvm instances's
 * kvm->arch.n_used_mmu_pages values.  We need a global,
 * aggregate version in order to make the slab shrinker
 * faster
 */
1707
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1708 1709 1710 1711 1712
{
	kvm->arch.n_used_mmu_pages += nr;
	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

1713
static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1714
{
1715
	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1716
	hlist_del(&sp->hash_link);
1717 1718
	list_del(&sp->link);
	free_page((unsigned long)sp->spt);
1719 1720
	if (!sp->role.direct)
		free_page((unsigned long)sp->gfns);
1721
	kmem_cache_free(mmu_page_header_cache, sp);
1722 1723
}

1724 1725
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
1726
	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1727 1728
}

1729
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1730
				    struct kvm_mmu_page *sp, u64 *parent_pte)
1731 1732 1733 1734
{
	if (!parent_pte)
		return;

1735
	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1736 1737
}

1738
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1739 1740
				       u64 *parent_pte)
{
1741
	__pte_list_remove(parent_pte, &sp->parent_ptes);
1742 1743
}

1744 1745 1746 1747
static void drop_parent_pte(struct kvm_mmu_page *sp,
			    u64 *parent_pte)
{
	mmu_page_remove_parent_pte(sp, parent_pte);
1748
	mmu_spte_clear_no_track(parent_pte);
1749 1750
}

1751
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
M
Marcelo Tosatti 已提交
1752
{
1753
	struct kvm_mmu_page *sp;
1754

1755 1756
	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
1757
	if (!direct)
1758
		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
1759
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1760 1761 1762 1763 1764 1765

	/*
	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
	 * depends on valid pages being added to the head of the list.  See
	 * comments in kvm_zap_obsolete_pages().
	 */
1766
	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1767 1768 1769
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
	return sp;
M
Marcelo Tosatti 已提交
1770 1771
}

1772
static void mark_unsync(u64 *spte);
1773
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1774
{
1775 1776 1777 1778 1779 1780
	u64 *sptep;
	struct rmap_iterator iter;

	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
		mark_unsync(sptep);
	}
1781 1782
}

1783
static void mark_unsync(u64 *spte)
1784
{
1785
	struct kvm_mmu_page *sp;
1786
	unsigned int index;
1787

1788
	sp = sptep_to_sp(spte);
1789 1790
	index = spte - sp->spt;
	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1791
		return;
1792
	if (sp->unsync_children++)
1793
		return;
1794
	kvm_mmu_mark_parents_unsync(sp);
1795 1796
}

1797
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1798
			       struct kvm_mmu_page *sp)
1799
{
1800
	return 0;
1801 1802
}

1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
#define KVM_PAGE_ARRAY_NR 16

struct kvm_mmu_pages {
	struct mmu_page_and_offset {
		struct kvm_mmu_page *sp;
		unsigned int idx;
	} page[KVM_PAGE_ARRAY_NR];
	unsigned int nr;
};

1813 1814
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
			 int idx)
1815
{
1816
	int i;
1817

1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
	if (sp->unsync)
		for (i=0; i < pvec->nr; i++)
			if (pvec->page[i].sp == sp)
				return 0;

	pvec->page[pvec->nr].sp = sp;
	pvec->page[pvec->nr].idx = idx;
	pvec->nr++;
	return (pvec->nr == KVM_PAGE_ARRAY_NR);
}

1829 1830 1831 1832 1833 1834 1835
static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
{
	--sp->unsync_children;
	WARN_ON((int)sp->unsync_children < 0);
	__clear_bit(idx, sp->unsync_child_bitmap);
}

1836 1837 1838 1839
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
	int i, ret, nr_unsync_leaf = 0;
1840

1841
	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1842
		struct kvm_mmu_page *child;
1843 1844
		u64 ent = sp->spt[i];

1845 1846 1847 1848
		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
			clear_unsync_child_bit(sp, i);
			continue;
		}
1849

1850
		child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
1851 1852 1853 1854 1855 1856

		if (child->unsync_children) {
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;

			ret = __mmu_unsync_walk(child, pvec);
1857 1858 1859 1860
			if (!ret) {
				clear_unsync_child_bit(sp, i);
				continue;
			} else if (ret > 0) {
1861
				nr_unsync_leaf += ret;
1862
			} else
1863 1864 1865 1866 1867 1868
				return ret;
		} else if (child->unsync) {
			nr_unsync_leaf++;
			if (mmu_pages_add(pvec, child, i))
				return -ENOSPC;
		} else
1869
			clear_unsync_child_bit(sp, i);
1870 1871
	}

1872 1873 1874
	return nr_unsync_leaf;
}

1875 1876
#define INVALID_INDEX (-1)

1877 1878 1879
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
			   struct kvm_mmu_pages *pvec)
{
P
Paolo Bonzini 已提交
1880
	pvec->nr = 0;
1881 1882 1883
	if (!sp->unsync_children)
		return 0;

1884
	mmu_pages_add(pvec, sp, INVALID_INDEX);
1885
	return __mmu_unsync_walk(sp, pvec);
1886 1887 1888 1889 1890
}

static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	WARN_ON(!sp->unsync);
1891
	trace_kvm_mmu_sync_page(sp);
1892 1893 1894 1895
	sp->unsync = 0;
	--kvm->stat.mmu_unsync;
}

1896 1897
static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				     struct list_head *invalid_list);
1898 1899
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list);
1900

1901 1902
#define for_each_valid_sp(_kvm, _sp, _list)				\
	hlist_for_each_entry(_sp, _list, hash_link)			\
1903
		if (is_obsolete_sp((_kvm), (_sp))) {			\
1904
		} else
1905 1906

#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
1907 1908
	for_each_valid_sp(_kvm, _sp,					\
	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
1909
		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1910

1911 1912
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			 struct list_head *invalid_list)
1913
{
1914
	if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
1915
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1916
		return false;
1917 1918
	}

1919
	return true;
1920 1921
}

1922 1923 1924 1925
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
					struct list_head *invalid_list,
					bool remote_flush)
{
1926
	if (!remote_flush && list_empty(invalid_list))
1927 1928 1929 1930 1931 1932 1933 1934 1935
		return false;

	if (!list_empty(invalid_list))
		kvm_mmu_commit_zap_page(kvm, invalid_list);
	else
		kvm_flush_remote_tlbs(kvm);
	return true;
}

1936 1937 1938
static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
				 struct list_head *invalid_list,
				 bool remote_flush, bool local_flush)
1939
{
1940
	if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
1941
		return;
1942

1943
	if (local_flush)
1944
		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1945 1946
}

1947 1948 1949 1950 1951 1952 1953
#ifdef CONFIG_KVM_MMU_AUDIT
#include "mmu_audit.c"
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif

1954 1955
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
1956 1957
	return sp->role.invalid ||
	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1958 1959
}

1960
struct mmu_page_path {
1961 1962
	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
	unsigned int idx[PT64_ROOT_MAX_LEVEL];
1963 1964
};

1965
#define for_each_sp(pvec, sp, parents, i)			\
P
Paolo Bonzini 已提交
1966
		for (i = mmu_pages_first(&pvec, &parents);	\
1967 1968 1969
			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
			i = mmu_pages_next(&pvec, &parents, i))

1970 1971 1972
static int mmu_pages_next(struct kvm_mmu_pages *pvec,
			  struct mmu_page_path *parents,
			  int i)
1973 1974 1975 1976 1977
{
	int n;

	for (n = i+1; n < pvec->nr; n++) {
		struct kvm_mmu_page *sp = pvec->page[n].sp;
P
Paolo Bonzini 已提交
1978 1979
		unsigned idx = pvec->page[n].idx;
		int level = sp->role.level;
1980

P
Paolo Bonzini 已提交
1981
		parents->idx[level-1] = idx;
1982
		if (level == PG_LEVEL_4K)
P
Paolo Bonzini 已提交
1983
			break;
1984

P
Paolo Bonzini 已提交
1985
		parents->parent[level-2] = sp;
1986 1987 1988 1989 1990
	}

	return n;
}

P
Paolo Bonzini 已提交
1991 1992 1993 1994 1995 1996 1997 1998 1999
static int mmu_pages_first(struct kvm_mmu_pages *pvec,
			   struct mmu_page_path *parents)
{
	struct kvm_mmu_page *sp;
	int level;

	if (pvec->nr == 0)
		return 0;

2000 2001
	WARN_ON(pvec->page[0].idx != INVALID_INDEX);

P
Paolo Bonzini 已提交
2002 2003
	sp = pvec->page[0].sp;
	level = sp->role.level;
2004
	WARN_ON(level == PG_LEVEL_4K);
P
Paolo Bonzini 已提交
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014

	parents->parent[level-2] = sp;

	/* Also set up a sentinel.  Further entries in pvec are all
	 * children of sp, so this element is never overwritten.
	 */
	parents->parent[level-1] = NULL;
	return mmu_pages_next(pvec, parents, 0);
}

2015
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2016
{
2017 2018 2019 2020 2021 2022 2023 2024 2025
	struct kvm_mmu_page *sp;
	unsigned int level = 0;

	do {
		unsigned int idx = parents->idx[level];
		sp = parents->parent[level];
		if (!sp)
			return;

2026
		WARN_ON(idx == INVALID_INDEX);
2027
		clear_unsync_child_bit(sp, idx);
2028
		level++;
P
Paolo Bonzini 已提交
2029
	} while (!sp->unsync_children);
2030
}
2031

2032 2033 2034 2035 2036 2037 2038
static void mmu_sync_children(struct kvm_vcpu *vcpu,
			      struct kvm_mmu_page *parent)
{
	int i;
	struct kvm_mmu_page *sp;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
2039
	LIST_HEAD(invalid_list);
2040
	bool flush = false;
2041 2042

	while (mmu_unsync_walk(parent, &pages)) {
2043
		bool protected = false;
2044 2045

		for_each_sp(pages, sp, parents, i)
2046
			protected |= rmap_write_protect(vcpu, sp->gfn);
2047

2048
		if (protected) {
2049
			kvm_flush_remote_tlbs(vcpu->kvm);
2050 2051
			flush = false;
		}
2052

2053
		for_each_sp(pages, sp, parents, i) {
2054
			kvm_unlink_unsync_page(vcpu->kvm, sp);
2055
			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2056 2057
			mmu_pages_clear_parents(&parents);
		}
2058
		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
2059
			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2060
			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2061 2062
			flush = false;
		}
2063
	}
2064 2065

	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2066 2067
}

2068 2069
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
2070
	atomic_set(&sp->write_flooding_count,  0);
2071 2072 2073 2074
}

static void clear_sp_write_flooding_count(u64 *spte)
{
2075
	__clear_sp_write_flooding_count(sptep_to_sp(spte));
2076 2077
}

2078 2079 2080 2081
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
					     gfn_t gfn,
					     gva_t gaddr,
					     unsigned level,
2082
					     int direct,
2083
					     unsigned int access)
2084
{
2085
	bool direct_mmu = vcpu->arch.mmu->direct_map;
2086
	union kvm_mmu_page_role role;
2087
	struct hlist_head *sp_list;
2088
	unsigned quadrant;
2089
	struct kvm_mmu_page *sp;
2090
	int collisions = 0;
2091
	LIST_HEAD(invalid_list);
2092

2093
	role = vcpu->arch.mmu->mmu_role.base;
2094
	role.level = level;
2095
	role.direct = direct;
2096
	if (role.direct)
2097
		role.gpte_is_8_bytes = true;
2098
	role.access = access;
2099
	if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2100 2101 2102 2103
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
		role.quadrant = quadrant;
	}
2104 2105 2106

	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2107 2108 2109 2110 2111
		if (sp->gfn != gfn) {
			collisions++;
			continue;
		}

2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124
		if (sp->role.word != role.word) {
			/*
			 * If the guest is creating an upper-level page, zap
			 * unsync pages for the same gfn.  While it's possible
			 * the guest is using recursive page tables, in all
			 * likelihood the guest has stopped using the unsync
			 * page and is installing a completely unrelated page.
			 * Unsync pages must not be left as is, because the new
			 * upper-level page will be write-protected.
			 */
			if (level > PG_LEVEL_4K && sp->unsync)
				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
							 &invalid_list);
2125
			continue;
2126
		}
2127

2128 2129 2130
		if (direct_mmu)
			goto trace_get_page;

2131
		if (sp->unsync) {
2132
			/*
2133
			 * The page is good, but is stale.  kvm_sync_page does
2134 2135 2136 2137 2138 2139 2140 2141 2142
			 * get the latest guest state, but (unlike mmu_unsync_children)
			 * it doesn't write-protect the page or mark it synchronized!
			 * This way the validity of the mapping is ensured, but the
			 * overhead of write protection is not incurred until the
			 * guest invalidates the TLB mapping.  This allows multiple
			 * SPs for a single gfn to be unsync.
			 *
			 * If the sync fails, the page is zapped.  If so, break
			 * in order to rebuild it.
2143
			 */
2144
			if (!kvm_sync_page(vcpu, sp, &invalid_list))
2145 2146 2147
				break;

			WARN_ON(!list_empty(&invalid_list));
2148
			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
2149
		}
2150

2151
		if (sp->unsync_children)
2152
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2153

2154
		__clear_sp_write_flooding_count(sp);
2155 2156

trace_get_page:
2157
		trace_kvm_mmu_get_page(sp, false);
2158
		goto out;
2159
	}
2160

A
Avi Kivity 已提交
2161
	++vcpu->kvm->stat.mmu_cache_miss;
2162 2163 2164

	sp = kvm_mmu_alloc_page(vcpu, direct);

2165 2166
	sp->gfn = gfn;
	sp->role = role;
2167
	hlist_add_head(&sp->hash_link, sp_list);
2168
	if (!direct) {
2169
		account_shadowed(vcpu->kvm, sp);
2170
		if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
2171
			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2172
	}
A
Avi Kivity 已提交
2173
	trace_kvm_mmu_get_page(sp, true);
2174
out:
2175 2176
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);

2177 2178
	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2179
	return sp;
2180 2181
}

2182 2183 2184
static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
					struct kvm_vcpu *vcpu, hpa_t root,
					u64 addr)
2185 2186
{
	iterator->addr = addr;
2187
	iterator->shadow_addr = root;
2188
	iterator->level = vcpu->arch.mmu->shadow_root_level;
2189

2190
	if (iterator->level == PT64_ROOT_4LEVEL &&
2191 2192
	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
	    !vcpu->arch.mmu->direct_map)
2193 2194
		--iterator->level;

2195
	if (iterator->level == PT32E_ROOT_LEVEL) {
2196 2197 2198 2199
		/*
		 * prev_root is currently only used for 64-bit hosts. So only
		 * the active root_hpa is valid here.
		 */
2200
		BUG_ON(root != vcpu->arch.mmu->root_hpa);
2201

2202
		iterator->shadow_addr
2203
			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2204 2205 2206 2207 2208 2209 2210
		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
		--iterator->level;
		if (!iterator->shadow_addr)
			iterator->level = 0;
	}
}

2211 2212 2213
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
			     struct kvm_vcpu *vcpu, u64 addr)
{
2214
	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2215 2216 2217
				    addr);
}

2218 2219
static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
2220
	if (iterator->level < PG_LEVEL_4K)
2221
		return false;
2222

2223 2224 2225 2226 2227
	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
	return true;
}

2228 2229
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
			       u64 spte)
2230
{
2231
	if (is_last_spte(spte, iterator->level)) {
2232 2233 2234 2235
		iterator->level = 0;
		return;
	}

2236
	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2237 2238 2239
	--iterator->level;
}

2240 2241
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
2242
	__shadow_walk_next(iterator, *iterator->sptep);
2243 2244
}

2245 2246 2247 2248 2249 2250 2251 2252 2253
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
			     struct kvm_mmu_page *sp)
{
	u64 spte;

	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);

	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));

2254
	mmu_spte_set(sptep, spte);
2255 2256 2257 2258 2259

	mmu_page_add_parent_pte(vcpu, sp, sptep);

	if (sp->unsync_children || sp->unsync)
		mark_unsync(sptep);
2260 2261
}

2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
				   unsigned direct_access)
{
	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
		struct kvm_mmu_page *child;

		/*
		 * For the direct sp, if the guest pte's dirty bit
		 * changed form clean to dirty, it will corrupt the
		 * sp's access: allow writable in the read-only sp,
		 * so we should update the spte at this point to get
		 * a new sp with the correct access.
		 */
2275
		child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
2276 2277 2278
		if (child->role.access == direct_access)
			return;

2279
		drop_parent_pte(child, sptep);
2280
		kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2281 2282 2283
	}
}

2284 2285 2286
/* Returns the number of zapped non-leaf child shadow pages. */
static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
			    u64 *spte, struct list_head *invalid_list)
2287 2288 2289 2290 2291 2292
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
	if (is_shadow_present_pte(pte)) {
X
Xiao Guangrong 已提交
2293
		if (is_last_spte(pte, sp->role.level)) {
2294
			drop_spte(kvm, spte);
X
Xiao Guangrong 已提交
2295 2296 2297
			if (is_large_pte(pte))
				--kvm->stat.lpages;
		} else {
2298
			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2299
			drop_parent_pte(child, spte);
2300 2301 2302 2303 2304 2305 2306 2307 2308 2309

			/*
			 * Recursively zap nested TDP SPs, parentless SPs are
			 * unlikely to be used again in the near future.  This
			 * avoids retaining a large number of stale nested SPs.
			 */
			if (tdp_enabled && invalid_list &&
			    child->role.guest_mode && !child->parent_ptes.val)
				return kvm_mmu_prepare_zap_page(kvm, child,
								invalid_list);
2310
		}
2311
	} else if (is_mmio_spte(pte)) {
2312
		mmu_spte_clear_no_track(spte);
2313
	}
2314
	return 0;
2315 2316
}

2317 2318 2319
static int kvm_mmu_page_unlink_children(struct kvm *kvm,
					struct kvm_mmu_page *sp,
					struct list_head *invalid_list)
2320
{
2321
	int zapped = 0;
2322 2323
	unsigned i;

2324
	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2325 2326 2327
		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);

	return zapped;
2328 2329
}

2330
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2331
{
2332 2333
	u64 *sptep;
	struct rmap_iterator iter;
2334

2335
	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2336
		drop_parent_pte(sp, sptep);
2337 2338
}

2339
static int mmu_zap_unsync_children(struct kvm *kvm,
2340 2341
				   struct kvm_mmu_page *parent,
				   struct list_head *invalid_list)
2342
{
2343 2344 2345
	int i, zapped = 0;
	struct mmu_page_path parents;
	struct kvm_mmu_pages pages;
2346

2347
	if (parent->role.level == PG_LEVEL_4K)
2348
		return 0;
2349 2350 2351 2352 2353

	while (mmu_unsync_walk(parent, &pages)) {
		struct kvm_mmu_page *sp;

		for_each_sp(pages, sp, parents, i) {
2354
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2355
			mmu_pages_clear_parents(&parents);
2356
			zapped++;
2357 2358 2359 2360
		}
	}

	return zapped;
2361 2362
}

2363 2364 2365 2366
static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
				       struct kvm_mmu_page *sp,
				       struct list_head *invalid_list,
				       int *nr_zapped)
2367
{
2368
	bool list_unstable;
A
Avi Kivity 已提交
2369

2370
	trace_kvm_mmu_prepare_zap_page(sp);
2371
	++kvm->stat.mmu_shadow_zapped;
2372
	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2373
	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2374
	kvm_mmu_unlink_parents(kvm, sp);
2375

2376 2377 2378
	/* Zapping children means active_mmu_pages has become unstable. */
	list_unstable = *nr_zapped;

2379
	if (!sp->role.invalid && !sp->role.direct)
2380
		unaccount_shadowed(kvm, sp);
2381

2382 2383
	if (sp->unsync)
		kvm_unlink_unsync_page(kvm, sp);
2384
	if (!sp->root_count) {
2385
		/* Count self */
2386
		(*nr_zapped)++;
2387 2388 2389 2390 2391 2392 2393 2394 2395 2396

		/*
		 * Already invalid pages (previously active roots) are not on
		 * the active page list.  See list_del() in the "else" case of
		 * !sp->root_count.
		 */
		if (sp->role.invalid)
			list_add(&sp->link, invalid_list);
		else
			list_move(&sp->link, invalid_list);
2397
		kvm_mod_used_mmu_pages(kvm, -1);
2398
	} else {
2399 2400 2401 2402 2403
		/*
		 * Remove the active root from the active page list, the root
		 * will be explicitly freed when the root_count hits zero.
		 */
		list_del(&sp->link);
2404

2405 2406 2407 2408 2409 2410
		/*
		 * Obsolete pages cannot be used on any vCPUs, see the comment
		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
		 * treats invalid shadow pages as being obsolete.
		 */
		if (!is_obsolete_sp(kvm, sp))
2411
			kvm_reload_remote_mmus(kvm);
2412
	}
2413

P
Paolo Bonzini 已提交
2414 2415 2416
	if (sp->lpage_disallowed)
		unaccount_huge_nx_page(kvm, sp);

2417
	sp->role.invalid = 1;
2418 2419 2420 2421 2422 2423 2424 2425 2426 2427
	return list_unstable;
}

static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				     struct list_head *invalid_list)
{
	int nr_zapped;

	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
	return nr_zapped;
2428 2429
}

2430 2431 2432
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				    struct list_head *invalid_list)
{
2433
	struct kvm_mmu_page *sp, *nsp;
2434 2435 2436 2437

	if (list_empty(invalid_list))
		return;

2438
	/*
2439 2440 2441 2442 2443 2444 2445
	 * We need to make sure everyone sees our modifications to
	 * the page tables and see changes to vcpu->mode here. The barrier
	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
	 *
	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
	 * guest mode and/or lockless shadow page table walks.
2446 2447
	 */
	kvm_flush_remote_tlbs(kvm);
2448

2449
	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2450
		WARN_ON(!sp->role.invalid || sp->root_count);
2451
		kvm_mmu_free_page(sp);
2452
	}
2453 2454
}

2455 2456
static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
						  unsigned long nr_to_zap)
2457
{
2458 2459
	unsigned long total_zapped = 0;
	struct kvm_mmu_page *sp, *tmp;
2460
	LIST_HEAD(invalid_list);
2461 2462
	bool unstable;
	int nr_zapped;
2463 2464

	if (list_empty(&kvm->arch.active_mmu_pages))
2465 2466
		return 0;

2467
restart:
2468
	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
		/*
		 * Don't zap active root pages, the page itself can't be freed
		 * and zapping it will just force vCPUs to realloc and reload.
		 */
		if (sp->root_count)
			continue;

		unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
						      &nr_zapped);
		total_zapped += nr_zapped;
		if (total_zapped >= nr_to_zap)
2480 2481
			break;

2482 2483
		if (unstable)
			goto restart;
2484
	}
2485

2486 2487 2488 2489 2490 2491
	kvm_mmu_commit_zap_page(kvm, &invalid_list);

	kvm->stat.mmu_recycled += total_zapped;
	return total_zapped;
}

2492 2493 2494 2495 2496 2497 2498
static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
		return kvm->arch.n_max_mmu_pages -
			kvm->arch.n_used_mmu_pages;

	return 0;
2499 2500
}

2501 2502
static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
2503
	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2504

2505
	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2506 2507
		return 0;

2508
	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2509

2510 2511 2512 2513 2514
	/*
	 * Note, this check is intentionally soft, it only guarantees that one
	 * page is available, while the caller may end up allocating as many as
	 * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
	 * exceeding the (arbitrary by default) limit will not harm the host,
I
Ingo Molnar 已提交
2515
	 * being too aggressive may unnecessarily kill the guest, and getting an
2516 2517 2518
	 * exact count is far more trouble than it's worth, especially in the
	 * page fault paths.
	 */
2519 2520 2521 2522 2523
	if (!kvm_mmu_available_pages(vcpu->kvm))
		return -ENOSPC;
	return 0;
}

2524 2525
/*
 * Changing the number of mmu pages allocated to the vm
2526
 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2527
 */
2528
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2529
{
2530
	write_lock(&kvm->mmu_lock);
2531

2532
	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2533 2534
		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
						  goal_nr_mmu_pages);
2535

2536
		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2537 2538
	}

2539
	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2540

2541
	write_unlock(&kvm->mmu_lock);
2542 2543
}

2544
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2545
{
2546
	struct kvm_mmu_page *sp;
2547
	LIST_HEAD(invalid_list);
2548 2549
	int r;

2550
	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2551
	r = 0;
2552
	write_lock(&kvm->mmu_lock);
2553
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2554
		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2555 2556
			 sp->role.word);
		r = 1;
2557
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2558
	}
2559
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
2560
	write_unlock(&kvm->mmu_lock);
2561

2562
	return r;
2563
}
2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578

static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
	gpa_t gpa;
	int r;

	if (vcpu->arch.mmu->direct_map)
		return 0;

	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);

	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);

	return r;
}
2579

2580
static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2581 2582 2583 2584 2585 2586 2587 2588
{
	trace_kvm_mmu_unsync_page(sp);
	++vcpu->kvm->stat.mmu_unsync;
	sp->unsync = 1;

	kvm_mmu_mark_parents_unsync(sp);
}

2589 2590 2591 2592 2593 2594 2595
/*
 * Attempt to unsync any shadow pages that can be reached by the specified gfn,
 * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
 * be write-protected.
 */
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
2596
{
2597
	struct kvm_mmu_page *sp;
2598
	bool locked = false;
2599

2600 2601 2602 2603 2604
	/*
	 * Force write-protection if the page is being tracked.  Note, the page
	 * track machinery is used to write-protect upper-level shadow pages,
	 * i.e. this guards the role.level == 4K assertion below!
	 */
2605
	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2606
		return -EPERM;
2607

2608 2609 2610 2611 2612 2613
	/*
	 * The page is not write-tracked, mark existing shadow pages unsync
	 * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
	 * that case, KVM must complete emulation of the guest TLB flush before
	 * allowing shadow pages to become unsync (writable by the guest).
	 */
2614
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2615
		if (!can_unsync)
2616
			return -EPERM;
2617

2618 2619
		if (sp->unsync)
			continue;
2620

2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643
		/*
		 * TDP MMU page faults require an additional spinlock as they
		 * run with mmu_lock held for read, not write, and the unsync
		 * logic is not thread safe.  Take the spinklock regardless of
		 * the MMU type to avoid extra conditionals/parameters, there's
		 * no meaningful penalty if mmu_lock is held for write.
		 */
		if (!locked) {
			locked = true;
			spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);

			/*
			 * Recheck after taking the spinlock, a different vCPU
			 * may have since marked the page unsync.  A false
			 * positive on the unprotected check above is not
			 * possible as clearing sp->unsync _must_ hold mmu_lock
			 * for write, i.e. unsync cannot transition from 0->1
			 * while this CPU holds mmu_lock for read (or write).
			 */
			if (READ_ONCE(sp->unsync))
				continue;
		}

2644
		WARN_ON(sp->role.level != PG_LEVEL_4K);
2645
		kvm_unsync_page(vcpu, sp);
2646
	}
2647 2648
	if (locked)
		spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
2649

2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671
	/*
	 * We need to ensure that the marking of unsync pages is visible
	 * before the SPTE is updated to allow writes because
	 * kvm_mmu_sync_roots() checks the unsync flags without holding
	 * the MMU lock and so can race with this. If the SPTE was updated
	 * before the page had been marked as unsync-ed, something like the
	 * following could happen:
	 *
	 * CPU 1                    CPU 2
	 * ---------------------------------------------------------------------
	 * 1.2 Host updates SPTE
	 *     to be writable
	 *                      2.1 Guest writes a GPTE for GVA X.
	 *                          (GPTE being in the guest page table shadowed
	 *                           by the SP from CPU 1.)
	 *                          This reads SPTE during the page table walk.
	 *                          Since SPTE.W is read as 1, there is no
	 *                          fault.
	 *
	 *                      2.2 Guest issues TLB flush.
	 *                          That causes a VM Exit.
	 *
2672 2673
	 *                      2.3 Walking of unsync pages sees sp->unsync is
	 *                          false and skips the page.
2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688
	 *
	 *                      2.4 Guest accesses GVA X.
	 *                          Since the mapping in the SP was not updated,
	 *                          so the old mapping for GVA X incorrectly
	 *                          gets used.
	 * 1.1 Host marks SP
	 *     as unsync
	 *     (sp->unsync = true)
	 *
	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
	 * the situation in 2.4 does not arise. The implicit barrier in 2.2
	 * pairs with this write barrier.
	 */
	smp_wmb();

2689
	return 0;
2690 2691
}

2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
		    unsigned int pte_access, int level,
		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
		    bool can_unsync, bool host_writable)
{
	u64 spte;
	struct kvm_mmu_page *sp;
	int ret;

	sp = sptep_to_sp(sptep);

	ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
			can_unsync, host_writable, sp_ad_disabled(sp), &spte);

	if (spte & PT_WRITABLE_MASK)
		kvm_vcpu_mark_page_dirty(vcpu, gfn);

2709 2710 2711
	if (*sptep == spte)
		ret |= SET_SPTE_SPURIOUS;
	else if (mmu_spte_update(sptep, spte))
2712
		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
M
Marcelo Tosatti 已提交
2713 2714 2715
	return ret;
}

2716
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2717
			unsigned int pte_access, bool write_fault, int level,
2718 2719
			gfn_t gfn, kvm_pfn_t pfn, bool speculative,
			bool host_writable)
M
Marcelo Tosatti 已提交
2720 2721
{
	int was_rmapped = 0;
2722
	int rmap_count;
2723
	int set_spte_ret;
2724
	int ret = RET_PF_FIXED;
2725
	bool flush = false;
M
Marcelo Tosatti 已提交
2726

2727 2728
	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
		 *sptep, write_fault, gfn);
M
Marcelo Tosatti 已提交
2729

2730 2731 2732 2733 2734
	if (unlikely(is_noslot_pfn(pfn))) {
		mark_mmio_spte(vcpu, sptep, gfn, pte_access);
		return RET_PF_EMULATE;
	}

2735
	if (is_shadow_present_pte(*sptep)) {
M
Marcelo Tosatti 已提交
2736 2737 2738 2739
		/*
		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
		 * the parent of the now unreachable PTE.
		 */
2740
		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
M
Marcelo Tosatti 已提交
2741
			struct kvm_mmu_page *child;
A
Avi Kivity 已提交
2742
			u64 pte = *sptep;
M
Marcelo Tosatti 已提交
2743

2744
			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2745
			drop_parent_pte(child, sptep);
2746
			flush = true;
A
Avi Kivity 已提交
2747
		} else if (pfn != spte_to_pfn(*sptep)) {
2748
			pgprintk("hfn old %llx new %llx\n",
A
Avi Kivity 已提交
2749
				 spte_to_pfn(*sptep), pfn);
2750
			drop_spte(vcpu->kvm, sptep);
2751
			flush = true;
2752 2753
		} else
			was_rmapped = 1;
M
Marcelo Tosatti 已提交
2754
	}
2755

2756 2757 2758
	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
				speculative, true, host_writable);
	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
M
Marcelo Tosatti 已提交
2759
		if (write_fault)
2760
			ret = RET_PF_EMULATE;
2761
		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
2762
	}
2763

2764
	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
2765 2766
		kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
				KVM_PAGES_PER_HPAGE(level));
M
Marcelo Tosatti 已提交
2767

2768 2769 2770 2771 2772 2773 2774 2775 2776
	/*
	 * The fault is fully spurious if and only if the new SPTE and old SPTE
	 * are identical, and emulation is not required.
	 */
	if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
		WARN_ON_ONCE(!was_rmapped);
		return RET_PF_SPURIOUS;
	}

A
Avi Kivity 已提交
2777
	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2778
	trace_kvm_mmu_set_spte(level, gfn, sptep);
A
Avi Kivity 已提交
2779
	if (!was_rmapped && is_large_pte(*sptep))
M
Marcelo Tosatti 已提交
2780 2781
		++vcpu->kvm->stat.lpages;

2782 2783 2784
	if (is_shadow_present_pte(*sptep)) {
		if (!was_rmapped) {
			rmap_count = rmap_add(vcpu, sptep, gfn);
2785 2786
			if (rmap_count > vcpu->kvm->stat.max_mmu_rmap_size)
				vcpu->kvm->stat.max_mmu_rmap_size = rmap_count;
2787 2788 2789
			if (rmap_count > RMAP_RECYCLE_THRESHOLD)
				rmap_recycle(vcpu, sptep, gfn);
		}
2790
	}
2791

2792
	return ret;
2793 2794
}

D
Dan Williams 已提交
2795
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2796 2797 2798 2799
				     bool no_dirty_log)
{
	struct kvm_memory_slot *slot;

2800
	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2801
	if (!slot)
2802
		return KVM_PFN_ERR_FAULT;
2803

2804
	return gfn_to_pfn_memslot_atomic(slot, gfn);
2805 2806 2807 2808 2809 2810 2811
}

static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp,
				    u64 *start, u64 *end)
{
	struct page *pages[PTE_PREFETCH_NUM];
2812
	struct kvm_memory_slot *slot;
2813
	unsigned int access = sp->role.access;
2814 2815 2816 2817
	int i, ret;
	gfn_t gfn;

	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2818 2819
	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
	if (!slot)
2820 2821
		return -1;

2822
	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2823 2824 2825
	if (ret <= 0)
		return -1;

2826
	for (i = 0; i < ret; i++, gfn++, start++) {
2827
		mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
2828
			     page_to_pfn(pages[i]), true, true);
2829 2830
		put_page(pages[i]);
	}
2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846

	return 0;
}

static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
				  struct kvm_mmu_page *sp, u64 *sptep)
{
	u64 *spte, *start = NULL;
	int i;

	WARN_ON(!sp->role.direct);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2847
		if (is_shadow_present_pte(*spte) || spte == sptep) {
2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861
			if (!start)
				continue;
			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
				break;
			start = NULL;
		} else if (!start)
			start = spte;
	}
}

static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
	struct kvm_mmu_page *sp;

2862
	sp = sptep_to_sp(sptep);
2863

2864
	/*
2865 2866 2867
	 * Without accessed bits, there's no way to distinguish between
	 * actually accessed translations and prefetched, so disable pte
	 * prefetch if accessed bits aren't available.
2868
	 */
2869
	if (sp_ad_disabled(sp))
2870 2871
		return;

2872
	if (sp->role.level > PG_LEVEL_4K)
2873 2874
		return;

2875 2876 2877 2878 2879 2880 2881
	/*
	 * If addresses are being invalidated, skip prefetching to avoid
	 * accidentally prefetching those addresses.
	 */
	if (unlikely(vcpu->kvm->mmu_notifier_count))
		return;

2882 2883 2884
	__direct_pte_prefetch(vcpu, sp, sptep);
}

2885
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2886
				  const struct kvm_memory_slot *slot)
2887 2888 2889 2890 2891
{
	unsigned long hva;
	pte_t *pte;
	int level;

2892
	if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
2893
		return PG_LEVEL_4K;
2894

2895 2896 2897 2898 2899 2900 2901 2902
	/*
	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
	 * is not solely for performance, it's also necessary to avoid the
	 * "writable" check in __gfn_to_hva_many(), which will always fail on
	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
	 * page fault steps have already verified the guest isn't writing a
	 * read-only memslot.
	 */
2903 2904
	hva = __gfn_to_hva_memslot(slot, gfn);

2905
	pte = lookup_address_in_mm(kvm->mm, hva, &level);
2906
	if (unlikely(!pte))
2907
		return PG_LEVEL_4K;
2908 2909 2910 2911

	return level;
}

2912 2913 2914
int kvm_mmu_max_mapping_level(struct kvm *kvm,
			      const struct kvm_memory_slot *slot, gfn_t gfn,
			      kvm_pfn_t pfn, int max_level)
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930
{
	struct kvm_lpage_info *linfo;

	max_level = min(max_level, max_huge_page_level);
	for ( ; max_level > PG_LEVEL_4K; max_level--) {
		linfo = lpage_info_slot(gfn, slot, max_level);
		if (!linfo->disallow_lpage)
			break;
	}

	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	return host_pfn_mapping_level(kvm, gfn, pfn, slot);
}

B
Ben Gardon 已提交
2931 2932 2933
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
			    int max_level, kvm_pfn_t *pfnp,
			    bool huge_page_disallowed, int *req_level)
2934
{
2935
	struct kvm_memory_slot *slot;
2936
	kvm_pfn_t pfn = *pfnp;
2937
	kvm_pfn_t mask;
2938
	int level;
2939

2940 2941
	*req_level = PG_LEVEL_4K;

2942 2943
	if (unlikely(max_level == PG_LEVEL_4K))
		return PG_LEVEL_4K;
2944

2945
	if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
2946
		return PG_LEVEL_4K;
2947

2948 2949
	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
	if (!slot)
2950
		return PG_LEVEL_4K;
2951

2952
	level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
2953
	if (level == PG_LEVEL_4K)
2954
		return level;
2955

2956 2957 2958 2959 2960 2961 2962 2963
	*req_level = level = min(level, max_level);

	/*
	 * Enforce the iTLB multihit workaround after capturing the requested
	 * level, which will be used to do precise, accurate accounting.
	 */
	if (huge_page_disallowed)
		return PG_LEVEL_4K;
2964 2965

	/*
2966 2967
	 * mmu_notifier_retry() was successful and mmu_lock is held, so
	 * the pmd can't be split from under us.
2968
	 */
2969 2970 2971
	mask = KVM_PAGES_PER_HPAGE(level) - 1;
	VM_BUG_ON((gfn & mask) != (pfn & mask));
	*pfnp = pfn & ~mask;
2972 2973

	return level;
2974 2975
}

B
Ben Gardon 已提交
2976 2977
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
				kvm_pfn_t *pfnp, int *goal_levelp)
P
Paolo Bonzini 已提交
2978
{
B
Ben Gardon 已提交
2979
	int level = *goal_levelp;
P
Paolo Bonzini 已提交
2980

2981
	if (cur_level == level && level > PG_LEVEL_4K &&
P
Paolo Bonzini 已提交
2982 2983 2984 2985 2986 2987 2988 2989 2990
	    is_shadow_present_pte(spte) &&
	    !is_large_pte(spte)) {
		/*
		 * A small SPTE exists for this pfn, but FNAME(fetch)
		 * and __direct_map would like to create a large PTE
		 * instead: just force them to go down another level,
		 * patching back for them into pfn the next 9 bits of
		 * the address.
		 */
2991 2992
		u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
				KVM_PAGES_PER_HPAGE(level - 1);
P
Paolo Bonzini 已提交
2993
		*pfnp |= gfn & page_mask;
B
Ben Gardon 已提交
2994
		(*goal_levelp)--;
P
Paolo Bonzini 已提交
2995 2996 2997
	}
}

2998
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
2999
			int map_writable, int max_level, kvm_pfn_t pfn,
3000
			bool prefault, bool is_tdp)
3001
{
3002 3003 3004 3005
	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
	bool write = error_code & PFERR_WRITE_MASK;
	bool exec = error_code & PFERR_FETCH_MASK;
	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
3006
	struct kvm_shadow_walk_iterator it;
3007
	struct kvm_mmu_page *sp;
3008
	int level, req_level, ret;
3009 3010
	gfn_t gfn = gpa >> PAGE_SHIFT;
	gfn_t base_gfn = gfn;
A
Avi Kivity 已提交
3011

3012 3013
	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
					huge_page_disallowed, &req_level);
3014

3015
	trace_kvm_mmu_spte_requested(gpa, level, pfn);
3016
	for_each_shadow_entry(vcpu, gpa, it) {
P
Paolo Bonzini 已提交
3017 3018 3019 3020
		/*
		 * We cannot overwrite existing page tables with an NX
		 * large page, as the leaf could be executable.
		 */
3021
		if (nx_huge_page_workaround_enabled)
3022 3023
			disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
						   &pfn, &level);
P
Paolo Bonzini 已提交
3024

3025 3026
		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
		if (it.level == level)
3027
			break;
A
Avi Kivity 已提交
3028

3029
		drop_large_spte(vcpu, it.sptep);
3030 3031 3032 3033 3034 3035 3036 3037 3038 3039
		if (is_shadow_present_pte(*it.sptep))
			continue;

		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
				      it.level - 1, true, ACC_ALL);

		link_shadow_page(vcpu, it.sptep, sp);
		if (is_tdp && huge_page_disallowed &&
		    req_level >= it.level)
			account_huge_nx_page(vcpu->kvm, sp);
3040
	}
3041 3042 3043 3044

	ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
			   write, level, base_gfn, pfn, prefault,
			   map_writable);
3045 3046 3047
	if (ret == RET_PF_SPURIOUS)
		return ret;

3048 3049 3050
	direct_pte_prefetch(vcpu, it.sptep);
	++vcpu->stat.pf_fixed;
	return ret;
A
Avi Kivity 已提交
3051 3052
}

H
Huang Ying 已提交
3053
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3054
{
3055
	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3056 3057
}

D
Dan Williams 已提交
3058
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3059
{
X
Xiao Guangrong 已提交
3060 3061 3062 3063 3064 3065
	/*
	 * Do not cache the mmio info caused by writing the readonly gfn
	 * into the spte otherwise read access on readonly gfn also can
	 * caused mmio page fault and treat it as mmio access.
	 */
	if (pfn == KVM_PFN_ERR_RO_FAULT)
3066
		return RET_PF_EMULATE;
X
Xiao Guangrong 已提交
3067

3068
	if (pfn == KVM_PFN_ERR_HWPOISON) {
3069
		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3070
		return RET_PF_RETRY;
3071
	}
3072

3073
	return -EFAULT;
3074 3075
}

3076
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3077 3078
				kvm_pfn_t pfn, unsigned int access,
				int *ret_val)
3079 3080
{
	/* The pfn is invalid, report the error! */
3081
	if (unlikely(is_error_pfn(pfn))) {
3082
		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3083
		return true;
3084 3085
	}

3086
	if (unlikely(is_noslot_pfn(pfn))) {
3087 3088
		vcpu_cache_mmio_info(vcpu, gva, gfn,
				     access & shadow_mmio_access_mask);
3089 3090 3091 3092 3093 3094 3095 3096 3097 3098
		/*
		 * If MMIO caching is disabled, emulate immediately without
		 * touching the shadow page tables as attempting to install an
		 * MMIO SPTE will just be an expensive nop.
		 */
		if (unlikely(!shadow_mmio_value)) {
			*ret_val = RET_PF_EMULATE;
			return true;
		}
	}
3099

3100
	return false;
3101 3102
}

3103
static bool page_fault_can_be_fast(u32 error_code)
3104
{
3105 3106 3107 3108 3109 3110 3111
	/*
	 * Do not fix the mmio spte with invalid generation number which
	 * need to be updated by slow page fault path.
	 */
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return false;

3112 3113 3114 3115 3116
	/* See if the page fault is due to an NX violation */
	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
		return false;

3117
	/*
3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128
	 * #PF can be fast if:
	 * 1. The shadow page table entry is not present, which could mean that
	 *    the fault is potentially caused by access tracking (if enabled).
	 * 2. The shadow page table entry is present and the fault
	 *    is caused by write-protect, that means we just need change the W
	 *    bit of the spte which can be done out of mmu-lock.
	 *
	 * However, if access tracking is disabled we know that a non-present
	 * page must be a genuine page fault where we have to create a new SPTE.
	 * So, if access tracking is disabled, we return true only for write
	 * accesses to a present page.
3129 3130
	 */

3131 3132 3133
	return shadow_acc_track_mask != 0 ||
	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3134 3135
}

3136 3137 3138 3139
/*
 * Returns true if the SPTE was fixed successfully. Otherwise,
 * someone else modified the SPTE from its original value.
 */
3140
static bool
3141
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3142
			u64 *sptep, u64 old_spte, u64 new_spte)
3143 3144 3145 3146 3147
{
	gfn_t gfn;

	WARN_ON(!sp->role.direct);

3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159
	/*
	 * Theoretically we could also set dirty bit (and flush TLB) here in
	 * order to eliminate unnecessary PML logging. See comments in
	 * set_spte. But fast_page_fault is very unlikely to happen with PML
	 * enabled, so we do not do this. This might result in the same GPA
	 * to be logged in PML buffer again when the write really happens, and
	 * eventually to be called by mark_page_dirty twice. But it's also no
	 * harm. This also avoids the TLB flush needed after setting dirty bit
	 * so non-PML cases won't be impacted.
	 *
	 * Compare with set_spte where instead shadow_dirty_mask is set.
	 */
3160
	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3161 3162
		return false;

3163
	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3164 3165 3166 3167 3168 3169 3170
		/*
		 * The gfn of direct spte is stable since it is
		 * calculated by sp->gfn.
		 */
		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
		kvm_vcpu_mark_page_dirty(vcpu, gfn);
	}
3171 3172 3173 3174

	return true;
}

3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186
static bool is_access_allowed(u32 fault_err_code, u64 spte)
{
	if (fault_err_code & PFERR_FETCH_MASK)
		return is_executable_pte(spte);

	if (fault_err_code & PFERR_WRITE_MASK)
		return is_writable_pte(spte);

	/* Fault was on Read access */
	return spte & PT_PRESENT_MASK;
}

3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212
/*
 * Returns the last level spte pointer of the shadow page walk for the given
 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
 * walk could be performed, returns NULL and *spte does not contain valid data.
 *
 * Contract:
 *  - Must be called between walk_shadow_page_lockless_{begin,end}.
 *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
 */
static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
{
	struct kvm_shadow_walk_iterator iterator;
	u64 old_spte;
	u64 *sptep = NULL;

	for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
		sptep = iterator.sptep;
		*spte = old_spte;

		if (!is_shadow_present_pte(old_spte))
			break;
	}

	return sptep;
}

3213
/*
3214
 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3215
 */
3216
static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
3217
{
3218
	struct kvm_mmu_page *sp;
3219
	int ret = RET_PF_INVALID;
3220
	u64 spte = 0ull;
3221
	u64 *sptep = NULL;
3222
	uint retry_count = 0;
3223

3224
	if (!page_fault_can_be_fast(error_code))
3225
		return ret;
3226 3227 3228

	walk_shadow_page_lockless_begin(vcpu);

3229
	do {
3230
		u64 new_spte;
3231

3232 3233 3234 3235
		if (is_tdp_mmu(vcpu->arch.mmu))
			sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
		else
			sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
3236

3237 3238 3239
		if (!is_shadow_present_pte(spte))
			break;

3240
		sp = sptep_to_sp(sptep);
3241 3242
		if (!is_last_spte(spte, sp->role.level))
			break;
3243

3244
		/*
3245 3246 3247 3248 3249
		 * Check whether the memory access that caused the fault would
		 * still cause it if it were to be performed right now. If not,
		 * then this is a spurious fault caused by TLB lazily flushed,
		 * or some other CPU has already fixed the PTE after the
		 * current CPU took the fault.
3250 3251 3252 3253
		 *
		 * Need not check the access of upper level table entries since
		 * they are always ACC_ALL.
		 */
3254
		if (is_access_allowed(error_code, spte)) {
3255
			ret = RET_PF_SPURIOUS;
3256 3257
			break;
		}
3258

3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269
		new_spte = spte;

		if (is_access_track_spte(spte))
			new_spte = restore_acc_track_spte(new_spte);

		/*
		 * Currently, to simplify the code, write-protection can
		 * be removed in the fast path only if the SPTE was
		 * write-protected for dirty-logging or access tracking.
		 */
		if ((error_code & PFERR_WRITE_MASK) &&
3270
		    spte_can_locklessly_be_made_writable(spte)) {
3271
			new_spte |= PT_WRITABLE_MASK;
3272 3273

			/*
3274 3275 3276 3277 3278 3279 3280 3281 3282
			 * Do not fix write-permission on the large spte.  Since
			 * we only dirty the first page into the dirty-bitmap in
			 * fast_pf_fix_direct_spte(), other pages are missed
			 * if its slot has dirty logging enabled.
			 *
			 * Instead, we let the slow page fault path create a
			 * normal spte to fix the access.
			 *
			 * See the comments in kvm_arch_commit_memory_region().
3283
			 */
3284
			if (sp->role.level > PG_LEVEL_4K)
3285
				break;
3286
		}
3287

3288
		/* Verify that the fault can be handled in the fast path */
3289 3290
		if (new_spte == spte ||
		    !is_access_allowed(error_code, new_spte))
3291 3292 3293 3294 3295
			break;

		/*
		 * Currently, fast page fault only works for direct mapping
		 * since the gfn is not stable for indirect shadow page. See
3296
		 * Documentation/virt/kvm/locking.rst to get more detail.
3297
		 */
3298
		if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
3299
			ret = RET_PF_FIXED;
3300
			break;
3301
		}
3302 3303 3304 3305 3306 3307 3308 3309

		if (++retry_count > 4) {
			printk_once(KERN_WARNING
				"kvm: Fast #PF retrying more than 4 times.\n");
			break;
		}

	} while (true);
3310

3311
	trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
3312 3313
	walk_shadow_page_lockless_end(vcpu);

3314
	return ret;
3315 3316
}

3317 3318
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
			       struct list_head *invalid_list)
3319
{
3320
	struct kvm_mmu_page *sp;
3321

3322
	if (!VALID_PAGE(*root_hpa))
A
Avi Kivity 已提交
3323
		return;
3324

3325
	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
3326

3327
	if (is_tdp_mmu_page(sp))
3328
		kvm_tdp_mmu_put_root(kvm, sp, false);
3329 3330
	else if (!--sp->root_count && sp->role.invalid)
		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3331

3332 3333 3334
	*root_hpa = INVALID_PAGE;
}

3335
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3336 3337
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			ulong roots_to_free)
3338
{
3339
	struct kvm *kvm = vcpu->kvm;
3340 3341
	int i;
	LIST_HEAD(invalid_list);
3342
	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3343

3344
	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3345

3346
	/* Before acquiring the MMU lock, see if we need to do any real work. */
3347 3348 3349 3350 3351 3352 3353 3354 3355
	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
			    VALID_PAGE(mmu->prev_roots[i].hpa))
				break;

		if (i == KVM_MMU_NUM_PREV_ROOTS)
			return;
	}
3356

3357
	write_lock(&kvm->mmu_lock);
3358

3359 3360
	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3361
			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3362
					   &invalid_list);
3363

3364 3365 3366
	if (free_active_root) {
		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3367
			mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
3368
		} else if (mmu->pae_root) {
3369 3370 3371 3372 3373 3374 3375 3376
			for (i = 0; i < 4; ++i) {
				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
					continue;

				mmu_free_root_page(kvm, &mmu->pae_root[i],
						   &invalid_list);
				mmu->pae_root[i] = INVALID_PAE_ROOT;
			}
3377
		}
3378
		mmu->root_hpa = INVALID_PAGE;
3379
		mmu->root_pgd = 0;
3380
	}
3381

3382
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3383
	write_unlock(&kvm->mmu_lock);
3384
}
3385
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3386

3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413
void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
	unsigned long roots_to_free = 0;
	hpa_t root_hpa;
	int i;

	/*
	 * This should not be called while L2 is active, L2 can't invalidate
	 * _only_ its own roots, e.g. INVVPID unconditionally exits.
	 */
	WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);

	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
		root_hpa = mmu->prev_roots[i].hpa;
		if (!VALID_PAGE(root_hpa))
			continue;

		if (!to_shadow_page(root_hpa) ||
			to_shadow_page(root_hpa)->role.guest_mode)
			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
	}

	kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);


3414 3415 3416 3417
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
	int ret = 0;

3418
	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3419
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3420 3421 3422 3423 3424 3425
		ret = 1;
	}

	return ret;
}

3426 3427
static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
			    u8 level, bool direct)
3428 3429
{
	struct kvm_mmu_page *sp;
3430 3431 3432 3433 3434 3435 3436 3437 3438

	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
	++sp->root_count;

	return __pa(sp->spt);
}

static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
3439 3440
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	u8 shadow_root_level = mmu->shadow_root_level;
3441
	hpa_t root;
3442
	unsigned i;
3443 3444 3445 3446 3447 3448
	int r;

	write_lock(&vcpu->kvm->mmu_lock);
	r = make_mmu_pages_available(vcpu);
	if (r < 0)
		goto out_unlock;
3449

3450
	if (is_tdp_mmu_enabled(vcpu->kvm)) {
3451
		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3452
		mmu->root_hpa = root;
3453
	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3454
		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3455
		mmu->root_hpa = root;
3456
	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3457 3458 3459 3460
		if (WARN_ON_ONCE(!mmu->pae_root)) {
			r = -EIO;
			goto out_unlock;
		}
3461

3462
		for (i = 0; i < 4; ++i) {
3463
			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3464

3465 3466
			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
					      i << 30, PT32_ROOT_LEVEL, true);
3467 3468
			mmu->pae_root[i] = root | PT_PRESENT_MASK |
					   shadow_me_mask;
3469
		}
3470
		mmu->root_hpa = __pa(mmu->pae_root);
3471 3472
	} else {
		WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3473 3474
		r = -EIO;
		goto out_unlock;
3475
	}
3476

3477
	/* root_pgd is ignored for direct MMUs. */
3478
	mmu->root_pgd = 0;
3479 3480 3481
out_unlock:
	write_unlock(&vcpu->kvm->mmu_lock);
	return r;
3482 3483 3484
}

static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3485
{
3486
	struct kvm_mmu *mmu = vcpu->arch.mmu;
3487
	u64 pdptrs[4], pm_mask;
3488
	gfn_t root_gfn, root_pgd;
3489
	hpa_t root;
3490 3491
	unsigned i;
	int r;
3492

3493
	root_pgd = mmu->get_guest_pgd(vcpu);
3494
	root_gfn = root_pgd >> PAGE_SHIFT;
3495

3496 3497 3498
	if (mmu_check_root(vcpu, root_gfn))
		return 1;

3499 3500 3501 3502
	/*
	 * On SVM, reading PDPTRs might access guest memory, which might fault
	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
	 */
3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513
	if (mmu->root_level == PT32E_ROOT_LEVEL) {
		for (i = 0; i < 4; ++i) {
			pdptrs[i] = mmu->get_pdptr(vcpu, i);
			if (!(pdptrs[i] & PT_PRESENT_MASK))
				continue;

			if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
				return 1;
		}
	}

3514 3515 3516 3517
	r = alloc_all_memslots_rmaps(vcpu->kvm);
	if (r)
		return r;

3518 3519 3520 3521 3522
	write_lock(&vcpu->kvm->mmu_lock);
	r = make_mmu_pages_available(vcpu);
	if (r < 0)
		goto out_unlock;

3523 3524 3525 3526
	/*
	 * Do we shadow a long mode page table? If so we need to
	 * write-protect the guests page table root.
	 */
3527
	if (mmu->root_level >= PT64_ROOT_4LEVEL) {
3528
		root = mmu_alloc_root(vcpu, root_gfn, 0,
3529 3530
				      mmu->shadow_root_level, false);
		mmu->root_hpa = root;
3531
		goto set_root_pgd;
3532
	}
3533

3534 3535 3536 3537
	if (WARN_ON_ONCE(!mmu->pae_root)) {
		r = -EIO;
		goto out_unlock;
	}
3538

3539 3540
	/*
	 * We shadow a 32 bit page table. This may be a legacy 2-level
3541 3542
	 * or a PAE 3-level page table. In either case we need to be aware that
	 * the shadow page table may be a PAE or a long mode page table.
3543
	 */
3544
	pm_mask = PT_PRESENT_MASK | shadow_me_mask;
3545
	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3546 3547
		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;

3548
		if (WARN_ON_ONCE(!mmu->pml4_root)) {
3549 3550 3551
			r = -EIO;
			goto out_unlock;
		}
3552

3553
		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3554 3555
	}

3556
	for (i = 0; i < 4; ++i) {
3557
		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3558

3559
		if (mmu->root_level == PT32E_ROOT_LEVEL) {
3560
			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3561
				mmu->pae_root[i] = INVALID_PAE_ROOT;
A
Avi Kivity 已提交
3562 3563
				continue;
			}
3564
			root_gfn = pdptrs[i] >> PAGE_SHIFT;
3565
		}
3566

3567 3568
		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
				      PT32_ROOT_LEVEL, false);
3569
		mmu->pae_root[i] = root | pm_mask;
3570
	}
3571

3572
	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3573
		mmu->root_hpa = __pa(mmu->pml4_root);
3574 3575
	else
		mmu->root_hpa = __pa(mmu->pae_root);
3576

3577
set_root_pgd:
3578
	mmu->root_pgd = root_pgd;
3579 3580
out_unlock:
	write_unlock(&vcpu->kvm->mmu_lock);
3581

3582
	return 0;
3583 3584
}

3585 3586 3587
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *mmu = vcpu->arch.mmu;
3588
	u64 *pml4_root, *pae_root;
3589 3590

	/*
3591 3592 3593 3594
	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
	 * tables are allocated and initialized at root creation as there is no
	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3595
	 */
3596 3597 3598
	if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
	    mmu->shadow_root_level < PT64_ROOT_4LEVEL)
		return 0;
3599

3600 3601 3602 3603 3604 3605
	/*
	 * This mess only works with 4-level paging and needs to be updated to
	 * work with 5-level paging.
	 */
	if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
		return -EIO;
3606

3607
	if (mmu->pae_root && mmu->pml4_root)
3608
		return 0;
3609

3610 3611 3612 3613
	/*
	 * The special roots should always be allocated in concert.  Yell and
	 * bail if KVM ends up in a state where only one of the roots is valid.
	 */
3614
	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
3615
		return -EIO;
3616

3617 3618 3619 3620
	/*
	 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
	 * doesn't need to be decrypted.
	 */
3621 3622 3623
	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
	if (!pae_root)
		return -ENOMEM;
3624

3625 3626
	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
	if (!pml4_root) {
3627 3628
		free_page((unsigned long)pae_root);
		return -ENOMEM;
3629 3630
	}

3631
	mmu->pae_root = pae_root;
3632
	mmu->pml4_root = pml4_root;
3633

3634
	return 0;
3635 3636
}

3637
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3638 3639 3640 3641
{
	int i;
	struct kvm_mmu_page *sp;

3642
	if (vcpu->arch.mmu->direct_map)
3643 3644
		return;

3645
	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3646
		return;
3647

3648
	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3649

3650 3651
	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
		hpa_t root = vcpu->arch.mmu->root_hpa;
3652
		sp = to_shadow_page(root);
3653 3654 3655 3656 3657 3658 3659 3660

		/*
		 * Even if another CPU was marking the SP as unsync-ed
		 * simultaneously, any guest page table changes are not
		 * guaranteed to be visible anyway until this VCPU issues a TLB
		 * flush strictly after those changes are made. We only need to
		 * ensure that the other CPU sets these flags before any actual
		 * changes to the page tables are made. The comments in
3661 3662
		 * mmu_try_to_unsync_pages() describe what could go wrong if
		 * this requirement isn't satisfied.
3663 3664 3665 3666 3667
		 */
		if (!smp_load_acquire(&sp->unsync) &&
		    !smp_load_acquire(&sp->unsync_children))
			return;

3668
		write_lock(&vcpu->kvm->mmu_lock);
3669 3670
		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

3671
		mmu_sync_children(vcpu, sp);
3672

3673
		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3674
		write_unlock(&vcpu->kvm->mmu_lock);
3675 3676
		return;
	}
3677

3678
	write_lock(&vcpu->kvm->mmu_lock);
3679 3680
	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

3681
	for (i = 0; i < 4; ++i) {
3682
		hpa_t root = vcpu->arch.mmu->pae_root[i];
3683

3684
		if (IS_VALID_PAE_ROOT(root)) {
3685
			root &= PT64_BASE_ADDR_MASK;
3686
			sp = to_shadow_page(root);
3687 3688 3689 3690
			mmu_sync_children(vcpu, sp);
		}
	}

3691
	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3692
	write_unlock(&vcpu->kvm->mmu_lock);
3693 3694
}

3695
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3696
				  u32 access, struct x86_exception *exception)
A
Avi Kivity 已提交
3697
{
3698 3699
	if (exception)
		exception->error_code = 0;
A
Avi Kivity 已提交
3700 3701 3702
	return vaddr;
}

3703
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
3704 3705
					 u32 access,
					 struct x86_exception *exception)
3706
{
3707 3708
	if (exception)
		exception->error_code = 0;
3709
	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3710 3711
}

3712
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3713
{
3714 3715 3716 3717 3718 3719 3720
	/*
	 * A nested guest cannot use the MMIO cache if it is using nested
	 * page tables, because cr2 is a nGPA while the cache stores GPAs.
	 */
	if (mmu_is_nested(vcpu))
		return false;

3721 3722 3723 3724 3725 3726
	if (direct)
		return vcpu_match_mmio_gpa(vcpu, addr);

	return vcpu_match_mmio_gva(vcpu, addr);
}

3727 3728 3729
/*
 * Return the level of the lowest level SPTE added to sptes.
 * That SPTE may be non-present.
3730 3731
 *
 * Must be called between walk_shadow_page_lockless_{begin,end}.
3732
 */
3733
static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3734 3735
{
	struct kvm_shadow_walk_iterator iterator;
3736
	int leaf = -1;
3737
	u64 spte;
3738

3739 3740
	for (shadow_walk_init(&iterator, vcpu, addr),
	     *root_level = iterator.level;
3741 3742
	     shadow_walk_okay(&iterator);
	     __shadow_walk_next(&iterator, spte)) {
3743
		leaf = iterator.level;
3744 3745
		spte = mmu_spte_get_lockless(iterator.sptep);

3746
		sptes[leaf] = spte;
3747

3748 3749
		if (!is_shadow_present_pte(spte))
			break;
3750 3751 3752 3753 3754
	}

	return leaf;
}

3755
/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3756 3757
static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
3758
	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3759
	struct rsvd_bits_validate *rsvd_check;
3760
	int root, leaf, level;
3761 3762
	bool reserved = false;

3763 3764
	walk_shadow_page_lockless_begin(vcpu);

3765
	if (is_tdp_mmu(vcpu->arch.mmu))
3766
		leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3767
	else
3768
		leaf = get_walk(vcpu, addr, sptes, &root);
3769

3770 3771
	walk_shadow_page_lockless_end(vcpu);

3772 3773 3774 3775 3776
	if (unlikely(leaf < 0)) {
		*sptep = 0ull;
		return reserved;
	}

3777 3778 3779 3780 3781 3782 3783 3784 3785 3786
	*sptep = sptes[leaf];

	/*
	 * Skip reserved bits checks on the terminal leaf if it's not a valid
	 * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
	 * design, always have reserved bits set.  The purpose of the checks is
	 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
	 */
	if (!is_shadow_present_pte(sptes[leaf]))
		leaf++;
3787 3788 3789

	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;

3790
	for (level = root; level >= leaf; level--)
3791
		reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
3792 3793

	if (reserved) {
3794
		pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
3795
		       __func__, addr);
3796
		for (level = root; level >= leaf; level--)
3797 3798
			pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
			       sptes[level], level,
3799
			       get_rsvd_bits(rsvd_check, sptes[level], level));
3800
	}
3801

3802
	return reserved;
3803 3804
}

P
Paolo Bonzini 已提交
3805
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3806 3807
{
	u64 spte;
3808
	bool reserved;
3809

3810
	if (mmio_info_in_cache(vcpu, addr, direct))
3811
		return RET_PF_EMULATE;
3812

3813
	reserved = get_mmio_spte(vcpu, addr, &spte);
3814
	if (WARN_ON(reserved))
3815
		return -EINVAL;
3816 3817 3818

	if (is_mmio_spte(spte)) {
		gfn_t gfn = get_mmio_spte_gfn(spte);
3819
		unsigned int access = get_mmio_spte_access(spte);
3820

3821
		if (!check_mmio_spte(vcpu, spte))
3822
			return RET_PF_INVALID;
3823

3824 3825
		if (direct)
			addr = 0;
X
Xiao Guangrong 已提交
3826 3827

		trace_handle_mmio_page_fault(addr, gfn, access);
3828
		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3829
		return RET_PF_EMULATE;
3830 3831 3832 3833 3834 3835
	}

	/*
	 * If the page table is zapped by other cpus, let CPU fault again on
	 * the address.
	 */
3836
	return RET_PF_RETRY;
3837 3838
}

3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858
static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
					 u32 error_code, gfn_t gfn)
{
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return false;

	if (!(error_code & PFERR_PRESENT_MASK) ||
	      !(error_code & PFERR_WRITE_MASK))
		return false;

	/*
	 * guest is writing the page which is write tracked which can
	 * not be fixed by page fault handler.
	 */
	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
		return true;

	return false;
}

3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872
static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
{
	struct kvm_shadow_walk_iterator iterator;
	u64 spte;

	walk_shadow_page_lockless_begin(vcpu);
	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
		clear_sp_write_flooding_count(iterator.sptep);
		if (!is_shadow_present_pte(spte))
			break;
	}
	walk_shadow_page_lockless_end(vcpu);
}

3873 3874
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
				    gfn_t gfn)
3875 3876
{
	struct kvm_arch_async_pf arch;
X
Xiao Guangrong 已提交
3877

3878
	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3879
	arch.gfn = gfn;
3880
	arch.direct_map = vcpu->arch.mmu->direct_map;
3881
	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
3882

3883 3884
	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3885 3886
}

3887
static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3888
			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
3889
			 bool write, bool *writable, int *r)
3890
{
3891
	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3892 3893
	bool async;

3894 3895 3896 3897 3898 3899
	/*
	 * Retry the page fault if the gfn hit a memslot that is being deleted
	 * or moved.  This ensures any existing SPTEs for the old memslot will
	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
	 */
	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
3900
		goto out_retry;
3901

3902 3903
	/* Don't expose private memslots to L2. */
	if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
3904
		*pfn = KVM_PFN_NOSLOT;
3905
		*writable = false;
3906 3907 3908
		return false;
	}

3909
	async = false;
3910 3911
	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
				    write, writable, hva);
3912 3913 3914
	if (!async)
		return false; /* *pfn has correct page already */

3915
	if (!prefault && kvm_can_do_async_pf(vcpu)) {
3916
		trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
3917
		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
3918
			trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
3919
			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3920
			goto out_retry;
3921
		} else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
3922
			goto out_retry;
3923 3924
	}

3925 3926
	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
				    write, writable, hva);
3927 3928 3929 3930

out_retry:
	*r = RET_PF_RETRY;
	return true;
3931 3932
}

3933 3934
static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
			     bool prefault, int max_level, bool is_tdp)
A
Avi Kivity 已提交
3935
{
3936
	bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
3937
	bool write = error_code & PFERR_WRITE_MASK;
3938
	bool map_writable;
A
Avi Kivity 已提交
3939

3940 3941 3942
	gfn_t gfn = gpa >> PAGE_SHIFT;
	unsigned long mmu_seq;
	kvm_pfn_t pfn;
3943
	hva_t hva;
3944
	int r;
3945

3946
	if (page_fault_handle_page_track(vcpu, error_code, gfn))
3947
		return RET_PF_EMULATE;
3948

3949 3950 3951
	r = fast_page_fault(vcpu, gpa, error_code);
	if (r != RET_PF_INVALID)
		return r;
3952

3953
	r = mmu_topup_memory_caches(vcpu, false);
3954 3955
	if (r)
		return r;
3956

3957 3958 3959
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
	smp_rmb();

3960
	if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
3961 3962
			 write, &map_writable, &r))
		return r;
3963

3964
	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
3965
		return r;
A
Avi Kivity 已提交
3966

3967
	r = RET_PF_RETRY;
3968

3969
	if (is_tdp_mmu_fault)
3970 3971 3972 3973
		read_lock(&vcpu->kvm->mmu_lock);
	else
		write_lock(&vcpu->kvm->mmu_lock);

3974
	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
3975
		goto out_unlock;
3976 3977
	r = make_mmu_pages_available(vcpu);
	if (r)
3978
		goto out_unlock;
B
Ben Gardon 已提交
3979

3980
	if (is_tdp_mmu_fault)
B
Ben Gardon 已提交
3981 3982 3983 3984 3985
		r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
				    pfn, prefault);
	else
		r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
				 prefault, is_tdp);
3986

3987
out_unlock:
3988
	if (is_tdp_mmu_fault)
3989 3990 3991
		read_unlock(&vcpu->kvm->mmu_lock);
	else
		write_unlock(&vcpu->kvm->mmu_lock);
3992 3993
	kvm_release_pfn_clean(pfn);
	return r;
A
Avi Kivity 已提交
3994 3995
}

3996 3997 3998 3999 4000 4001 4002
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
				u32 error_code, bool prefault)
{
	pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);

	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
	return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
4003
				 PG_LEVEL_2M, false);
4004 4005
}

4006
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4007
				u64 fault_address, char *insn, int insn_len)
4008 4009
{
	int r = 1;
4010
	u32 flags = vcpu->arch.apf.host_apf_flags;
4011

4012 4013 4014 4015 4016 4017
#ifndef CONFIG_X86_64
	/* A 64-bit CR2 should be impossible on 32-bit KVM. */
	if (WARN_ON_ONCE(fault_address >> 32))
		return -EFAULT;
#endif

P
Paolo Bonzini 已提交
4018
	vcpu->arch.l1tf_flush_l1d = true;
4019
	if (!flags) {
4020 4021
		trace_kvm_page_fault(fault_address, error_code);

4022
		if (kvm_event_needs_reinjection(vcpu))
4023 4024 4025
			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
				insn_len);
4026
	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4027
		vcpu->arch.apf.host_apf_flags = 0;
4028
		local_irq_disable();
4029
		kvm_async_pf_task_wait_schedule(fault_address);
4030
		local_irq_enable();
4031 4032
	} else {
		WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4033
	}
4034

4035 4036 4037 4038
	return r;
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);

4039 4040
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
		       bool prefault)
4041
{
4042
	int max_level;
4043

4044
	for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
4045
	     max_level > PG_LEVEL_4K;
4046 4047
	     max_level--) {
		int page_num = KVM_PAGES_PER_HPAGE(max_level);
4048
		gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
4049

4050 4051
		if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
			break;
4052
	}
4053

4054 4055
	return direct_page_fault(vcpu, gpa, error_code, prefault,
				 max_level, true);
4056 4057
}

4058
static void nonpaging_init_context(struct kvm_mmu *context)
A
Avi Kivity 已提交
4059 4060 4061
{
	context->page_fault = nonpaging_page_fault;
	context->gva_to_gpa = nonpaging_gva_to_gpa;
4062
	context->sync_page = nonpaging_sync_page;
4063
	context->invlpg = NULL;
4064
	context->direct_map = true;
A
Avi Kivity 已提交
4065 4066
}

4067
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4068 4069
				  union kvm_mmu_page_role role)
{
4070
	return (role.direct || pgd == root->pgd) &&
4071 4072
	       VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
	       role.word == to_shadow_page(root->hpa)->role.word;
4073 4074
}

4075
/*
4076
 * Find out if a previously cached root matching the new pgd/role is available.
4077 4078 4079 4080 4081 4082
 * The current root is also inserted into the cache.
 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
 * returned.
 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
 * false is returned. This root should now be freed by the caller.
 */
4083
static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4084 4085 4086 4087
				  union kvm_mmu_page_role new_role)
{
	uint i;
	struct kvm_mmu_root_info root;
4088
	struct kvm_mmu *mmu = vcpu->arch.mmu;
4089

4090
	root.pgd = mmu->root_pgd;
4091 4092
	root.hpa = mmu->root_hpa;

4093
	if (is_root_usable(&root, new_pgd, new_role))
4094 4095
		return true;

4096 4097 4098
	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
		swap(root, mmu->prev_roots[i]);

4099
		if (is_root_usable(&root, new_pgd, new_role))
4100 4101 4102 4103
			break;
	}

	mmu->root_hpa = root.hpa;
4104
	mmu->root_pgd = root.pgd;
4105 4106 4107 4108

	return i < KVM_MMU_NUM_PREV_ROOTS;
}

4109
static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4110
			    union kvm_mmu_page_role new_role)
A
Avi Kivity 已提交
4111
{
4112
	struct kvm_mmu *mmu = vcpu->arch.mmu;
4113 4114 4115 4116 4117 4118 4119

	/*
	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
	 * later if necessary.
	 */
	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4120
	    mmu->root_level >= PT64_ROOT_4LEVEL)
4121
		return cached_root_available(vcpu, new_pgd, new_role);
4122 4123

	return false;
A
Avi Kivity 已提交
4124 4125
}

4126
static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4127
			      union kvm_mmu_page_role new_role)
A
Avi Kivity 已提交
4128
{
4129
	if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141
		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
		return;
	}

	/*
	 * It's possible that the cached previous root page is obsolete because
	 * of a change in the MMU generation number. However, changing the
	 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
	 * free the root set here and allocate a new one.
	 */
	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);

4142
	if (force_flush_and_sync_on_reuse) {
4143 4144
		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4145
	}
4146 4147 4148 4149 4150 4151 4152 4153 4154

	/*
	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
	 * switching to a new CR3, that GVA->GPA mapping may no longer be
	 * valid. So clear any cached MMIO info even when we don't need to sync
	 * the shadow page tables.
	 */
	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);

4155 4156 4157 4158 4159 4160 4161
	/*
	 * If this is a direct root page, it doesn't have a write flooding
	 * count. Otherwise, clear the write flooding count.
	 */
	if (!new_role.direct)
		__clear_sp_write_flooding_count(
				to_shadow_page(vcpu->arch.mmu->root_hpa));
A
Avi Kivity 已提交
4162 4163
}

4164
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4165
{
4166
	__kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
4167
}
4168
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4169

4170 4171
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
4172
	return kvm_read_cr3(vcpu);
4173 4174
}

4175
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4176
			   unsigned int access, int *nr_present)
4177 4178 4179 4180 4181 4182 4183 4184
{
	if (unlikely(is_mmio_spte(*sptep))) {
		if (gfn != get_mmio_spte_gfn(*sptep)) {
			mmu_spte_clear_no_track(sptep);
			return true;
		}

		(*nr_present)++;
4185
		mark_mmio_spte(vcpu, sptep, gfn, access);
4186 4187 4188 4189 4190 4191
		return true;
	}

	return false;
}

4192 4193 4194 4195 4196
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
#undef PTTYPE

A
Avi Kivity 已提交
4197 4198 4199 4200 4201 4202 4203 4204
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE

#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE

4205
static void
4206
__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4207
			u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4208
			bool pse, bool amd)
4209
{
4210
	u64 gbpages_bit_rsvd = 0;
4211
	u64 nonleaf_bit8_rsvd = 0;
4212
	u64 high_bits_rsvd;
4213

4214
	rsvd_check->bad_mt_xwr = 0;
4215

4216
	if (!gbpages)
4217
		gbpages_bit_rsvd = rsvd_bits(7, 7);
4218

4219 4220 4221 4222 4223 4224 4225 4226 4227
	if (level == PT32E_ROOT_LEVEL)
		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
	else
		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);

	/* Note, NX doesn't exist in PDPTEs, this is handled below. */
	if (!nx)
		high_bits_rsvd |= rsvd_bits(63, 63);

4228 4229 4230 4231
	/*
	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
	 * leaf entries) on AMD CPUs only.
	 */
4232
	if (amd)
4233 4234
		nonleaf_bit8_rsvd = rsvd_bits(8, 8);

4235
	switch (level) {
4236 4237
	case PT32_ROOT_LEVEL:
		/* no rsvd bits for 2 level 4K page table entries */
4238 4239 4240 4241
		rsvd_check->rsvd_bits_mask[0][1] = 0;
		rsvd_check->rsvd_bits_mask[0][0] = 0;
		rsvd_check->rsvd_bits_mask[1][0] =
			rsvd_check->rsvd_bits_mask[0][0];
4242

4243
		if (!pse) {
4244
			rsvd_check->rsvd_bits_mask[1][1] = 0;
4245 4246 4247
			break;
		}

4248 4249
		if (is_cpuid_PSE36())
			/* 36bits PSE 4MB page */
4250
			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4251 4252
		else
			/* 32 bits PSE 4MB page */
4253
			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4254 4255
		break;
	case PT32E_ROOT_LEVEL:
4256 4257 4258 4259 4260 4261 4262 4263
		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
						   high_bits_rsvd |
						   rsvd_bits(5, 8) |
						   rsvd_bits(1, 2);	/* PDPTE */
		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
						   rsvd_bits(13, 20);	/* large page */
4264 4265
		rsvd_check->rsvd_bits_mask[1][0] =
			rsvd_check->rsvd_bits_mask[0][0];
4266
		break;
4267
	case PT64_ROOT_5LEVEL:
4268 4269 4270
		rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
						   nonleaf_bit8_rsvd |
						   rsvd_bits(7, 7);
4271 4272
		rsvd_check->rsvd_bits_mask[1][4] =
			rsvd_check->rsvd_bits_mask[0][4];
4273
		fallthrough;
4274
	case PT64_ROOT_4LEVEL:
4275 4276 4277 4278 4279 4280 4281
		rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
						   nonleaf_bit8_rsvd |
						   rsvd_bits(7, 7);
		rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
						   gbpages_bit_rsvd;
		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4282 4283
		rsvd_check->rsvd_bits_mask[1][3] =
			rsvd_check->rsvd_bits_mask[0][3];
4284 4285 4286 4287 4288
		rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
						   gbpages_bit_rsvd |
						   rsvd_bits(13, 29);
		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
						   rsvd_bits(13, 20); /* large page */
4289 4290
		rsvd_check->rsvd_bits_mask[1][0] =
			rsvd_check->rsvd_bits_mask[0][0];
4291 4292 4293 4294
		break;
	}
}

4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309
static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
{
	/*
	 * If TDP is enabled, let the guest use GBPAGES if they're supported in
	 * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
	 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
	 * walk for performance and complexity reasons.  Not to mention KVM
	 * _can't_ solve the problem because GVA->GPA walks aren't visible to
	 * KVM once a TDP translation is installed.  Mimic hardware behavior so
	 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
	 */
	return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
			     guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
}

4310 4311 4312
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				  struct kvm_mmu *context)
{
4313
	__reset_rsvds_bits_mask(&context->guest_rsvd_check,
4314
				vcpu->arch.reserved_gpa_bits,
4315
				context->root_level, is_efer_nx(context),
4316
				guest_can_use_gbpages(vcpu),
4317
				is_cr4_pse(context),
4318
				guest_cpuid_is_amd_or_hygon(vcpu));
4319 4320
}

4321 4322
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4323
			    u64 pa_bits_rsvd, bool execonly)
4324
{
4325
	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4326
	u64 bad_mt_xwr;
4327

4328 4329 4330 4331 4332
	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4333 4334

	/* large page */
4335
	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4336
	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4337 4338
	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
4339
	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4340

4341 4342 4343 4344 4345 4346 4347 4348
	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
	if (!execonly) {
		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4349
	}
4350
	rsvd_check->bad_mt_xwr = bad_mt_xwr;
4351 4352
}

4353 4354 4355 4356
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
		struct kvm_mmu *context, bool execonly)
{
	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4357
				    vcpu->arch.reserved_gpa_bits, execonly);
4358 4359
}

4360 4361 4362 4363 4364
static inline u64 reserved_hpa_bits(void)
{
	return rsvd_bits(shadow_phys_bits, 63);
}

4365 4366 4367 4368 4369
/*
 * the page table on host is the shadow page table for the page
 * table in guest or amd nested guest, its mmu features completely
 * follow the features in guest.
 */
4370 4371
static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
					struct kvm_mmu *context)
4372
{
4373 4374 4375 4376 4377 4378 4379 4380
	/*
	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
	 * The iTLB multi-hit workaround can be toggled at any time, so assume
	 * NX can be used by any non-nested shadow MMU to avoid having to reset
	 * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
	 */
4381
	bool uses_nx = is_efer_nx(context) || !tdp_enabled;
4382 4383 4384 4385 4386

	/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
	bool is_amd = true;
	/* KVM doesn't use 2-level page tables for the shadow MMU. */
	bool is_pse = false;
4387 4388
	struct rsvd_bits_validate *shadow_zero_check;
	int i;
4389

4390 4391
	WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);

4392
	shadow_zero_check = &context->shadow_zero_check;
4393
	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4394
				context->shadow_root_level, uses_nx,
4395
				guest_can_use_gbpages(vcpu), is_pse, is_amd);
4396 4397 4398 4399 4400 4401 4402 4403 4404

	if (!shadow_me_mask)
		return;

	for (i = context->shadow_root_level; --i >= 0;) {
		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
	}

4405 4406
}

4407 4408 4409 4410 4411 4412
static inline bool boot_cpu_is_amd(void)
{
	WARN_ON_ONCE(!tdp_enabled);
	return shadow_x_mask == 0;
}

4413 4414 4415 4416 4417 4418 4419 4420
/*
 * the direct page table on host, use as much mmu features as
 * possible, however, kvm currently does not do execution-protection.
 */
static void
reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				struct kvm_mmu *context)
{
4421 4422 4423 4424 4425
	struct rsvd_bits_validate *shadow_zero_check;
	int i;

	shadow_zero_check = &context->shadow_zero_check;

4426
	if (boot_cpu_is_amd())
4427
		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4428
					context->shadow_root_level, false,
4429
					boot_cpu_has(X86_FEATURE_GBPAGES),
4430
					false, true);
4431
	else
4432
		__reset_rsvds_bits_mask_ept(shadow_zero_check,
4433
					    reserved_hpa_bits(), false);
4434

4435 4436 4437 4438 4439 4440 4441
	if (!shadow_me_mask)
		return;

	for (i = context->shadow_root_level; --i >= 0;) {
		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
	}
4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452
}

/*
 * as the comments in reset_shadow_zero_bits_mask() except it
 * is the shadow page table for intel nested guest.
 */
static void
reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				struct kvm_mmu *context, bool execonly)
{
	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4453
				    reserved_hpa_bits(), execonly);
4454 4455
}

4456 4457 4458 4459 4460 4461 4462 4463 4464 4465
#define BYTE_MASK(access) \
	((1 & (access) ? 2 : 0) | \
	 (2 & (access) ? 4 : 0) | \
	 (3 & (access) ? 8 : 0) | \
	 (4 & (access) ? 16 : 0) | \
	 (5 & (access) ? 32 : 0) | \
	 (6 & (access) ? 64 : 0) | \
	 (7 & (access) ? 128 : 0))


4466
static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4467
{
4468 4469 4470 4471 4472 4473
	unsigned byte;

	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
	const u8 u = BYTE_MASK(ACC_USER_MASK);

4474 4475 4476
	bool cr4_smep = is_cr4_smep(mmu);
	bool cr4_smap = is_cr4_smap(mmu);
	bool cr0_wp = is_cr0_wp(mmu);
4477
	bool efer_nx = is_efer_nx(mmu);
4478 4479

	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4480 4481
		unsigned pfec = byte << 1;

F
Feng Wu 已提交
4482
		/*
4483 4484
		 * Each "*f" variable has a 1 bit for each UWX value
		 * that causes a fault with the given PFEC.
F
Feng Wu 已提交
4485
		 */
4486

4487
		/* Faults from writes to non-writable pages */
4488
		u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4489
		/* Faults from user mode accesses to supervisor pages */
4490
		u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4491
		/* Faults from fetches of non-executable pages*/
4492
		u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4493 4494 4495 4496 4497 4498 4499 4500 4501 4502
		/* Faults from kernel mode fetches of user pages */
		u8 smepf = 0;
		/* Faults from kernel mode accesses of user pages */
		u8 smapf = 0;

		if (!ept) {
			/* Faults from kernel mode accesses to user pages */
			u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;

			/* Not really needed: !nx will cause pte.nx to fault */
4503
			if (!efer_nx)
4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517
				ff = 0;

			/* Allow supervisor writes if !cr0.wp */
			if (!cr0_wp)
				wf = (pfec & PFERR_USER_MASK) ? wf : 0;

			/* Disallow supervisor fetches of user code if cr4.smep */
			if (cr4_smep)
				smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;

			/*
			 * SMAP:kernel-mode data accesses from user-mode
			 * mappings should fault. A fault is considered
			 * as a SMAP violation if all of the following
P
Peng Hao 已提交
4518
			 * conditions are true:
4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531
			 *   - X86_CR4_SMAP is set in CR4
			 *   - A user page is accessed
			 *   - The access is not a fetch
			 *   - Page fault in kernel mode
			 *   - if CPL = 3 or X86_EFLAGS_AC is clear
			 *
			 * Here, we cover the first three conditions.
			 * The fourth is computed dynamically in permission_fault();
			 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
			 * *not* subject to SMAP restrictions.
			 */
			if (cr4_smap)
				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4532
		}
4533 4534

		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4535 4536 4537
	}
}

4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561
/*
* PKU is an additional mechanism by which the paging controls access to
* user-mode addresses based on the value in the PKRU register.  Protection
* key violations are reported through a bit in the page fault error code.
* Unlike other bits of the error code, the PK bit is not known at the
* call site of e.g. gva_to_gpa; it must be computed directly in
* permission_fault based on two bits of PKRU, on some machine state (CR4,
* CR0, EFER, CPL), and on other bits of the error code and the page tables.
*
* In particular the following conditions come from the error code, the
* page tables and the machine state:
* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
* - PK is always zero if U=0 in the page tables
* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
*
* The PKRU bitmask caches the result of these four conditions.  The error
* code (minus the P bit) and the page table's U bit form an index into the
* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
* with the two bits of the PKRU register corresponding to the protection key.
* For the first three conditions above the bits will be 00, thus masking
* away both AD and WD.  For all reads or if the last condition holds, WD
* only will be masked away.
*/
4562
static void update_pkru_bitmask(struct kvm_mmu *mmu)
4563 4564 4565 4566
{
	unsigned bit;
	bool wp;

4567
	if (!is_cr4_pke(mmu)) {
4568 4569 4570 4571
		mmu->pkru_mask = 0;
		return;
	}

4572
	wp = is_cr0_wp(mmu);
4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605

	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
		unsigned pfec, pkey_bits;
		bool check_pkey, check_write, ff, uf, wf, pte_user;

		pfec = bit << 1;
		ff = pfec & PFERR_FETCH_MASK;
		uf = pfec & PFERR_USER_MASK;
		wf = pfec & PFERR_WRITE_MASK;

		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
		pte_user = pfec & PFERR_RSVD_MASK;

		/*
		 * Only need to check the access which is not an
		 * instruction fetch and is to a user page.
		 */
		check_pkey = (!ff && pte_user);
		/*
		 * write access is controlled by PKRU if it is a
		 * user access or CR0.WP = 1.
		 */
		check_write = check_pkey && wf && (uf || wp);

		/* PKRU.AD stops both read and write access. */
		pkey_bits = !!check_pkey;
		/* PKRU.WD stops write access. */
		pkey_bits |= (!!check_write) << 1;

		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
	}
}

4606 4607
static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
					struct kvm_mmu *mmu)
A
Avi Kivity 已提交
4608
{
4609 4610
	if (!is_cr0_pg(mmu))
		return;
4611

4612 4613 4614
	reset_rsvds_bits_mask(vcpu, mmu);
	update_permission_bitmask(mmu, false);
	update_pkru_bitmask(mmu);
A
Avi Kivity 已提交
4615 4616
}

4617
static void paging64_init_context(struct kvm_mmu *context)
A
Avi Kivity 已提交
4618 4619 4620
{
	context->page_fault = paging64_page_fault;
	context->gva_to_gpa = paging64_gva_to_gpa;
4621
	context->sync_page = paging64_sync_page;
M
Marcelo Tosatti 已提交
4622
	context->invlpg = paging64_invlpg;
4623
	context->direct_map = false;
A
Avi Kivity 已提交
4624 4625
}

4626
static void paging32_init_context(struct kvm_mmu *context)
A
Avi Kivity 已提交
4627 4628 4629
{
	context->page_fault = paging32_page_fault;
	context->gva_to_gpa = paging32_gva_to_gpa;
4630
	context->sync_page = paging32_sync_page;
M
Marcelo Tosatti 已提交
4631
	context->invlpg = paging32_invlpg;
4632
	context->direct_map = false;
A
Avi Kivity 已提交
4633 4634
}

4635 4636
static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
							 struct kvm_mmu_role_regs *regs)
4637 4638 4639
{
	union kvm_mmu_extended_role ext = {0};

4640 4641 4642 4643 4644 4645
	if (____is_cr0_pg(regs)) {
		ext.cr0_pg = 1;
		ext.cr4_pae = ____is_cr4_pae(regs);
		ext.cr4_smep = ____is_cr4_smep(regs);
		ext.cr4_smap = ____is_cr4_smap(regs);
		ext.cr4_pse = ____is_cr4_pse(regs);
4646 4647 4648 4649

		/* PKEY and LA57 are active iff long mode is active. */
		ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
		ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4650
	}
4651 4652 4653 4654 4655 4656

	ext.valid = 1;

	return ext;
}

4657
static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4658
						   struct kvm_mmu_role_regs *regs,
4659 4660 4661 4662 4663
						   bool base_only)
{
	union kvm_mmu_role role = {0};

	role.base.access = ACC_ALL;
4664 4665 4666 4667
	if (____is_cr0_pg(regs)) {
		role.base.efer_nx = ____is_efer_nx(regs);
		role.base.cr0_wp = ____is_cr0_wp(regs);
	}
4668 4669 4670 4671 4672 4673
	role.base.smm = is_smm(vcpu);
	role.base.guest_mode = is_guest_mode(vcpu);

	if (base_only)
		return role;

4674
	role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
4675 4676 4677 4678

	return role;
}

4679 4680 4681
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
	/* Use 5-level TDP if and only if it's useful/necessary. */
4682
	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4683 4684
		return 4;

4685
	return max_tdp_level;
4686 4687
}

4688
static union kvm_mmu_role
4689 4690
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
				struct kvm_mmu_role_regs *regs, bool base_only)
4691
{
4692
	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4693

4694
	role.base.ad_disabled = (shadow_accessed_mask == 0);
4695
	role.base.level = kvm_mmu_get_tdp_level(vcpu);
4696
	role.base.direct = true;
4697
	role.base.gpte_is_8_bytes = true;
4698 4699 4700 4701

	return role;
}

4702
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4703
{
4704
	struct kvm_mmu *context = &vcpu->arch.root_mmu;
4705
	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4706
	union kvm_mmu_role new_role =
4707
		kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
4708

4709 4710 4711 4712
	if (new_role.as_u64 == context->mmu_role.as_u64)
		return;

	context->mmu_role.as_u64 = new_role.as_u64;
4713
	context->page_fault = kvm_tdp_page_fault;
4714
	context->sync_page = nonpaging_sync_page;
4715
	context->invlpg = NULL;
4716
	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
4717
	context->direct_map = true;
4718
	context->get_guest_pgd = get_cr3;
4719
	context->get_pdptr = kvm_pdptr_read;
4720
	context->inject_page_fault = kvm_inject_page_fault;
4721
	context->root_level = role_regs_to_root_level(&regs);
4722

4723
	if (!is_cr0_pg(context))
4724
		context->gva_to_gpa = nonpaging_gva_to_gpa;
4725
	else if (is_cr4_pae(context))
4726
		context->gva_to_gpa = paging64_gva_to_gpa;
4727
	else
4728
		context->gva_to_gpa = paging32_gva_to_gpa;
4729

4730
	reset_guest_paging_metadata(vcpu, context);
4731
	reset_tdp_shadow_zero_bits_mask(vcpu, context);
4732 4733
}

4734
static union kvm_mmu_role
4735 4736
kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
				      struct kvm_mmu_role_regs *regs, bool base_only)
4737
{
4738
	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4739

4740 4741
	role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
	role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
4742
	role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
4743

4744 4745 4746 4747
	return role;
}

static union kvm_mmu_role
4748 4749
kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
				   struct kvm_mmu_role_regs *regs, bool base_only)
4750 4751
{
	union kvm_mmu_role role =
4752
		kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
4753

4754
	role.base.direct = !____is_cr0_pg(regs);
4755

4756
	if (!____is_efer_lma(regs))
4757
		role.base.level = PT32E_ROOT_LEVEL;
4758
	else if (____is_cr4_la57(regs))
4759
		role.base.level = PT64_ROOT_5LEVEL;
4760
	else
4761
		role.base.level = PT64_ROOT_4LEVEL;
4762 4763 4764 4765

	return role;
}

4766
static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
4767
				    struct kvm_mmu_role_regs *regs,
4768
				    union kvm_mmu_role new_role)
4769
{
4770 4771
	if (new_role.as_u64 == context->mmu_role.as_u64)
		return;
4772

4773
	context->mmu_role.as_u64 = new_role.as_u64;
4774

4775
	if (!is_cr0_pg(context))
4776
		nonpaging_init_context(context);
4777
	else if (is_cr4_pae(context))
4778
		paging64_init_context(context);
A
Avi Kivity 已提交
4779
	else
4780
		paging32_init_context(context);
4781
	context->root_level = role_regs_to_root_level(regs);
4782

4783
	reset_guest_paging_metadata(vcpu, context);
4784 4785
	context->shadow_root_level = new_role.base.level;

4786
	reset_shadow_zero_bits_mask(vcpu, context);
4787
}
4788

4789 4790
static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
				struct kvm_mmu_role_regs *regs)
4791
{
4792
	struct kvm_mmu *context = &vcpu->arch.root_mmu;
4793
	union kvm_mmu_role new_role =
4794
		kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
4795

4796
	shadow_mmu_init_context(vcpu, context, regs, new_role);
4797 4798
}

4799
static union kvm_mmu_role
4800 4801
kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
				   struct kvm_mmu_role_regs *regs)
4802 4803
{
	union kvm_mmu_role role =
4804
		kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
4805 4806

	role.base.direct = false;
4807
	role.base.level = kvm_mmu_get_tdp_level(vcpu);
4808 4809 4810 4811

	return role;
}

4812 4813
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
			     unsigned long cr4, u64 efer, gpa_t nested_cr3)
4814
{
4815
	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4816 4817 4818 4819 4820
	struct kvm_mmu_role_regs regs = {
		.cr0 = cr0,
		.cr4 = cr4,
		.efer = efer,
	};
4821
	union kvm_mmu_role new_role;
4822

4823
	new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
4824

4825
	__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
4826

4827
	shadow_mmu_init_context(vcpu, context, &regs, new_role);
4828 4829
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
4830

4831 4832
static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4833
				   bool execonly, u8 level)
4834
{
4835
	union kvm_mmu_role role = {0};
4836

4837 4838
	/* SMM flag is inherited from root_mmu */
	role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4839

4840
	role.base.level = level;
4841
	role.base.gpte_is_8_bytes = true;
4842 4843 4844 4845
	role.base.direct = false;
	role.base.ad_disabled = !accessed_dirty;
	role.base.guest_mode = true;
	role.base.access = ACC_ALL;
4846

4847 4848
	/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
	role.ext.word = 0;
4849
	role.ext.execonly = execonly;
4850
	role.ext.valid = 1;
4851 4852 4853 4854

	return role;
}

4855
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4856
			     bool accessed_dirty, gpa_t new_eptp)
N
Nadav Har'El 已提交
4857
{
4858
	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4859
	u8 level = vmx_eptp_page_walk_level(new_eptp);
4860 4861
	union kvm_mmu_role new_role =
		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4862
						   execonly, level);
4863

4864
	__kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
4865 4866 4867

	if (new_role.as_u64 == context->mmu_role.as_u64)
		return;
4868

4869 4870
	context->mmu_role.as_u64 = new_role.as_u64;

4871
	context->shadow_root_level = level;
N
Nadav Har'El 已提交
4872

4873
	context->ept_ad = accessed_dirty;
N
Nadav Har'El 已提交
4874 4875 4876 4877
	context->page_fault = ept_page_fault;
	context->gva_to_gpa = ept_gva_to_gpa;
	context->sync_page = ept_sync_page;
	context->invlpg = ept_invlpg;
4878
	context->root_level = level;
N
Nadav Har'El 已提交
4879
	context->direct_map = false;
4880

4881
	update_permission_bitmask(context, true);
4882
	update_pkru_bitmask(context);
N
Nadav Har'El 已提交
4883
	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
4884
	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
N
Nadav Har'El 已提交
4885 4886 4887
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);

4888
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4889
{
4890
	struct kvm_mmu *context = &vcpu->arch.root_mmu;
4891
	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4892

4893
	kvm_init_shadow_mmu(vcpu, &regs);
4894

4895
	context->get_guest_pgd     = get_cr3;
4896 4897
	context->get_pdptr         = kvm_pdptr_read;
	context->inject_page_fault = kvm_inject_page_fault;
A
Avi Kivity 已提交
4898 4899
}

4900 4901
static union kvm_mmu_role
kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
4902
{
4903 4904 4905
	union kvm_mmu_role role;

	role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
4906 4907 4908 4909 4910 4911 4912

	/*
	 * Nested MMUs are used only for walking L2's gva->gpa, they never have
	 * shadow pages of their own and so "direct" has no meaning.   Set it
	 * to "true" to try to detect bogus usage of the nested MMU.
	 */
	role.base.direct = true;
4913
	role.base.level = role_regs_to_root_level(regs);
4914 4915 4916
	return role;
}

4917
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
4918
{
4919 4920
	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
	union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
4921 4922
	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;

4923 4924 4925 4926
	if (new_role.as_u64 == g_context->mmu_role.as_u64)
		return;

	g_context->mmu_role.as_u64 = new_role.as_u64;
4927
	g_context->get_guest_pgd     = get_cr3;
4928
	g_context->get_pdptr         = kvm_pdptr_read;
4929
	g_context->inject_page_fault = kvm_inject_page_fault;
4930
	g_context->root_level        = new_role.base.level;
4931

4932 4933 4934 4935 4936 4937
	/*
	 * L2 page tables are never shadowed, so there is no need to sync
	 * SPTEs.
	 */
	g_context->invlpg            = NULL;

4938
	/*
4939
	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
4940 4941 4942 4943 4944
	 * L1's nested page tables (e.g. EPT12). The nested translation
	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
	 * L2's page tables as the first level of translation and L1's
	 * nested page tables as the second level of translation. Basically
	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
4945
	 */
4946
	if (!is_paging(vcpu))
4947
		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
4948
	else if (is_long_mode(vcpu))
4949
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
4950
	else if (is_pae(vcpu))
4951
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
4952
	else
4953 4954
		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;

4955
	reset_guest_paging_metadata(vcpu, g_context);
4956 4957
}

4958
void kvm_init_mmu(struct kvm_vcpu *vcpu)
4959
{
4960
	if (mmu_is_nested(vcpu))
4961
		init_kvm_nested_mmu(vcpu);
4962
	else if (tdp_enabled)
4963
		init_kvm_tdp_mmu(vcpu);
4964
	else
4965
		init_kvm_softmmu(vcpu);
4966
}
4967
EXPORT_SYMBOL_GPL(kvm_init_mmu);
4968

4969 4970 4971
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
{
4972
	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4973 4974
	union kvm_mmu_role role;

4975
	if (tdp_enabled)
4976
		role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
4977
	else
4978
		role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
4979 4980

	return role.base;
4981
}
4982

4983 4984 4985 4986 4987 4988 4989 4990 4991 4992
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
	/*
	 * Invalidate all MMU roles to force them to reinitialize as CPUID
	 * information is factored into reserved bit calculations.
	 */
	vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
	vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
	vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
	kvm_mmu_reset_context(vcpu);
4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012

	/*
	 * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
	 * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
	 * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
	 * faults due to reusing SPs/SPTEs.  Alert userspace, but otherwise
	 * sweep the problem under the rug.
	 *
	 * KVM's horrific CPUID ABI makes the problem all but impossible to
	 * solve, as correctly handling multiple vCPU models (with respect to
	 * paging and physical address properties) in a single VM would require
	 * tracking all relevant CPUID information in kvm_mmu_page_role.  That
	 * is very undesirable as it would double the memory requirements for
	 * gfn_track (see struct kvm_mmu_page_role comments), and in practice
	 * no sane VMM mucks with the core vCPU model on the fly.
	 */
	if (vcpu->arch.last_vmentry_cpu != -1) {
		pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
		pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
	}
5013 5014
}

5015
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
5016
{
5017
	kvm_mmu_unload(vcpu);
5018
	kvm_init_mmu(vcpu);
A
Avi Kivity 已提交
5019
}
5020
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
A
Avi Kivity 已提交
5021 5022

int kvm_mmu_load(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
5023
{
5024 5025
	int r;

5026
	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
A
Avi Kivity 已提交
5027 5028
	if (r)
		goto out;
5029
	r = mmu_alloc_special_roots(vcpu);
A
Avi Kivity 已提交
5030 5031
	if (r)
		goto out;
5032
	if (vcpu->arch.mmu->direct_map)
5033 5034 5035
		r = mmu_alloc_direct_roots(vcpu);
	else
		r = mmu_alloc_shadow_roots(vcpu);
5036 5037
	if (r)
		goto out;
5038 5039 5040

	kvm_mmu_sync_roots(vcpu);

5041
	kvm_mmu_load_pgd(vcpu);
5042
	static_call(kvm_x86_tlb_flush_current)(vcpu);
5043 5044
out:
	return r;
A
Avi Kivity 已提交
5045
}
A
Avi Kivity 已提交
5046 5047 5048

void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
5049 5050 5051 5052
	kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
	WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
	WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
A
Avi Kivity 已提交
5053
}
A
Avi Kivity 已提交
5054

5055 5056 5057 5058 5059 5060 5061 5062
static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
5063 5064
	old ^= shadow_nx_mask;
	new ^= shadow_nx_mask;
5065 5066 5067
	return (old & ~new & PT64_PERM_MASK) != 0;
}

5068
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5069
				    int *bytes)
5070
{
5071
	u64 gentry = 0;
5072
	int r;
5073 5074 5075

	/*
	 * Assume that the pte write on a page table of the same type
5076 5077
	 * as the current vcpu paging mode since we update the sptes only
	 * when they have the same mode.
5078
	 */
5079
	if (is_pae(vcpu) && *bytes == 4) {
5080
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5081 5082
		*gpa &= ~(gpa_t)7;
		*bytes = 8;
5083 5084
	}

5085 5086 5087 5088
	if (*bytes == 4 || *bytes == 8) {
		r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
		if (r)
			gentry = 0;
5089 5090
	}

5091 5092 5093 5094 5095 5096 5097
	return gentry;
}

/*
 * If we're seeing too many writes to a page, it may no longer be a page table,
 * or we may be forking, in which case it is better to unmap the page.
 */
5098
static bool detect_write_flooding(struct kvm_mmu_page *sp)
5099
{
5100 5101 5102 5103
	/*
	 * Skip write-flooding detected for the sp whose level is 1, because
	 * it can become unsync, then the guest page is not write-protected.
	 */
5104
	if (sp->role.level == PG_LEVEL_4K)
5105
		return false;
5106

5107 5108
	atomic_inc(&sp->write_flooding_count);
	return atomic_read(&sp->write_flooding_count) >= 3;
5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123
}

/*
 * Misaligned accesses are too much trouble to fix up; also, they usually
 * indicate a page is not used as a page table.
 */
static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
				    int bytes)
{
	unsigned offset, pte_size, misaligned;

	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
		 gpa, bytes, sp->role.word);

	offset = offset_in_page(gpa);
5124
	pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5125 5126 5127 5128 5129 5130 5131 5132

	/*
	 * Sometimes, the OS only writes the last one bytes to update status
	 * bits, for example, in linux, andb instruction is used in clear_bit().
	 */
	if (!(offset & (pte_size - 1)) && bytes == 1)
		return false;

5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147
	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
	misaligned |= bytes < 4;

	return misaligned;
}

static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
{
	unsigned page_offset, quadrant;
	u64 *spte;
	int level;

	page_offset = offset_in_page(gpa);
	level = sp->role.level;
	*nspte = 1;
5148
	if (!sp->role.gpte_is_8_bytes) {
5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169
		page_offset <<= 1;	/* 32->64 */
		/*
		 * A 32-bit pde maps 4MB while the shadow pdes map
		 * only 2MB.  So we need to double the offset again
		 * and zap two pdes instead of one.
		 */
		if (level == PT32_ROOT_LEVEL) {
			page_offset &= ~7; /* kill rounding error */
			page_offset <<= 1;
			*nspte = 2;
		}
		quadrant = page_offset >> PAGE_SHIFT;
		page_offset &= ~PAGE_MASK;
		if (quadrant != sp->role.quadrant)
			return NULL;
	}

	spte = &sp->spt[page_offset / sizeof(*spte)];
	return spte;
}

5170
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5171 5172
			      const u8 *new, int bytes,
			      struct kvm_page_track_notifier_node *node)
5173 5174 5175 5176 5177 5178
{
	gfn_t gfn = gpa >> PAGE_SHIFT;
	struct kvm_mmu_page *sp;
	LIST_HEAD(invalid_list);
	u64 entry, gentry, *spte;
	int npte;
5179
	bool remote_flush, local_flush;
5180 5181 5182 5183 5184

	/*
	 * If we don't have indirect shadow pages, it means no page is
	 * write-protected, so we can exit simply.
	 */
5185
	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5186 5187
		return;

5188
	remote_flush = local_flush = false;
5189 5190 5191 5192 5193

	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);

	/*
	 * No need to care whether allocation memory is successful
I
Ingo Molnar 已提交
5194
	 * or not since pte prefetch is skipped if it does not have
5195 5196
	 * enough objects in the cache.
	 */
5197
	mmu_topup_memory_caches(vcpu, true);
5198

5199
	write_lock(&vcpu->kvm->mmu_lock);
5200 5201 5202

	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);

5203
	++vcpu->kvm->stat.mmu_pte_write;
5204
	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5205

5206
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5207
		if (detect_write_misaligned(sp, gpa, bytes) ||
5208
		      detect_write_flooding(sp)) {
5209
			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
A
Avi Kivity 已提交
5210
			++vcpu->kvm->stat.mmu_flooded;
5211 5212
			continue;
		}
5213 5214 5215 5216 5217

		spte = get_written_sptes(sp, gpa, &npte);
		if (!spte)
			continue;

5218
		local_flush = true;
5219
		while (npte--) {
5220
			entry = *spte;
5221
			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5222 5223
			if (gentry && sp->role.level != PG_LEVEL_4K)
				++vcpu->kvm->stat.mmu_pde_zapped;
G
Gleb Natapov 已提交
5224
			if (need_remote_flush(entry, *spte))
5225
				remote_flush = true;
5226
			++spte;
5227 5228
		}
	}
5229
	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5230
	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5231
	write_unlock(&vcpu->kvm->mmu_lock);
5232 5233
}

5234
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5235
		       void *insn, int insn_len)
5236
{
5237
	int r, emulation_type = EMULTYPE_PF;
5238
	bool direct = vcpu->arch.mmu->direct_map;
5239

5240
	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
5241 5242
		return RET_PF_RETRY;

5243
	r = RET_PF_INVALID;
5244
	if (unlikely(error_code & PFERR_RSVD_MASK)) {
5245
		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5246
		if (r == RET_PF_EMULATE)
5247 5248
			goto emulate;
	}
5249

5250
	if (r == RET_PF_INVALID) {
5251 5252
		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
					  lower_32_bits(error_code), false);
5253
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5254
			return -EIO;
5255 5256
	}

5257
	if (r < 0)
5258
		return r;
5259 5260
	if (r != RET_PF_EMULATE)
		return 1;
5261

5262 5263 5264 5265 5266 5267 5268
	/*
	 * Before emulating the instruction, check if the error code
	 * was due to a RO violation while translating the guest page.
	 * This can occur when using nested virtualization with nested
	 * paging in both guests. If true, we simply unprotect the page
	 * and resume the guest.
	 */
5269
	if (vcpu->arch.mmu->direct_map &&
5270
	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5271
		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5272 5273 5274
		return 1;
	}

5275 5276 5277 5278 5279 5280
	/*
	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
	 * optimistically try to just unprotect the page and let the processor
	 * re-execute the instruction that caused the page fault.  Do not allow
	 * retrying MMIO emulation, as it's not only pointless but could also
	 * cause us to enter an infinite loop because the processor will keep
5281 5282 5283 5284
	 * faulting on the non-existent MMIO address.  Retrying an instruction
	 * from a nested guest is also pointless and dangerous as we are only
	 * explicitly shadowing L1's page tables, i.e. unprotecting something
	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5285
	 */
5286
	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5287
		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5288
emulate:
5289
	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5290
				       insn_len);
5291 5292 5293
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);

5294 5295
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gva_t gva, hpa_t root_hpa)
M
Marcelo Tosatti 已提交
5296
{
5297
	int i;
5298

5299 5300 5301 5302 5303 5304
	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
	if (mmu != &vcpu->arch.guest_mmu) {
		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
		if (is_noncanonical_address(gva, vcpu))
			return;

5305
		static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
5306 5307 5308
	}

	if (!mmu->invlpg)
5309 5310
		return;

5311 5312
	if (root_hpa == INVALID_PAGE) {
		mmu->invlpg(vcpu, gva, mmu->root_hpa);
5313

5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331
		/*
		 * INVLPG is required to invalidate any global mappings for the VA,
		 * irrespective of PCID. Since it would take us roughly similar amount
		 * of work to determine whether any of the prev_root mappings of the VA
		 * is marked global, or to just sync it blindly, so we might as well
		 * just always sync it.
		 *
		 * Mappings not reachable via the current cr3 or the prev_roots will be
		 * synced when switching to that cr3, so nothing needs to be done here
		 * for them.
		 */
		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
			if (VALID_PAGE(mmu->prev_roots[i].hpa))
				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
	} else {
		mmu->invlpg(vcpu, gva, root_hpa);
	}
}
5332

5333 5334 5335
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
	kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
M
Marcelo Tosatti 已提交
5336 5337 5338 5339
	++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);

5340

5341 5342
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
5343
	struct kvm_mmu *mmu = vcpu->arch.mmu;
5344
	bool tlb_flush = false;
5345
	uint i;
5346 5347

	if (pcid == kvm_get_active_pcid(vcpu)) {
5348
		mmu->invlpg(vcpu, gva, mmu->root_hpa);
5349
		tlb_flush = true;
5350 5351
	}

5352 5353
	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5354
		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5355 5356 5357
			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
			tlb_flush = true;
		}
5358
	}
5359

5360
	if (tlb_flush)
5361
		static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
5362

5363 5364 5365
	++vcpu->stat.invlpg;

	/*
5366 5367 5368
	 * Mappings not reachable via the current cr3 or the prev_roots will be
	 * synced when switching to that cr3, so nothing needs to be done here
	 * for them.
5369 5370 5371
	 */
}

5372 5373
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
		       int tdp_huge_page_level)
5374
{
5375
	tdp_enabled = enable_tdp;
5376
	max_tdp_level = tdp_max_root_level;
5377 5378

	/*
5379
	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
5380 5381 5382 5383 5384 5385
	 * of kernel support, e.g. KVM may be capable of using 1GB pages when
	 * the kernel is not.  But, KVM never creates a page size greater than
	 * what is used by the kernel for any given HVA, i.e. the kernel's
	 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
	 */
	if (tdp_enabled)
5386
		max_huge_page_level = tdp_huge_page_level;
5387
	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5388
		max_huge_page_level = PG_LEVEL_1G;
5389
	else
5390
		max_huge_page_level = PG_LEVEL_2M;
5391
}
5392
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5393 5394

/* The return value indicates if tlb flush on all vcpus is needed. */
5395 5396 5397
typedef bool (*slot_level_handler) (struct kvm *kvm,
				    struct kvm_rmap_head *rmap_head,
				    const struct kvm_memory_slot *slot);
5398 5399 5400

/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
5401
slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5402
			slot_level_handler fn, int start_level, int end_level,
5403 5404
			gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
			bool flush)
5405 5406 5407 5408 5409 5410
{
	struct slot_rmap_walk_iterator iterator;

	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
			end_gfn, &iterator) {
		if (iterator.rmap)
5411
			flush |= fn(kvm, iterator.rmap, memslot);
5412

5413
		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5414
			if (flush && flush_on_yield) {
5415 5416 5417
				kvm_flush_remote_tlbs_with_address(kvm,
						start_gfn,
						iterator.gfn - start_gfn + 1);
5418 5419
				flush = false;
			}
5420
			cond_resched_rwlock_write(&kvm->mmu_lock);
5421 5422 5423 5424 5425 5426 5427
		}
	}

	return flush;
}

static __always_inline bool
5428
slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5429
		  slot_level_handler fn, int start_level, int end_level,
5430
		  bool flush_on_yield)
5431 5432 5433 5434
{
	return slot_handle_level_range(kvm, memslot, fn, start_level,
			end_level, memslot->base_gfn,
			memslot->base_gfn + memslot->npages - 1,
5435
			flush_on_yield, false);
5436 5437 5438
}

static __always_inline bool
5439
slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5440
		 slot_level_handler fn, bool flush_on_yield)
5441
{
5442
	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5443
				 PG_LEVEL_4K, flush_on_yield);
5444 5445
}

5446
static void free_mmu_pages(struct kvm_mmu *mmu)
A
Avi Kivity 已提交
5447
{
5448 5449
	if (!tdp_enabled && mmu->pae_root)
		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5450
	free_page((unsigned long)mmu->pae_root);
5451
	free_page((unsigned long)mmu->pml4_root);
A
Avi Kivity 已提交
5452 5453
}

5454
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
A
Avi Kivity 已提交
5455
{
5456
	struct page *page;
A
Avi Kivity 已提交
5457 5458
	int i;

5459 5460 5461 5462 5463 5464
	mmu->root_hpa = INVALID_PAGE;
	mmu->root_pgd = 0;
	mmu->translate_gpa = translate_gpa;
	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;

5465
	/*
5466 5467 5468 5469
	 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
	 * while the PDP table is a per-vCPU construct that's allocated at MMU
	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
	 * x86_64.  Therefore we need to allocate the PDP table in the first
5470 5471 5472 5473 5474
	 * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
	 * generally doesn't use PAE paging and can skip allocating the PDP
	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
	 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
5475
	 */
5476
	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5477 5478
		return 0;

5479
	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5480
	if (!page)
5481 5482
		return -ENOMEM;

5483
	mmu->pae_root = page_address(page);
5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497

	/*
	 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
	 * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
	 * that KVM's writes and the CPU's reads get along.  Note, this is
	 * only necessary when using shadow paging, as 64-bit NPT can get at
	 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
	 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
	 */
	if (!tdp_enabled)
		set_memory_decrypted((unsigned long)mmu->pae_root, 1);
	else
		WARN_ON_ONCE(shadow_me_mask);

5498
	for (i = 0; i < 4; ++i)
5499
		mmu->pae_root[i] = INVALID_PAE_ROOT;
5500

A
Avi Kivity 已提交
5501 5502 5503
	return 0;
}

5504
int kvm_mmu_create(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
5505
{
5506
	int ret;
5507

5508
	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5509 5510
	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;

5511
	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5512
	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5513

5514 5515
	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;

5516 5517
	vcpu->arch.mmu = &vcpu->arch.root_mmu;
	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
A
Avi Kivity 已提交
5518

5519
	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5520

5521
	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5522 5523 5524
	if (ret)
		return ret;

5525
	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5526 5527 5528 5529 5530 5531 5532
	if (ret)
		goto fail_allocate_root;

	return ret;
 fail_allocate_root:
	free_mmu_pages(&vcpu->arch.guest_mmu);
	return ret;
A
Avi Kivity 已提交
5533 5534
}

5535
#define BATCH_ZAP_PAGES	10
5536 5537 5538
static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
	struct kvm_mmu_page *sp, *node;
5539
	int nr_zapped, batch = 0;
5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551

restart:
	list_for_each_entry_safe_reverse(sp, node,
	      &kvm->arch.active_mmu_pages, link) {
		/*
		 * No obsolete valid page exists before a newly created page
		 * since active_mmu_pages is a FIFO list.
		 */
		if (!is_obsolete_sp(kvm, sp))
			break;

		/*
5552 5553 5554
		 * Invalid pages should never land back on the list of active
		 * pages.  Skip the bogus page, otherwise we'll get stuck in an
		 * infinite loop if the page gets put back on the list (again).
5555
		 */
5556
		if (WARN_ON(sp->role.invalid))
5557 5558
			continue;

5559 5560 5561 5562 5563 5564
		/*
		 * No need to flush the TLB since we're only zapping shadow
		 * pages with an obsolete generation number and all vCPUS have
		 * loaded a new root, i.e. the shadow pages being zapped cannot
		 * be in active use by the guest.
		 */
5565
		if (batch >= BATCH_ZAP_PAGES &&
5566
		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
5567
			batch = 0;
5568 5569 5570
			goto restart;
		}

5571 5572
		if (__kvm_mmu_prepare_zap_page(kvm, sp,
				&kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5573
			batch += nr_zapped;
5574
			goto restart;
5575
		}
5576 5577
	}

5578 5579 5580 5581 5582
	/*
	 * Trigger a remote TLB flush before freeing the page tables to ensure
	 * KVM is not in the middle of a lockless shadow page table walk, which
	 * may reference the pages.
	 */
5583
	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596
}

/*
 * Fast invalidate all shadow pages and use lock-break technique
 * to zap obsolete pages.
 *
 * It's required when memslot is being deleted or VM is being
 * destroyed, in these cases, we should ensure that KVM MMU does
 * not use any resource of the being-deleted slot or all slots
 * after calling the function.
 */
static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
5597 5598
	lockdep_assert_held(&kvm->slots_lock);

5599
	write_lock(&kvm->mmu_lock);
5600
	trace_kvm_mmu_zap_all_fast(kvm);
5601 5602 5603 5604 5605 5606 5607 5608 5609

	/*
	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
	 * held for the entire duration of zapping obsolete pages, it's
	 * impossible for there to be multiple invalid generations associated
	 * with *valid* shadow pages at any given time, i.e. there is exactly
	 * one valid generation and (at most) one invalid generation.
	 */
	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5610

5611 5612 5613 5614 5615 5616 5617 5618 5619
	/* In order to ensure all threads see this change when
	 * handling the MMU reload signal, this must happen in the
	 * same critical section as kvm_reload_remote_mmus, and
	 * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
	 * could drop the MMU lock and yield.
	 */
	if (is_tdp_mmu_enabled(kvm))
		kvm_tdp_mmu_invalidate_all_roots(kvm);

5620 5621 5622 5623 5624 5625 5626 5627 5628 5629
	/*
	 * Notify all vcpus to reload its shadow page table and flush TLB.
	 * Then all vcpus will switch to new shadow page table with the new
	 * mmu_valid_gen.
	 *
	 * Note: we need to do this under the protection of mmu_lock,
	 * otherwise, vcpu would purge shadow page but miss tlb flush.
	 */
	kvm_reload_remote_mmus(kvm);

5630
	kvm_zap_obsolete_pages(kvm);
5631

5632
	write_unlock(&kvm->mmu_lock);
5633 5634 5635 5636 5637 5638

	if (is_tdp_mmu_enabled(kvm)) {
		read_lock(&kvm->mmu_lock);
		kvm_tdp_mmu_zap_invalidated_roots(kvm);
		read_unlock(&kvm->mmu_lock);
	}
5639 5640
}

5641 5642 5643 5644 5645
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
{
	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}

5646
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5647 5648
			struct kvm_memory_slot *slot,
			struct kvm_page_track_notifier_node *node)
5649
{
5650
	kvm_mmu_zap_all_fast(kvm);
5651 5652
}

5653
void kvm_mmu_init_vm(struct kvm *kvm)
5654
{
5655
	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5656

5657 5658
	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

5659 5660 5661 5662 5663 5664 5665
	if (!kvm_mmu_init_tdp_mmu(kvm))
		/*
		 * No smp_load/store wrappers needed here as we are in
		 * VM init and there cannot be any memslots / other threads
		 * accessing this struct kvm yet.
		 */
		kvm->arch.memslots_have_rmaps = true;
5666

5667
	node->track_write = kvm_mmu_pte_write;
5668
	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5669
	kvm_page_track_register_notifier(kvm, node);
5670 5671
}

5672
void kvm_mmu_uninit_vm(struct kvm *kvm)
5673
{
5674
	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5675

5676
	kvm_page_track_unregister_notifier(kvm, node);
5677 5678

	kvm_mmu_uninit_tdp_mmu(kvm);
5679 5680
}

5681 5682 5683 5684
/*
 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
 * (not including it)
 */
X
Xiao Guangrong 已提交
5685 5686 5687 5688
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
	struct kvm_memslots *slots;
	struct kvm_memory_slot *memslot;
5689
	int i;
5690
	bool flush = false;
X
Xiao Guangrong 已提交
5691

5692 5693
	write_lock(&kvm->mmu_lock);

5694 5695
	kvm_inc_notifier_count(kvm, gfn_start, gfn_end);

5696 5697 5698 5699 5700 5701 5702 5703 5704 5705
	if (kvm_memslots_have_rmaps(kvm)) {
		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
			slots = __kvm_memslots(kvm, i);
			kvm_for_each_memslot(memslot, slots) {
				gfn_t start, end;

				start = max(gfn_start, memslot->base_gfn);
				end = min(gfn_end, memslot->base_gfn + memslot->npages);
				if (start >= end)
					continue;
X
Xiao Guangrong 已提交
5706

5707 5708
				flush = slot_handle_level_range(kvm,
						(const struct kvm_memory_slot *) memslot,
5709 5710 5711 5712
						kvm_zap_rmapp, PG_LEVEL_4K,
						KVM_MAX_HUGEPAGE_LEVEL, start,
						end - 1, true, flush);
			}
5713
		}
5714
		if (flush)
5715 5716
			kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
							   gfn_end - gfn_start);
X
Xiao Guangrong 已提交
5717 5718
	}

5719
	if (is_tdp_mmu_enabled(kvm)) {
5720 5721
		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
5722
							  gfn_end, flush);
5723 5724 5725
		if (flush)
			kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
							   gfn_end - gfn_start);
5726
	}
5727 5728 5729 5730

	if (flush)
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);

5731 5732
	kvm_dec_notifier_count(kvm, gfn_start, gfn_end);

5733
	write_unlock(&kvm->mmu_lock);
X
Xiao Guangrong 已提交
5734 5735
}

5736
static bool slot_rmap_write_protect(struct kvm *kvm,
5737
				    struct kvm_rmap_head *rmap_head,
5738
				    const struct kvm_memory_slot *slot)
5739
{
5740
	return __rmap_write_protect(kvm, rmap_head, false);
5741 5742
}

5743
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5744
				      const struct kvm_memory_slot *memslot,
5745
				      int start_level)
A
Avi Kivity 已提交
5746
{
5747
	bool flush = false;
A
Avi Kivity 已提交
5748

5749 5750 5751 5752 5753 5754 5755
	if (kvm_memslots_have_rmaps(kvm)) {
		write_lock(&kvm->mmu_lock);
		flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
					  start_level, KVM_MAX_HUGEPAGE_LEVEL,
					  false);
		write_unlock(&kvm->mmu_lock);
	}
5756

5757 5758 5759 5760 5761 5762
	if (is_tdp_mmu_enabled(kvm)) {
		read_lock(&kvm->mmu_lock);
		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
		read_unlock(&kvm->mmu_lock);
	}

5763 5764 5765 5766 5767 5768 5769
	/*
	 * We can flush all the TLBs out of the mmu lock without TLB
	 * corruption since we just change the spte from writable to
	 * readonly so that we only need to care the case of changing
	 * spte from present to present (changing the spte from present
	 * to nonpresent will flush all the TLBs immediately), in other
	 * words, the only case we care is mmu_spte_update() where we
5770 5771 5772
	 * have checked Host-writable | MMU-writable instead of
	 * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
	 * anymore.
5773
	 */
5774
	if (flush)
5775
		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
A
Avi Kivity 已提交
5776
}
5777

5778
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5779
					 struct kvm_rmap_head *rmap_head,
5780
					 const struct kvm_memory_slot *slot)
5781 5782 5783 5784
{
	u64 *sptep;
	struct rmap_iterator iter;
	int need_tlb_flush = 0;
D
Dan Williams 已提交
5785
	kvm_pfn_t pfn;
5786 5787
	struct kvm_mmu_page *sp;

5788
restart:
5789
	for_each_rmap_spte(rmap_head, &iter, sptep) {
5790
		sp = sptep_to_sp(sptep);
5791 5792 5793
		pfn = spte_to_pfn(*sptep);

		/*
5794 5795 5796 5797 5798
		 * We cannot do huge page mapping for indirect shadow pages,
		 * which are found on the last rmap (level = 1) when not using
		 * tdp; such shadow pages are synced with the page table in
		 * the guest, and the guest page table is using 4K page size
		 * mapping if the indirect sp has level = 1.
5799
		 */
5800
		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
5801 5802
		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
							       pfn, PG_LEVEL_NUM)) {
5803
			pte_list_remove(rmap_head, sptep);
5804 5805 5806 5807 5808 5809 5810

			if (kvm_available_flush_tlb_with_range())
				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
					KVM_PAGES_PER_HPAGE(sp->role.level));
			else
				need_tlb_flush = 1;

5811 5812
			goto restart;
		}
5813 5814 5815 5816 5817 5818
	}

	return need_tlb_flush;
}

void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5819
				   const struct kvm_memory_slot *slot)
5820
{
5821
	bool flush = false;
5822

5823 5824 5825 5826 5827 5828 5829
	if (kvm_memslots_have_rmaps(kvm)) {
		write_lock(&kvm->mmu_lock);
		flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
		if (flush)
			kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
		write_unlock(&kvm->mmu_lock);
	}
5830 5831 5832 5833 5834 5835 5836 5837

	if (is_tdp_mmu_enabled(kvm)) {
		read_lock(&kvm->mmu_lock);
		flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
		if (flush)
			kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
		read_unlock(&kvm->mmu_lock);
	}
5838 5839
}

5840
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
5841
					const struct kvm_memory_slot *memslot)
5842 5843
{
	/*
5844
	 * All current use cases for flushing the TLBs for a specific memslot
5845
	 * related to dirty logging, and many do the TLB flush out of mmu_lock.
5846 5847 5848
	 * The interaction between the various operations on memslot must be
	 * serialized by slots_locks to ensure the TLB flush from one operation
	 * is observed by any other operation on the same memslot.
5849 5850
	 */
	lockdep_assert_held(&kvm->slots_lock);
5851 5852
	kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
					   memslot->npages);
5853 5854
}

5855
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5856
				   const struct kvm_memory_slot *memslot)
5857
{
5858
	bool flush = false;
5859

5860 5861 5862 5863 5864 5865
	if (kvm_memslots_have_rmaps(kvm)) {
		write_lock(&kvm->mmu_lock);
		flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
					 false);
		write_unlock(&kvm->mmu_lock);
	}
5866

5867 5868 5869 5870 5871 5872
	if (is_tdp_mmu_enabled(kvm)) {
		read_lock(&kvm->mmu_lock);
		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
		read_unlock(&kvm->mmu_lock);
	}

5873 5874 5875 5876 5877 5878 5879
	/*
	 * It's also safe to flush TLBs out of mmu lock here as currently this
	 * function is only used for dirty logging, in which case flushing TLB
	 * out of mmu lock also guarantees no dirty pages will be lost in
	 * dirty_bitmap.
	 */
	if (flush)
5880
		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5881 5882
}

5883
void kvm_mmu_zap_all(struct kvm *kvm)
5884 5885
{
	struct kvm_mmu_page *sp, *node;
5886
	LIST_HEAD(invalid_list);
5887
	int ign;
5888

5889
	write_lock(&kvm->mmu_lock);
5890
restart:
5891
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5892
		if (WARN_ON(sp->role.invalid))
5893
			continue;
5894
		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5895
			goto restart;
5896
		if (cond_resched_rwlock_write(&kvm->mmu_lock))
5897 5898 5899
			goto restart;
	}

5900
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
5901

5902
	if (is_tdp_mmu_enabled(kvm))
5903 5904
		kvm_tdp_mmu_zap_all(kvm);

5905
	write_unlock(&kvm->mmu_lock);
5906 5907
}

5908
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5909
{
5910
	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5911

5912
	gen &= MMIO_SPTE_GEN_MASK;
5913

5914
	/*
5915 5916 5917 5918 5919 5920 5921 5922
	 * Generation numbers are incremented in multiples of the number of
	 * address spaces in order to provide unique generations across all
	 * address spaces.  Strip what is effectively the address space
	 * modifier prior to checking for a wrap of the MMIO generation so
	 * that a wrap in any address space is detected.
	 */
	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);

5923
	/*
5924
	 * The very rare case: if the MMIO generation number has wrapped,
5925 5926
	 * zap all shadow pages.
	 */
5927
	if (unlikely(gen == 0)) {
5928
		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5929
		kvm_mmu_zap_all_fast(kvm);
5930
	}
5931 5932
}

5933 5934
static unsigned long
mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5935 5936
{
	struct kvm *kvm;
5937
	int nr_to_scan = sc->nr_to_scan;
5938
	unsigned long freed = 0;
5939

J
Junaid Shahid 已提交
5940
	mutex_lock(&kvm_lock);
5941 5942

	list_for_each_entry(kvm, &vm_list, vm_list) {
5943
		int idx;
5944
		LIST_HEAD(invalid_list);
5945

5946 5947 5948 5949 5950 5951 5952 5953
		/*
		 * Never scan more than sc->nr_to_scan VM instances.
		 * Will not hit this condition practically since we do not try
		 * to shrink more than one VM and it is very unlikely to see
		 * !n_used_mmu_pages so many times.
		 */
		if (!nr_to_scan--)
			break;
5954 5955 5956 5957 5958 5959
		/*
		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
		 * here. We may skip a VM instance errorneosly, but we do not
		 * want to shrink a VM that only started to populate its MMU
		 * anyway.
		 */
5960 5961
		if (!kvm->arch.n_used_mmu_pages &&
		    !kvm_has_zapped_obsolete_pages(kvm))
5962 5963
			continue;

5964
		idx = srcu_read_lock(&kvm->srcu);
5965
		write_lock(&kvm->mmu_lock);
5966

5967 5968 5969 5970 5971 5972
		if (kvm_has_zapped_obsolete_pages(kvm)) {
			kvm_mmu_commit_zap_page(kvm,
			      &kvm->arch.zapped_obsolete_pages);
			goto unlock;
		}

5973
		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
5974

5975
unlock:
5976
		write_unlock(&kvm->mmu_lock);
5977
		srcu_read_unlock(&kvm->srcu, idx);
5978

5979 5980 5981 5982 5983
		/*
		 * unfair on small ones
		 * per-vm shrinkers cry out
		 * sadness comes quickly
		 */
5984 5985
		list_move_tail(&kvm->vm_list, &vm_list);
		break;
5986 5987
	}

J
Junaid Shahid 已提交
5988
	mutex_unlock(&kvm_lock);
5989 5990 5991 5992 5993 5994
	return freed;
}

static unsigned long
mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
5995
	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
5996 5997 5998
}

static struct shrinker mmu_shrinker = {
5999 6000
	.count_objects = mmu_shrink_count,
	.scan_objects = mmu_shrink_scan,
6001 6002 6003
	.seeks = DEFAULT_SEEKS * 10,
};

I
Ingo Molnar 已提交
6004
static void mmu_destroy_caches(void)
6005
{
6006 6007
	kmem_cache_destroy(pte_list_desc_cache);
	kmem_cache_destroy(mmu_page_header_cache);
6008 6009
}

P
Paolo Bonzini 已提交
6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043
static bool get_nx_auto_mode(void)
{
	/* Return true when CPU has the bug, and mitigations are ON */
	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
}

static void __set_nx_huge_pages(bool val)
{
	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
}

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
{
	bool old_val = nx_huge_pages;
	bool new_val;

	/* In "auto" mode deploy workaround only if CPU has the bug. */
	if (sysfs_streq(val, "off"))
		new_val = 0;
	else if (sysfs_streq(val, "force"))
		new_val = 1;
	else if (sysfs_streq(val, "auto"))
		new_val = get_nx_auto_mode();
	else if (strtobool(val, &new_val) < 0)
		return -EINVAL;

	__set_nx_huge_pages(new_val);

	if (new_val != old_val) {
		struct kvm *kvm;

		mutex_lock(&kvm_lock);

		list_for_each_entry(kvm, &vm_list, vm_list) {
6044
			mutex_lock(&kvm->slots_lock);
P
Paolo Bonzini 已提交
6045
			kvm_mmu_zap_all_fast(kvm);
6046
			mutex_unlock(&kvm->slots_lock);
6047 6048

			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
P
Paolo Bonzini 已提交
6049 6050 6051 6052 6053 6054 6055
		}
		mutex_unlock(&kvm_lock);
	}

	return 0;
}

6056 6057
int kvm_mmu_module_init(void)
{
6058 6059
	int ret = -ENOMEM;

P
Paolo Bonzini 已提交
6060 6061 6062
	if (nx_huge_pages == -1)
		__set_nx_huge_pages(get_nx_auto_mode());

6063 6064 6065 6066 6067 6068 6069 6070 6071 6072
	/*
	 * MMU roles use union aliasing which is, generally speaking, an
	 * undefined behavior. However, we supposedly know how compilers behave
	 * and the current status quo is unlikely to change. Guardians below are
	 * supposed to let us know if the assumption becomes false.
	 */
	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
	BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));

6073
	kvm_mmu_reset_all_pte_masks();
6074

6075 6076
	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
					    sizeof(struct pte_list_desc),
6077
					    0, SLAB_ACCOUNT, NULL);
6078
	if (!pte_list_desc_cache)
6079
		goto out;
6080

6081 6082
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
						  sizeof(struct kvm_mmu_page),
6083
						  0, SLAB_ACCOUNT, NULL);
6084
	if (!mmu_page_header_cache)
6085
		goto out;
6086

6087
	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6088
		goto out;
6089

6090 6091 6092
	ret = register_shrinker(&mmu_shrinker);
	if (ret)
		goto out;
6093

6094 6095
	return 0;

6096
out:
6097
	mmu_destroy_caches();
6098
	return ret;
6099 6100
}

6101
/*
P
Peng Hao 已提交
6102
 * Calculate mmu pages needed for kvm.
6103
 */
6104
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6105
{
6106 6107
	unsigned long nr_mmu_pages;
	unsigned long nr_pages = 0;
6108
	struct kvm_memslots *slots;
6109
	struct kvm_memory_slot *memslot;
6110
	int i;
6111

6112 6113
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		slots = __kvm_memslots(kvm, i);
6114

6115 6116 6117
		kvm_for_each_memslot(memslot, slots)
			nr_pages += memslot->npages;
	}
6118 6119

	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6120
	nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6121 6122 6123 6124

	return nr_mmu_pages;
}

6125 6126
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
6127
	kvm_mmu_unload(vcpu);
6128 6129
	free_mmu_pages(&vcpu->arch.root_mmu);
	free_mmu_pages(&vcpu->arch.guest_mmu);
6130
	mmu_free_memory_caches(vcpu);
6131 6132 6133 6134 6135 6136 6137
}

void kvm_mmu_module_exit(void)
{
	mmu_destroy_caches();
	percpu_counter_destroy(&kvm_total_used_mmu_pages);
	unregister_shrinker(&mmu_shrinker);
6138 6139
	mmu_audit_disable();
}
6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167

static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
{
	unsigned int old_val;
	int err;

	old_val = nx_huge_pages_recovery_ratio;
	err = param_set_uint(val, kp);
	if (err)
		return err;

	if (READ_ONCE(nx_huge_pages) &&
	    !old_val && nx_huge_pages_recovery_ratio) {
		struct kvm *kvm;

		mutex_lock(&kvm_lock);

		list_for_each_entry(kvm, &vm_list, vm_list)
			wake_up_process(kvm->arch.nx_lpage_recovery_thread);

		mutex_unlock(&kvm_lock);
	}

	return err;
}

static void kvm_recover_nx_lpages(struct kvm *kvm)
{
6168
	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6169 6170 6171 6172
	int rcu_idx;
	struct kvm_mmu_page *sp;
	unsigned int ratio;
	LIST_HEAD(invalid_list);
6173
	bool flush = false;
6174 6175 6176
	ulong to_zap;

	rcu_idx = srcu_read_lock(&kvm->srcu);
6177
	write_lock(&kvm->mmu_lock);
6178 6179

	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6180
	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6181 6182 6183 6184
	for ( ; to_zap; --to_zap) {
		if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
			break;

6185 6186 6187 6188 6189 6190 6191 6192 6193
		/*
		 * We use a separate list instead of just using active_mmu_pages
		 * because the number of lpage_disallowed pages is expected to
		 * be relatively small compared to the total.
		 */
		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
				      struct kvm_mmu_page,
				      lpage_disallowed_link);
		WARN_ON_ONCE(!sp->lpage_disallowed);
6194
		if (is_tdp_mmu_page(sp)) {
6195
			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6196
		} else {
6197 6198 6199
			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
			WARN_ON_ONCE(sp->lpage_disallowed);
		}
6200

6201
		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6202
			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6203
			cond_resched_rwlock_write(&kvm->mmu_lock);
6204
			flush = false;
6205 6206
		}
	}
6207
	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6208

6209
	write_unlock(&kvm->mmu_lock);
6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262
	srcu_read_unlock(&kvm->srcu, rcu_idx);
}

static long get_nx_lpage_recovery_timeout(u64 start_time)
{
	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
		? start_time + 60 * HZ - get_jiffies_64()
		: MAX_SCHEDULE_TIMEOUT;
}

static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
{
	u64 start_time;
	long remaining_time;

	while (true) {
		start_time = get_jiffies_64();
		remaining_time = get_nx_lpage_recovery_timeout(start_time);

		set_current_state(TASK_INTERRUPTIBLE);
		while (!kthread_should_stop() && remaining_time > 0) {
			schedule_timeout(remaining_time);
			remaining_time = get_nx_lpage_recovery_timeout(start_time);
			set_current_state(TASK_INTERRUPTIBLE);
		}

		set_current_state(TASK_RUNNING);

		if (kthread_should_stop())
			return 0;

		kvm_recover_nx_lpages(kvm);
	}
}

int kvm_mmu_post_init_vm(struct kvm *kvm)
{
	int err;

	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
					  "kvm-nx-lpage-recovery",
					  &kvm->arch.nx_lpage_recovery_thread);
	if (!err)
		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);

	return err;
}

void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
	if (kvm->arch.nx_lpage_recovery_thread)
		kthread_stop(kvm->arch.nx_lpage_recovery_thread);
}