tdp_mmu.c 52.6 KB
Newer Older
1 2
// SPDX-License-Identifier: GPL-2.0

3 4
#include "mmu.h"
#include "mmu_internal.h"
B
Ben Gardon 已提交
5
#include "mmutrace.h"
6
#include "tdp_iter.h"
7
#include "tdp_mmu.h"
8
#include "spte.h"
9

10
#include <asm/cmpxchg.h>
11 12
#include <trace/events/kvm.h>

13
static bool __read_mostly tdp_mmu_enabled = true;
14
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 16

/* Initializes the TDP MMU for the VM, if enabled. */
17
bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18
{
19
	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20
		return false;
21 22 23

	/* This should not be changed for the lifetime of the VM. */
	kvm->arch.tdp_mmu_enabled = true;
24 25

	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26
	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27
	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 29

	return true;
30 31
}

32 33
/* Arbitrarily returns true so that this may be used in if statements. */
static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
34 35 36 37 38 39
							     bool shared)
{
	if (shared)
		lockdep_assert_held_read(&kvm->mmu_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
40 41

	return true;
42 43
}

44 45 46 47
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
	if (!kvm->arch.tdp_mmu_enabled)
		return;
48

49
	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
50
	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
51 52 53 54 55 56

	/*
	 * Ensure that all the outstanding RCU callbacks to free shadow pages
	 * can run before the VM is torn down.
	 */
	rcu_barrier();
57 58
}

59
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
60 61
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared);
62 63

static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
64
{
65 66
	free_page((unsigned long)sp->spt);
	kmem_cache_free(mmu_page_header_cache, sp);
67 68
}

69 70 71 72 73 74 75 76 77
/*
 * This is called through call_rcu in order to free TDP page table memory
 * safely with respect to other kernel threads that may be operating on
 * the memory.
 * By only accessing TDP MMU page table memory in an RCU read critical
 * section, and freeing it after a grace period, lockless access to that
 * memory won't use it after it is freed.
 */
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
78
{
79 80
	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
					       rcu_head);
81

82 83
	tdp_mmu_free_sp(sp);
}
84

85 86
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
			  bool shared)
87
{
88
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
89

90
	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
91 92 93 94
		return;

	WARN_ON(!root->tdp_mmu_page);

95 96 97
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	list_del_rcu(&root->link);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
98

99 100 101 102 103 104 105 106 107
	/*
	 * A TLB flush is not necessary as KVM performs a local TLB flush when
	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
	 * to a different pCPU.  Note, the local TLB flush on reuse also
	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
	 * intermediate paging structures, that may be zapped, as such entries
	 * are associated with the ASID on both VMX and SVM.
	 */
	(void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
108

109
	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
110 111
}

112
/*
113 114 115 116 117 118 119 120
 * Returns the next root after @prev_root (or the first root if @prev_root is
 * NULL).  A reference to the returned root is acquired, and the reference to
 * @prev_root is released (the caller obviously must hold a reference to
 * @prev_root if it's non-NULL).
 *
 * If @only_valid is true, invalid roots are skipped.
 *
 * Returns NULL if the end of tdp_mmu_roots was reached.
121 122
 */
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
123
					      struct kvm_mmu_page *prev_root,
124
					      bool shared, bool only_valid)
125 126 127
{
	struct kvm_mmu_page *next_root;

128 129
	rcu_read_lock();

130
	if (prev_root)
131 132 133
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &prev_root->link,
						  typeof(*prev_root), link);
134
	else
135 136
		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						   typeof(*next_root), link);
137

138
	while (next_root) {
139
		if ((!only_valid || !next_root->role.invalid) &&
140
		    kvm_tdp_mmu_get_root(next_root))
141 142
			break;

143 144
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
				&next_root->link, typeof(*next_root), link);
145
	}
146

147
	rcu_read_unlock();
148

149
	if (prev_root)
150
		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
151 152 153 154 155 156 157 158 159

	return next_root;
}

/*
 * Note: this iterator gets and puts references to the roots it iterates over.
 * This makes it safe to release the MMU lock and yield within the loop, but
 * if exiting the loop early, the caller must drop the reference to the most
 * recent root. (Unless keeping a live reference is desirable.)
160 161 162 163
 *
 * If shared is set, this function is operating under the MMU lock in read
 * mode. In the unlikely event that this thread must free a root, the lock
 * will be temporarily dropped and reacquired in write mode.
164
 */
165 166 167 168
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
	     _root;								\
	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
169 170
		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
		    kvm_mmu_page_as_id(_root) != _as_id) {			\
171
		} else
172

173 174 175
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)

176 177
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
178

179 180 181 182 183 184 185 186 187 188 189
/*
 * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
 * the implication being that any flow that holds mmu_lock for read is
 * inherently yield-friendly and should use the yield-safe variant above.
 * Holding mmu_lock for write obviates the need for RCU protection as the list
 * is guaranteed to be stable.
 */
#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
		    kvm_mmu_page_as_id(_root) != _as_id) {		\
190
		} else
191

192
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
193 194 195 196 197
{
	struct kvm_mmu_page *sp;

	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
198 199 200 201

	return sp;
}

202 203
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
			    gfn_t gfn, union kvm_mmu_page_role role)
204
{
205 206
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);

207
	sp->role = role;
208
	sp->gfn = gfn;
209
	sp->ptep = sptep;
210 211
	sp->tdp_mmu_page = true;

212
	trace_kvm_mmu_get_page(sp, true);
213 214
}

215 216
static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
				  struct tdp_iter *iter)
217
{
218
	struct kvm_mmu_page *parent_sp;
219
	union kvm_mmu_page_role role;
220 221 222 223 224 225

	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));

	role = parent_sp->role;
	role.level--;

226
	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
227 228 229 230 231
}

hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
	union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
232 233 234
	struct kvm *kvm = vcpu->kvm;
	struct kvm_mmu_page *root;

235
	lockdep_assert_held_write(&kvm->mmu_lock);
236

237 238 239 240
	/*
	 * Check for an existing root before allocating a new one.  Note, the
	 * role check prevents consuming an invalid root.
	 */
241
	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
242
		if (root->role.word == role.word &&
243
		    kvm_tdp_mmu_get_root(root))
244
			goto out;
245 246
	}

247
	root = tdp_mmu_alloc_sp(vcpu);
248
	tdp_mmu_init_sp(root, NULL, 0, role);
249

250
	refcount_set(&root->tdp_mmu_root_count, 1);
251

252 253 254
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
255

256
out:
257
	return __pa(root->spt);
258
}
259 260

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
261 262
				u64 old_spte, u64 new_spte, int level,
				bool shared);
263

264 265 266 267 268 269
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
		return;

	if (is_accessed_spte(old_spte) &&
270 271
	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
272 273 274
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}

275 276 277 278 279 280 281 282 283 284 285 286 287 288
static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
					  u64 old_spte, u64 new_spte, int level)
{
	bool pfn_changed;
	struct kvm_memory_slot *slot;

	if (level > PG_LEVEL_4K)
		return;

	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);

	if ((!is_writable_pte(old_spte) || pfn_changed) &&
	    is_writable_pte(new_spte)) {
		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
289
		mark_page_dirty_in_slot(kvm, slot, gfn);
290 291 292
	}
}

293
/**
294
 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
295 296 297
 *
 * @kvm: kvm instance
 * @sp: the page to be removed
298 299 300
 * @shared: This operation may not be running under the exclusive use of
 *	    the MMU lock and the operation must synchronize with other
 *	    threads that might be adding or removing pages.
301
 */
302 303
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
			      bool shared)
304
{
305 306 307 308
	if (shared)
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
309 310 311 312

	list_del(&sp->link);
	if (sp->lpage_disallowed)
		unaccount_huge_nx_page(kvm, sp);
313 314 315

	if (shared)
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
316 317
}

318
/**
319
 * handle_removed_pt() - handle a page table removed from the TDP structure
320 321 322
 *
 * @kvm: kvm instance
 * @pt: the page removed from the paging structure
323 324 325
 * @shared: This operation may not be running under the exclusive use
 *	    of the MMU lock and the operation must synchronize with other
 *	    threads that might be modifying SPTEs.
326 327 328
 *
 * Given a page table that has been removed from the TDP paging structure,
 * iterates through the page table to clear SPTEs and free child page tables.
329 330 331 332 333
 *
 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 * protection. Since this thread removed it from the paging structure,
 * this thread will be responsible for ensuring the page is freed. Hence the
 * early rcu_dereferences in the function.
334
 */
335
static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
336
{
337
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
338
	int level = sp->role.level;
339
	gfn_t base_gfn = sp->gfn;
340 341 342 343
	int i;

	trace_kvm_mmu_prepare_zap_page(sp);

344
	tdp_mmu_unlink_sp(kvm, sp, shared);
345 346

	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
347 348 349
		u64 *sptep = rcu_dereference(pt) + i;
		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
		u64 old_child_spte;
350 351

		if (shared) {
352 353 354 355 356 357 358 359 360 361 362 363 364 365
			/*
			 * Set the SPTE to a nonpresent value that other
			 * threads will not overwrite. If the SPTE was
			 * already marked as removed then another thread
			 * handling a page fault could overwrite it, so
			 * set the SPTE until it is set from some other
			 * value to the removed SPTE value.
			 */
			for (;;) {
				old_child_spte = xchg(sptep, REMOVED_SPTE);
				if (!is_removed_spte(old_child_spte))
					break;
				cpu_relax();
			}
366
		} else {
367 368 369 370 371 372 373 374 375
			/*
			 * If the SPTE is not MMU-present, there is no backing
			 * page associated with the SPTE and so no side effects
			 * that need to be recorded, and exclusive ownership of
			 * mmu_lock ensures the SPTE can't be made present.
			 * Note, zapping MMIO SPTEs is also unnecessary as they
			 * are guarded by the memslots generation, not by being
			 * unreachable.
			 */
376
			old_child_spte = READ_ONCE(*sptep);
377 378
			if (!is_shadow_present_pte(old_child_spte))
				continue;
379 380 381 382 383 384 385 386 387 388

			/*
			 * Marking the SPTE as a removed SPTE is not
			 * strictly necessary here as the MMU lock will
			 * stop other threads from concurrently modifying
			 * this SPTE. Using the removed SPTE value keeps
			 * the two branches consistent and simplifies
			 * the function.
			 */
			WRITE_ONCE(*sptep, REMOVED_SPTE);
389
		}
390
		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
391
				    old_child_spte, REMOVED_SPTE, level,
392
				    shared);
393 394
	}

395
	kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
396
					   KVM_PAGES_PER_HPAGE(level + 1));
397

398
	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
399 400
}

401
/**
402
 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
403 404 405 406 407 408
 * @kvm: kvm instance
 * @as_id: the address space of the paging structure the SPTE was a part of
 * @gfn: the base GFN that was mapped by the SPTE
 * @old_spte: The value of the SPTE before the change
 * @new_spte: The value of the SPTE after the change
 * @level: the level of the PT the SPTE is part of in the paging structure
409 410 411
 * @shared: This operation may not be running under the exclusive use of
 *	    the MMU lock and the operation must synchronize with other
 *	    threads that might be modifying SPTEs.
412 413 414 415 416
 *
 * Handle bookkeeping that might result from the modification of a SPTE.
 * This function must be called for all TDP SPTE modifications.
 */
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
417 418
				  u64 old_spte, u64 new_spte, int level,
				  bool shared)
419 420 421 422 423 424 425 426 427
{
	bool was_present = is_shadow_present_pte(old_spte);
	bool is_present = is_shadow_present_pte(new_spte);
	bool was_leaf = was_present && is_last_spte(old_spte, level);
	bool is_leaf = is_present && is_last_spte(new_spte, level);
	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);

	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
	WARN_ON(level < PG_LEVEL_4K);
428
	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
429 430 431 432 433

	/*
	 * If this warning were to trigger it would indicate that there was a
	 * missing MMU notifier or a race with some notifier handler.
	 * A present, leaf SPTE should never be directly replaced with another
I
Ingo Molnar 已提交
434
	 * present leaf SPTE pointing to a different PFN. A notifier handler
435 436 437 438 439 440 441 442 443 444 445 446 447
	 * should be zapping the SPTE before the main MM's page table is
	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
	 * thread before replacement.
	 */
	if (was_leaf && is_leaf && pfn_changed) {
		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
		       "SPTE with another present leaf SPTE mapping a\n"
		       "different PFN!\n"
		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
		       as_id, gfn, old_spte, new_spte, level);

		/*
		 * Crash the host to prevent error propagation and guest data
I
Ingo Molnar 已提交
448
		 * corruption.
449 450 451 452 453 454 455
		 */
		BUG();
	}

	if (old_spte == new_spte)
		return;

456 457
	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);

458 459 460
	if (is_leaf)
		check_spte_writable_invariants(new_spte);

461 462 463 464 465 466 467
	/*
	 * The only times a SPTE should be changed from a non-present to
	 * non-present state is when an MMIO entry is installed/modified/
	 * removed. In that case, there is nothing to do here.
	 */
	if (!was_present && !is_present) {
		/*
468 469 470 471
		 * If this change does not involve a MMIO SPTE or removed SPTE,
		 * it is unexpected. Log the change, though it should not
		 * impact the guest since both the former and current SPTEs
		 * are nonpresent.
472
		 */
473 474 475
		if (WARN_ON(!is_mmio_spte(old_spte) &&
			    !is_mmio_spte(new_spte) &&
			    !is_removed_spte(new_spte)))
476 477 478
			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
			       "should not be replaced with another,\n"
			       "different nonpresent SPTE, unless one or both\n"
479 480
			       "are MMIO SPTEs, or the new SPTE is\n"
			       "a temporary removed SPTE.\n"
481 482 483 484 485
			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
			       as_id, gfn, old_spte, new_spte, level);
		return;
	}

486 487
	if (is_leaf != was_leaf)
		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
488 489

	if (was_leaf && is_dirty_spte(old_spte) &&
490
	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
491 492 493 494
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));

	/*
	 * Recursively handle child PTs if the change removed a subtree from
495 496 497
	 * the paging structure.  Note the WARN on the PFN changing without the
	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
	 * pages are kernel allocations and should never be migrated.
498
	 */
499 500
	if (was_present && !was_leaf &&
	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
501
		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
502 503 504
}

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
505 506
				u64 old_spte, u64 new_spte, int level,
				bool shared)
507
{
508 509
	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
			      shared);
510
	handle_changed_spte_acc_track(old_spte, new_spte, level);
511 512
	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
				      new_spte, level);
513
}
514

515
/*
516 517
 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 * and handle the associated bookkeeping.  Do not mark the page dirty
518
 * in KVM's dirty bitmaps.
519
 *
520 521 522
 * If setting the SPTE fails because it has changed, iter->old_spte will be
 * refreshed to the current value of the spte.
 *
523 524 525
 * @kvm: kvm instance
 * @iter: a tdp_iter instance currently on the SPTE that should be set
 * @new_spte: The value the SPTE should be set to
526 527 528 529 530
 * Return:
 * * 0      - If the SPTE was set.
 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
 *            no side-effects other than setting iter->old_spte to the last
 *            known value of the spte.
531
 */
532 533 534
static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
					  struct tdp_iter *iter,
					  u64 new_spte)
535
{
536 537 538
	u64 *sptep = rcu_dereference(iter->sptep);
	u64 old_spte;

539 540
	WARN_ON_ONCE(iter->yielded);

541 542
	lockdep_assert_held_read(&kvm->mmu_lock);

543 544 545 546
	/*
	 * Do not change removed SPTEs. Only the thread that froze the SPTE
	 * may modify it.
	 */
547
	if (is_removed_spte(iter->old_spte))
548
		return -EBUSY;
549

550 551 552 553
	/*
	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
	 * does not hold the mmu_lock.
	 */
554 555 556 557 558 559 560 561 562
	old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
	if (old_spte != iter->old_spte) {
		/*
		 * The page table entry was modified by a different logical
		 * CPU. Refresh iter->old_spte with the current value so the
		 * caller operates on fresh data, e.g. if it retries
		 * tdp_mmu_set_spte_atomic().
		 */
		iter->old_spte = old_spte;
563
		return -EBUSY;
564
	}
565

566 567 568
	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
			      new_spte, iter->level, true);
	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
569

570
	return 0;
571 572
}

573 574
static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
					  struct tdp_iter *iter)
575
{
576 577
	int ret;

578 579 580 581 582 583
	/*
	 * Freeze the SPTE by setting it to a special,
	 * non-present value. This will stop other threads from
	 * immediately installing a present entry in its place
	 * before the TLBs are flushed.
	 */
584 585 586
	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
	if (ret)
		return ret;
587 588 589 590 591 592 593

	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
					   KVM_PAGES_PER_HPAGE(iter->level));

	/*
	 * No other thread can overwrite the removed SPTE as they
	 * must either wait on the MMU lock or use
I
Ingo Molnar 已提交
594
	 * tdp_mmu_set_spte_atomic which will not overwrite the
595 596 597 598
	 * special removed SPTE value. No bookkeeping is needed
	 * here since the SPTE is going from non-present
	 * to non-present.
	 */
599
	kvm_tdp_mmu_write_spte(iter->sptep, 0);
600

601
	return 0;
602 603
}

604

605 606
/*
 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
607 608 609 610 611 612 613
 * @kvm:	      KVM instance
 * @as_id:	      Address space ID, i.e. regular vs. SMM
 * @sptep:	      Pointer to the SPTE
 * @old_spte:	      The current value of the SPTE
 * @new_spte:	      The new value that will be set for the SPTE
 * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
 * @level:	      The level _containing_ the SPTE (its parent PT's level)
614 615 616 617 618 619 620 621 622 623 624
 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 *		      of the page. Should be set unless handling an MMU
 *		      notifier for access tracking. Leaving record_acc_track
 *		      unset in that case prevents page accesses from being
 *		      double counted.
 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 *		      appropriate for the change being made. Should be set
 *		      unless performing certain dirty logging operations.
 *		      Leaving record_dirty_log unset in that case prevents page
 *		      writes from being double counted.
 */
625 626 627
static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
			       u64 old_spte, u64 new_spte, gfn_t gfn, int level,
			       bool record_acc_track, bool record_dirty_log)
628
{
629
	lockdep_assert_held_write(&kvm->mmu_lock);
630

631
	/*
632
	 * No thread should be using this function to set SPTEs to or from the
633 634 635 636 637
	 * temporary removed SPTE value.
	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
	 * should be used. If operating under the MMU lock in write mode, the
	 * use of the removed SPTE should not be necessary.
	 */
638
	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
639

640 641 642
	kvm_tdp_mmu_write_spte(sptep, new_spte);

	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
643 644

	if (record_acc_track)
645
		handle_changed_spte_acc_track(old_spte, new_spte, level);
646
	if (record_dirty_log)
647 648 649 650 651 652 653 654 655 656 657 658 659
		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
					      new_spte, level);
}

static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
				     u64 new_spte, bool record_acc_track,
				     bool record_dirty_log)
{
	WARN_ON_ONCE(iter->yielded);

	__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
			   new_spte, iter->gfn, iter->level,
			   record_acc_track, record_dirty_log);
660 661 662 663 664
}

static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
				    u64 new_spte)
{
665
	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
666
}
667

668 669 670 671
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
						 struct tdp_iter *iter,
						 u64 new_spte)
{
672
	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
673 674 675 676 677 678
}

static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
						 struct tdp_iter *iter,
						 u64 new_spte)
{
679
	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
680 681 682
}

#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
683
	for_each_tdp_pte(_iter, _root, _start, _end)
684

685 686 687 688 689 690 691
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
		if (!is_shadow_present_pte(_iter.old_spte) ||		\
		    !is_last_spte(_iter.old_spte, _iter.level))		\
			continue;					\
		else

B
Ben Gardon 已提交
692
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
693
	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
B
Ben Gardon 已提交
694

695 696 697 698
/*
 * Yield if the MMU lock is contended or this thread needs to return control
 * to the scheduler.
 *
699 700 701
 * If this function should yield and flush is set, it will perform a remote
 * TLB flush before yielding.
 *
702 703 704 705
 * If this function yields, iter->yielded is set and the caller must skip to
 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
 * over the paging structures to allow the iterator to continue its traversal
 * from the paging structure root.
706
 *
707
 * Returns true if this function yielded.
708
 */
709 710 711
static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
							  struct tdp_iter *iter,
							  bool flush, bool shared)
712
{
713 714
	WARN_ON(iter->yielded);

715 716 717 718
	/* Ensure forward progress has been made before yielding. */
	if (iter->next_last_level_gfn == iter->yielded_gfn)
		return false;

719
	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
720 721
		rcu_read_unlock();

722 723 724
		if (flush)
			kvm_flush_remote_tlbs(kvm);

725 726 727 728 729
		if (shared)
			cond_resched_rwlock_read(&kvm->mmu_lock);
		else
			cond_resched_rwlock_write(&kvm->mmu_lock);

730
		rcu_read_lock();
731 732 733

		WARN_ON(iter->gfn > iter->next_last_level_gfn);

734
		iter->yielded = true;
735
	}
736

737
	return iter->yielded;
738 739
}

740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	u64 old_spte;

	/*
	 * This helper intentionally doesn't allow zapping a root shadow page,
	 * which doesn't have a parent page table and thus no associated entry.
	 */
	if (WARN_ON_ONCE(!sp->ptep))
		return false;

	rcu_read_lock();

	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) {
		rcu_read_unlock();
		return false;
	}

	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
			   sp->gfn, sp->role.level + 1, true, true);

	rcu_read_unlock();

	return true;
}

767 768 769 770 771
/*
 * Tears down the mappings for the range of gfns, [start, end), and frees the
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
772
 *
773 774 775 776
 * If can_yield is true, will release the MMU lock and reschedule if the
 * scheduler needs the CPU or there is contention on the MMU lock. If this
 * function cannot yield, it will not release the MMU lock or reschedule and
 * the caller must ensure it does not supply too large a GFN range, or the
777 778 779 780 781 782
 * operation can cause a soft lockup.
 *
 * If shared is true, this thread holds the MMU lock in read mode and must
 * account for the possibility that other threads are modifying the paging
 * structures concurrently. If shared is false, this thread should hold the
 * MMU lock in write mode.
783 784
 */
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
785 786
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared)
787
{
788 789
	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
	bool zap_all = (start == 0 && end >= max_gfn_host);
790 791
	struct tdp_iter iter;

792 793 794 795 796 797
	/*
	 * No need to try to step down in the iterator when zapping all SPTEs,
	 * zapping the top-level non-leaf SPTEs will recurse on their children.
	 */
	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;

798 799 800 801 802 803 804
	/*
	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
	 * and so KVM will never install a SPTE for such addresses.
	 */
	end = min(end, max_gfn_host);

805 806
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

807 808
	rcu_read_lock();

809
	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
810
retry:
811
		if (can_yield &&
812
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
813
			flush = false;
814 815 816
			continue;
		}

817 818 819 820 821 822
		if (!is_shadow_present_pte(iter.old_spte))
			continue;

		/*
		 * If this is a non-last-level SPTE that covers a larger range
		 * than should be zapped, continue, and zap the mappings at a
823
		 * lower level, except when zapping all SPTEs.
824
		 */
825 826
		if (!zap_all &&
		    (iter.gfn < start ||
827 828 829 830
		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

831 832 833
		if (!shared) {
			tdp_mmu_set_spte(kvm, &iter, 0);
			flush = true;
834
		} else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
835 836
			goto retry;
		}
837
	}
838 839

	rcu_read_unlock();
840
	return flush;
841 842 843 844 845 846 847 848
}

/*
 * Tears down the mappings for the range of gfns, [start, end), and frees the
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
 */
849
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
850
				 gfn_t end, bool can_yield, bool flush)
851 852 853
{
	struct kvm_mmu_page *root;

854
	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
855
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
856
				      false);
857 858 859 860 861 862

	return flush;
}

void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
863 864 865 866
	bool flush = false;
	int i;

	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
867
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
868 869 870 871

	if (flush)
		kvm_flush_remote_tlbs(kvm);
}
B
Ben Gardon 已提交
872

873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
						  struct kvm_mmu_page *prev_root)
{
	struct kvm_mmu_page *next_root;

	if (prev_root)
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &prev_root->link,
						  typeof(*prev_root), link);
	else
		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						   typeof(*next_root), link);

	while (next_root && !(next_root->role.invalid &&
			      refcount_read(&next_root->tdp_mmu_root_count)))
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &next_root->link,
						  typeof(*next_root), link);

	return next_root;
}

/*
896 897 898 899 900
 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
 * zap" completes.  Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
 * reference to each invalidated root, roots will not be freed until after this
 * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
 * tearing down paging structures.
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
 */
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
	struct kvm_mmu_page *next_root;
	struct kvm_mmu_page *root;

	lockdep_assert_held_read(&kvm->mmu_lock);

	rcu_read_lock();

	root = next_invalidated_root(kvm, NULL);

	while (root) {
		next_root = next_invalidated_root(kvm, root);

		rcu_read_unlock();

918 919 920 921 922 923 924 925 926 927
		/*
		 * A TLB flush is unnecessary, invalidated roots are guaranteed
		 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
		 * for more details), and unlike the legacy MMU, no vCPU kick
		 * is needed to play nice with lockless shadow walks as the TDP
		 * MMU protects its paging structures via RCU.  Note, zapping
		 * will still flush on yield, but that's a minor performance
		 * blip and not a functional issue.
		 */
		(void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
928 929 930 931 932 933 934 935 936 937 938 939 940

		/*
		 * Put the reference acquired in
		 * kvm_tdp_mmu_invalidate_roots
		 */
		kvm_tdp_mmu_put_root(kvm, root, true);

		root = next_root;

		rcu_read_lock();
	}

	rcu_read_unlock();
941
}
B
Ben Gardon 已提交
942

943
/*
944 945 946 947
 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
 * is about to be zapped, e.g. in response to a memslots update.  The caller is
 * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
 * zapping.
948
 *
949 950 951 952
 * Take a reference on all roots to prevent the root from being freed before it
 * is zapped by this thread.  Freeing a root is not a correctness issue, but if
 * a vCPU drops the last reference to a root prior to the root being zapped, it
 * will get stuck with tearing down the entire paging structure.
953
 *
954 955 956 957 958 959 960 961 962
 * Get a reference even if the root is already invalid,
 * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
 * invalid roots, e.g. there's no epoch to identify roots that were invalidated
 * by a previous call.  Roots stay on the list until the last reference is
 * dropped, so even though all invalid roots are zapped, a root may not go away
 * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
 *
 * Because mmu_lock is held for write, it should be impossible to observe a
 * root with zero refcount, i.e. the list of roots cannot be stale.
963
 *
964 965 966 967 968 969 970 971
 * This has essentially the same effect for the TDP MMU
 * as updating mmu_valid_gen does for the shadow MMU.
 */
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
	struct kvm_mmu_page *root;

	lockdep_assert_held_write(&kvm->mmu_lock);
972 973
	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
		if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
974
			root->role.invalid = true;
975
	}
976 977
}

B
Ben Gardon 已提交
978 979 980 981
/*
 * Installs a last-level SPTE to handle a TDP page fault.
 * (NPT/EPT violation/misconfiguration)
 */
982 983 984
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
					  struct kvm_page_fault *fault,
					  struct tdp_iter *iter)
B
Ben Gardon 已提交
985
{
986
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
B
Ben Gardon 已提交
987
	u64 new_spte;
988
	int ret = RET_PF_FIXED;
989
	bool wrprot = false;
B
Ben Gardon 已提交
990

991
	WARN_ON(sp->role.level != fault->goal_level);
992
	if (unlikely(!fault->slot))
B
Ben Gardon 已提交
993
		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
994
	else
995
		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
996
					 fault->pfn, iter->old_spte, fault->prefetch, true,
997
					 fault->map_writable, &new_spte);
B
Ben Gardon 已提交
998 999 1000

	if (new_spte == iter->old_spte)
		ret = RET_PF_SPURIOUS;
1001
	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1002
		return RET_PF_RETRY;
B
Ben Gardon 已提交
1003 1004 1005 1006 1007 1008

	/*
	 * If the page fault was caused by a write but the page is write
	 * protected, emulation is needed. If the emulation was skipped,
	 * the vCPU would have the same fault again.
	 */
1009
	if (wrprot) {
1010
		if (fault->write)
B
Ben Gardon 已提交
1011 1012 1013 1014
			ret = RET_PF_EMULATE;
	}

	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1015 1016 1017
	if (unlikely(is_mmio_spte(new_spte))) {
		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
				     new_spte);
B
Ben Gardon 已提交
1018
		ret = RET_PF_EMULATE;
1019
	} else {
1020 1021
		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
				       rcu_dereference(iter->sptep));
1022
	}
B
Ben Gardon 已提交
1023

1024 1025 1026 1027 1028
	/*
	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
	 * consistent with legacy MMU behavior.
	 */
	if (ret != RET_PF_SPURIOUS)
B
Ben Gardon 已提交
1029 1030 1031 1032 1033
		vcpu->stat.pf_fixed++;

	return ret;
}

1034
/*
1035 1036
 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
 * provided page table.
1037 1038 1039 1040 1041 1042
 *
 * @kvm: kvm instance
 * @iter: a tdp_iter instance currently on the SPTE that should be set
 * @sp: The new TDP page table to install.
 * @account_nx: True if this page table is being installed to split a
 *              non-executable huge page.
1043
 * @shared: This operation is running under the MMU lock in read mode.
1044 1045 1046 1047
 *
 * Returns: 0 if the new page table was installed. Non-0 if the page table
 *          could not be installed (e.g. the atomic compare-exchange failed).
 */
1048 1049 1050
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
			   struct kvm_mmu_page *sp, bool account_nx,
			   bool shared)
1051 1052
{
	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1053
	int ret = 0;
1054

1055 1056 1057 1058 1059 1060 1061
	if (shared) {
		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
		if (ret)
			return ret;
	} else {
		tdp_mmu_set_spte(kvm, iter, spte);
	}
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071

	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
	if (account_nx)
		account_huge_nx_page(kvm, sp);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

	return 0;
}

B
Ben Gardon 已提交
1072 1073 1074 1075
/*
 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 * page tables and SPTEs to translate the faulting guest physical address.
 */
1076
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
B
Ben Gardon 已提交
1077 1078 1079
{
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	struct tdp_iter iter;
1080
	struct kvm_mmu_page *sp;
B
Ben Gardon 已提交
1081 1082
	int ret;

1083
	kvm_mmu_hugepage_adjust(vcpu, fault);
B
Ben Gardon 已提交
1084

1085
	trace_kvm_mmu_spte_requested(fault);
1086 1087 1088

	rcu_read_lock();

1089
	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1090
		if (fault->nx_huge_page_workaround_enabled)
1091
			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
B
Ben Gardon 已提交
1092

1093
		if (iter.level == fault->goal_level)
B
Ben Gardon 已提交
1094 1095 1096 1097 1098 1099 1100 1101 1102
			break;

		/*
		 * If there is an SPTE mapping a large page at a higher level
		 * than the target, that SPTE must be cleared and replaced
		 * with a non-leaf SPTE.
		 */
		if (is_shadow_present_pte(iter.old_spte) &&
		    is_large_pte(iter.old_spte)) {
1103
			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1104
				break;
B
Ben Gardon 已提交
1105 1106 1107 1108 1109 1110

			/*
			 * The iter must explicitly re-read the spte here
			 * because the new value informs the !present
			 * path below.
			 */
1111
			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
B
Ben Gardon 已提交
1112 1113 1114
		}

		if (!is_shadow_present_pte(iter.old_spte)) {
1115 1116 1117
			bool account_nx = fault->huge_page_disallowed &&
					  fault->req_level >= iter.level;

1118
			/*
I
Ingo Molnar 已提交
1119
			 * If SPTE has been frozen by another thread, just
1120 1121 1122 1123 1124 1125
			 * give up and retry, avoiding unnecessary page table
			 * allocation and free.
			 */
			if (is_removed_spte(iter.old_spte))
				break;

1126 1127 1128
			sp = tdp_mmu_alloc_sp(vcpu);
			tdp_mmu_init_child_sp(sp, &iter);

1129
			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1130 1131 1132
				tdp_mmu_free_sp(sp);
				break;
			}
B
Ben Gardon 已提交
1133 1134 1135
		}
	}

1136
	if (iter.level != fault->goal_level) {
1137
		rcu_read_unlock();
B
Ben Gardon 已提交
1138
		return RET_PF_RETRY;
1139
	}
B
Ben Gardon 已提交
1140

1141
	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1142
	rcu_read_unlock();
B
Ben Gardon 已提交
1143 1144 1145

	return ret;
}
1146

1147 1148
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
				 bool flush)
1149
{
1150 1151
	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
					   range->end, range->may_block, flush);
1152 1153
}

1154 1155
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
			      struct kvm_gfn_range *range);
1156

1157 1158 1159
static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
						   struct kvm_gfn_range *range,
						   tdp_handler_t handler)
1160
{
1161 1162 1163 1164
	struct kvm_mmu_page *root;
	struct tdp_iter iter;
	bool ret = false;

1165 1166 1167 1168
	/*
	 * Don't support rescheduling, none of the MMU notifiers that funnel
	 * into this helper allow blocking; it'd be dead, wasteful code.
	 */
1169
	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1170 1171
		rcu_read_lock();

1172 1173 1174
		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
			ret |= handler(kvm, &iter, range);

1175 1176
		rcu_read_unlock();
	}
1177 1178

	return ret;
1179
}
1180 1181 1182 1183 1184

/*
 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 * if any of the GFNs in the range have been accessed.
 */
1185 1186
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
			  struct kvm_gfn_range *range)
1187 1188 1189
{
	u64 new_spte = 0;

1190 1191 1192
	/* If we have a non-accessed entry we don't need to change the pte. */
	if (!is_accessed_spte(iter->old_spte))
		return false;
1193

1194 1195 1196 1197 1198
	new_spte = iter->old_spte;

	if (spte_ad_enabled(new_spte)) {
		new_spte &= ~shadow_accessed_mask;
	} else {
1199
		/*
1200 1201
		 * Capture the dirty status of the page, so that it doesn't get
		 * lost when the SPTE is marked for access tracking.
1202
		 */
1203 1204
		if (is_writable_pte(new_spte))
			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1205

1206
		new_spte = mark_spte_for_access_track(new_spte);
1207 1208
	}

1209
	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1210

1211
	return true;
1212 1213
}

1214
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1215
{
1216
	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1217 1218
}

1219 1220
static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
			 struct kvm_gfn_range *range)
1221
{
1222
	return is_accessed_spte(iter->old_spte);
1223 1224
}

1225
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1226
{
1227
	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1228
}
1229

1230 1231
static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
			 struct kvm_gfn_range *range)
1232 1233
{
	u64 new_spte;
1234

1235 1236
	/* Huge pages aren't expected to be modified without first being zapped. */
	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1237

1238 1239 1240
	if (iter->level != PG_LEVEL_4K ||
	    !is_shadow_present_pte(iter->old_spte))
		return false;
1241

1242 1243 1244 1245 1246 1247 1248
	/*
	 * Note, when changing a read-only SPTE, it's not strictly necessary to
	 * zero the SPTE before setting the new PFN, but doing so preserves the
	 * invariant that the PFN of a present * leaf SPTE can never change.
	 * See __handle_changed_spte().
	 */
	tdp_mmu_set_spte(kvm, iter, 0);
1249

1250 1251 1252
	if (!pte_write(range->pte)) {
		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
								  pte_pfn(range->pte));
1253

1254
		tdp_mmu_set_spte(kvm, iter, new_spte);
1255 1256
	}

1257
	return true;
1258 1259
}

1260 1261 1262 1263 1264 1265 1266
/*
 * Handle the changed_pte MMU notifier for the TDP MMU.
 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
 * notifier.
 * Returns non-zero if a flush is needed before releasing the MMU lock.
 */
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1267
{
1268 1269 1270 1271 1272 1273
	/*
	 * No need to handle the remote TLB flush under RCU protection, the
	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
	 */
	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1274 1275
}

1276
/*
1277 1278 1279
 * Remove write access from all SPTEs at or above min_level that map GFNs
 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
 * be flushed.
1280 1281 1282 1283 1284 1285 1286 1287
 */
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
			     gfn_t start, gfn_t end, int min_level)
{
	struct tdp_iter iter;
	u64 new_spte;
	bool spte_set = false;

1288 1289
	rcu_read_lock();

1290 1291
	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);

1292
	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1293 1294
retry:
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1295 1296
			continue;

1297
		if (!is_shadow_present_pte(iter.old_spte) ||
1298 1299
		    !is_last_spte(iter.old_spte, iter.level) ||
		    !(iter.old_spte & PT_WRITABLE_MASK))
1300 1301 1302 1303
			continue;

		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;

1304
		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1305
			goto retry;
1306

1307 1308
		spte_set = true;
	}
1309 1310

	rcu_read_unlock();
1311 1312 1313 1314 1315 1316 1317 1318
	return spte_set;
}

/*
 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
 * only affect leaf SPTEs down to min_level.
 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
 */
1319 1320
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
			     const struct kvm_memory_slot *slot, int min_level)
1321 1322 1323 1324
{
	struct kvm_mmu_page *root;
	bool spte_set = false;

1325
	lockdep_assert_held_read(&kvm->mmu_lock);
1326

1327
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1328 1329 1330 1331 1332 1333
		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
			     slot->base_gfn + slot->npages, min_level);

	return spte_set;
}

1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
{
	struct kvm_mmu_page *sp;

	gfp |= __GFP_ZERO;

	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
	if (!sp)
		return NULL;

	sp->spt = (void *)__get_free_page(gfp);
	if (!sp->spt) {
		kmem_cache_free(mmu_page_header_cache, sp);
		return NULL;
	}

	return sp;
}

static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1354 1355
						       struct tdp_iter *iter,
						       bool shared)
1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
{
	struct kvm_mmu_page *sp;

	/*
	 * Since we are allocating while under the MMU lock we have to be
	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
	 * reclaim and to avoid making any filesystem callbacks (which can end
	 * up invoking KVM MMU notifiers, resulting in a deadlock).
	 *
	 * If this allocation fails we drop the lock and retry with reclaim
	 * allowed.
	 */
	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
	if (sp)
		return sp;

	rcu_read_unlock();
1373 1374 1375 1376 1377

	if (shared)
		read_unlock(&kvm->mmu_lock);
	else
		write_unlock(&kvm->mmu_lock);
1378 1379 1380 1381

	iter->yielded = true;
	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);

1382 1383 1384 1385 1386
	if (shared)
		read_lock(&kvm->mmu_lock);
	else
		write_lock(&kvm->mmu_lock);

1387 1388 1389 1390 1391
	rcu_read_lock();

	return sp;
}

1392 1393
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
				   struct kvm_mmu_page *sp, bool shared)
1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415
{
	const u64 huge_spte = iter->old_spte;
	const int level = iter->level;
	int ret, i;

	tdp_mmu_init_child_sp(sp, iter);

	/*
	 * No need for atomics when writing to sp->spt since the page table has
	 * not been linked in yet and thus is not reachable from any other CPU.
	 */
	for (i = 0; i < PT64_ENT_PER_PAGE; i++)
		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);

	/*
	 * Replace the huge spte with a pointer to the populated lower level
	 * page table. Since we are making this change without a TLB flush vCPUs
	 * will see a mix of the split mappings and the original huge mapping,
	 * depending on what's currently in their TLB. This is fine from a
	 * correctness standpoint since the translation will be the same either
	 * way.
	 */
1416
	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1417
	if (ret)
1418
		goto out;
1419 1420 1421 1422 1423 1424 1425 1426

	/*
	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
	 * are overwriting from the page stats. But we have to manually update
	 * the page stats with the new present child pages.
	 */
	kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);

1427 1428 1429
out:
	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
	return ret;
1430 1431 1432 1433 1434
}

static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
					 struct kvm_mmu_page *root,
					 gfn_t start, gfn_t end,
1435
					 int target_level, bool shared)
1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
{
	struct kvm_mmu_page *sp = NULL;
	struct tdp_iter iter;
	int ret = 0;

	rcu_read_lock();

	/*
	 * Traverse the page table splitting all huge pages above the target
	 * level into one lower level. For example, if we encounter a 1GB page
	 * we split it into 512 2MB pages.
	 *
	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
	 * to visit an SPTE before ever visiting its children, which means we
	 * will correctly recursively split huge pages that are more than one
	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
	 * and then splitting each of those to 512 4KB pages).
	 */
	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
retry:
1456
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1457 1458 1459 1460 1461 1462
			continue;

		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
			continue;

		if (!sp) {
1463
			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1464 1465
			if (!sp) {
				ret = -ENOMEM;
1466 1467 1468
				trace_kvm_mmu_split_huge_page(iter.gfn,
							      iter.old_spte,
							      iter.level, ret);
1469 1470 1471 1472 1473 1474 1475
				break;
			}

			if (iter.yielded)
				continue;
		}

1476
		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
			goto retry;

		sp = NULL;
	}

	rcu_read_unlock();

	/*
	 * It's possible to exit the loop having never used the last sp if, for
	 * example, a vCPU doing HugePage NX splitting wins the race and
	 * installs its own sp in place of the last sp we tried to split.
	 */
	if (sp)
		tdp_mmu_free_sp(sp);

	return ret;
}

1495

1496 1497 1498 1499 1500 1501
/*
 * Try to split all huge pages mapped by the TDP MMU down to the target level.
 */
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
				      const struct kvm_memory_slot *slot,
				      gfn_t start, gfn_t end,
1502
				      int target_level, bool shared)
1503 1504 1505 1506
{
	struct kvm_mmu_page *root;
	int r = 0;

1507
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1508

1509
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1510
		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1511
		if (r) {
1512
			kvm_tdp_mmu_put_root(kvm, root, shared);
1513 1514 1515 1516 1517
			break;
		}
	}
}

1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531
/*
 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
 * If AD bits are not enabled, this will require clearing the writable bit on
 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
 * be flushed.
 */
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
			   gfn_t start, gfn_t end)
{
	struct tdp_iter iter;
	u64 new_spte;
	bool spte_set = false;

1532 1533
	rcu_read_lock();

1534
	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1535 1536
retry:
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1537 1538
			continue;

1539 1540 1541
		if (!is_shadow_present_pte(iter.old_spte))
			continue;

1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553
		if (spte_ad_need_write_protect(iter.old_spte)) {
			if (is_writable_pte(iter.old_spte))
				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
			else
				continue;
		} else {
			if (iter.old_spte & shadow_dirty_mask)
				new_spte = iter.old_spte & ~shadow_dirty_mask;
			else
				continue;
		}

1554
		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1555
			goto retry;
1556

1557 1558
		spte_set = true;
	}
1559 1560

	rcu_read_unlock();
1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
	return spte_set;
}

/*
 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
 * If AD bits are not enabled, this will require clearing the writable bit on
 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
 * be flushed.
 */
1571 1572
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
				  const struct kvm_memory_slot *slot)
1573 1574 1575 1576
{
	struct kvm_mmu_page *root;
	bool spte_set = false;

1577
	lockdep_assert_held_read(&kvm->mmu_lock);
1578

1579
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
				slot->base_gfn + slot->npages);

	return spte_set;
}

/*
 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
 * set in mask, starting at gfn. The given memslot is expected to contain all
 * the GFNs represented by set bits in the mask. If AD bits are enabled,
 * clearing the dirty status will involve clearing the dirty bit on each SPTE
 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
 */
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
				  gfn_t gfn, unsigned long mask, bool wrprot)
{
	struct tdp_iter iter;
	u64 new_spte;

1599 1600
	rcu_read_lock();

1601 1602 1603 1604 1605 1606 1607 1608 1609
	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
				    gfn + BITS_PER_LONG) {
		if (!mask)
			break;

		if (iter.level > PG_LEVEL_4K ||
		    !(mask & (1UL << (iter.gfn - gfn))))
			continue;

1610 1611
		mask &= ~(1UL << (iter.gfn - gfn));

1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
			if (is_writable_pte(iter.old_spte))
				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
			else
				continue;
		} else {
			if (iter.old_spte & shadow_dirty_mask)
				new_spte = iter.old_spte & ~shadow_dirty_mask;
			else
				continue;
		}

		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
	}
1626 1627

	rcu_read_unlock();
1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
}

/*
 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
 * set in mask, starting at gfn. The given memslot is expected to contain all
 * the GFNs represented by set bits in the mask. If AD bits are enabled,
 * clearing the dirty status will involve clearing the dirty bit on each SPTE
 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
 */
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
				       struct kvm_memory_slot *slot,
				       gfn_t gfn, unsigned long mask,
				       bool wrprot)
{
	struct kvm_mmu_page *root;

1644
	lockdep_assert_held_write(&kvm->mmu_lock);
1645
	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1646 1647 1648
		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}

1649
/*
1650 1651
 * Clear leaf entries which could be replaced by large mappings, for
 * GFNs within the slot.
1652
 */
1653
static void zap_collapsible_spte_range(struct kvm *kvm,
1654
				       struct kvm_mmu_page *root,
1655
				       const struct kvm_memory_slot *slot)
1656
{
1657 1658
	gfn_t start = slot->base_gfn;
	gfn_t end = start + slot->npages;
1659 1660 1661
	struct tdp_iter iter;
	kvm_pfn_t pfn;

1662 1663
	rcu_read_lock();

1664
	tdp_root_for_each_pte(iter, root, start, end) {
1665
retry:
1666
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1667 1668
			continue;

1669
		if (!is_shadow_present_pte(iter.old_spte) ||
1670
		    !is_last_spte(iter.old_spte, iter.level))
1671 1672 1673 1674
			continue;

		pfn = spte_to_pfn(iter.old_spte);
		if (kvm_is_reserved_pfn(pfn) ||
1675 1676
		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
							    pfn, PG_LEVEL_NUM))
1677 1678
			continue;

1679
		/* Note, a successful atomic zap also does a remote TLB flush. */
1680
		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1681
			goto retry;
1682 1683
	}

1684
	rcu_read_unlock();
1685 1686 1687 1688 1689 1690
}

/*
 * Clear non-leaf entries (and free associated page tables) which could
 * be replaced by large mappings, for GFNs within the slot.
 */
1691 1692
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
				       const struct kvm_memory_slot *slot)
1693 1694 1695
{
	struct kvm_mmu_page *root;

1696
	lockdep_assert_held_read(&kvm->mmu_lock);
1697

1698
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1699
		zap_collapsible_spte_range(kvm, root, slot);
1700
}
1701 1702 1703

/*
 * Removes write access on the last level SPTE mapping this GFN and unsets the
1704
 * MMU-writable bit to ensure future writes continue to be intercepted.
1705 1706 1707
 * Returns true if an SPTE was set and a TLB flush is needed.
 */
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1708
			      gfn_t gfn, int min_level)
1709 1710 1711 1712 1713
{
	struct tdp_iter iter;
	u64 new_spte;
	bool spte_set = false;

1714 1715
	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);

1716 1717
	rcu_read_lock();

1718
	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1719 1720 1721 1722
		if (!is_shadow_present_pte(iter.old_spte) ||
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

1723
		new_spte = iter.old_spte &
1724
			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1725

1726 1727 1728
		if (new_spte == iter.old_spte)
			break;

1729 1730 1731 1732
		tdp_mmu_set_spte(kvm, &iter, new_spte);
		spte_set = true;
	}

1733 1734
	rcu_read_unlock();

1735 1736 1737 1738 1739
	return spte_set;
}

/*
 * Removes write access on the last level SPTE mapping this GFN and unsets the
1740
 * MMU-writable bit to ensure future writes continue to be intercepted.
1741 1742 1743
 * Returns true if an SPTE was set and a TLB flush is needed.
 */
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1744 1745
				   struct kvm_memory_slot *slot, gfn_t gfn,
				   int min_level)
1746 1747 1748 1749
{
	struct kvm_mmu_page *root;
	bool spte_set = false;

1750
	lockdep_assert_held_write(&kvm->mmu_lock);
1751
	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1752
		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1753

1754 1755 1756
	return spte_set;
}

1757 1758 1759
/*
 * Return the level of the lowest level SPTE added to sptes.
 * That SPTE may be non-present.
1760 1761
 *
 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1762
 */
1763 1764
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
			 int *root_level)
1765 1766 1767 1768
{
	struct tdp_iter iter;
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	gfn_t gfn = addr >> PAGE_SHIFT;
1769
	int leaf = -1;
1770

1771
	*root_level = vcpu->arch.mmu->shadow_root_level;
1772 1773 1774

	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
		leaf = iter.level;
1775
		sptes[leaf] = iter.old_spte;
1776 1777 1778 1779
	}

	return leaf;
}
1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816

/*
 * Returns the last level spte pointer of the shadow page walk for the given
 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
 * walk could be performed, returns NULL and *spte does not contain valid data.
 *
 * Contract:
 *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
 *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
 *
 * WARNING: This function is only intended to be called during fast_page_fault.
 */
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
					u64 *spte)
{
	struct tdp_iter iter;
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	gfn_t gfn = addr >> PAGE_SHIFT;
	tdp_ptep_t sptep = NULL;

	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
		*spte = iter.old_spte;
		sptep = iter.sptep;
	}

	/*
	 * Perform the rcu_dereference to get the raw spte pointer value since
	 * we are passing it up to fast_page_fault, which is shared with the
	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
	 * annotation.
	 *
	 * This is safe since fast_page_fault obeys the contracts of this
	 * function as well as all TDP MMU contracts around modifying SPTEs
	 * outside of mmu_lock.
	 */
	return rcu_dereference(sptep);
}