paging_tmpl.h 30.6 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0-only */
A
Avi Kivity 已提交
2 3 4 5 6 7 8 9 10
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
11
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
28 29
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
30
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
31
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
32 33
	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
34
	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
35 36
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
37
	#define CMPXCHG cmpxchg
38
	#else
39
	#define CMPXCHG cmpxchg64
40 41
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
42 43 44 45 46
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 48
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
49
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
	#define PT_MAX_FULL_LEVELS 2
52 53
	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
54
	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
55
	#define CMPXCHG cmpxchg
56 57 58 59 60 61 62 63 64
#elif PTTYPE == PTTYPE_EPT
	#define pt_element_t u64
	#define guest_walker guest_walkerEPT
	#define FNAME(name) ept_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
65 66 67
	#define PT_GUEST_DIRTY_SHIFT 9
	#define PT_GUEST_ACCESSED_SHIFT 8
	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
68 69
	#define CMPXCHG cmpxchg64
	#define PT_MAX_FULL_LEVELS 4
A
Avi Kivity 已提交
70 71 72 73
#else
	#error Invalid PTTYPE value
#endif

74 75 76
#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)

77 78
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
79

A
Avi Kivity 已提交
80 81 82 83 84 85
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
86
	unsigned max_level;
87
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
88
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
89
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
90
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
91
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
92
	bool pte_writable[PT_MAX_FULL_LEVELS];
93 94
	unsigned pt_access;
	unsigned pte_access;
95
	gfn_t gfn;
96
	struct x86_exception fault;
A
Avi Kivity 已提交
97 98
};

99
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
100
{
101
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
102 103
}

104 105
static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
					     unsigned gpte)
106 107 108
{
	unsigned mask;

109
	/* dirty bit is not supported, so no need to track it */
110
	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
111 112
		return;

113 114 115 116
	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);

	mask = (unsigned)~ACC_WRITE_MASK;
	/* Allow write access to dirty gptes */
117 118
	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
		PT_WRITABLE_MASK;
119 120 121 122 123
	*access &= mask;
}

static inline int FNAME(is_present_gpte)(unsigned long pte)
{
124
#if PTTYPE != PTTYPE_EPT
B
Bandan Das 已提交
125
	return pte & PT_PRESENT_MASK;
126 127 128
#else
	return pte & 7;
#endif
129 130
}

131
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
132 133
			       pt_element_t __user *ptep_user, unsigned index,
			       pt_element_t orig_pte, pt_element_t new_pte)
134
{
135
	int npages;
136 137 138 139
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

140
	npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
	if (likely(npages == 1)) {
		table = kmap_atomic(page);
		ret = CMPXCHG(&table[index], orig_pte, new_pte);
		kunmap_atomic(table);

		kvm_release_page_dirty(page);
	} else {
		struct vm_area_struct *vma;
		unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
		unsigned long pfn;
		unsigned long paddr;

		down_read(&current->mm->mmap_sem);
		vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
		if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
			up_read(&current->mm->mmap_sem);
			return -EFAULT;
		}
		pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
		paddr = pfn << PAGE_SHIFT;
		table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
		if (!table) {
			up_read(&current->mm->mmap_sem);
			return -EFAULT;
		}
		ret = CMPXCHG(&table[index], orig_pte, new_pte);
		memunmap(table);
		up_read(&current->mm->mmap_sem);
	}
170 171 172 173

	return (ret != orig_pte);
}

174 175 176 177
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
				  struct kvm_mmu_page *sp, u64 *spte,
				  u64 gpte)
{
178
	if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
179 180 181 182 183
		goto no_present;

	if (!FNAME(is_present_gpte)(gpte))
		goto no_present;

184
	/* if accessed bit is not supported prefetch non accessed gpte */
185 186
	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
	    !(gpte & PT_GUEST_ACCESSED_MASK))
187 188 189 190 191 192 193 194 195
		goto no_present;

	return false;

no_present:
	drop_spte(vcpu->kvm, spte);
	return true;
}

196 197 198 199 200 201
/*
 * For PTTYPE_EPT, a page table can be executable but not readable
 * on supported processors. Therefore, set_spte does not automatically
 * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
 * to signify readability since it isn't used in the EPT case
 */
202
static inline unsigned FNAME(gpte_access)(u64 gpte)
203 204
{
	unsigned access;
205 206 207
#if PTTYPE == PTTYPE_EPT
	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
208
		((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
209
#else
210 211 212 213 214
	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
	access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
	access ^= (gpte >> PT64_NX_SHIFT);
215
#endif
216 217 218 219

	return access;
}

220 221 222 223 224 225 226 227 228 229 230
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

231
	/* dirty/accessed bits are not supported, so no need to update them */
232
	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
233 234
		return 0;

235 236 237 238 239
	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
240
		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
241
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
242
			pte |= PT_GUEST_ACCESSED_MASK;
243
		}
244
		if (level == walker->level && write_fault &&
245
				!(pte & PT_GUEST_DIRTY_MASK)) {
246
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
247 248 249 250
#if PTTYPE == PTTYPE_EPT
			if (kvm_arch_write_log_dirty(vcpu))
				return -EINVAL;
#endif
251
			pte |= PT_GUEST_DIRTY_MASK;
252 253 254 255
		}
		if (pte == orig_pte)
			continue;

256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
		/*
		 * If the slot is read-only, simply do not process the accessed
		 * and dirty bits.  This is the correct thing to do if the slot
		 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
		 * are only supported if the accessed and dirty bits are already
		 * set in the ROM (so that MMIO writes are never needed).
		 *
		 * Note that NPT does not allow this at all and faults, since
		 * it always wants nested page table entries for the guest
		 * page tables to be writable.  And EPT works but will simply
		 * overwrite the read-only memory to set the accessed and dirty
		 * bits.
		 */
		if (unlikely(!walker->pte_writable[level - 1]))
			continue;

272 273 274 275
		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

276
		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
277
		walker->ptes[level - 1] = pte;
278 279 280 281
	}
	return 0;
}

282 283 284 285 286 287 288 289 290 291 292
static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
{
	unsigned pkeys = 0;
#if PTTYPE == 64
	pte_t pte = {.pte = gpte};

	pkeys = pte_flags_pkey(pte_flags(pte));
#endif
	return pkeys;
}

293
/*
294
 * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
295
 */
296 297
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
298
				    gpa_t addr, u32 access)
A
Avi Kivity 已提交
299
{
300
	int ret;
301
	pt_element_t pte;
302
	pt_element_t __user *uninitialized_var(ptep_user);
303
	gfn_t table_gfn;
304 305
	u64 pt_access, pte_access;
	unsigned index, accessed_dirty, pte_pkey;
306
	unsigned nested_access;
307
	gpa_t pte_gpa;
308
	bool have_ad;
309
	int offset;
310
	u64 walk_nx_mask = 0;
311 312 313 314
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
315 316
	gpa_t real_gpa;
	gfn_t gfn;
A
Avi Kivity 已提交
317

318
	trace_kvm_mmu_pagetable_walk(addr, access);
319
retry_walk:
320 321
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);
322
	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
323

324
#if PTTYPE == 64
325
	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
326
	if (walker->level == PT32E_ROOT_LEVEL) {
327
		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
328
		trace_kvm_mmu_paging_element(pte, walker->level);
329
		if (!FNAME(is_present_gpte)(pte))
330
			goto error;
331 332 333
		--walker->level;
	}
#endif
334
	walker->max_level = walker->level;
335
	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
A
Avi Kivity 已提交
336

337 338 339 340 341 342 343
	/*
	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
	 * by the MOV to CR instruction are treated as reads and do not cause the
	 * processor to set the dirty flag in any EPT paging-structure entry.
	 */
	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;

344
	pte_access = ~0;
345
	++walker->level;
346

347
	do {
348 349 350
		gfn_t real_gfn;
		unsigned long host_addr;

351
		pt_access = pte_access;
352 353
		--walker->level;

354
		index = PT_INDEX(addr, walker->level);
355
		table_gfn = gpte_to_gfn(pte);
356 357
		offset    = index * sizeof(pt_element_t);
		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
358 359

		BUG_ON(walker->level < 1);
360
		walker->table_gfn[walker->level - 1] = table_gfn;
361
		walker->pte_gpa[walker->level - 1] = pte_gpa;
362

363
		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
364
					      nested_access,
365
					      &walker->fault);
366 367 368 369 370 371 372 373 374 375 376

		/*
		 * FIXME: This can happen if emulation (for of an INS/OUTS
		 * instruction) triggers a nested page fault.  The exit
		 * qualification / exit info field will incorrectly have
		 * "guest page access" as the nested page fault's cause,
		 * instead of "guest page structure access".  To fix this,
		 * the x86_exception struct should be augmented with enough
		 * information to fix the exit_qualification or exit_info_1
		 * fields.
		 */
377
		if (unlikely(real_gfn == UNMAPPED_GVA))
378
			return 0;
379

380 381
		real_gfn = gpa_to_gfn(real_gfn);

382
		host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
383
					    &walker->pte_writable[walker->level - 1]);
384 385
		if (unlikely(kvm_is_error_hva(host_addr)))
			goto error;
386 387

		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
388 389
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
390
		walker->ptep_user[walker->level - 1] = ptep_user;
391

392
		trace_kvm_mmu_paging_element(pte, walker->level);
393

394 395 396 397 398 399
		/*
		 * Inverting the NX it lets us AND it like other
		 * permission bits.
		 */
		pte_access = pt_access & (pte ^ walk_nx_mask);

400
		if (unlikely(!FNAME(is_present_gpte)(pte)))
401
			goto error;
402

403
		if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
404
			errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
405
			goto error;
406
		}
407

408
		walker->ptes[walker->level - 1] = pte;
A
Avi Kivity 已提交
409
	} while (!is_last_gpte(mmu, walker->level, pte));
410

411
	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
412 413 414
	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;

	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
415 416
	walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
417
	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
418
	if (unlikely(errcode))
419 420
		goto error;

421 422 423 424 425 426
	gfn = gpte_to_gfn_lvl(pte, walker->level);
	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;

	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

427
	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
428 429 430 431 432
	if (real_gpa == UNMAPPED_GVA)
		return 0;

	walker->gfn = real_gpa >> PAGE_SHIFT;

433
	if (!write_fault)
434
		FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
435 436
	else
		/*
437 438 439
		 * On a write fault, fold the dirty bit into accessed_dirty.
		 * For modes without A/D bits support accessed_dirty will be
		 * always clear.
440
		 */
441 442
		accessed_dirty &= pte >>
			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
443 444 445 446 447 448 449 450

	if (unlikely(!accessed_dirty)) {
		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
		if (unlikely(ret < 0))
			goto error;
		else if (ret)
			goto retry_walk;
	}
451

452
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
453
		 __func__, (u64)pte, walker->pte_access, walker->pt_access);
454 455
	return 1;

456
error:
457
	errcode |= write_fault | user_fault;
458 459
	if (fetch_fault && (mmu->nx ||
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
460
		errcode |= PFERR_FETCH_MASK;
461

462 463 464
	walker->fault.vector = PF_VECTOR;
	walker->fault.error_code_valid = true;
	walker->fault.error_code = errcode;
465 466 467 468 469 470 471 472

#if PTTYPE == PTTYPE_EPT
	/*
	 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
	 * misconfiguration requires to be injected. The detection is
	 * done by is_rsvd_bits_set() above.
	 *
	 * We set up the value of exit_qualification to inject:
473 474
	 * [2:0] - Derive from the access bits. The exit_qualification might be
	 *         out of date if it is serving an EPT misconfiguration.
475 476 477 478 479 480
	 * [5:3] - Calculated by the page walk of the guest EPT page tables
	 * [7:8] - Derived from [7:8] of real exit_qualification
	 *
	 * The other bits are set to 0.
	 */
	if (!(errcode & PFERR_RSVD_MASK)) {
481 482 483 484 485 486 487
		vcpu->arch.exit_qualification &= 0x180;
		if (write_fault)
			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
		if (user_fault)
			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
		if (fetch_fault)
			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
488
		vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
489 490
	}
#endif
491 492
	walker->fault.address = addr;
	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
493

494
	trace_kvm_mmu_walker_error(walker->fault.error_code);
495
	return 0;
A
Avi Kivity 已提交
496 497
}

498
static int FNAME(walk_addr)(struct guest_walker *walker,
499
			    struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
500
{
501
	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
502
					access);
503 504
}

505
#if PTTYPE != PTTYPE_EPT
506 507
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				   struct kvm_vcpu *vcpu, gva_t addr,
508
				   u32 access)
509 510
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
511
					addr, access);
512
}
513
#endif
514

515 516 517
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
518
{
519
	unsigned pte_access;
520
	gfn_t gfn;
D
Dan Williams 已提交
521
	kvm_pfn_t pfn;
522

523
	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
524
		return false;
525

526
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
527 528

	gfn = gpte_to_gfn(gpte);
529
	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
530
	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
531 532
	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
			no_dirty_log && (pte_access & ACC_WRITE_MASK));
533
	if (is_error_pfn(pfn))
534
		return false;
535

536
	/*
537 538
	 * we call mmu_set_spte() with host_writable = true because
	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
539
	 */
540 541
	mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
		     true, true);
542

543
	kvm_release_pfn_clean(pfn);
544 545 546 547 548 549 550 551 552
	return true;
}

static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			      u64 *spte, const void *pte)
{
	pt_element_t gpte = *(const pt_element_t *)pte;

	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
553 554
}

A
Avi Kivity 已提交
555 556 557 558
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
559 560 561 562 563 564 565 566 567
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

568
		r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
569 570 571
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
572
		r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
A
Avi Kivity 已提交
573
				  &curr_pte, sizeof(curr_pte));
574

A
Avi Kivity 已提交
575 576 577
	return r || curr_pte != gw->ptes[level - 1];
}

578 579
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
580 581
{
	struct kvm_mmu_page *sp;
582
	pt_element_t *gptep = gw->prefetch_ptes;
583
	u64 *spte;
584
	int i;
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		if (spte == sptep)
			continue;

601
		if (is_shadow_present_pte(*spte))
602 603
			continue;

604
		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
605 606 607 608
			break;
	}
}

A
Avi Kivity 已提交
609 610
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
611 612
 * If the guest tries to write a write-protected page, we need to
 * emulate this operation, return 1 to indicate this case.
A
Avi Kivity 已提交
613
 */
614
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
615
			 struct guest_walker *gw,
616
			 int write_fault, int hlevel, int max_level,
P
Paolo Bonzini 已提交
617 618
			 kvm_pfn_t pfn, bool map_writable, bool prefault,
			 bool lpage_disallowed)
A
Avi Kivity 已提交
619
{
620
	struct kvm_mmu_page *sp = NULL;
621
	struct kvm_shadow_walk_iterator it;
622
	unsigned direct_access, access = gw->pt_access;
623
	int top_level, ret;
P
Paolo Bonzini 已提交
624
	gfn_t gfn, base_gfn;
625

626
	direct_access = gw->pte_access;
627

628
	top_level = vcpu->arch.mmu->root_level;
629 630 631 632 633 634 635 636 637 638 639
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

640
	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
641 642
		goto out_gpte_changed;

643 644 645
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
646 647
		gfn_t table_gfn;

648
		clear_sp_write_flooding_count(it.sptep);
649
		drop_large_spte(vcpu, it.sptep);
650

651
		sp = NULL;
652 653 654
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
655
					      false, access);
656
		}
657 658 659 660 661

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
662
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
663
			goto out_gpte_changed;
664

665
		if (sp)
666
			link_shadow_page(vcpu, it.sptep, sp);
667
	}
A
Avi Kivity 已提交
668

P
Paolo Bonzini 已提交
669 670 671 672 673 674
	/*
	 * FNAME(page_fault) might have clobbered the bottom bits of
	 * gw->gfn, restore them from the virtual address.
	 */
	gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
	base_gfn = gfn;
675

676 677 678
	if (max_level > PT_PAGE_TABLE_LEVEL)
		transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel);

679 680
	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);

681
	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
682
		clear_sp_write_flooding_count(it.sptep);
P
Paolo Bonzini 已提交
683 684 685 686 687 688 689 690

		/*
		 * We cannot overwrite existing page tables with an NX
		 * large page, as the leaf could be executable.
		 */
		disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);

		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
691 692 693
		if (it.level == hlevel)
			break;

694
		validate_direct_spte(vcpu, it.sptep, direct_access);
695

696
		drop_large_spte(vcpu, it.sptep);
697

698 699 700 701
		if (!is_shadow_present_pte(*it.sptep)) {
			sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
					      it.level - 1, true, direct_access);
			link_shadow_page(vcpu, it.sptep, sp);
P
Paolo Bonzini 已提交
702 703
			if (lpage_disallowed)
				account_huge_nx_page(vcpu->kvm, sp);
704
		}
705 706
	}

707
	ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
708
			   it.level, base_gfn, pfn, prefault, map_writable);
709
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
710
	++vcpu->stat.pf_fixed;
711
	return ret;
712 713

out_gpte_changed:
714
	return RET_PF_RETRY;
A
Avi Kivity 已提交
715 716
}

717 718 719 720 721 722 723 724 725 726
 /*
 * To see whether the mapped gfn can write its page table in the current
 * mapping.
 *
 * It is the helper function of FNAME(page_fault). When guest uses large page
 * size to map the writable gfn which is used as current page table, we should
 * force kvm to use small page size to map it because new shadow page will be
 * created when kvm establishes shadow page table that stop kvm using large
 * page size. Do it early can avoid unnecessary #PF and emulation.
 *
727 728 729
 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 * currently used as its page table.
 *
730 731 732 733 734 735
 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
 * since the PDPT is always shadowed, that means, we can not use large page
 * size to map the gfn which is used as PDPT.
 */
static bool
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
736 737
			      struct guest_walker *walker, int user_fault,
			      bool *write_fault_to_shadow_pgtable)
738 739 740
{
	int level;
	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
741
	bool self_changed = false;
742 743 744 745 746

	if (!(walker->pte_access & ACC_WRITE_MASK ||
	      (!is_write_protection(vcpu) && !user_fault)))
		return false;

747 748 749 750 751 752
	for (level = walker->level; level <= walker->max_level; level++) {
		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];

		self_changed |= !(gfn & mask);
		*write_fault_to_shadow_pgtable |= !gfn;
	}
753

754
	return self_changed;
755 756
}

A
Avi Kivity 已提交
757 758 759 760 761 762 763 764 765 766 767
/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
768 769
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
770
 */
771
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
772
			     bool prefault)
A
Avi Kivity 已提交
773 774 775 776
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
	struct guest_walker walker;
777
	int r;
D
Dan Williams 已提交
778
	kvm_pfn_t pfn;
779
	int level;
780
	unsigned long mmu_seq;
781
	bool map_writable, is_self_change_mapping;
P
Paolo Bonzini 已提交
782 783
	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				is_nx_huge_page_enabled();
784
	int max_level;
A
Avi Kivity 已提交
785

786
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
787

788 789 790
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
791

792 793 794 795 796 797
	/*
	 * If PFEC.RSVD is set, this is a shadow page fault.
	 * The bit needs to be cleared before walking guest page tables.
	 */
	error_code &= ~PFERR_RSVD_MASK;

A
Avi Kivity 已提交
798
	/*
799
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
800
	 */
801
	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
A
Avi Kivity 已提交
802 803 804 805

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
806
	if (!r) {
807
		pgprintk("%s: guest page fault\n", __func__);
808
		if (!prefault)
X
Xiao Guangrong 已提交
809
			inject_page_fault(vcpu, &walker.fault);
810

811
		return RET_PF_RETRY;
A
Avi Kivity 已提交
812 813
	}

814 815
	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
		shadow_page_table_clear_flood(vcpu, addr);
816
		return RET_PF_EMULATE;
817
	}
818

819 820 821 822 823
	vcpu->arch.write_fault_to_shadow_pgtable = false;

	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);

824
	if (lpage_disallowed || is_self_change_mapping)
825
		max_level = PT_PAGE_TABLE_LEVEL;
826 827 828 829 830 831
	else
		max_level = walker.level;

	level = mapping_level(vcpu, walker.gfn, &max_level);
	if (level > PT_PAGE_TABLE_LEVEL)
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
832

833
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
834
	smp_rmb();
835

836
	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
837
			 &map_writable))
838
		return RET_PF_RETRY;
839

840
	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
841 842
		return r;

843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
	/*
	 * Do not change pte_access if the pfn is a mmio page, otherwise
	 * we will cache the incorrect access into mmio spte.
	 */
	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
	     !is_write_protection(vcpu) && !user_fault &&
	      !is_noslot_pfn(pfn)) {
		walker.pte_access |= ACC_WRITE_MASK;
		walker.pte_access &= ~ACC_USER_MASK;

		/*
		 * If we converted a user page to a kernel page,
		 * so that the kernel can write to it when cr0.wp=0,
		 * then we should prevent the kernel from executing it
		 * if SMEP is enabled.
		 */
		if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
			walker.pte_access &= ~ACC_EXEC_MASK;
	}

863
	r = RET_PF_RETRY;
864
	spin_lock(&vcpu->kvm->mmu_lock);
865
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
866
		goto out_unlock;
867

868
	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
869 870
	if (make_mmu_pages_available(vcpu) < 0)
		goto out_unlock;
871 872
	r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level,
			 pfn, map_writable, prefault, lpage_disallowed);
873
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
874 875 876 877

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
878
	return r;
A
Avi Kivity 已提交
879 880
}

X
Xiao Guangrong 已提交
881 882 883 884
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
	int offset = 0;

885
	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
X
Xiao Guangrong 已提交
886 887 888 889 890 891 892

	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}

893
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
M
Marcelo Tosatti 已提交
894
{
895
	struct kvm_shadow_walk_iterator iterator;
896
	struct kvm_mmu_page *sp;
897 898 899
	int level;
	u64 *sptep;

900 901
	vcpu_clear_mmio_info(vcpu, gva);

902 903 904 905 906
	/*
	 * No need to check return value here, rmap_can_add() can
	 * help us to skip pte prefetch later.
	 */
	mmu_topup_memory_caches(vcpu);
M
Marcelo Tosatti 已提交
907

908
	if (!VALID_PAGE(root_hpa)) {
909 910 911 912
		WARN_ON(1);
		return;
	}

913
	spin_lock(&vcpu->kvm->mmu_lock);
914
	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
915 916
		level = iterator.level;
		sptep = iterator.sptep;
917

918
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
919
		if (is_last_spte(*sptep, level)) {
920 921 922
			pt_element_t gpte;
			gpa_t pte_gpa;

923 924 925
			if (!sp->unsync)
				break;

X
Xiao Guangrong 已提交
926
			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
927
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
928

X
Xiao Guangrong 已提交
929
			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
930 931
				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
932 933 934 935

			if (!rmap_can_add(vcpu))
				break;

936 937
			if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
						       sizeof(pt_element_t)))
938 939 940
				break;

			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
941
		}
M
Marcelo Tosatti 已提交
942

943
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
944 945
			break;
	}
946
	spin_unlock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
947 948
}

949 950
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
951
			       struct x86_exception *exception)
A
Avi Kivity 已提交
952 953
{
	struct guest_walker walker;
A
Avi Kivity 已提交
954 955
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
956

957
	r = FNAME(walk_addr)(&walker, vcpu, addr, access);
A
Avi Kivity 已提交
958

A
Avi Kivity 已提交
959
	if (r) {
A
Avi Kivity 已提交
960
		gpa = gfn_to_gpa(walker.gfn);
961
		gpa |= addr & ~PAGE_MASK;
962 963
	} else if (exception)
		*exception = walker.fault;
A
Avi Kivity 已提交
964 965 966 967

	return gpa;
}

968
#if PTTYPE != PTTYPE_EPT
969 970
/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
971 972
				      u32 access,
				      struct x86_exception *exception)
973 974 975 976 977
{
	struct guest_walker walker;
	gpa_t gpa = UNMAPPED_GVA;
	int r;

978 979 980 981 982
#ifndef CONFIG_X86_64
	/* A 64-bit GVA should be impossible on 32-bit KVM. */
	WARN_ON_ONCE(vaddr >> 32);
#endif

983
	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
984 985 986 987

	if (r) {
		gpa = gfn_to_gpa(walker.gfn);
		gpa |= vaddr & ~PAGE_MASK;
988 989
	} else if (exception)
		*exception = walker.fault;
990 991 992

	return gpa;
}
993
#endif
994

995 996 997 998
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
999 1000 1001 1002 1003 1004 1005
 *
 * Note:
 *   We should flush all tlbs if spte is dropped even though guest is
 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
 *   used by guest then tlbs are not flushed, so guest is allowed to access the
 *   freed pages.
1006
 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
1007
 */
1008
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1009
{
X
Xiao Guangrong 已提交
1010
	int i, nr_present = 0;
1011
	bool host_writable;
1012
	gpa_t first_pte_gpa;
1013
	int set_spte_ret = 0;
1014

1015 1016 1017
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

X
Xiao Guangrong 已提交
1018
	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
1019

1020 1021 1022 1023
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
1024
		gfn_t gfn;
1025

1026
		if (!sp->spt[i])
1027 1028
			continue;

1029
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
1030

1031 1032
		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
					       sizeof(pt_element_t)))
1033
			return 0;
1034

1035
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
1036 1037 1038 1039 1040 1041
			/*
			 * Update spte before increasing tlbs_dirty to make
			 * sure no tlb flush is lost after spte is zapped; see
			 * the comments in kvm_flush_remote_tlbs().
			 */
			smp_wmb();
1042
			vcpu->kvm->tlbs_dirty++;
1043 1044 1045
			continue;
		}

1046 1047
		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
1048
		pte_access &= FNAME(gpte_access)(gpte);
1049
		FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
1050

1051
		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
1052
		      &nr_present))
1053 1054
			continue;

1055
		if (gfn != sp->gfns[i]) {
1056
			drop_spte(vcpu->kvm, &sp->spt[i]);
1057 1058 1059 1060 1061
			/*
			 * The same as above where we are doing
			 * prefetch_invalid_gpte().
			 */
			smp_wmb();
1062
			vcpu->kvm->tlbs_dirty++;
1063 1064 1065 1066
			continue;
		}

		nr_present++;
1067

1068 1069
		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

1070 1071 1072 1073
		set_spte_ret |= set_spte(vcpu, &sp->spt[i],
					 pte_access, PT_PAGE_TABLE_LEVEL,
					 gfn, spte_to_pfn(sp->spt[i]),
					 true, false, host_writable);
1074 1075
	}

1076 1077 1078
	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
		kvm_flush_remote_tlbs(vcpu->kvm);

1079
	return nr_present;
1080 1081
}

A
Avi Kivity 已提交
1082 1083 1084 1085 1086
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
1087 1088
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
1089
#undef PT_LEVEL_BITS
1090
#undef PT_MAX_FULL_LEVELS
1091
#undef gpte_to_gfn
1092
#undef gpte_to_gfn_lvl
1093
#undef CMPXCHG
1094 1095 1096 1097
#undef PT_GUEST_ACCESSED_MASK
#undef PT_GUEST_DIRTY_MASK
#undef PT_GUEST_DIRTY_SHIFT
#undef PT_GUEST_ACCESSED_SHIFT
1098
#undef PT_HAVE_ACCESSED_DIRTY