paging_tmpl.h 22.0 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 32
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
33
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
35 36
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
37
	#define CMPXCHG cmpxchg
38
	#else
39
	#define CMPXCHG cmpxchg64
40 41
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
42 43 44 45 46
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 48
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
49
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
	#define PT_MAX_FULL_LEVELS 2
52
	#define CMPXCHG cmpxchg
A
Avi Kivity 已提交
53 54 55 56
#else
	#error Invalid PTTYPE value
#endif

57 58
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
59

A
Avi Kivity 已提交
60 61 62 63 64 65
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
66
	unsigned max_level;
67
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
69
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
72 73
	unsigned pt_access;
	unsigned pte_access;
74
	gfn_t gfn;
75
	struct x86_exception fault;
A
Avi Kivity 已提交
76 77
};

78
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79
{
80
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81 82
}

83
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 85
			       pt_element_t __user *ptep_user, unsigned index,
			       pt_element_t orig_pte, pt_element_t new_pte)
86
{
87
	int npages;
88 89 90 91
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

92 93 94
	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
	/* Check if the user is doing something meaningless. */
	if (unlikely(npages != 1))
95 96
		return -EFAULT;

97
	table = kmap_atomic(page);
98
	ret = CMPXCHG(&table[index], orig_pte, new_pte);
99
	kunmap_atomic(table);
100 101 102 103 104 105

	kvm_release_page_dirty(page);

	return (ret != orig_pte);
}

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
		if (!(pte & PT_ACCESSED_MASK)) {
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
			pte |= PT_ACCESSED_MASK;
		}
		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
			pte |= PT_DIRTY_MASK;
		}
		if (pte == orig_pte)
			continue;

		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

		mark_page_dirty(vcpu->kvm, table_gfn);
		walker->ptes[level] = pte;
	}
	return 0;
}

143 144 145
/*
 * Fetch a guest pte for a guest virtual address
 */
146 147
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
148
				    gva_t addr, u32 access)
A
Avi Kivity 已提交
149
{
150
	int ret;
151
	pt_element_t pte;
152
	pt_element_t __user *uninitialized_var(ptep_user);
153
	gfn_t table_gfn;
154
	unsigned index, pt_access, pte_access, accessed_dirty;
155
	gpa_t pte_gpa;
156 157 158 159 160
	int offset;
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
161 162
	gpa_t real_gpa;
	gfn_t gfn;
A
Avi Kivity 已提交
163

164
	trace_kvm_mmu_pagetable_walk(addr, access);
165
retry_walk:
166 167 168
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);

169
#if PTTYPE == 64
170
	if (walker->level == PT32E_ROOT_LEVEL) {
171
		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172
		trace_kvm_mmu_paging_element(pte, walker->level);
173
		if (!is_present_gpte(pte))
174
			goto error;
175 176 177
		--walker->level;
	}
#endif
178
	walker->max_level = walker->level;
A
Avi Kivity 已提交
179
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180
	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
A
Avi Kivity 已提交
181

182
	accessed_dirty = PT_ACCESSED_MASK;
183 184
	pt_access = pte_access = ACC_ALL;
	++walker->level;
185

186
	do {
187 188 189
		gfn_t real_gfn;
		unsigned long host_addr;

190 191 192
		pt_access &= pte_access;
		--walker->level;

193
		index = PT_INDEX(addr, walker->level);
194

195
		table_gfn = gpte_to_gfn(pte);
196 197
		offset    = index * sizeof(pt_element_t);
		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
198
		walker->table_gfn[walker->level - 1] = table_gfn;
199
		walker->pte_gpa[walker->level - 1] = pte_gpa;
200

201 202
		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
					      PFERR_USER_MASK|PFERR_WRITE_MASK);
203 204
		if (unlikely(real_gfn == UNMAPPED_GVA))
			goto error;
205 206 207
		real_gfn = gpa_to_gfn(real_gfn);

		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
208 209
		if (unlikely(kvm_is_error_hva(host_addr)))
			goto error;
210 211

		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
212 213
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
214
		walker->ptep_user[walker->level - 1] = ptep_user;
215

216
		trace_kvm_mmu_paging_element(pte, walker->level);
217

218 219
		if (unlikely(!is_present_gpte(pte)))
			goto error;
220

221 222
		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
					      walker->level))) {
223 224
			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
			goto error;
225
		}
226

227
		accessed_dirty &= pte;
228
		pte_access = pt_access & gpte_access(vcpu, pte);
229

230
		walker->ptes[walker->level - 1] = pte;
A
Avi Kivity 已提交
231
	} while (!is_last_gpte(mmu, walker->level, pte));
232

A
Avi Kivity 已提交
233
	if (unlikely(permission_fault(mmu, pte_access, access))) {
234
		errcode |= PFERR_PRESENT_MASK;
235
		goto error;
236
	}
237

238 239 240 241 242 243
	gfn = gpte_to_gfn_lvl(pte, walker->level);
	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;

	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

244
	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
245 246 247 248 249
	if (real_gpa == UNMAPPED_GVA)
		return 0;

	walker->gfn = real_gpa >> PAGE_SHIFT;

250 251
	if (!write_fault)
		protect_clean_gpte(&pte_access, pte);
252 253 254 255 256 257
	else
		/*
		 * On a write fault, fold the dirty bit into accessed_dirty by
		 * shifting it one place right.
		 */
		accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
258 259 260 261 262 263 264 265

	if (unlikely(!accessed_dirty)) {
		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
		if (unlikely(ret < 0))
			goto error;
		else if (ret)
			goto retry_walk;
	}
266

267 268 269
	walker->pt_access = pt_access;
	walker->pte_access = pte_access;
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
270
		 __func__, (u64)pte, pte_access, pt_access);
271 272
	return 1;

273
error:
274
	errcode |= write_fault | user_fault;
275 276
	if (fetch_fault && (mmu->nx ||
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
277
		errcode |= PFERR_FETCH_MASK;
278

279 280 281
	walker->fault.vector = PF_VECTOR;
	walker->fault.error_code_valid = true;
	walker->fault.error_code = errcode;
282 283
	walker->fault.address = addr;
	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
284

285
	trace_kvm_mmu_walker_error(walker->fault.error_code);
286
	return 0;
A
Avi Kivity 已提交
287 288
}

289
static int FNAME(walk_addr)(struct guest_walker *walker,
290
			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
291 292
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
293
					access);
294 295
}

296 297
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				   struct kvm_vcpu *vcpu, gva_t addr,
298
				   u32 access)
299 300
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
301
					addr, access);
302 303
}

304 305 306
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
307
{
308
	unsigned pte_access;
309
	gfn_t gfn;
310
	pfn_t pfn;
311

312
	if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
313
		return false;
314

315
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
316 317

	gfn = gpte_to_gfn(gpte);
318
	pte_access = sp->role.access & gpte_access(vcpu, gpte);
319
	protect_clean_gpte(&pte_access, gpte);
320 321
	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
			no_dirty_log && (pte_access & ACC_WRITE_MASK));
322
	if (is_error_pfn(pfn))
323
		return false;
324

325
	/*
326 327
	 * we call mmu_set_spte() with host_writable = true because
	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
328
	 */
329 330
	mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
		     gfn, pfn, true, true);
331 332 333 334 335 336 337 338 339 340

	return true;
}

static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			      u64 *spte, const void *pte)
{
	pt_element_t gpte = *(const pt_element_t *)pte;

	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
341 342
}

A
Avi Kivity 已提交
343 344 345 346
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
347 348 349 350 351 352 353 354 355 356 357 358 359 360
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
A
Avi Kivity 已提交
361
				  &curr_pte, sizeof(curr_pte));
362

A
Avi Kivity 已提交
363 364 365
	return r || curr_pte != gw->ptes[level - 1];
}

366 367
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
368 369
{
	struct kvm_mmu_page *sp;
370
	pt_element_t *gptep = gw->prefetch_ptes;
371
	u64 *spte;
372
	int i;
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		if (spte == sptep)
			continue;

389
		if (is_shadow_present_pte(*spte))
390 391
			continue;

392
		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
393 394 395 396
			break;
	}
}

A
Avi Kivity 已提交
397 398
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
399 400
 * If the guest tries to write a write-protected page, we need to
 * emulate this operation, return 1 to indicate this case.
A
Avi Kivity 已提交
401
 */
402
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
403
			 struct guest_walker *gw,
404
			 int write_fault, int hlevel,
405
			 pfn_t pfn, bool map_writable, bool prefault)
A
Avi Kivity 已提交
406
{
407
	struct kvm_mmu_page *sp = NULL;
408
	struct kvm_shadow_walk_iterator it;
409 410
	unsigned direct_access, access = gw->pt_access;
	int top_level, emulate = 0;
411

412
	direct_access = gw->pte_access;
413

414 415 416 417 418 419 420 421 422 423 424 425
	top_level = vcpu->arch.mmu.root_level;
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

426 427 428
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
429 430
		gfn_t table_gfn;

431
		clear_sp_write_flooding_count(it.sptep);
432
		drop_large_spte(vcpu, it.sptep);
433

434
		sp = NULL;
435 436 437 438
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access, it.sptep);
439
		}
440 441 442 443 444

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
445
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
446
			goto out_gpte_changed;
447

448
		if (sp)
449
			link_shadow_page(it.sptep, sp);
450
	}
A
Avi Kivity 已提交
451

452
	for (;
453 454
	     shadow_walk_okay(&it) && it.level > hlevel;
	     shadow_walk_next(&it)) {
455 456
		gfn_t direct_gfn;

457
		clear_sp_write_flooding_count(it.sptep);
458
		validate_direct_spte(vcpu, it.sptep, direct_access);
459

460
		drop_large_spte(vcpu, it.sptep);
461

462
		if (is_shadow_present_pte(*it.sptep))
463 464
			continue;

465
		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
466

467 468 469
		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
				      true, direct_access, it.sptep);
		link_shadow_page(it.sptep, sp);
470 471
	}

472
	clear_sp_write_flooding_count(it.sptep);
473 474
	mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
		     it.level, gw->gfn, pfn, prefault, map_writable);
475
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
476

477
	return emulate;
478 479

out_gpte_changed:
480
	if (sp)
481
		kvm_mmu_put_page(sp, it.sptep);
482
	kvm_release_pfn_clean(pfn);
483
	return 0;
A
Avi Kivity 已提交
484 485
}

486 487 488 489 490 491 492 493 494 495
 /*
 * To see whether the mapped gfn can write its page table in the current
 * mapping.
 *
 * It is the helper function of FNAME(page_fault). When guest uses large page
 * size to map the writable gfn which is used as current page table, we should
 * force kvm to use small page size to map it because new shadow page will be
 * created when kvm establishes shadow page table that stop kvm using large
 * page size. Do it early can avoid unnecessary #PF and emulation.
 *
496 497 498
 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 * currently used as its page table.
 *
499 500 501 502 503 504
 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
 * since the PDPT is always shadowed, that means, we can not use large page
 * size to map the gfn which is used as PDPT.
 */
static bool
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
505 506
			      struct guest_walker *walker, int user_fault,
			      bool *write_fault_to_shadow_pgtable)
507 508 509
{
	int level;
	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
510
	bool self_changed = false;
511 512 513 514 515

	if (!(walker->pte_access & ACC_WRITE_MASK ||
	      (!is_write_protection(vcpu) && !user_fault)))
		return false;

516 517 518 519 520 521
	for (level = walker->level; level <= walker->max_level; level++) {
		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];

		self_changed |= !(gfn & mask);
		*write_fault_to_shadow_pgtable |= !gfn;
	}
522

523
	return self_changed;
524 525
}

A
Avi Kivity 已提交
526 527 528 529 530 531 532 533 534 535 536
/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
537 538
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
539
 */
G
Gleb Natapov 已提交
540
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
541
			     bool prefault)
A
Avi Kivity 已提交
542 543 544 545
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
	struct guest_walker walker;
546
	int r;
547
	pfn_t pfn;
548
	int level = PT_PAGE_TABLE_LEVEL;
549
	int force_pt_level;
550
	unsigned long mmu_seq;
551
	bool map_writable, is_self_change_mapping;
A
Avi Kivity 已提交
552

553
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
554

555 556 557 558
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return handle_mmio_page_fault(vcpu, addr, error_code,
					      mmu_is_nested(vcpu));

559 560 561
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
562

A
Avi Kivity 已提交
563
	/*
564
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
565
	 */
566
	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
A
Avi Kivity 已提交
567 568 569 570

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
571
	if (!r) {
572
		pgprintk("%s: guest page fault\n", __func__);
573
		if (!prefault)
X
Xiao Guangrong 已提交
574
			inject_page_fault(vcpu, &walker.fault);
575

A
Avi Kivity 已提交
576 577 578
		return 0;
	}

579 580 581 582 583
	vcpu->arch.write_fault_to_shadow_pgtable = false;

	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);

584
	if (walker.level >= PT_DIRECTORY_LEVEL)
585
		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
586
		   || is_self_change_mapping;
587 588 589
	else
		force_pt_level = 1;
	if (!force_pt_level) {
590 591
		level = min(walker.level, mapping_level(vcpu, walker.gfn));
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
592
	}
593

594
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
595
	smp_rmb();
596

597
	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
598
			 &map_writable))
599
		return 0;
600

601 602 603 604
	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
				walker.gfn, pfn, walker.pte_access, &r))
		return r;

605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
	/*
	 * Do not change pte_access if the pfn is a mmio page, otherwise
	 * we will cache the incorrect access into mmio spte.
	 */
	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
	     !is_write_protection(vcpu) && !user_fault &&
	      !is_noslot_pfn(pfn)) {
		walker.pte_access |= ACC_WRITE_MASK;
		walker.pte_access &= ~ACC_USER_MASK;

		/*
		 * If we converted a user page to a kernel page,
		 * so that the kernel can write to it when cr0.wp=0,
		 * then we should prevent the kernel from executing it
		 * if SMEP is enabled.
		 */
		if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
			walker.pte_access &= ~ACC_EXEC_MASK;
	}

625
	spin_lock(&vcpu->kvm->mmu_lock);
626
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
627
		goto out_unlock;
628

629
	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
630 631
	if (!force_pt_level)
		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
632
	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
633
			 level, pfn, map_writable, prefault);
A
Avi Kivity 已提交
634
	++vcpu->stat.pf_fixed;
635
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
636
	spin_unlock(&vcpu->kvm->mmu_lock);
A
Avi Kivity 已提交
637

638
	return r;
639 640 641 642 643

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
A
Avi Kivity 已提交
644 645
}

X
Xiao Guangrong 已提交
646 647 648 649
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
	int offset = 0;

650
	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
X
Xiao Guangrong 已提交
651 652 653 654 655 656 657

	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}

658
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
M
Marcelo Tosatti 已提交
659
{
660
	struct kvm_shadow_walk_iterator iterator;
661
	struct kvm_mmu_page *sp;
662 663 664
	int level;
	u64 *sptep;

665 666
	vcpu_clear_mmio_info(vcpu, gva);

667 668 669 670 671
	/*
	 * No need to check return value here, rmap_can_add() can
	 * help us to skip pte prefetch later.
	 */
	mmu_topup_memory_caches(vcpu);
M
Marcelo Tosatti 已提交
672

673
	spin_lock(&vcpu->kvm->mmu_lock);
674 675 676
	for_each_shadow_entry(vcpu, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
677

678
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
679
		if (is_last_spte(*sptep, level)) {
680 681 682
			pt_element_t gpte;
			gpa_t pte_gpa;

683 684 685
			if (!sp->unsync)
				break;

X
Xiao Guangrong 已提交
686
			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
687
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
688

X
Xiao Guangrong 已提交
689 690
			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
				kvm_flush_remote_tlbs(vcpu->kvm);
691 692 693 694 695 696 697 698 699

			if (!rmap_can_add(vcpu))
				break;

			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
						  sizeof(pt_element_t)))
				break;

			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
700
		}
M
Marcelo Tosatti 已提交
701

702
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
703 704
			break;
	}
705
	spin_unlock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
706 707
}

708
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
709
			       struct x86_exception *exception)
A
Avi Kivity 已提交
710 711
{
	struct guest_walker walker;
A
Avi Kivity 已提交
712 713
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
714

715
	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
A
Avi Kivity 已提交
716

A
Avi Kivity 已提交
717
	if (r) {
A
Avi Kivity 已提交
718
		gpa = gfn_to_gpa(walker.gfn);
A
Avi Kivity 已提交
719
		gpa |= vaddr & ~PAGE_MASK;
720 721
	} else if (exception)
		*exception = walker.fault;
A
Avi Kivity 已提交
722 723 724 725

	return gpa;
}

726
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
727 728
				      u32 access,
				      struct x86_exception *exception)
729 730 731 732 733
{
	struct guest_walker walker;
	gpa_t gpa = UNMAPPED_GVA;
	int r;

734
	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
735 736 737 738

	if (r) {
		gpa = gfn_to_gpa(walker.gfn);
		gpa |= vaddr & ~PAGE_MASK;
739 740
	} else if (exception)
		*exception = walker.fault;
741 742 743 744

	return gpa;
}

745 746 747 748
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
749 750 751 752 753 754 755 756
 *
 * Note:
 *   We should flush all tlbs if spte is dropped even though guest is
 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
 *   used by guest then tlbs are not flushed, so guest is allowed to access the
 *   freed pages.
 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
757
 */
758
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
759
{
X
Xiao Guangrong 已提交
760
	int i, nr_present = 0;
761
	bool host_writable;
762
	gpa_t first_pte_gpa;
763

764 765 766
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

X
Xiao Guangrong 已提交
767
	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
768

769 770 771 772
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
773
		gfn_t gfn;
774

775
		if (!sp->spt[i])
776 777
			continue;

778
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
779 780 781 782 783

		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
					  sizeof(pt_element_t)))
			return -EINVAL;

784
		if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
785
			vcpu->kvm->tlbs_dirty++;
786 787 788
			continue;
		}

789 790
		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
791
		pte_access &= gpte_access(vcpu, gpte);
792
		protect_clean_gpte(&pte_access, gpte);
793 794 795 796

		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
			continue;

797
		if (gfn != sp->gfns[i]) {
798
			drop_spte(vcpu->kvm, &sp->spt[i]);
799
			vcpu->kvm->tlbs_dirty++;
800 801 802 803
			continue;
		}

		nr_present++;
804

805 806
		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

807
		set_spte(vcpu, &sp->spt[i], pte_access,
808
			 PT_PAGE_TABLE_LEVEL, gfn,
809
			 spte_to_pfn(sp->spt[i]), true, false,
810
			 host_writable);
811 812 813 814 815
	}

	return !nr_present;
}

A
Avi Kivity 已提交
816 817 818 819 820
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
821 822
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
823
#undef PT_LEVEL_BITS
824
#undef PT_MAX_FULL_LEVELS
825
#undef gpte_to_gfn
826
#undef gpte_to_gfn_lvl
827
#undef CMPXCHG