paging_tmpl.h 22.0 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 32
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
33
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
35 36
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
37
	#define CMPXCHG cmpxchg
38
	#else
39
	#define CMPXCHG cmpxchg64
40 41
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
42 43 44 45 46
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 48
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
49
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
	#define PT_MAX_FULL_LEVELS 2
52
	#define CMPXCHG cmpxchg
A
Avi Kivity 已提交
53 54 55 56
#else
	#error Invalid PTTYPE value
#endif

57 58
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
59

A
Avi Kivity 已提交
60 61 62 63 64 65
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
66
	unsigned max_level;
67
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
69
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
72 73
	unsigned pt_access;
	unsigned pte_access;
74
	gfn_t gfn;
75
	struct x86_exception fault;
A
Avi Kivity 已提交
76 77
};

78
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79
{
80
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81 82
}

83
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 85
			       pt_element_t __user *ptep_user, unsigned index,
			       pt_element_t orig_pte, pt_element_t new_pte)
86
{
87
	int npages;
88 89 90 91
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

92 93 94
	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
	/* Check if the user is doing something meaningless. */
	if (unlikely(npages != 1))
95 96
		return -EFAULT;

97
	table = kmap_atomic(page);
98
	ret = CMPXCHG(&table[index], orig_pte, new_pte);
99
	kunmap_atomic(table);
100 101 102 103 104 105

	kvm_release_page_dirty(page);

	return (ret != orig_pte);
}

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
		if (!(pte & PT_ACCESSED_MASK)) {
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
			pte |= PT_ACCESSED_MASK;
		}
		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
			pte |= PT_DIRTY_MASK;
		}
		if (pte == orig_pte)
			continue;

		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

		mark_page_dirty(vcpu->kvm, table_gfn);
		walker->ptes[level] = pte;
	}
	return 0;
}

143 144 145
/*
 * Fetch a guest pte for a guest virtual address
 */
146 147
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
148
				    gva_t addr, u32 access)
A
Avi Kivity 已提交
149
{
150
	int ret;
151
	pt_element_t pte;
152
	pt_element_t __user *uninitialized_var(ptep_user);
153
	gfn_t table_gfn;
154
	unsigned index, pt_access, pte_access, accessed_dirty;
155
	gpa_t pte_gpa;
156 157 158 159 160
	int offset;
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
161 162
	gpa_t real_gpa;
	gfn_t gfn;
A
Avi Kivity 已提交
163

164
	trace_kvm_mmu_pagetable_walk(addr, access);
165
retry_walk:
166 167 168
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);

169
#if PTTYPE == 64
170
	if (walker->level == PT32E_ROOT_LEVEL) {
171
		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172
		trace_kvm_mmu_paging_element(pte, walker->level);
173
		if (!is_present_gpte(pte))
174
			goto error;
175 176 177
		--walker->level;
	}
#endif
178
	walker->max_level = walker->level;
A
Avi Kivity 已提交
179
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180
	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
A
Avi Kivity 已提交
181

182
	accessed_dirty = PT_ACCESSED_MASK;
183 184
	pt_access = pte_access = ACC_ALL;
	++walker->level;
185

186
	do {
187 188 189
		gfn_t real_gfn;
		unsigned long host_addr;

190 191 192
		pt_access &= pte_access;
		--walker->level;

193
		index = PT_INDEX(addr, walker->level);
194

195
		table_gfn = gpte_to_gfn(pte);
196 197
		offset    = index * sizeof(pt_element_t);
		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
198
		walker->table_gfn[walker->level - 1] = table_gfn;
199
		walker->pte_gpa[walker->level - 1] = pte_gpa;
200

201 202
		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
					      PFERR_USER_MASK|PFERR_WRITE_MASK);
203 204
		if (unlikely(real_gfn == UNMAPPED_GVA))
			goto error;
205 206 207
		real_gfn = gpa_to_gfn(real_gfn);

		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
208 209
		if (unlikely(kvm_is_error_hva(host_addr)))
			goto error;
210 211

		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
212 213
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
214
		walker->ptep_user[walker->level - 1] = ptep_user;
215

216
		trace_kvm_mmu_paging_element(pte, walker->level);
217

218 219
		if (unlikely(!is_present_gpte(pte)))
			goto error;
220

221 222
		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
					      walker->level))) {
223 224
			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
			goto error;
225
		}
226

227
		accessed_dirty &= pte;
228
		pte_access = pt_access & gpte_access(vcpu, pte);
229

230
		walker->ptes[walker->level - 1] = pte;
A
Avi Kivity 已提交
231
	} while (!is_last_gpte(mmu, walker->level, pte));
232

A
Avi Kivity 已提交
233
	if (unlikely(permission_fault(mmu, pte_access, access))) {
234
		errcode |= PFERR_PRESENT_MASK;
235
		goto error;
236
	}
237

238 239 240 241 242 243
	gfn = gpte_to_gfn_lvl(pte, walker->level);
	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;

	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

244
	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
245 246 247 248 249
	if (real_gpa == UNMAPPED_GVA)
		return 0;

	walker->gfn = real_gpa >> PAGE_SHIFT;

250 251
	if (!write_fault)
		protect_clean_gpte(&pte_access, pte);
252 253 254 255 256 257
	else
		/*
		 * On a write fault, fold the dirty bit into accessed_dirty by
		 * shifting it one place right.
		 */
		accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
258 259 260 261 262 263 264 265

	if (unlikely(!accessed_dirty)) {
		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
		if (unlikely(ret < 0))
			goto error;
		else if (ret)
			goto retry_walk;
	}
266

267 268 269
	walker->pt_access = pt_access;
	walker->pte_access = pte_access;
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
270
		 __func__, (u64)pte, pte_access, pt_access);
271 272
	return 1;

273
error:
274
	errcode |= write_fault | user_fault;
275 276
	if (fetch_fault && (mmu->nx ||
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
277
		errcode |= PFERR_FETCH_MASK;
278

279 280 281
	walker->fault.vector = PF_VECTOR;
	walker->fault.error_code_valid = true;
	walker->fault.error_code = errcode;
282 283
	walker->fault.address = addr;
	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
284

285
	trace_kvm_mmu_walker_error(walker->fault.error_code);
286
	return 0;
A
Avi Kivity 已提交
287 288
}

289
static int FNAME(walk_addr)(struct guest_walker *walker,
290
			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
291 292
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
293
					access);
294 295
}

296 297
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				   struct kvm_vcpu *vcpu, gva_t addr,
298
				   u32 access)
299 300
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
301
					addr, access);
302 303
}

304 305 306
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
307
{
308
	unsigned pte_access;
309
	gfn_t gfn;
310
	pfn_t pfn;
311

312
	if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
313
		return false;
314

315
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
316 317

	gfn = gpte_to_gfn(gpte);
318
	pte_access = sp->role.access & gpte_access(vcpu, gpte);
319
	protect_clean_gpte(&pte_access, gpte);
320 321
	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
			no_dirty_log && (pte_access & ACC_WRITE_MASK));
322
	if (is_error_pfn(pfn))
323
		return false;
324

325
	/*
326 327
	 * we call mmu_set_spte() with host_writable = true because
	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
328
	 */
329
	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0,
330 331 332 333 334 335 336 337 338 339 340
		     NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);

	return true;
}

static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			      u64 *spte, const void *pte)
{
	pt_element_t gpte = *(const pt_element_t *)pte;

	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
341 342
}

A
Avi Kivity 已提交
343 344 345 346
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
347 348 349 350 351 352 353 354 355 356 357 358 359 360
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
A
Avi Kivity 已提交
361
				  &curr_pte, sizeof(curr_pte));
362

A
Avi Kivity 已提交
363 364 365
	return r || curr_pte != gw->ptes[level - 1];
}

366 367
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
368 369
{
	struct kvm_mmu_page *sp;
370
	pt_element_t *gptep = gw->prefetch_ptes;
371
	u64 *spte;
372
	int i;
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		if (spte == sptep)
			continue;

389
		if (is_shadow_present_pte(*spte))
390 391
			continue;

392
		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
393 394 395 396
			break;
	}
}

A
Avi Kivity 已提交
397 398
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
399 400
 * If the guest tries to write a write-protected page, we need to
 * emulate this operation, return 1 to indicate this case.
A
Avi Kivity 已提交
401
 */
402
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
403
			 struct guest_walker *gw,
404
			 int write_fault, int hlevel,
405
			 pfn_t pfn, bool map_writable, bool prefault)
A
Avi Kivity 已提交
406
{
407
	struct kvm_mmu_page *sp = NULL;
408
	struct kvm_shadow_walk_iterator it;
409 410
	unsigned direct_access, access = gw->pt_access;
	int top_level, emulate = 0;
411

412
	direct_access = gw->pte_access;
413

414 415 416 417 418 419 420 421 422 423 424 425
	top_level = vcpu->arch.mmu.root_level;
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

426 427 428
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
429 430
		gfn_t table_gfn;

431
		clear_sp_write_flooding_count(it.sptep);
432
		drop_large_spte(vcpu, it.sptep);
433

434
		sp = NULL;
435 436 437 438
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access, it.sptep);
439
		}
440 441 442 443 444

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
445
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
446
			goto out_gpte_changed;
447

448
		if (sp)
449
			link_shadow_page(it.sptep, sp);
450
	}
A
Avi Kivity 已提交
451

452
	for (;
453 454
	     shadow_walk_okay(&it) && it.level > hlevel;
	     shadow_walk_next(&it)) {
455 456
		gfn_t direct_gfn;

457
		clear_sp_write_flooding_count(it.sptep);
458
		validate_direct_spte(vcpu, it.sptep, direct_access);
459

460
		drop_large_spte(vcpu, it.sptep);
461

462
		if (is_shadow_present_pte(*it.sptep))
463 464
			continue;

465
		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
466

467 468 469
		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
				      true, direct_access, it.sptep);
		link_shadow_page(it.sptep, sp);
470 471
	}

472
	clear_sp_write_flooding_count(it.sptep);
473
	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
474
		     write_fault, &emulate, it.level,
X
Xiao Guangrong 已提交
475
		     gw->gfn, pfn, prefault, map_writable);
476
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
477

478
	return emulate;
479 480

out_gpte_changed:
481
	if (sp)
482
		kvm_mmu_put_page(sp, it.sptep);
483
	kvm_release_pfn_clean(pfn);
484
	return 0;
A
Avi Kivity 已提交
485 486
}

487 488 489 490 491 492 493 494 495 496
 /*
 * To see whether the mapped gfn can write its page table in the current
 * mapping.
 *
 * It is the helper function of FNAME(page_fault). When guest uses large page
 * size to map the writable gfn which is used as current page table, we should
 * force kvm to use small page size to map it because new shadow page will be
 * created when kvm establishes shadow page table that stop kvm using large
 * page size. Do it early can avoid unnecessary #PF and emulation.
 *
497 498 499
 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 * currently used as its page table.
 *
500 501 502 503 504 505
 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
 * since the PDPT is always shadowed, that means, we can not use large page
 * size to map the gfn which is used as PDPT.
 */
static bool
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
506 507
			      struct guest_walker *walker, int user_fault,
			      bool *write_fault_to_shadow_pgtable)
508 509 510
{
	int level;
	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
511
	bool self_changed = false;
512 513 514 515 516

	if (!(walker->pte_access & ACC_WRITE_MASK ||
	      (!is_write_protection(vcpu) && !user_fault)))
		return false;

517 518 519 520 521 522
	for (level = walker->level; level <= walker->max_level; level++) {
		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];

		self_changed |= !(gfn & mask);
		*write_fault_to_shadow_pgtable |= !gfn;
	}
523

524
	return self_changed;
525 526
}

A
Avi Kivity 已提交
527 528 529 530 531 532 533 534 535 536 537
/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
538 539
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
540
 */
G
Gleb Natapov 已提交
541
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
542
			     bool prefault)
A
Avi Kivity 已提交
543 544 545 546
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
	struct guest_walker walker;
547
	int r;
548
	pfn_t pfn;
549
	int level = PT_PAGE_TABLE_LEVEL;
550
	int force_pt_level;
551
	unsigned long mmu_seq;
552
	bool map_writable, is_self_change_mapping;
A
Avi Kivity 已提交
553

554
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
555

556 557 558 559
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return handle_mmio_page_fault(vcpu, addr, error_code,
					      mmu_is_nested(vcpu));

560 561 562
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
563

A
Avi Kivity 已提交
564
	/*
565
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
566
	 */
567
	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
A
Avi Kivity 已提交
568 569 570 571

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
572
	if (!r) {
573
		pgprintk("%s: guest page fault\n", __func__);
574
		if (!prefault)
X
Xiao Guangrong 已提交
575
			inject_page_fault(vcpu, &walker.fault);
576

A
Avi Kivity 已提交
577 578 579
		return 0;
	}

580 581 582 583 584
	vcpu->arch.write_fault_to_shadow_pgtable = false;

	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);

585
	if (walker.level >= PT_DIRECTORY_LEVEL)
586
		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
587
		   || is_self_change_mapping;
588 589 590
	else
		force_pt_level = 1;
	if (!force_pt_level) {
591 592
		level = min(walker.level, mapping_level(vcpu, walker.gfn));
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
593
	}
594

595
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
596
	smp_rmb();
597

598
	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
599
			 &map_writable))
600
		return 0;
601

602 603 604 605
	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
				walker.gfn, pfn, walker.pte_access, &r))
		return r;

606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
	/*
	 * Do not change pte_access if the pfn is a mmio page, otherwise
	 * we will cache the incorrect access into mmio spte.
	 */
	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
	     !is_write_protection(vcpu) && !user_fault &&
	      !is_noslot_pfn(pfn)) {
		walker.pte_access |= ACC_WRITE_MASK;
		walker.pte_access &= ~ACC_USER_MASK;

		/*
		 * If we converted a user page to a kernel page,
		 * so that the kernel can write to it when cr0.wp=0,
		 * then we should prevent the kernel from executing it
		 * if SMEP is enabled.
		 */
		if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
			walker.pte_access &= ~ACC_EXEC_MASK;
	}

626
	spin_lock(&vcpu->kvm->mmu_lock);
627
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
628
		goto out_unlock;
629

630
	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
631
	kvm_mmu_free_some_pages(vcpu);
632 633
	if (!force_pt_level)
		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
634
	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
635
			 level, pfn, map_writable, prefault);
A
Avi Kivity 已提交
636
	++vcpu->stat.pf_fixed;
637
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
638
	spin_unlock(&vcpu->kvm->mmu_lock);
A
Avi Kivity 已提交
639

640
	return r;
641 642 643 644 645

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
A
Avi Kivity 已提交
646 647
}

X
Xiao Guangrong 已提交
648 649 650 651
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
	int offset = 0;

652
	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
X
Xiao Guangrong 已提交
653 654 655 656 657 658 659

	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}

660
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
M
Marcelo Tosatti 已提交
661
{
662
	struct kvm_shadow_walk_iterator iterator;
663
	struct kvm_mmu_page *sp;
664 665 666
	int level;
	u64 *sptep;

667 668
	vcpu_clear_mmio_info(vcpu, gva);

669 670 671 672 673
	/*
	 * No need to check return value here, rmap_can_add() can
	 * help us to skip pte prefetch later.
	 */
	mmu_topup_memory_caches(vcpu);
M
Marcelo Tosatti 已提交
674

675
	spin_lock(&vcpu->kvm->mmu_lock);
676 677 678
	for_each_shadow_entry(vcpu, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
679

680
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
681
		if (is_last_spte(*sptep, level)) {
682 683 684
			pt_element_t gpte;
			gpa_t pte_gpa;

685 686 687
			if (!sp->unsync)
				break;

X
Xiao Guangrong 已提交
688
			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
689
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
690

X
Xiao Guangrong 已提交
691 692
			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
				kvm_flush_remote_tlbs(vcpu->kvm);
693 694 695 696 697 698 699 700 701

			if (!rmap_can_add(vcpu))
				break;

			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
						  sizeof(pt_element_t)))
				break;

			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
702
		}
M
Marcelo Tosatti 已提交
703

704
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
705 706
			break;
	}
707
	spin_unlock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
708 709
}

710
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
711
			       struct x86_exception *exception)
A
Avi Kivity 已提交
712 713
{
	struct guest_walker walker;
A
Avi Kivity 已提交
714 715
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
716

717
	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
A
Avi Kivity 已提交
718

A
Avi Kivity 已提交
719
	if (r) {
A
Avi Kivity 已提交
720
		gpa = gfn_to_gpa(walker.gfn);
A
Avi Kivity 已提交
721
		gpa |= vaddr & ~PAGE_MASK;
722 723
	} else if (exception)
		*exception = walker.fault;
A
Avi Kivity 已提交
724 725 726 727

	return gpa;
}

728
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
729 730
				      u32 access,
				      struct x86_exception *exception)
731 732 733 734 735
{
	struct guest_walker walker;
	gpa_t gpa = UNMAPPED_GVA;
	int r;

736
	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
737 738 739 740

	if (r) {
		gpa = gfn_to_gpa(walker.gfn);
		gpa |= vaddr & ~PAGE_MASK;
741 742
	} else if (exception)
		*exception = walker.fault;
743 744 745 746

	return gpa;
}

747 748 749 750
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
751 752 753 754 755 756 757 758
 *
 * Note:
 *   We should flush all tlbs if spte is dropped even though guest is
 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
 *   used by guest then tlbs are not flushed, so guest is allowed to access the
 *   freed pages.
 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
759
 */
760
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
761
{
X
Xiao Guangrong 已提交
762
	int i, nr_present = 0;
763
	bool host_writable;
764
	gpa_t first_pte_gpa;
765

766 767 768
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

X
Xiao Guangrong 已提交
769
	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
770

771 772 773 774
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
775
		gfn_t gfn;
776

777
		if (!sp->spt[i])
778 779
			continue;

780
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
781 782 783 784 785

		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
					  sizeof(pt_element_t)))
			return -EINVAL;

786
		if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
787
			vcpu->kvm->tlbs_dirty++;
788 789 790
			continue;
		}

791 792
		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
793
		pte_access &= gpte_access(vcpu, gpte);
794
		protect_clean_gpte(&pte_access, gpte);
795 796 797 798

		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
			continue;

799
		if (gfn != sp->gfns[i]) {
800
			drop_spte(vcpu->kvm, &sp->spt[i]);
801
			vcpu->kvm->tlbs_dirty++;
802 803 804 805
			continue;
		}

		nr_present++;
806

807 808
		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

809
		set_spte(vcpu, &sp->spt[i], pte_access,
810
			 PT_PAGE_TABLE_LEVEL, gfn,
811
			 spte_to_pfn(sp->spt[i]), true, false,
812
			 host_writable);
813 814 815 816 817
	}

	return !nr_present;
}

A
Avi Kivity 已提交
818 819 820 821 822
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
823 824
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
825
#undef PT_LEVEL_BITS
826
#undef PT_MAX_FULL_LEVELS
827
#undef gpte_to_gfn
828
#undef gpte_to_gfn_lvl
829
#undef CMPXCHG