paging_tmpl.h 21.3 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 32
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
33
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
35 36
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
37
	#define CMPXCHG cmpxchg
38
	#else
39
	#define CMPXCHG cmpxchg64
40 41
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
42 43 44 45 46
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 48
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
49
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
	#define PT_MAX_FULL_LEVELS 2
52
	#define CMPXCHG cmpxchg
A
Avi Kivity 已提交
53 54 55 56
#else
	#error Invalid PTTYPE value
#endif

57 58
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
59

A
Avi Kivity 已提交
60 61 62 63 64 65
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
66
	unsigned max_level;
67
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
69
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
72 73
	unsigned pt_access;
	unsigned pte_access;
74
	gfn_t gfn;
75
	struct x86_exception fault;
A
Avi Kivity 已提交
76 77
};

78
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79
{
80
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81 82
}

83
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 85
			       pt_element_t __user *ptep_user, unsigned index,
			       pt_element_t orig_pte, pt_element_t new_pte)
86
{
87
	int npages;
88 89 90 91
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

92 93 94
	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
	/* Check if the user is doing something meaningless. */
	if (unlikely(npages != 1))
95 96
		return -EFAULT;

97
	table = kmap_atomic(page);
98
	ret = CMPXCHG(&table[index], orig_pte, new_pte);
99
	kunmap_atomic(table);
100 101 102 103 104 105

	kvm_release_page_dirty(page);

	return (ret != orig_pte);
}

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
static bool FNAME(is_last_gpte)(struct guest_walker *walker,
				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
				pt_element_t gpte)
{
	if (walker->level == PT_PAGE_TABLE_LEVEL)
		return true;

	if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
	    (PTTYPE == 64 || is_pse(vcpu)))
		return true;

	if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
	    (mmu->root_level == PT64_ROOT_LEVEL))
		return true;

	return false;
}

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
		if (!(pte & PT_ACCESSED_MASK)) {
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
			pte |= PT_ACCESSED_MASK;
		}
		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
			pte |= PT_DIRTY_MASK;
		}
		if (pte == orig_pte)
			continue;

		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

		mark_page_dirty(vcpu->kvm, table_gfn);
		walker->ptes[level] = pte;
	}
	return 0;
}

161 162 163
/*
 * Fetch a guest pte for a guest virtual address
 */
164 165
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
166
				    gva_t addr, u32 access)
A
Avi Kivity 已提交
167
{
168
	int ret;
169
	pt_element_t pte;
170
	pt_element_t __user *uninitialized_var(ptep_user);
171
	gfn_t table_gfn;
172
	unsigned index, pt_access, uninitialized_var(pte_access);
173
	gpa_t pte_gpa;
174
	bool eperm, last_gpte;
175 176 177 178 179
	int offset;
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
A
Avi Kivity 已提交
180

181
	trace_kvm_mmu_pagetable_walk(addr, access);
182
retry_walk:
183
	eperm = false;
184 185 186
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);

187
#if PTTYPE == 64
188
	if (walker->level == PT32E_ROOT_LEVEL) {
189
		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
190
		trace_kvm_mmu_paging_element(pte, walker->level);
191
		if (!is_present_gpte(pte))
192
			goto error;
193 194 195
		--walker->level;
	}
#endif
196
	walker->max_level = walker->level;
A
Avi Kivity 已提交
197
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
198
	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
A
Avi Kivity 已提交
199

200
	pt_access = ACC_ALL;
201 202

	for (;;) {
203 204 205
		gfn_t real_gfn;
		unsigned long host_addr;

206
		index = PT_INDEX(addr, walker->level);
207

208
		table_gfn = gpte_to_gfn(pte);
209 210
		offset    = index * sizeof(pt_element_t);
		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
211
		walker->table_gfn[walker->level - 1] = table_gfn;
212
		walker->pte_gpa[walker->level - 1] = pte_gpa;
213

214 215
		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
					      PFERR_USER_MASK|PFERR_WRITE_MASK);
216 217
		if (unlikely(real_gfn == UNMAPPED_GVA))
			goto error;
218 219 220
		real_gfn = gpa_to_gfn(real_gfn);

		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
221 222
		if (unlikely(kvm_is_error_hva(host_addr)))
			goto error;
223 224

		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
225 226
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
227
		walker->ptep_user[walker->level - 1] = ptep_user;
228

229
		trace_kvm_mmu_paging_element(pte, walker->level);
230

231 232
		if (unlikely(!is_present_gpte(pte)))
			goto error;
233

234 235
		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
					      walker->level))) {
236 237
			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
			goto error;
238
		}
239

240 241
		if (!check_write_user_access(vcpu, write_fault, user_fault,
					  pte))
242
			eperm = true;
243

244
#if PTTYPE == 64
245
		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
246
			eperm = true;
247 248
#endif

249 250
		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
		if (last_gpte) {
251
			pte_access = pt_access & gpte_access(vcpu, pte);
252 253 254 255 256 257 258
			/* check if the kernel is fetching from user page */
			if (unlikely(pte_access & PT_USER_MASK) &&
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
				if (fetch_fault && !user_fault)
					eperm = true;
		}

259 260
		walker->ptes[walker->level - 1] = pte;

261
		if (last_gpte) {
262
			int lvl = walker->level;
263 264
			gpa_t real_gpa;
			gfn_t gfn;
265
			u32 ac;
266

267 268
			gfn = gpte_to_gfn_lvl(pte, lvl);
			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
269 270 271 272

			if (PTTYPE == 32 &&
			    walker->level == PT_DIRECTORY_LEVEL &&
			    is_cpuid_PSE36())
273 274
				gfn += pse36_gfn_delta(pte);

275
			ac = write_fault | fetch_fault | user_fault;
276 277

			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
278
						      ac);
279 280 281 282
			if (real_gpa == UNMAPPED_GVA)
				return 0;

			walker->gfn = real_gpa >> PAGE_SHIFT;
283

284
			break;
285
		}
286

287
		pt_access &= gpte_access(vcpu, pte);
288 289
		--walker->level;
	}
290

291 292
	if (unlikely(eperm)) {
		errcode |= PFERR_PRESENT_MASK;
293
		goto error;
294
	}
295

296 297
	if (!write_fault)
		protect_clean_gpte(&pte_access, pte);
298

299 300 301 302 303
	ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
	if (unlikely(ret < 0))
		goto error;
	else if (ret)
		goto retry_walk;
304

305 306 307
	walker->pt_access = pt_access;
	walker->pte_access = pte_access;
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
308
		 __func__, (u64)pte, pte_access, pt_access);
309 310
	return 1;

311
error:
312
	errcode |= write_fault | user_fault;
313 314
	if (fetch_fault && (mmu->nx ||
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
315
		errcode |= PFERR_FETCH_MASK;
316

317 318 319
	walker->fault.vector = PF_VECTOR;
	walker->fault.error_code_valid = true;
	walker->fault.error_code = errcode;
320 321
	walker->fault.address = addr;
	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
322

323
	trace_kvm_mmu_walker_error(walker->fault.error_code);
324
	return 0;
A
Avi Kivity 已提交
325 326
}

327
static int FNAME(walk_addr)(struct guest_walker *walker,
328
			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
329 330
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
331
					access);
332 333
}

334 335
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				   struct kvm_vcpu *vcpu, gva_t addr,
336
				   u32 access)
337 338
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
339
					addr, access);
340 341
}

342 343 344 345 346 347 348
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp, u64 *spte,
				    pt_element_t gpte)
{
	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
		goto no_present;

349
	if (!is_present_gpte(gpte))
350 351 352 353 354 355 356 357
		goto no_present;

	if (!(gpte & PT_ACCESSED_MASK))
		goto no_present;

	return false;

no_present:
358
	drop_spte(vcpu->kvm, spte);
359 360 361
	return true;
}

362
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
363
			      u64 *spte, const void *pte)
364 365
{
	pt_element_t gpte;
366
	unsigned pte_access;
367
	pfn_t pfn;
368 369

	gpte = *(const pt_element_t *)pte;
370
	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
371
		return;
372

373
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
374
	pte_access = sp->role.access & gpte_access(vcpu, gpte);
375
	protect_clean_gpte(&pte_access, gpte);
376
	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
377
	if (mmu_invalid_pfn(pfn))
378
		return;
379

380
	/*
L
Lucas De Marchi 已提交
381
	 * we call mmu_set_spte() with host_writable = true because that
382 383
	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
	 */
384
	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
385
		     NULL, PT_PAGE_TABLE_LEVEL,
386
		     gpte_to_gfn(gpte), pfn, true, true);
387 388
}

A
Avi Kivity 已提交
389 390 391 392
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
393 394 395 396 397 398 399 400 401 402 403 404 405 406
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
A
Avi Kivity 已提交
407
				  &curr_pte, sizeof(curr_pte));
408

A
Avi Kivity 已提交
409 410 411
	return r || curr_pte != gw->ptes[level - 1];
}

412 413
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
414 415
{
	struct kvm_mmu_page *sp;
416
	pt_element_t *gptep = gw->prefetch_ptes;
417
	u64 *spte;
418
	int i;
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		pt_element_t gpte;
		unsigned pte_access;
		gfn_t gfn;
		pfn_t pfn;

		if (spte == sptep)
			continue;

440
		if (is_shadow_present_pte(*spte))
441 442 443 444
			continue;

		gpte = gptep[i];

445
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 447
			continue;

448
		pte_access = sp->role.access & gpte_access(vcpu, gpte);
449
		protect_clean_gpte(&pte_access, gpte);
450 451
		gfn = gpte_to_gfn(gpte);
		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452
				      pte_access & ACC_WRITE_MASK);
453
		if (mmu_invalid_pfn(pfn))
454 455 456
			break;

		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
457
			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
458 459 460 461
			     pfn, true, true);
	}
}

A
Avi Kivity 已提交
462 463 464
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
 */
465 466
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
			 struct guest_walker *gw,
467
			 int user_fault, int write_fault, int hlevel,
468
			 int *emulate, pfn_t pfn, bool map_writable,
X
Xiao Guangrong 已提交
469
			 bool prefault)
A
Avi Kivity 已提交
470
{
471
	unsigned access = gw->pt_access;
472 473
	struct kvm_mmu_page *sp = NULL;
	int top_level;
474
	unsigned direct_access;
475
	struct kvm_shadow_walk_iterator it;
476

477
	if (!is_present_gpte(gw->ptes[gw->level - 1]))
478
		return NULL;
A
Avi Kivity 已提交
479

480
	direct_access = gw->pte_access;
481

482 483 484 485 486 487 488 489 490 491 492 493
	top_level = vcpu->arch.mmu.root_level;
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

494 495 496
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
497 498
		gfn_t table_gfn;

499
		clear_sp_write_flooding_count(it.sptep);
500
		drop_large_spte(vcpu, it.sptep);
501

502
		sp = NULL;
503 504 505 506
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access, it.sptep);
507
		}
508 509 510 511 512

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
513
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
514
			goto out_gpte_changed;
515

516
		if (sp)
517
			link_shadow_page(it.sptep, sp);
518
	}
A
Avi Kivity 已提交
519

520
	for (;
521 522
	     shadow_walk_okay(&it) && it.level > hlevel;
	     shadow_walk_next(&it)) {
523 524
		gfn_t direct_gfn;

525
		clear_sp_write_flooding_count(it.sptep);
526
		validate_direct_spte(vcpu, it.sptep, direct_access);
527

528
		drop_large_spte(vcpu, it.sptep);
529

530
		if (is_shadow_present_pte(*it.sptep))
531 532
			continue;

533
		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
534

535 536 537
		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
				      true, direct_access, it.sptep);
		link_shadow_page(it.sptep, sp);
538 539
	}

540
	clear_sp_write_flooding_count(it.sptep);
541
	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
542
		     user_fault, write_fault, emulate, it.level,
X
Xiao Guangrong 已提交
543
		     gw->gfn, pfn, prefault, map_writable);
544
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
545

546
	return it.sptep;
547 548

out_gpte_changed:
549
	if (sp)
550
		kvm_mmu_put_page(sp, it.sptep);
551 552
	kvm_release_pfn_clean(pfn);
	return NULL;
A
Avi Kivity 已提交
553 554 555 556 557 558 559 560 561 562 563 564 565
}

/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
566 567
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
568
 */
G
Gleb Natapov 已提交
569
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
570
			     bool prefault)
A
Avi Kivity 已提交
571 572 573 574
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
	struct guest_walker walker;
A
Avi Kivity 已提交
575
	u64 *sptep;
576
	int emulate = 0;
577
	int r;
578
	pfn_t pfn;
579
	int level = PT_PAGE_TABLE_LEVEL;
580
	int force_pt_level;
581
	unsigned long mmu_seq;
582
	bool map_writable;
A
Avi Kivity 已提交
583

584
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
585

586 587 588 589
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return handle_mmio_page_fault(vcpu, addr, error_code,
					      mmu_is_nested(vcpu));

590 591 592
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
593

A
Avi Kivity 已提交
594
	/*
595
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
596
	 */
597
	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
A
Avi Kivity 已提交
598 599 600 601

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
602
	if (!r) {
603
		pgprintk("%s: guest page fault\n", __func__);
604
		if (!prefault)
X
Xiao Guangrong 已提交
605
			inject_page_fault(vcpu, &walker.fault);
606

A
Avi Kivity 已提交
607 608 609
		return 0;
	}

610 611 612 613 614
	if (walker.level >= PT_DIRECTORY_LEVEL)
		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
	else
		force_pt_level = 1;
	if (!force_pt_level) {
615 616
		level = min(walker.level, mapping_level(vcpu, walker.gfn));
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
617
	}
618

619
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
620
	smp_rmb();
621

622
	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
623
			 &map_writable))
624
		return 0;
625

626 627 628 629
	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
				walker.gfn, pfn, walker.pte_access, &r))
		return r;

630
	spin_lock(&vcpu->kvm->mmu_lock);
631 632
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
633

634
	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
635
	kvm_mmu_free_some_pages(vcpu);
636 637
	if (!force_pt_level)
		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
A
Avi Kivity 已提交
638
	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
639
			     level, &emulate, pfn, map_writable, prefault);
A
Andi Kleen 已提交
640
	(void)sptep;
641 642
	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
		 sptep, *sptep, emulate);
643

A
Avi Kivity 已提交
644
	++vcpu->stat.pf_fixed;
645
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
646
	spin_unlock(&vcpu->kvm->mmu_lock);
A
Avi Kivity 已提交
647

648
	return emulate;
649 650 651 652 653

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
A
Avi Kivity 已提交
654 655
}

X
Xiao Guangrong 已提交
656 657 658 659
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
	int offset = 0;

660
	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
X
Xiao Guangrong 已提交
661 662 663 664 665 666 667

	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}

668
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
M
Marcelo Tosatti 已提交
669
{
670
	struct kvm_shadow_walk_iterator iterator;
671
	struct kvm_mmu_page *sp;
672 673 674
	int level;
	u64 *sptep;

675 676
	vcpu_clear_mmio_info(vcpu, gva);

677 678 679 680 681
	/*
	 * No need to check return value here, rmap_can_add() can
	 * help us to skip pte prefetch later.
	 */
	mmu_topup_memory_caches(vcpu);
M
Marcelo Tosatti 已提交
682

683
	spin_lock(&vcpu->kvm->mmu_lock);
684 685 686
	for_each_shadow_entry(vcpu, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
687

688
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
689
		if (is_last_spte(*sptep, level)) {
690 691 692
			pt_element_t gpte;
			gpa_t pte_gpa;

693 694 695
			if (!sp->unsync)
				break;

X
Xiao Guangrong 已提交
696
			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
697
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
698

X
Xiao Guangrong 已提交
699 700
			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
				kvm_flush_remote_tlbs(vcpu->kvm);
701 702 703 704 705 706 707 708 709

			if (!rmap_can_add(vcpu))
				break;

			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
						  sizeof(pt_element_t)))
				break;

			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
710
		}
M
Marcelo Tosatti 已提交
711

712
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
713 714
			break;
	}
715
	spin_unlock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
716 717
}

718
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
719
			       struct x86_exception *exception)
A
Avi Kivity 已提交
720 721
{
	struct guest_walker walker;
A
Avi Kivity 已提交
722 723
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
724

725
	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
A
Avi Kivity 已提交
726

A
Avi Kivity 已提交
727
	if (r) {
A
Avi Kivity 已提交
728
		gpa = gfn_to_gpa(walker.gfn);
A
Avi Kivity 已提交
729
		gpa |= vaddr & ~PAGE_MASK;
730 731
	} else if (exception)
		*exception = walker.fault;
A
Avi Kivity 已提交
732 733 734 735

	return gpa;
}

736
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
737 738
				      u32 access,
				      struct x86_exception *exception)
739 740 741 742 743
{
	struct guest_walker walker;
	gpa_t gpa = UNMAPPED_GVA;
	int r;

744
	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
745 746 747 748

	if (r) {
		gpa = gfn_to_gpa(walker.gfn);
		gpa |= vaddr & ~PAGE_MASK;
749 750
	} else if (exception)
		*exception = walker.fault;
751 752 753 754

	return gpa;
}

755 756 757 758
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
759 760 761 762 763 764 765 766
 *
 * Note:
 *   We should flush all tlbs if spte is dropped even though guest is
 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
 *   used by guest then tlbs are not flushed, so guest is allowed to access the
 *   freed pages.
 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
767
 */
768
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
769
{
X
Xiao Guangrong 已提交
770
	int i, nr_present = 0;
771
	bool host_writable;
772
	gpa_t first_pte_gpa;
773

774 775 776
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

X
Xiao Guangrong 已提交
777
	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
778

779 780 781 782
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
783
		gfn_t gfn;
784

785
		if (!sp->spt[i])
786 787
			continue;

788
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
789 790 791 792 793

		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
					  sizeof(pt_element_t)))
			return -EINVAL;

794
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
795
			vcpu->kvm->tlbs_dirty++;
796 797 798
			continue;
		}

799 800
		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
801
		pte_access &= gpte_access(vcpu, gpte);
802
		protect_clean_gpte(&pte_access, gpte);
803 804 805 806

		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
			continue;

807
		if (gfn != sp->gfns[i]) {
808
			drop_spte(vcpu->kvm, &sp->spt[i]);
809
			vcpu->kvm->tlbs_dirty++;
810 811 812 813
			continue;
		}

		nr_present++;
814

815 816
		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

817
		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
818
			 PT_PAGE_TABLE_LEVEL, gfn,
819
			 spte_to_pfn(sp->spt[i]), true, false,
820
			 host_writable);
821 822 823 824 825
	}

	return !nr_present;
}

A
Avi Kivity 已提交
826 827 828 829 830
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
831 832
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
833
#undef PT_LEVEL_BITS
834
#undef PT_MAX_FULL_LEVELS
835
#undef gpte_to_gfn
836
#undef gpte_to_gfn_lvl
837
#undef CMPXCHG