paging_tmpl.h 20.7 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 32
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
33
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
35 36
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
37
	#define CMPXCHG cmpxchg
38
	#else
39
	#define CMPXCHG cmpxchg64
40 41
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
42 43 44 45 46
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 48
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
49
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
	#define PT_MAX_FULL_LEVELS 2
52
	#define CMPXCHG cmpxchg
A
Avi Kivity 已提交
53 54 55 56
#else
	#error Invalid PTTYPE value
#endif

57 58
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
59

A
Avi Kivity 已提交
60 61 62 63 64 65
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
66
	unsigned max_level;
67
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
69
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71
	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
72 73
	unsigned pt_access;
	unsigned pte_access;
74
	gfn_t gfn;
75
	struct x86_exception fault;
A
Avi Kivity 已提交
76 77
};

78
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79
{
80
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81 82
}

83
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 85
			       pt_element_t __user *ptep_user, unsigned index,
			       pt_element_t orig_pte, pt_element_t new_pte)
86
{
87
	int npages;
88 89 90 91
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

92 93 94
	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
	/* Check if the user is doing something meaningless. */
	if (unlikely(npages != 1))
95 96
		return -EFAULT;

97
	table = kmap_atomic(page);
98
	ret = CMPXCHG(&table[index], orig_pte, new_pte);
99
	kunmap_atomic(table);
100 101 102 103 104 105

	kvm_release_page_dirty(page);

	return (ret != orig_pte);
}

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
					     struct kvm_mmu *mmu,
					     struct guest_walker *walker,
					     int write_fault)
{
	unsigned level, index;
	pt_element_t pte, orig_pte;
	pt_element_t __user *ptep_user;
	gfn_t table_gfn;
	int ret;

	for (level = walker->max_level; level >= walker->level; --level) {
		pte = orig_pte = walker->ptes[level - 1];
		table_gfn = walker->table_gfn[level - 1];
		ptep_user = walker->ptep_user[level - 1];
		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
		if (!(pte & PT_ACCESSED_MASK)) {
			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
			pte |= PT_ACCESSED_MASK;
		}
		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
			pte |= PT_DIRTY_MASK;
		}
		if (pte == orig_pte)
			continue;

		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
		if (ret)
			return ret;

		mark_page_dirty(vcpu->kvm, table_gfn);
		walker->ptes[level] = pte;
	}
	return 0;
}

143 144 145
/*
 * Fetch a guest pte for a guest virtual address
 */
146 147
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
148
				    gva_t addr, u32 access)
A
Avi Kivity 已提交
149
{
150
	int ret;
151
	pt_element_t pte;
152
	pt_element_t __user *uninitialized_var(ptep_user);
153
	gfn_t table_gfn;
154
	unsigned index, pt_access, pte_access, accessed_dirty, shift;
155
	gpa_t pte_gpa;
156 157 158 159 160
	int offset;
	const int write_fault = access & PFERR_WRITE_MASK;
	const int user_fault  = access & PFERR_USER_MASK;
	const int fetch_fault = access & PFERR_FETCH_MASK;
	u16 errcode = 0;
161 162
	gpa_t real_gpa;
	gfn_t gfn;
A
Avi Kivity 已提交
163

164
	trace_kvm_mmu_pagetable_walk(addr, access);
165
retry_walk:
166 167 168
	walker->level = mmu->root_level;
	pte           = mmu->get_cr3(vcpu);

169
#if PTTYPE == 64
170
	if (walker->level == PT32E_ROOT_LEVEL) {
171
		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172
		trace_kvm_mmu_paging_element(pte, walker->level);
173
		if (!is_present_gpte(pte))
174
			goto error;
175 176 177
		--walker->level;
	}
#endif
178
	walker->max_level = walker->level;
A
Avi Kivity 已提交
179
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180
	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
A
Avi Kivity 已提交
181

182
	accessed_dirty = PT_ACCESSED_MASK;
183 184
	pt_access = pte_access = ACC_ALL;
	++walker->level;
185

186
	do {
187 188 189
		gfn_t real_gfn;
		unsigned long host_addr;

190 191 192
		pt_access &= pte_access;
		--walker->level;

193
		index = PT_INDEX(addr, walker->level);
194

195
		table_gfn = gpte_to_gfn(pte);
196 197
		offset    = index * sizeof(pt_element_t);
		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
198
		walker->table_gfn[walker->level - 1] = table_gfn;
199
		walker->pte_gpa[walker->level - 1] = pte_gpa;
200

201 202
		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
					      PFERR_USER_MASK|PFERR_WRITE_MASK);
203 204
		if (unlikely(real_gfn == UNMAPPED_GVA))
			goto error;
205 206 207
		real_gfn = gpa_to_gfn(real_gfn);

		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
208 209
		if (unlikely(kvm_is_error_hva(host_addr)))
			goto error;
210 211

		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
212 213
		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
			goto error;
214
		walker->ptep_user[walker->level - 1] = ptep_user;
215

216
		trace_kvm_mmu_paging_element(pte, walker->level);
217

218 219
		if (unlikely(!is_present_gpte(pte)))
			goto error;
220

221 222
		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
					      walker->level))) {
223 224
			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
			goto error;
225
		}
226

227
		accessed_dirty &= pte;
228
		pte_access = pt_access & gpte_access(vcpu, pte);
229

230
		walker->ptes[walker->level - 1] = pte;
A
Avi Kivity 已提交
231
	} while (!is_last_gpte(mmu, walker->level, pte));
232

A
Avi Kivity 已提交
233
	if (unlikely(permission_fault(mmu, pte_access, access))) {
234
		errcode |= PFERR_PRESENT_MASK;
235
		goto error;
236
	}
237

238 239 240 241 242 243
	gfn = gpte_to_gfn_lvl(pte, walker->level);
	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;

	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

244
	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
245 246 247 248 249
	if (real_gpa == UNMAPPED_GVA)
		return 0;

	walker->gfn = real_gpa >> PAGE_SHIFT;

250 251
	if (!write_fault)
		protect_clean_gpte(&pte_access, pte);
252

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
	/*
	 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
	 * place right.
	 *
	 * On a read fault, do nothing.
	 */
	shift = write_fault >> ilog2(PFERR_WRITE_MASK);
	shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
	accessed_dirty &= pte >> shift;

	if (unlikely(!accessed_dirty)) {
		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
		if (unlikely(ret < 0))
			goto error;
		else if (ret)
			goto retry_walk;
	}
270

271 272 273
	walker->pt_access = pt_access;
	walker->pte_access = pte_access;
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
274
		 __func__, (u64)pte, pte_access, pt_access);
275 276
	return 1;

277
error:
278
	errcode |= write_fault | user_fault;
279 280
	if (fetch_fault && (mmu->nx ||
			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
281
		errcode |= PFERR_FETCH_MASK;
282

283 284 285
	walker->fault.vector = PF_VECTOR;
	walker->fault.error_code_valid = true;
	walker->fault.error_code = errcode;
286 287
	walker->fault.address = addr;
	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
288

289
	trace_kvm_mmu_walker_error(walker->fault.error_code);
290
	return 0;
A
Avi Kivity 已提交
291 292
}

293
static int FNAME(walk_addr)(struct guest_walker *walker,
294
			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
295 296
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
297
					access);
298 299
}

300 301
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				   struct kvm_vcpu *vcpu, gva_t addr,
302
				   u32 access)
303 304
{
	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
305
					addr, access);
306 307
}

308 309 310 311 312 313 314
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
				    struct kvm_mmu_page *sp, u64 *spte,
				    pt_element_t gpte)
{
	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
		goto no_present;

315
	if (!is_present_gpte(gpte))
316 317 318 319 320 321 322 323
		goto no_present;

	if (!(gpte & PT_ACCESSED_MASK))
		goto no_present;

	return false;

no_present:
324
	drop_spte(vcpu->kvm, spte);
325 326 327
	return true;
}

328
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329
			      u64 *spte, const void *pte)
330 331
{
	pt_element_t gpte;
332
	unsigned pte_access;
333
	pfn_t pfn;
334 335

	gpte = *(const pt_element_t *)pte;
336
	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
337
		return;
338

339
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
340
	pte_access = sp->role.access & gpte_access(vcpu, gpte);
341
	protect_clean_gpte(&pte_access, gpte);
342
	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
X
Xiao Guangrong 已提交
343
	if (is_invalid_pfn(pfn))
344
		return;
345

346
	/*
L
Lucas De Marchi 已提交
347
	 * we call mmu_set_spte() with host_writable = true because that
348 349
	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
	 */
350
	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
351
		     NULL, PT_PAGE_TABLE_LEVEL,
352
		     gpte_to_gfn(gpte), pfn, true, true);
353 354
}

A
Avi Kivity 已提交
355 356 357 358
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
359 360 361 362 363 364 365 366 367 368 369 370 371 372
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
A
Avi Kivity 已提交
373
				  &curr_pte, sizeof(curr_pte));
374

A
Avi Kivity 已提交
375 376 377
	return r || curr_pte != gw->ptes[level - 1];
}

378 379
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
380 381
{
	struct kvm_mmu_page *sp;
382
	pt_element_t *gptep = gw->prefetch_ptes;
383
	u64 *spte;
384
	int i;
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		pt_element_t gpte;
		unsigned pte_access;
		gfn_t gfn;
		pfn_t pfn;

		if (spte == sptep)
			continue;

406
		if (is_shadow_present_pte(*spte))
407 408 409 410
			continue;

		gpte = gptep[i];

411
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
412 413
			continue;

414
		pte_access = sp->role.access & gpte_access(vcpu, gpte);
415
		protect_clean_gpte(&pte_access, gpte);
416 417
		gfn = gpte_to_gfn(gpte);
		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
418
				      pte_access & ACC_WRITE_MASK);
X
Xiao Guangrong 已提交
419
		if (is_invalid_pfn(pfn))
420 421 422
			break;

		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
423
			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
424 425 426 427
			     pfn, true, true);
	}
}

A
Avi Kivity 已提交
428 429 430
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
 */
431 432
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
			 struct guest_walker *gw,
433
			 int user_fault, int write_fault, int hlevel,
434
			 int *emulate, pfn_t pfn, bool map_writable,
X
Xiao Guangrong 已提交
435
			 bool prefault)
A
Avi Kivity 已提交
436
{
437
	unsigned access = gw->pt_access;
438 439
	struct kvm_mmu_page *sp = NULL;
	int top_level;
440
	unsigned direct_access;
441
	struct kvm_shadow_walk_iterator it;
442

443
	if (!is_present_gpte(gw->ptes[gw->level - 1]))
444
		return NULL;
A
Avi Kivity 已提交
445

446
	direct_access = gw->pte_access;
447

448 449 450 451 452 453 454 455 456 457 458 459
	top_level = vcpu->arch.mmu.root_level;
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

460 461 462
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
463 464
		gfn_t table_gfn;

465
		clear_sp_write_flooding_count(it.sptep);
466
		drop_large_spte(vcpu, it.sptep);
467

468
		sp = NULL;
469 470 471 472
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access, it.sptep);
473
		}
474 475 476 477 478

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
479
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
480
			goto out_gpte_changed;
481

482
		if (sp)
483
			link_shadow_page(it.sptep, sp);
484
	}
A
Avi Kivity 已提交
485

486
	for (;
487 488
	     shadow_walk_okay(&it) && it.level > hlevel;
	     shadow_walk_next(&it)) {
489 490
		gfn_t direct_gfn;

491
		clear_sp_write_flooding_count(it.sptep);
492
		validate_direct_spte(vcpu, it.sptep, direct_access);
493

494
		drop_large_spte(vcpu, it.sptep);
495

496
		if (is_shadow_present_pte(*it.sptep))
497 498
			continue;

499
		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
500

501 502 503
		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
				      true, direct_access, it.sptep);
		link_shadow_page(it.sptep, sp);
504 505
	}

506
	clear_sp_write_flooding_count(it.sptep);
507
	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
508
		     user_fault, write_fault, emulate, it.level,
X
Xiao Guangrong 已提交
509
		     gw->gfn, pfn, prefault, map_writable);
510
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
511

512
	return it.sptep;
513 514

out_gpte_changed:
515
	if (sp)
516
		kvm_mmu_put_page(sp, it.sptep);
517 518
	kvm_release_pfn_clean(pfn);
	return NULL;
A
Avi Kivity 已提交
519 520 521 522 523 524 525 526 527 528 529 530 531
}

/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
532 533
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
534
 */
G
Gleb Natapov 已提交
535
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
536
			     bool prefault)
A
Avi Kivity 已提交
537 538 539 540
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
	struct guest_walker walker;
A
Avi Kivity 已提交
541
	u64 *sptep;
542
	int emulate = 0;
543
	int r;
544
	pfn_t pfn;
545
	int level = PT_PAGE_TABLE_LEVEL;
546
	int force_pt_level;
547
	unsigned long mmu_seq;
548
	bool map_writable;
A
Avi Kivity 已提交
549

550
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
551

552 553 554 555
	if (unlikely(error_code & PFERR_RSVD_MASK))
		return handle_mmio_page_fault(vcpu, addr, error_code,
					      mmu_is_nested(vcpu));

556 557 558
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
559

A
Avi Kivity 已提交
560
	/*
561
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
562
	 */
563
	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
A
Avi Kivity 已提交
564 565 566 567

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
568
	if (!r) {
569
		pgprintk("%s: guest page fault\n", __func__);
570
		if (!prefault)
X
Xiao Guangrong 已提交
571
			inject_page_fault(vcpu, &walker.fault);
572

A
Avi Kivity 已提交
573 574 575
		return 0;
	}

576 577 578 579 580
	if (walker.level >= PT_DIRECTORY_LEVEL)
		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
	else
		force_pt_level = 1;
	if (!force_pt_level) {
581 582
		level = min(walker.level, mapping_level(vcpu, walker.gfn));
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
583
	}
584

585
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
586
	smp_rmb();
587

588
	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
589
			 &map_writable))
590
		return 0;
591

592 593 594 595
	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
				walker.gfn, pfn, walker.pte_access, &r))
		return r;

596
	spin_lock(&vcpu->kvm->mmu_lock);
597 598
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
599

600
	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601
	kvm_mmu_free_some_pages(vcpu);
602 603
	if (!force_pt_level)
		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
A
Avi Kivity 已提交
604
	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
605
			     level, &emulate, pfn, map_writable, prefault);
A
Andi Kleen 已提交
606
	(void)sptep;
607 608
	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
		 sptep, *sptep, emulate);
609

A
Avi Kivity 已提交
610
	++vcpu->stat.pf_fixed;
611
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
612
	spin_unlock(&vcpu->kvm->mmu_lock);
A
Avi Kivity 已提交
613

614
	return emulate;
615 616 617 618 619

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
A
Avi Kivity 已提交
620 621
}

X
Xiao Guangrong 已提交
622 623 624 625
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
	int offset = 0;

626
	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
X
Xiao Guangrong 已提交
627 628 629 630 631 632 633

	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}

634
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
M
Marcelo Tosatti 已提交
635
{
636
	struct kvm_shadow_walk_iterator iterator;
637
	struct kvm_mmu_page *sp;
638 639 640
	int level;
	u64 *sptep;

641 642
	vcpu_clear_mmio_info(vcpu, gva);

643 644 645 646 647
	/*
	 * No need to check return value here, rmap_can_add() can
	 * help us to skip pte prefetch later.
	 */
	mmu_topup_memory_caches(vcpu);
M
Marcelo Tosatti 已提交
648

649
	spin_lock(&vcpu->kvm->mmu_lock);
650 651 652
	for_each_shadow_entry(vcpu, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
653

654
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
655
		if (is_last_spte(*sptep, level)) {
656 657 658
			pt_element_t gpte;
			gpa_t pte_gpa;

659 660 661
			if (!sp->unsync)
				break;

X
Xiao Guangrong 已提交
662
			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
663
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
664

X
Xiao Guangrong 已提交
665 666
			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
				kvm_flush_remote_tlbs(vcpu->kvm);
667 668 669 670 671 672 673 674 675

			if (!rmap_can_add(vcpu))
				break;

			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
						  sizeof(pt_element_t)))
				break;

			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
676
		}
M
Marcelo Tosatti 已提交
677

678
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
679 680
			break;
	}
681
	spin_unlock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
682 683
}

684
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
685
			       struct x86_exception *exception)
A
Avi Kivity 已提交
686 687
{
	struct guest_walker walker;
A
Avi Kivity 已提交
688 689
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
690

691
	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
A
Avi Kivity 已提交
692

A
Avi Kivity 已提交
693
	if (r) {
A
Avi Kivity 已提交
694
		gpa = gfn_to_gpa(walker.gfn);
A
Avi Kivity 已提交
695
		gpa |= vaddr & ~PAGE_MASK;
696 697
	} else if (exception)
		*exception = walker.fault;
A
Avi Kivity 已提交
698 699 700 701

	return gpa;
}

702
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
703 704
				      u32 access,
				      struct x86_exception *exception)
705 706 707 708 709
{
	struct guest_walker walker;
	gpa_t gpa = UNMAPPED_GVA;
	int r;

710
	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
711 712 713 714

	if (r) {
		gpa = gfn_to_gpa(walker.gfn);
		gpa |= vaddr & ~PAGE_MASK;
715 716
	} else if (exception)
		*exception = walker.fault;
717 718 719 720

	return gpa;
}

721 722 723 724
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
725 726 727 728 729 730 731 732
 *
 * Note:
 *   We should flush all tlbs if spte is dropped even though guest is
 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
 *   used by guest then tlbs are not flushed, so guest is allowed to access the
 *   freed pages.
 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733
 */
734
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735
{
X
Xiao Guangrong 已提交
736
	int i, nr_present = 0;
737
	bool host_writable;
738
	gpa_t first_pte_gpa;
739

740 741 742
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

X
Xiao Guangrong 已提交
743
	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
744

745 746 747 748
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
749
		gfn_t gfn;
750

751
		if (!sp->spt[i])
752 753
			continue;

754
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
755 756 757 758 759

		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
					  sizeof(pt_element_t)))
			return -EINVAL;

760
		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
761
			vcpu->kvm->tlbs_dirty++;
762 763 764
			continue;
		}

765 766
		gfn = gpte_to_gfn(gpte);
		pte_access = sp->role.access;
767
		pte_access &= gpte_access(vcpu, gpte);
768
		protect_clean_gpte(&pte_access, gpte);
769 770 771 772

		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
			continue;

773
		if (gfn != sp->gfns[i]) {
774
			drop_spte(vcpu->kvm, &sp->spt[i]);
775
			vcpu->kvm->tlbs_dirty++;
776 777 778 779
			continue;
		}

		nr_present++;
780

781 782
		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

783
		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
784
			 PT_PAGE_TABLE_LEVEL, gfn,
785
			 spte_to_pfn(sp->spt[i]), true, false,
786
			 host_writable);
787 788 789 790 791
	}

	return !nr_present;
}

A
Avi Kivity 已提交
792 793 794 795 796
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
797 798
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
799
#undef PT_LEVEL_BITS
800
#undef PT_MAX_FULL_LEVELS
801
#undef gpte_to_gfn
802
#undef gpte_to_gfn_lvl
803
#undef CMPXCHG