paging_tmpl.h 19.2 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7 8 9
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
A
Avi Kivity 已提交
10
 * Copyright 2010 Red Hat, Inc. and/or its affilates.
A
Avi Kivity 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

/*
 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 * so the code in this file is compiled twice, once per pte size.
 */

#if PTTYPE == 64
	#define pt_element_t u64
	#define guest_walker guest_walker64
	#define FNAME(name) paging##64_##name
	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 32
	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
33 34
	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35
	#define PT_LEVEL_BITS PT64_LEVEL_BITS
36 37
	#ifdef CONFIG_X86_64
	#define PT_MAX_FULL_LEVELS 4
38
	#define CMPXCHG cmpxchg
39
	#else
40
	#define CMPXCHG cmpxchg64
41 42
	#define PT_MAX_FULL_LEVELS 2
	#endif
A
Avi Kivity 已提交
43 44 45 46 47
#elif PTTYPE == 32
	#define pt_element_t u32
	#define guest_walker guest_walker32
	#define FNAME(name) paging##32_##name
	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
48 49
	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
A
Avi Kivity 已提交
50 51
	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52
	#define PT_LEVEL_BITS PT32_LEVEL_BITS
53
	#define PT_MAX_FULL_LEVELS 2
54
	#define CMPXCHG cmpxchg
A
Avi Kivity 已提交
55 56 57 58
#else
	#error Invalid PTTYPE value
#endif

59 60
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
61

A
Avi Kivity 已提交
62 63 64 65 66 67
/*
 * The guest_walker structure emulates the behavior of the hardware page
 * table walker.
 */
struct guest_walker {
	int level;
68
	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69
	pt_element_t ptes[PT_MAX_FULL_LEVELS];
70
	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
71
	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
72 73
	unsigned pt_access;
	unsigned pte_access;
74
	gfn_t gfn;
75
	u32 error_code;
A
Avi Kivity 已提交
76 77
};

78
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79
{
80
	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81 82
}

83 84 85 86 87 88 89 90 91
static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
			 gfn_t table_gfn, unsigned index,
			 pt_element_t orig_pte, pt_element_t new_pte)
{
	pt_element_t ret;
	pt_element_t *table;
	struct page *page;

	page = gfn_to_page(kvm, table_gfn);
92

93 94 95 96 97 98 99 100 101
	table = kmap_atomic(page, KM_USER0);
	ret = CMPXCHG(&table[index], orig_pte, new_pte);
	kunmap_atomic(table, KM_USER0);

	kvm_release_page_dirty(page);

	return (ret != orig_pte);
}

102 103 104 105 106 107 108 109 110 111 112 113
static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
{
	unsigned access;

	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
#if PTTYPE == 64
	if (is_nx(vcpu))
		access &= ~(gpte >> PT64_NX_SHIFT);
#endif
	return access;
}

114 115 116
/*
 * Fetch a guest pte for a guest virtual address
 */
117 118
static int FNAME(walk_addr)(struct guest_walker *walker,
			    struct kvm_vcpu *vcpu, gva_t addr,
119
			    int write_fault, int user_fault, int fetch_fault)
A
Avi Kivity 已提交
120
{
121
	pt_element_t pte;
122
	gfn_t table_gfn;
123
	unsigned index, pt_access, uninitialized_var(pte_access);
124
	gpa_t pte_gpa;
125
	bool eperm, present, rsvd_fault;
A
Avi Kivity 已提交
126

127 128
	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
				     fetch_fault);
129
walk:
130 131
	present = true;
	eperm = rsvd_fault = false;
132 133
	walker->level = vcpu->arch.mmu.root_level;
	pte = vcpu->arch.cr3;
134 135
#if PTTYPE == 64
	if (!is_long_mode(vcpu)) {
A
Avi Kivity 已提交
136
		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
137
		trace_kvm_mmu_paging_element(pte, walker->level);
138 139 140 141
		if (!is_present_gpte(pte)) {
			present = false;
			goto error;
		}
142 143 144
		--walker->level;
	}
#endif
A
Avi Kivity 已提交
145
	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
146
	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
A
Avi Kivity 已提交
147

148
	pt_access = ACC_ALL;
149 150

	for (;;) {
151
		index = PT_INDEX(addr, walker->level);
152

153
		table_gfn = gpte_to_gfn(pte);
A
Avi Kivity 已提交
154
		pte_gpa = gfn_to_gpa(table_gfn);
155
		pte_gpa += index * sizeof(pt_element_t);
156
		walker->table_gfn[walker->level - 1] = table_gfn;
157
		walker->pte_gpa[walker->level - 1] = pte_gpa;
158

159 160 161 162
		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
			present = false;
			break;
		}
163

164
		trace_kvm_mmu_paging_element(pte, walker->level);
165

166 167 168 169
		if (!is_present_gpte(pte)) {
			present = false;
			break;
		}
170

171 172 173 174
		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
			rsvd_fault = true;
			break;
		}
175

176
		if (write_fault && !is_writable_pte(pte))
177
			if (user_fault || is_write_protection(vcpu))
178
				eperm = true;
179

180
		if (user_fault && !(pte & PT_USER_MASK))
181
			eperm = true;
182

183
#if PTTYPE == 64
184
		if (fetch_fault && (pte & PT64_NX_MASK))
185
			eperm = true;
186 187
#endif

188
		if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
189 190
			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
						       sizeof(pte));
191 192 193
			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
			    index, pte, pte|PT_ACCESSED_MASK))
				goto walk;
194
			mark_page_dirty(vcpu->kvm, table_gfn);
195
			pte |= PT_ACCESSED_MASK;
196
		}
197

198
		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
199

200 201
		walker->ptes[walker->level - 1] = pte;

202 203
		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
		    ((walker->level == PT_DIRECTORY_LEVEL) &&
204
				is_large_pte(pte) &&
205 206
				(PTTYPE == 64 || is_pse(vcpu))) ||
		    ((walker->level == PT_PDPE_LEVEL) &&
207
				is_large_pte(pte) &&
208 209 210 211 212 213 214 215 216 217
				is_long_mode(vcpu))) {
			int lvl = walker->level;

			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
			walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
					>> PAGE_SHIFT;

			if (PTTYPE == 32 &&
			    walker->level == PT_DIRECTORY_LEVEL &&
			    is_cpuid_PSE36())
218
				walker->gfn += pse36_gfn_delta(pte);
219

220
			break;
221
		}
222

223
		pt_access = pte_access;
224 225
		--walker->level;
	}
226

227 228 229
	if (!present || eperm || rsvd_fault)
		goto error;

230
	if (write_fault && !is_dirty_gpte(pte)) {
231 232
		bool ret;

233
		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
234 235 236 237
		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
			    pte|PT_DIRTY_MASK);
		if (ret)
			goto walk;
238
		mark_page_dirty(vcpu->kvm, table_gfn);
239
		pte |= PT_DIRTY_MASK;
240
		walker->ptes[walker->level - 1] = pte;
241 242
	}

243 244 245
	walker->pt_access = pt_access;
	walker->pte_access = pte_access;
	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
246
		 __func__, (u64)pte, pte_access, pt_access);
247 248
	return 1;

249
error:
250
	walker->error_code = 0;
251 252
	if (present)
		walker->error_code |= PFERR_PRESENT_MASK;
253 254 255 256
	if (write_fault)
		walker->error_code |= PFERR_WRITE_MASK;
	if (user_fault)
		walker->error_code |= PFERR_USER_MASK;
257
	if (fetch_fault && is_nx(vcpu))
258
		walker->error_code |= PFERR_FETCH_MASK;
259 260
	if (rsvd_fault)
		walker->error_code |= PFERR_RSVD_MASK;
261
	trace_kvm_mmu_walker_error(walker->error_code);
262
	return 0;
A
Avi Kivity 已提交
263 264
}

265
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
266
			      u64 *spte, const void *pte)
267 268
{
	pt_element_t gpte;
269
	unsigned pte_access;
270
	pfn_t pfn;
271
	u64 new_spte;
272 273

	gpte = *(const pt_element_t *)pte;
274
	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
275
		if (!is_present_gpte(gpte)) {
276
			if (sp->unsync)
277 278 279 280 281
				new_spte = shadow_trap_nonpresent_pte;
			else
				new_spte = shadow_notrap_nonpresent_pte;
			__set_spte(spte, new_spte);
		}
282 283
		return;
	}
284
	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
285
	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
286 287
	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
		return;
288 289
	pfn = vcpu->arch.update_pte.pfn;
	if (is_error_pfn(pfn))
290
		return;
291 292
	if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
		return;
293
	kvm_get_pfn(pfn);
294 295 296 297
	/*
	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
	 */
298
	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
299
		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
300
		     gpte_to_gfn(gpte), pfn, true, true);
301 302
}

A
Avi Kivity 已提交
303 304 305 306
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				struct guest_walker *gw, int level)
{
	pt_element_t curr_pte;
307 308 309 310 311 312 313 314 315 316 317 318 319 320
	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
	u64 mask;
	int r, index;

	if (level == PT_PAGE_TABLE_LEVEL) {
		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
		base_gpa = pte_gpa & ~mask;
		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);

		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
		curr_pte = gw->prefetch_ptes[index];
	} else
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
A
Avi Kivity 已提交
321
				  &curr_pte, sizeof(curr_pte));
322

A
Avi Kivity 已提交
323 324 325
	return r || curr_pte != gw->ptes[level - 1];
}

326 327
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
				u64 *sptep)
328 329
{
	struct kvm_mmu_page *sp;
330
	pt_element_t *gptep = gw->prefetch_ptes;
331
	u64 *spte;
332
	int i;
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385

	sp = page_header(__pa(sptep));

	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
		return;

	if (sp->role.direct)
		return __direct_pte_prefetch(vcpu, sp, sptep);

	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
	spte = sp->spt + i;

	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
		pt_element_t gpte;
		unsigned pte_access;
		gfn_t gfn;
		pfn_t pfn;
		bool dirty;

		if (spte == sptep)
			continue;

		if (*spte != shadow_trap_nonpresent_pte)
			continue;

		gpte = gptep[i];

		if (!is_present_gpte(gpte) ||
		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
			if (!sp->unsync)
				__set_spte(spte, shadow_notrap_nonpresent_pte);
			continue;
		}

		if (!(gpte & PT_ACCESSED_MASK))
			continue;

		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
		gfn = gpte_to_gfn(gpte);
		dirty = is_dirty_gpte(gpte);
		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
				      (pte_access & ACC_WRITE_MASK) && dirty);
		if (is_error_pfn(pfn)) {
			kvm_release_pfn_clean(pfn);
			break;
		}

		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
			     pfn, true, true);
	}
}

A
Avi Kivity 已提交
386 387 388
/*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
 */
389 390
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
			 struct guest_walker *gw,
391
			 int user_fault, int write_fault, int hlevel,
392
			 int *ptwrite, pfn_t pfn)
A
Avi Kivity 已提交
393
{
394
	unsigned access = gw->pt_access;
395
	struct kvm_mmu_page *sp = NULL;
396
	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
397
	int top_level;
398
	unsigned direct_access;
399
	struct kvm_shadow_walk_iterator it;
400

401
	if (!is_present_gpte(gw->ptes[gw->level - 1]))
402
		return NULL;
A
Avi Kivity 已提交
403

404 405 406 407
	direct_access = gw->pt_access & gw->pte_access;
	if (!dirty)
		direct_access &= ~ACC_WRITE_MASK;

408 409 410 411 412 413 414 415 416 417 418 419
	top_level = vcpu->arch.mmu.root_level;
	if (top_level == PT32E_ROOT_LEVEL)
		top_level = PT32_ROOT_LEVEL;
	/*
	 * Verify that the top-level gpte is still there.  Since the page
	 * is a root page, it is either write protected (and cannot be
	 * changed from now on) or it is invalid (in which case, we don't
	 * really care if it changes underneath us after this point).
	 */
	if (FNAME(gpte_changed)(vcpu, gw, top_level))
		goto out_gpte_changed;

420 421 422
	for (shadow_walk_init(&it, vcpu, addr);
	     shadow_walk_okay(&it) && it.level > gw->level;
	     shadow_walk_next(&it)) {
423 424
		gfn_t table_gfn;

425
		drop_large_spte(vcpu, it.sptep);
426

427
		sp = NULL;
428 429 430 431
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access, it.sptep);
432
		}
433 434 435 436 437

		/*
		 * Verify that the gpte in the page we've just write
		 * protected is still there.
		 */
438
		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
439
			goto out_gpte_changed;
440

441
		if (sp)
442
			link_shadow_page(it.sptep, sp);
443
	}
A
Avi Kivity 已提交
444

445
	for (;
446 447
	     shadow_walk_okay(&it) && it.level > hlevel;
	     shadow_walk_next(&it)) {
448 449
		gfn_t direct_gfn;

450
		validate_direct_spte(vcpu, it.sptep, direct_access);
451

452
		drop_large_spte(vcpu, it.sptep);
453

454
		if (is_shadow_present_pte(*it.sptep))
455 456
			continue;

457
		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
458

459 460 461
		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
				      true, direct_access, it.sptep);
		link_shadow_page(it.sptep, sp);
462 463
	}

464 465
	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
		     user_fault, write_fault, dirty, ptwrite, it.level,
466
		     gw->gfn, pfn, false, true);
467
	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
468

469
	return it.sptep;
470 471

out_gpte_changed:
472
	if (sp)
473
		kvm_mmu_put_page(sp, it.sptep);
474 475
	kvm_release_pfn_clean(pfn);
	return NULL;
A
Avi Kivity 已提交
476 477 478 479 480 481 482 483 484 485 486 487 488
}

/*
 * Page fault handler.  There are several causes for a page fault:
 *   - there is no shadow pte for the guest pte
 *   - write access through a shadow pte marked read only so that we can set
 *     the dirty bit
 *   - write access to a shadow pte marked read only so we can update the page
 *     dirty bitmap, when userspace requests it
 *   - mmio access; in this case we will never install a present shadow pte
 *   - normal guest page fault due to the guest pte marked not present, not
 *     writable, or not executable
 *
489 490
 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 *           a negative value on error.
A
Avi Kivity 已提交
491 492 493 494 495 496
 */
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
			       u32 error_code)
{
	int write_fault = error_code & PFERR_WRITE_MASK;
	int user_fault = error_code & PFERR_USER_MASK;
497
	int fetch_fault = error_code & PFERR_FETCH_MASK;
A
Avi Kivity 已提交
498
	struct guest_walker walker;
A
Avi Kivity 已提交
499
	u64 *sptep;
500
	int write_pt = 0;
501
	int r;
502
	pfn_t pfn;
503
	int level = PT_PAGE_TABLE_LEVEL;
504
	unsigned long mmu_seq;
A
Avi Kivity 已提交
505

506
	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
507

508 509 510
	r = mmu_topup_memory_caches(vcpu);
	if (r)
		return r;
511

A
Avi Kivity 已提交
512
	/*
513
	 * Look up the guest pte for the faulting address.
A
Avi Kivity 已提交
514
	 */
515 516
	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
			     fetch_fault);
A
Avi Kivity 已提交
517 518 519 520

	/*
	 * The page is not mapped by the guest.  Let the guest handle it.
	 */
521
	if (!r) {
522
		pgprintk("%s: guest page fault\n", __func__);
523
		inject_page_fault(vcpu, addr, walker.error_code);
524
		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
A
Avi Kivity 已提交
525 526 527
		return 0;
	}

528 529 530
	if (walker.level >= PT_DIRECTORY_LEVEL) {
		level = min(walker.level, mapping_level(vcpu, walker.gfn));
		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
M
Marcelo Tosatti 已提交
531
	}
532

533
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
534
	smp_rmb();
535
	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
536

537
	/* mmio */
538 539
	if (is_error_pfn(pfn))
		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
540

541
	spin_lock(&vcpu->kvm->mmu_lock);
542 543
	if (mmu_notifier_retry(vcpu, mmu_seq))
		goto out_unlock;
544 545

	kvm_mmu_audit(vcpu, "pre page fault");
546
	kvm_mmu_free_some_pages(vcpu);
A
Avi Kivity 已提交
547
	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
548
			     level, &write_pt, pfn);
A
Andi Kleen 已提交
549
	(void)sptep;
550
	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
A
Avi Kivity 已提交
551
		 sptep, *sptep, write_pt);
552

553
	if (!write_pt)
554
		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
555

A
Avi Kivity 已提交
556
	++vcpu->stat.pf_fixed;
557
	kvm_mmu_audit(vcpu, "post page fault (fixed)");
558
	spin_unlock(&vcpu->kvm->mmu_lock);
A
Avi Kivity 已提交
559

560
	return write_pt;
561 562 563 564 565

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return 0;
A
Avi Kivity 已提交
566 567
}

568
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
M
Marcelo Tosatti 已提交
569
{
570
	struct kvm_shadow_walk_iterator iterator;
571
	struct kvm_mmu_page *sp;
572
	gpa_t pte_gpa = -1;
573 574
	int level;
	u64 *sptep;
575
	int need_flush = 0;
576 577

	spin_lock(&vcpu->kvm->mmu_lock);
M
Marcelo Tosatti 已提交
578

579 580 581
	for_each_shadow_entry(vcpu, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
582

583
		sp = page_header(__pa(sptep));
X
Xiao Guangrong 已提交
584
		if (is_last_spte(*sptep, level)) {
585
			int offset, shift;
586

587 588 589
			if (!sp->unsync)
				break;

590 591 592 593 594
			shift = PAGE_SHIFT -
				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
			offset = sp->role.quadrant << shift;

			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
595
			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
596 597 598 599

			if (is_shadow_present_pte(*sptep)) {
				if (is_large_pte(*sptep))
					--vcpu->kvm->stat.lpages;
A
Avi Kivity 已提交
600 601
				drop_spte(vcpu->kvm, sptep,
					  shadow_trap_nonpresent_pte);
602
				need_flush = 1;
A
Avi Kivity 已提交
603 604
			} else
				__set_spte(sptep, shadow_trap_nonpresent_pte);
605
			break;
606
		}
M
Marcelo Tosatti 已提交
607

608
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
609 610
			break;
	}
M
Marcelo Tosatti 已提交
611

612 613
	if (need_flush)
		kvm_flush_remote_tlbs(vcpu->kvm);
614 615 616

	atomic_inc(&vcpu->kvm->arch.invlpg_counter);

617
	spin_unlock(&vcpu->kvm->mmu_lock);
618 619 620 621 622 623 624

	if (pte_gpa == -1)
		return;

	if (mmu_topup_memory_caches(vcpu))
		return;
	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
M
Marcelo Tosatti 已提交
625 626
}

627 628
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
			       u32 *error)
A
Avi Kivity 已提交
629 630
{
	struct guest_walker walker;
A
Avi Kivity 已提交
631 632
	gpa_t gpa = UNMAPPED_GVA;
	int r;
A
Avi Kivity 已提交
633

634 635 636 637
	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
			     !!(access & PFERR_WRITE_MASK),
			     !!(access & PFERR_USER_MASK),
			     !!(access & PFERR_FETCH_MASK));
A
Avi Kivity 已提交
638

A
Avi Kivity 已提交
639
	if (r) {
A
Avi Kivity 已提交
640
		gpa = gfn_to_gpa(walker.gfn);
A
Avi Kivity 已提交
641
		gpa |= vaddr & ~PAGE_MASK;
642 643
	} else if (error)
		*error = walker.error_code;
A
Avi Kivity 已提交
644 645 646 647

	return gpa;
}

648 649 650
static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
				 struct kvm_mmu_page *sp)
{
A
Avi Kivity 已提交
651 652 653
	int i, j, offset, r;
	pt_element_t pt[256 / sizeof(pt_element_t)];
	gpa_t pte_gpa;
654

655
	if (sp->role.direct
656
	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
657 658 659 660
		nonpaging_prefetch_page(vcpu, sp);
		return;
	}

A
Avi Kivity 已提交
661 662
	pte_gpa = gfn_to_gpa(sp->gfn);
	if (PTTYPE == 32) {
663
		offset = sp->role.quadrant << PT64_LEVEL_BITS;
A
Avi Kivity 已提交
664 665
		pte_gpa += offset * sizeof(pt_element_t);
	}
666

A
Avi Kivity 已提交
667 668 669 670
	for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
		pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
		for (j = 0; j < ARRAY_SIZE(pt); ++j)
671
			if (r || is_present_gpte(pt[j]))
A
Avi Kivity 已提交
672 673 674
				sp->spt[i+j] = shadow_trap_nonpresent_pte;
			else
				sp->spt[i+j] = shadow_notrap_nonpresent_pte;
675
	}
676 677
}

678 679 680 681 682
/*
 * Using the cached information from sp->gfns is safe because:
 * - The spte has a reference to the struct page, so the pfn for a given gfn
 *   can't change unless all sptes pointing to it are nuked first.
 */
683 684
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
			    bool clear_unsync)
685 686
{
	int i, offset, nr_present;
687
	bool reset_host_protection;
688
	gpa_t first_pte_gpa;
689 690 691

	offset = nr_present = 0;

692 693 694
	/* direct kvm_mmu_page can not be unsync. */
	BUG_ON(sp->role.direct);

695 696 697
	if (PTTYPE == 32)
		offset = sp->role.quadrant << PT64_LEVEL_BITS;

698 699
	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);

700 701 702 703
	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
		unsigned pte_access;
		pt_element_t gpte;
		gpa_t pte_gpa;
704
		gfn_t gfn;
705 706 707 708

		if (!is_shadow_present_pte(sp->spt[i]))
			continue;

709
		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
710 711 712 713 714

		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
					  sizeof(pt_element_t)))
			return -EINVAL;

715
		gfn = gpte_to_gfn(gpte);
716 717 718
		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
		      || !(gpte & PT_ACCESSED_MASK)) {
719 720
			u64 nonpresent;

721
			if (is_present_gpte(gpte) || !clear_unsync)
722 723 724
				nonpresent = shadow_trap_nonpresent_pte;
			else
				nonpresent = shadow_notrap_nonpresent_pte;
A
Avi Kivity 已提交
725
			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
726 727 728 729 730
			continue;
		}

		nr_present++;
		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
731 732 733 734 735 736
		if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
			pte_access &= ~ACC_WRITE_MASK;
			reset_host_protection = 0;
		} else {
			reset_host_protection = 1;
		}
737
		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
738
			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
739 740
			 spte_to_pfn(sp->spt[i]), true, false,
			 reset_host_protection);
741 742 743 744 745
	}

	return !nr_present;
}

A
Avi Kivity 已提交
746 747 748 749 750 751
#undef pt_element_t
#undef guest_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
#undef PT_LEVEL_MASK
752 753
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
754
#undef PT_LEVEL_BITS
755
#undef PT_MAX_FULL_LEVELS
756
#undef gpte_to_gfn
757
#undef gpte_to_gfn_lvl
758
#undef CMPXCHG