nested.c 40.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * AMD SVM support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */

#define pr_fmt(fmt) "SVM: " fmt

#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kernel.h>

#include <asm/msr-index.h>
22
#include <asm/debugreg.h>
23 24 25 26 27

#include "kvm_emulate.h"
#include "trace.h"
#include "mmu.h"
#include "x86.h"
28
#include "cpuid.h"
29
#include "lapic.h"
30 31
#include "svm.h"

32 33
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
				       struct x86_exception *fault)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
		/*
		 * TODO: track the cause of the nested page fault, and
		 * correctly fill in the high bits of exit_info_1.
		 */
		svm->vmcb->control.exit_code = SVM_EXIT_NPF;
		svm->vmcb->control.exit_code_hi = 0;
		svm->vmcb->control.exit_info_1 = (1ULL << 32);
		svm->vmcb->control.exit_info_2 = fault->address;
	}

	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
	svm->vmcb->control.exit_info_1 |= fault->error_code;

	nested_svm_vmexit(svm);
}

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
       struct vcpu_svm *svm = to_svm(vcpu);
       WARN_ON(!is_guest_mode(vcpu));

       if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
	   !svm->nested.nested_run_pending) {
               svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
               svm->vmcb->control.exit_code_hi = 0;
               svm->vmcb->control.exit_info_1 = fault->error_code;
               svm->vmcb->control.exit_info_2 = fault->address;
               nested_svm_vmexit(svm);
       } else {
               kvm_inject_page_fault(vcpu, fault);
       }
}

73 74 75
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
	struct vcpu_svm *svm = to_svm(vcpu);
76
	u64 cr3 = svm->nested.ctl.nested_cr3;
77 78 79
	u64 pdpte;
	int ret;

80
	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
81 82 83 84 85 86 87 88 89 90
				       offset_in_page(cr3) + index * 8, 8);
	if (ret)
		return 0;
	return pdpte;
}

static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

91
	return svm->nested.ctl.nested_cr3;
92 93 94 95
}

static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
96 97
	struct vcpu_svm *svm = to_svm(vcpu);

98 99 100
	WARN_ON(mmu_is_nested(vcpu));

	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
101 102 103 104 105 106

	/*
	 * The NPT format depends on L1's CR4 and EFER, which is in vmcb01.  Note,
	 * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
	 * vCPU state.  CR0.WP is explicitly ignored, while CR0.PG is required.
	 */
107 108
	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
				svm->vmcb01.ptr->save.efer,
109
				svm->nested.ctl.nested_cr3);
110 111 112 113 114 115 116 117 118 119 120 121 122 123
	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
}

static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
	vcpu->arch.mmu = &vcpu->arch.root_mmu;
	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}

void recalc_intercepts(struct vcpu_svm *svm)
{
124
	struct vmcb_control_area *c, *h, *g;
125
	unsigned int i;
126

127
	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
128 129 130 131 132

	if (!is_guest_mode(&svm->vcpu))
		return;

	c = &svm->vmcb->control;
133
	h = &svm->vmcb01.ptr->control;
134
	g = &svm->nested.ctl;
135

136 137 138
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] = h->intercepts[i];

P
Paolo Bonzini 已提交
139
	if (g->int_ctl & V_INTR_MASKING_MASK) {
140
		/* We only want the cr8 intercept bits of L1 */
141 142
		vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
143 144 145 146 147 148

		/*
		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
		 * affect any interrupt we may want to inject; therefore,
		 * interrupt window vmexits are irrelevant to L0.
		 */
149
		vmcb_clr_intercept(c, INTERCEPT_VINTR);
150 151 152
	}

	/* We don't want to see VMMCALLs from a nested guest */
153
	vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
154

155 156
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] |= g->intercepts[i];
157 158 159 160

	/* If SMI is not intercepted, ignore guest SMI intercept as well  */
	if (!intercept_smi)
		vmcb_clr_intercept(c, INTERCEPT_SMI);
161 162 163

	vmcb_set_intercept(c, INTERCEPT_VMLOAD);
	vmcb_set_intercept(c, INTERCEPT_VMSAVE);
164 165 166 167 168 169 170 171 172 173 174
}

static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
	/*
	 * This function merges the msr permission bitmaps of kvm and the
	 * nested vmcb. It is optimized in that it only merges the parts where
	 * the kvm msr permission bitmap may contain zero bits
	 */
	int i;

175
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
176 177 178 179 180 181 182 183 184 185
		return true;

	for (i = 0; i < MSRPM_OFFSETS; i++) {
		u32 value, p;
		u64 offset;

		if (msrpm_offsets[i] == 0xffffffff)
			break;

		p      = msrpm_offsets[i];
186
		offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
187 188 189 190 191 192 193 194 195 196 197 198

		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
			return false;

		svm->nested.msrpm[p] = svm->msrpm[p] | value;
	}

	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));

	return true;
}

199 200 201 202 203 204 205 206 207 208 209
/*
 * Bits 11:0 of bitmap address are ignored by hardware
 */
static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
{
	u64 addr = PAGE_ALIGN(pa);

	return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
	    kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}

210 211 212 213 214 215 216 217 218 219 220 221
static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
{
	/* Nested FLUSHBYASID is not supported yet.  */
	switch(tlb_ctl) {
		case TLB_CONTROL_DO_NOTHING:
		case TLB_CONTROL_FLUSH_ALL_ASID:
			return true;
		default:
			return false;
	}
}

222 223
static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
				       struct vmcb_control_area *control)
224
{
225
	if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
226 227
		return false;

228
	if (CC(control->asid == 0))
229 230
		return false;

231
	if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
232 233
		return false;

234 235 236 237 238 239 240
	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
					   MSRPM_SIZE)))
		return false;
	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
					   IOPM_SIZE)))
		return false;

241 242 243
	if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
		return false;

244 245 246
	return true;
}

247
/* Common checks that apply to both L1 and L2 state.  */
248 249
static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
				     struct vmcb_save_area_cached *save)
250
{
251
	if (CC(!(save->efer & EFER_SVME)))
252 253
		return false;

254 255
	if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
	    CC(save->cr0 & ~0xffffffffULL))
256 257
		return false;

258
	if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
259 260
		return false;

261 262 263 264 265 266 267 268 269 270 271 272 273
	/*
	 * These checks are also performed by KVM_SET_SREGS,
	 * except that EFER.LMA is not checked by SVM against
	 * CR0.PG && EFER.LME.
	 */
	if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
		if (CC(!(save->cr4 & X86_CR4_PAE)) ||
		    CC(!(save->cr0 & X86_CR0_PE)) ||
		    CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
			return false;
	}

	if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
274
		return false;
275

276
	if (CC(!kvm_valid_efer(vcpu, save->efer)))
277 278 279 280 281
		return false;

	return true;
}

282 283 284 285 286 287 288 289
static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_save_area_cached *save = &svm->nested.save;

	return __nested_vmcb_check_save(vcpu, save);
}

290 291 292
static
void __nested_copy_vmcb_control_to_cache(struct vmcb_control_area *to,
					 struct vmcb_control_area *from)
293
{
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
	unsigned int i;

	for (i = 0; i < MAX_INTERCEPT; i++)
		to->intercepts[i] = from->intercepts[i];

	to->iopm_base_pa        = from->iopm_base_pa;
	to->msrpm_base_pa       = from->msrpm_base_pa;
	to->tsc_offset          = from->tsc_offset;
	to->tlb_ctl             = from->tlb_ctl;
	to->int_ctl             = from->int_ctl;
	to->int_vector          = from->int_vector;
	to->int_state           = from->int_state;
	to->exit_code           = from->exit_code;
	to->exit_code_hi        = from->exit_code_hi;
	to->exit_info_1         = from->exit_info_1;
	to->exit_info_2         = from->exit_info_2;
	to->exit_int_info       = from->exit_int_info;
	to->exit_int_info_err   = from->exit_int_info_err;
	to->nested_ctl          = from->nested_ctl;
	to->event_inj           = from->event_inj;
	to->event_inj_err       = from->event_inj_err;
	to->nested_cr3          = from->nested_cr3;
	to->virt_ext            = from->virt_ext;
	to->pause_filter_count  = from->pause_filter_count;
	to->pause_filter_thresh = from->pause_filter_thresh;

	/* Copy asid here because nested_vmcb_check_controls will check it.  */
	to->asid           = from->asid;
	to->msrpm_base_pa &= ~0x0fffULL;
	to->iopm_base_pa  &= ~0x0fffULL;
}
325

326 327 328 329
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
				       struct vmcb_control_area *control)
{
	__nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
330 331
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
					     struct vmcb_save_area *from)
{
	/*
	 * Copy only fields that are validated, as we need them
	 * to avoid TOC/TOU races.
	 */
	to->efer = from->efer;
	to->cr0 = from->cr0;
	to->cr3 = from->cr3;
	to->cr4 = from->cr4;

	to->dr6 = from->dr6;
	to->dr7 = from->dr7;
}

void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
				    struct vmcb_save_area *save)
{
	__nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
}

354 355
/*
 * Synchronize fields that are written by the processor, so that
356
 * they can be copied back into the vmcb12.
357
 */
358
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
359 360 361 362 363 364 365 366
{
	u32 mask;
	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;

	/* Only a few fields of int_ctl are written by the processor.  */
	mask = V_IRQ_MASK | V_TPR_MASK;
	if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
367
	    svm_is_intercept(svm, INTERCEPT_VINTR)) {
368 369 370 371 372 373 374 375 376 377 378 379 380 381
		/*
		 * In order to request an interrupt window, L0 is usurping
		 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
		 * even if it was clear in L1's VMCB.  Restoring it would be
		 * wrong.  However, in this case V_IRQ will remain true until
		 * interrupt_window_interception calls svm_clear_vintr and
		 * restores int_ctl.  We can just leave it aside.
		 */
		mask &= ~V_IRQ_MASK;
	}
	svm->nested.ctl.int_ctl        &= ~mask;
	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
}

382 383 384 385
/*
 * Transfer any event that L0 or L1 wanted to inject into L2 to
 * EXIT_INT_INFO.
 */
386 387
static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
						struct vmcb *vmcb12)
388 389 390 391 392 393 394 395 396 397 398
{
	struct kvm_vcpu *vcpu = &svm->vcpu;
	u32 exit_int_info = 0;
	unsigned int nr;

	if (vcpu->arch.exception.injected) {
		nr = vcpu->arch.exception.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;

		if (vcpu->arch.exception.has_error_code) {
			exit_int_info |= SVM_EVTINJ_VALID_ERR;
399
			vmcb12->control.exit_int_info_err =
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
				vcpu->arch.exception.error_code;
		}

	} else if (vcpu->arch.nmi_injected) {
		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;

	} else if (vcpu->arch.interrupt.injected) {
		nr = vcpu->arch.interrupt.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID;

		if (vcpu->arch.interrupt.soft)
			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
		else
			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
	}

416
	vmcb12->control.exit_int_info = exit_int_info;
417 418
}

419 420 421 422 423
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
	/*
	 * TODO: optimize unconditional TLB flush/MMU sync.  A partial list of
	 * things to fix before this can be conditional:
	 *
	 *  - Flush TLBs for both L1 and L2 remote TLB flush
	 *  - Honor L1's request to flush an ASID on nested VMRUN
	 *  - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
	 *  - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
	 *  - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
	 *
	 * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
	 *     NPT guest-physical mappings on VMRUN.
	 */
	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
	kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}

443
/*
444 445
 * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
 * if we are emulating VM-Entry into a guest with NPT enabled.
446 447
 */
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
448
			       bool nested_npt, bool reload_pdptrs)
449
{
450
	if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
451 452
		return -EINVAL;

453
	if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
454 455
	    CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
		return -EINVAL;
456 457

	if (!nested_npt)
458
		kvm_mmu_new_pgd(vcpu, cr3);
459 460 461 462

	vcpu->arch.cr3 = cr3;
	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);

463
	/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
464
	kvm_init_mmu(vcpu);
465 466

	return 0;
467 468
}

469 470 471 472 473 474 475 476 477
void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
{
	if (!svm->nested.vmcb02.ptr)
		return;

	/* FIXME: merge g_pat from vmcb01 and vmcb12.  */
	svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
}

478
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
479
{
480 481
	bool new_vmcb12 = false;

482 483
	nested_vmcb02_compute_g_pat(svm);

484
	/* Load the nested guest state */
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
	if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
		new_vmcb12 = true;
		svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
	}

	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
		svm->vmcb->save.es = vmcb12->save.es;
		svm->vmcb->save.cs = vmcb12->save.cs;
		svm->vmcb->save.ss = vmcb12->save.ss;
		svm->vmcb->save.ds = vmcb12->save.ds;
		svm->vmcb->save.cpl = vmcb12->save.cpl;
		vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
	}

	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
		svm->vmcb->save.gdtr = vmcb12->save.gdtr;
		svm->vmcb->save.idtr = vmcb12->save.idtr;
		vmcb_mark_dirty(svm->vmcb, VMCB_DT);
	}
504

P
Paolo Bonzini 已提交
505
	kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
506

507
	svm_set_efer(&svm->vcpu, svm->nested.save.efer);
508

509 510
	svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
	svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
511 512

	svm->vcpu.arch.cr2 = vmcb12->save.cr2;
513

514 515 516
	kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
	kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
	kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
517 518

	/* In case we don't even reach vcpu_run, the fields are not updated */
519 520 521
	svm->vmcb->save.rax = vmcb12->save.rax;
	svm->vmcb->save.rsp = vmcb12->save.rsp;
	svm->vmcb->save.rip = vmcb12->save.rip;
522

523 524
	/* These bits will be set properly on the first execution when new_vmc12 is true */
	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
525 526
		svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
527 528
		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
	}
529
}
530

531
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
532
{
533 534 535 536 537
	const u32 int_ctl_vmcb01_bits =
		V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;

	const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;

538
	struct kvm_vcpu *vcpu = &svm->vcpu;
539

540 541 542 543 544 545 546 547 548
	/*
	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
	 */

	/*
	 * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
	 * avic_physical_id.
	 */
549
	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
550 551 552 553 554 555 556 557 558 559

	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
	svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
	svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
	svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;

	/* Done at vmrun: asid.  */

	/* Also overwritten later if necessary.  */
	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
560

561
	/* nested_cr3.  */
562
	if (nested_npt_enabled(svm))
563
		nested_svm_init_mmu_context(vcpu);
564

565 566 567 568 569 570 571 572 573 574 575
	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
			vcpu->arch.l1_tsc_offset,
			svm->nested.ctl.tsc_offset,
			svm->tsc_ratio_msr);

	svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;

	if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
		WARN_ON(!svm->tsc_scaling_enabled);
		nested_svm_update_tsc_ratio_msr(vcpu);
	}
576

577
	svm->vmcb->control.int_ctl             =
578 579
		(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
		(svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
580

581 582 583 584
	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
585

586 587
	nested_svm_transition_tlb_flush(vcpu);

588
	/* Enter Guest-Mode */
589
	enter_guest_mode(vcpu);
590 591

	/*
592 593
	 * Merge guest and host intercepts - must be called with vcpu in
	 * guest-mode to take effect.
594 595
	 */
	recalc_intercepts(svm);
596 597
}

598 599 600 601 602 603 604 605 606 607 608 609
static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
{
	/*
	 * Some VMCB state is shared between L1 and L2 and thus has to be
	 * moved at the time of nested vmrun and vmexit.
	 *
	 * VMLOAD/VMSAVE state would also belong in this category, but KVM
	 * always performs VMLOAD and VMSAVE from the VMCB01.
	 */
	to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}

610
int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
611
			 struct vmcb *vmcb12, bool from_vmrun)
612
{
613
	struct vcpu_svm *svm = to_svm(vcpu);
614 615
	int ret;

616 617 618 619 620 621 622 623 624 625 626 627 628 629
	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
			       vmcb12->save.rip,
			       vmcb12->control.int_ctl,
			       vmcb12->control.event_inj,
			       vmcb12->control.nested_ctl);

	trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
				    vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
				    vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
				    vmcb12->control.intercepts[INTERCEPT_WORD3],
				    vmcb12->control.intercepts[INTERCEPT_WORD4],
				    vmcb12->control.intercepts[INTERCEPT_WORD5]);


630
	svm->nested.vmcb12_gpa = vmcb12_gpa;
631 632 633

	WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);

634
	nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
635 636

	svm_switch_vmcb(svm, &svm->nested.vmcb02);
637 638
	nested_vmcb02_prepare_control(svm);
	nested_vmcb02_prepare_save(svm, vmcb12);
639

640
	ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
641
				  nested_npt_enabled(svm), from_vmrun);
642 643 644
	if (ret)
		return ret;

645
	if (!npt_enabled)
646
		vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
647

648 649 650
	if (!from_vmrun)
		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);

P
Paolo Bonzini 已提交
651
	svm_set_gif(svm, true);
652 653

	return 0;
654 655
}

656
int nested_svm_vmrun(struct kvm_vcpu *vcpu)
657
{
658
	struct vcpu_svm *svm = to_svm(vcpu);
659
	int ret;
660
	struct vmcb *vmcb12;
661
	struct kvm_host_map map;
662
	u64 vmcb12_gpa;
663

664 665 666 667 668
	if (!svm->nested.hsave_msr) {
		kvm_inject_gp(vcpu, 0);
		return 1;
	}

669 670
	if (is_smm(vcpu)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
671 672
		return 1;
	}
673

674
	vmcb12_gpa = svm->vmcb->save.rax;
675
	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
676
	if (ret == -EINVAL) {
677
		kvm_inject_gp(vcpu, 0);
678 679
		return 1;
	} else if (ret) {
680
		return kvm_skip_emulated_instruction(vcpu);
681 682
	}

683
	ret = kvm_skip_emulated_instruction(vcpu);
684

685
	vmcb12 = map.hva;
686

687 688 689
	if (WARN_ON_ONCE(!svm->nested.initialized))
		return -EINVAL;

690
	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
691
	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
692

693
	if (!nested_vmcb_check_save(vcpu) ||
694
	    !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
695 696 697 698
		vmcb12->control.exit_code    = SVM_EXIT_ERR;
		vmcb12->control.exit_code_hi = 0;
		vmcb12->control.exit_info_1  = 0;
		vmcb12->control.exit_info_2  = 0;
699
		goto out;
700 701 702
	}

	/*
703 704
	 * Since vmcb01 is not in use, we can use it to store some of the L1
	 * state.
705
	 */
706 707 708 709 710
	svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
	svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
	svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
	svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
	svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
711 712

	if (!npt_enabled)
713
		svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
714

715
	svm->nested.nested_run_pending = 1;
716

717
	if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
718
		goto out_exit_err;
719

720 721
	if (nested_svm_vmrun_msrpm(svm))
		goto out;
722

723 724 725 726 727 728 729 730 731
out_exit_err:
	svm->nested.nested_run_pending = 0;

	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
	svm->vmcb->control.exit_code_hi = 0;
	svm->vmcb->control.exit_info_1  = 0;
	svm->vmcb->control.exit_info_2  = 0;

	nested_svm_vmexit(svm);
732

733
out:
734
	kvm_vcpu_unmap(vcpu, &map, true);
735

736 737 738
	return ret;
}

739
/* Copy state save area fields which are handled by VMRUN */
740 741
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
			  struct vmcb_save_area *from_save)
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
{
	to_save->es = from_save->es;
	to_save->cs = from_save->cs;
	to_save->ss = from_save->ss;
	to_save->ds = from_save->ds;
	to_save->gdtr = from_save->gdtr;
	to_save->idtr = from_save->idtr;
	to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
	to_save->efer = from_save->efer;
	to_save->cr0 = from_save->cr0;
	to_save->cr3 = from_save->cr3;
	to_save->cr4 = from_save->cr4;
	to_save->rax = from_save->rax;
	to_save->rsp = from_save->rsp;
	to_save->rip = from_save->rip;
	to_save->cpl = 0;
}

760
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
{
	to_vmcb->save.fs = from_vmcb->save.fs;
	to_vmcb->save.gs = from_vmcb->save.gs;
	to_vmcb->save.tr = from_vmcb->save.tr;
	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
	to_vmcb->save.star = from_vmcb->save.star;
	to_vmcb->save.lstar = from_vmcb->save.lstar;
	to_vmcb->save.cstar = from_vmcb->save.cstar;
	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}

int nested_svm_vmexit(struct vcpu_svm *svm)
{
778
	struct kvm_vcpu *vcpu = &svm->vcpu;
779
	struct vmcb *vmcb12;
780 781
	struct vmcb *vmcb = svm->vmcb;
	struct kvm_host_map map;
782
	int rc;
783

784 785 786
	/* Triple faults in L2 should never escape. */
	WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));

787
	rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
788 789
	if (rc) {
		if (rc == -EINVAL)
790
			kvm_inject_gp(vcpu, 0);
791 792 793
		return 1;
	}

794
	vmcb12 = map.hva;
795 796

	/* Exit Guest-Mode */
797
	leave_guest_mode(vcpu);
798
	svm->nested.vmcb12_gpa = 0;
799
	WARN_ON_ONCE(svm->nested.nested_run_pending);
800

801
	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
802

803 804 805
	/* in case we halted in L2 */
	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;

806 807
	/* Give the current vmcb to the guest */

808 809 810 811 812 813 814
	vmcb12->save.es     = vmcb->save.es;
	vmcb12->save.cs     = vmcb->save.cs;
	vmcb12->save.ss     = vmcb->save.ss;
	vmcb12->save.ds     = vmcb->save.ds;
	vmcb12->save.gdtr   = vmcb->save.gdtr;
	vmcb12->save.idtr   = vmcb->save.idtr;
	vmcb12->save.efer   = svm->vcpu.arch.efer;
815 816
	vmcb12->save.cr0    = kvm_read_cr0(vcpu);
	vmcb12->save.cr3    = kvm_read_cr3(vcpu);
817 818
	vmcb12->save.cr2    = vmcb->save.cr2;
	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
819 820 821 822
	vmcb12->save.rflags = kvm_get_rflags(vcpu);
	vmcb12->save.rip    = kvm_rip_read(vcpu);
	vmcb12->save.rsp    = kvm_rsp_read(vcpu);
	vmcb12->save.rax    = kvm_rax_read(vcpu);
823 824 825 826 827 828 829 830 831 832 833
	vmcb12->save.dr7    = vmcb->save.dr7;
	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
	vmcb12->save.cpl    = vmcb->save.cpl;

	vmcb12->control.int_state         = vmcb->control.int_state;
	vmcb12->control.exit_code         = vmcb->control.exit_code;
	vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
	vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
	vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;

	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
834
		nested_save_pending_event_to_vmcb12(svm, vmcb12);
835 836

	if (svm->nrips_enabled)
837
		vmcb12->control.next_rip  = vmcb->control.next_rip;
838

839 840 841 842
	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
	vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
843

844 845
	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);

846
	svm_switch_vmcb(svm, &svm->vmcb01);
847

848 849 850 851
	/*
	 * On vmexit the  GIF is set to false and
	 * no event can be injected in L1.
	 */
852
	svm_set_gif(svm, false);
853
	svm->vmcb->control.exit_int_info = 0;
854

855 856 857 858 859
	svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
	if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
		svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
	}
860

861 862 863 864 865 866
	if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
		WARN_ON(!svm->tsc_scaling_enabled);
		vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
		svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
	}

867
	svm->nested.ctl.nested_cr3 = 0;
868

869 870 871
	/*
	 * Restore processor state that had been saved in vmcb01
	 */
872 873 874 875 876 877 878
	kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
	svm_set_efer(vcpu, svm->vmcb->save.efer);
	svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
	svm_set_cr4(vcpu, svm->vmcb->save.cr4);
	kvm_rax_write(vcpu, svm->vmcb->save.rax);
	kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
	kvm_rip_write(vcpu, svm->vmcb->save.rip);
879 880 881

	svm->vcpu.arch.dr7 = DR7_FIXED_1;
	kvm_update_dr7(&svm->vcpu);
882

883 884 885 886 887
	trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
				       vmcb12->control.exit_info_1,
				       vmcb12->control.exit_info_2,
				       vmcb12->control.exit_int_info,
				       vmcb12->control.exit_int_info_err,
888 889
				       KVM_ISA_SVM);

890
	kvm_vcpu_unmap(vcpu, &map, true);
891

892 893
	nested_svm_transition_tlb_flush(vcpu);

894
	nested_svm_uninit_mmu_context(vcpu);
895

896
	rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
897 898
	if (rc)
		return 1;
899

900 901 902 903 904
	/*
	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
	 * doesn't end up in L1.
	 */
	svm->vcpu.arch.nmi_injected = false;
905 906
	kvm_clear_exception_queue(vcpu);
	kvm_clear_interrupt_queue(vcpu);
907 908 909 910 911 912 913 914 915

	/*
	 * If we are here following the completion of a VMRUN that
	 * is being single-stepped, queue the pending #DB intercept
	 * right now so that it an be accounted for before we execute
	 * L1's next instruction.
	 */
	if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
		kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
916 917 918 919

	return 0;
}

920 921
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
{
922
	nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
923 924
}

925 926
int svm_allocate_nested(struct vcpu_svm *svm)
{
927
	struct page *vmcb02_page;
928 929 930 931

	if (svm->nested.initialized)
		return 0;

932 933
	vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
	if (!vmcb02_page)
934
		return -ENOMEM;
935 936
	svm->nested.vmcb02.ptr = page_address(vmcb02_page);
	svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
937 938 939

	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
	if (!svm->nested.msrpm)
940
		goto err_free_vmcb02;
941 942 943 944 945
	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);

	svm->nested.initialized = true;
	return 0;

946 947
err_free_vmcb02:
	__free_page(vmcb02_page);
948 949 950 951 952 953 954 955 956 957 958
	return -ENOMEM;
}

void svm_free_nested(struct vcpu_svm *svm)
{
	if (!svm->nested.initialized)
		return;

	svm_vcpu_free_msrpm(svm->nested.msrpm);
	svm->nested.msrpm = NULL;

959 960
	__free_page(virt_to_page(svm->nested.vmcb02.ptr));
	svm->nested.vmcb02.ptr = NULL;
961

962 963 964 965 966 967 968 969 970
	/*
	 * When last_vmcb12_gpa matches the current vmcb12 gpa,
	 * some vmcb12 fields are not loaded if they are marked clean
	 * in the vmcb12, since in this case they are up to date already.
	 *
	 * When the vmcb02 is freed, this optimization becomes invalid.
	 */
	svm->nested.last_vmcb12_gpa = INVALID_GPA;

971 972 973
	svm->nested.initialized = false;
}

974 975 976 977 978
/*
 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
 */
void svm_leave_nested(struct vcpu_svm *svm)
{
979 980 981
	struct kvm_vcpu *vcpu = &svm->vcpu;

	if (is_guest_mode(vcpu)) {
982
		svm->nested.nested_run_pending = 0;
983 984
		svm->nested.vmcb12_gpa = INVALID_GPA;

985
		leave_guest_mode(vcpu);
986

987
		svm_switch_vmcb(svm, &svm->vmcb01);
988

989
		nested_svm_uninit_mmu_context(vcpu);
990
		vmcb_mark_all_dirty(svm->vmcb);
991
	}
992

993
	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
994 995
}

996 997 998 999 1000
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
	u32 offset, msr, value;
	int write, mask;

1001
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
		return NESTED_EXIT_HOST;

	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
	offset = svm_msrpm_offset(msr);
	write  = svm->vmcb->control.exit_info_1 & 1;
	mask   = 1 << ((2 * (msr & 0xf)) + write);

	if (offset == MSR_INVALID)
		return NESTED_EXIT_DONE;

	/* Offset is in 32 bit units but need in 8 bit units */
	offset *= 4;

1015
	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
		return NESTED_EXIT_DONE;

	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
	unsigned port, size, iopm_len;
	u16 val, mask;
	u8 start_bit;
	u64 gpa;

1028
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
1029 1030 1031 1032 1033
		return NESTED_EXIT_HOST;

	port = svm->vmcb->control.exit_info_1 >> 16;
	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
		SVM_IOIO_SIZE_SHIFT;
1034
	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
	start_bit = port % 8;
	iopm_len = (start_bit + size > 8) ? 2 : 1;
	mask = (0xf >> (4 - size)) << start_bit;
	val = 0;

	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
		return NESTED_EXIT_DONE;

	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;
	int vmexit = NESTED_EXIT_HOST;

	switch (exit_code) {
	case SVM_EXIT_MSR:
		vmexit = nested_svm_exit_handled_msr(svm);
		break;
	case SVM_EXIT_IOIO:
		vmexit = nested_svm_intercept_ioio(svm);
		break;
	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1059
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
1060 1061 1062 1063
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1064
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
1065 1066 1067 1068
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1069 1070 1071 1072 1073 1074
		/*
		 * Host-intercepted exceptions have been checked already in
		 * nested_svm_exit_special.  There is nothing to do here,
		 * the vmexit is injected by svm_check_nested_events.
		 */
		vmexit = NESTED_EXIT_DONE;
1075 1076 1077 1078 1079 1080 1081
		break;
	}
	case SVM_EXIT_ERR: {
		vmexit = NESTED_EXIT_DONE;
		break;
	}
	default: {
1082
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
			vmexit = NESTED_EXIT_DONE;
	}
	}

	return vmexit;
}

int nested_svm_exit_handled(struct vcpu_svm *svm)
{
	int vmexit;

	vmexit = nested_svm_intercept(svm);

	if (vmexit == NESTED_EXIT_DONE)
		nested_svm_vmexit(svm);

	return vmexit;
}

1102
int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
1103
{
1104 1105
	if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
1106 1107 1108
		return 1;
	}

1109 1110
	if (to_svm(vcpu)->vmcb->save.cpl) {
		kvm_inject_gp(vcpu, 0);
1111 1112 1113 1114 1115 1116
		return 1;
	}

	return 0;
}

1117
static bool nested_exit_on_exception(struct vcpu_svm *svm)
1118
{
1119
	unsigned int nr = svm->vcpu.arch.exception.nr;
1120

1121
	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
1122
}
1123

1124 1125 1126
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
	unsigned int nr = svm->vcpu.arch.exception.nr;
1127 1128 1129

	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
	svm->vmcb->control.exit_code_hi = 0;
1130 1131 1132

	if (svm->vcpu.arch.exception.has_error_code)
		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
1133 1134 1135 1136 1137

	/*
	 * EXITINFO2 is undefined for all exception intercepts other
	 * than #PF.
	 */
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
	if (nr == PF_VECTOR) {
		if (svm->vcpu.arch.exception.nested_apf)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
		else if (svm->vcpu.arch.exception.has_payload)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
		else
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
	} else if (nr == DB_VECTOR) {
		/* See inject_pending_event.  */
		kvm_deliver_exception_payload(&svm->vcpu);
		if (svm->vcpu.arch.dr7 & DR7_GD) {
			svm->vcpu.arch.dr7 &= ~DR7_GD;
			kvm_update_dr7(&svm->vcpu);
		}
	} else
		WARN_ON(svm->vcpu.arch.exception.has_payload);
1154

1155
	nested_svm_vmexit(svm);
1156 1157
}

1158 1159
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
1160
	return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
1161 1162
}

1163
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
1164 1165 1166
{
	struct vcpu_svm *svm = to_svm(vcpu);
	bool block_nested_events =
P
Paolo Bonzini 已提交
1167
		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
1168 1169 1170 1171 1172 1173 1174 1175
	struct kvm_lapic *apic = vcpu->arch.apic;

	if (lapic_in_kernel(vcpu) &&
	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
		if (block_nested_events)
			return -EBUSY;
		if (!nested_exit_on_init(svm))
			return 0;
1176
		nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
1177 1178
		return 0;
	}
1179

1180
	if (vcpu->arch.exception.pending) {
1181 1182 1183 1184 1185 1186 1187
		/*
		 * Only a pending nested run can block a pending exception.
		 * Otherwise an injected NMI/interrupt should either be
		 * lost or delivered to the nested hypervisor in the EXITINTINFO
		 * vmcb field, while delivering the pending exception.
		 */
		if (svm->nested.nested_run_pending)
1188 1189 1190 1191 1192 1193 1194
                        return -EBUSY;
		if (!nested_exit_on_exception(svm))
			return 0;
		nested_svm_inject_exception_vmexit(svm);
		return 0;
	}

1195
	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
1196 1197
		if (block_nested_events)
			return -EBUSY;
1198 1199
		if (!nested_exit_on_smi(svm))
			return 0;
1200
		nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
1201 1202 1203
		return 0;
	}

1204
	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1205 1206
		if (block_nested_events)
			return -EBUSY;
1207 1208
		if (!nested_exit_on_nmi(svm))
			return 0;
1209
		nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
1210 1211 1212
		return 0;
	}

1213
	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1214 1215
		if (block_nested_events)
			return -EBUSY;
1216 1217
		if (!nested_exit_on_intr(svm))
			return 0;
1218 1219
		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
		nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
		return 0;
	}

	return 0;
}

int nested_svm_exit_special(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;

	switch (exit_code) {
	case SVM_EXIT_INTR:
	case SVM_EXIT_NMI:
	case SVM_EXIT_NPF:
1234 1235 1236 1237
		return NESTED_EXIT_HOST;
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);

1238 1239
		if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
		    excp_bits)
1240
			return NESTED_EXIT_HOST;
1241
		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1242
			 svm->vcpu.arch.apf.host_apf_flags)
1243
			/* Trap async PF even if not shadowing */
1244 1245
			return NESTED_EXIT_HOST;
		break;
1246
	}
1247 1248 1249 1250 1251 1252
	default:
		break;
	}

	return NESTED_EXIT_CONTINUE;
}
1253

1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	vcpu->arch.tsc_scaling_ratio =
		kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
					       svm->tsc_ratio_msr);
	svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
}

1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				u32 user_data_size)
{
	struct vcpu_svm *svm;
	struct kvm_nested_state kvm_state = {
		.flags = 0,
		.format = KVM_STATE_NESTED_FORMAT_SVM,
		.size = sizeof(kvm_state),
	};
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];

	if (!vcpu)
		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;

	svm = to_svm(vcpu);

	if (user_data_size < kvm_state.size)
		goto out;

	/* First fill in the header and copy it out.  */
	if (is_guest_mode(vcpu)) {
1287
		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;

		if (svm->nested.nested_run_pending)
			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
	}

	if (gif_set(svm))
		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;

	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
		return -EFAULT;

	if (!is_guest_mode(vcpu))
		goto out;

	/*
	 * Copy over the full size of the VMCB rather than just the size
	 * of the structs.
	 */
	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
		return -EFAULT;
	if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
			 sizeof(user_vmcb->control)))
		return -EFAULT;
1313
	if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
			 sizeof(user_vmcb->save)))
		return -EFAULT;
out:
	return kvm_state.size;
}

static int svm_set_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				struct kvm_nested_state *kvm_state)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];
1327 1328
	struct vmcb_control_area *ctl;
	struct vmcb_save_area *save;
1329
	struct vmcb_save_area_cached save_cached;
1330
	unsigned long cr0;
1331
	int ret;
1332

1333 1334 1335
	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
		     KVM_STATE_NESTED_SVM_VMCB_SIZE);

1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359
	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
		return -EINVAL;

	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
				 KVM_STATE_NESTED_RUN_PENDING |
				 KVM_STATE_NESTED_GIF_SET))
		return -EINVAL;

	/*
	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
	 */
	if (!(vcpu->arch.efer & EFER_SVME)) {
		/* GIF=1 and no guest mode are required if SVME=0.  */
		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
			return -EINVAL;
	}

	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
		return -EINVAL;

	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
		svm_leave_nested(svm);
1360 1361
		svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
		return 0;
1362 1363 1364 1365 1366 1367 1368
	}

	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
		return -EINVAL;
	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
		return -EINVAL;

1369
	ret  = -ENOMEM;
1370 1371
	ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
	save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
	if (!ctl || !save)
		goto out_free;

	ret = -EFAULT;
	if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
		goto out_free;
	if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
		goto out_free;

	ret = -EINVAL;
1382
	if (!nested_vmcb_check_controls(vcpu, ctl))
1383
		goto out_free;
1384 1385 1386

	/*
	 * Processor state contains L2 state.  Check that it is
1387
	 * valid for guest mode (see nested_vmcb_check_save).
1388 1389 1390
	 */
	cr0 = kvm_read_cr0(vcpu);
        if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
1391
		goto out_free;
1392 1393 1394 1395 1396

	/*
	 * Validate host state saved from before VMRUN (see
	 * nested_svm_check_permissions).
	 */
1397
	__nested_copy_vmcb_save_to_cache(&save_cached, save);
1398 1399 1400
	if (!(save->cr0 & X86_CR0_PG) ||
	    !(save->cr0 & X86_CR0_PE) ||
	    (save->rflags & X86_EFLAGS_VM) ||
1401
	    !__nested_vmcb_check_save(vcpu, &save_cached))
1402
		goto out_free;
1403

1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
	/*
	 * While the nested guest CR3 is already checked and set by
	 * KVM_SET_SREGS, it was set when nested state was yet loaded,
	 * thus MMU might not be initialized correctly.
	 * Set it again to fix this.
	 */

	ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
				  nested_npt_enabled(svm), false);
	if (WARN_ON_ONCE(ret))
		goto out_free;


1417
	/*
1418 1419 1420 1421
	 * All checks done, we can enter guest mode. Userspace provides
	 * vmcb12.control, which will be combined with L1 and stored into
	 * vmcb02, and the L1 save state which we store in vmcb01.
	 * L2 registers if needed are moved from the current VMCB to VMCB02.
1422
	 */
1423

1424 1425 1426 1427 1428
	if (is_guest_mode(vcpu))
		svm_leave_nested(svm);
	else
		svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;

1429 1430
	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));

1431 1432 1433
	svm->nested.nested_run_pending =
		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);

1434
	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
1435

1436
	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
1437
	nested_copy_vmcb_control_to_cache(svm, ctl);
1438 1439

	svm_switch_vmcb(svm, &svm->nested.vmcb02);
1440
	nested_vmcb02_prepare_control(svm);
1441
	kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1442 1443 1444 1445 1446 1447
	ret = 0;
out_free:
	kfree(save);
	kfree(ctl);

	return ret;
1448 1449
}

1450 1451 1452 1453 1454 1455 1456
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (WARN_ON(!is_guest_mode(vcpu)))
		return true;

1457 1458
	if (!vcpu->arch.pdptrs_from_userspace &&
	    !nested_npt_enabled(svm) && is_pae_paging(vcpu))
1459 1460 1461 1462 1463 1464 1465
		/*
		 * Reload the guest's PDPTRs since after a migration
		 * the guest CR3 might be restored prior to setting the nested
		 * state which can lead to a load of wrong PDPTRs.
		 */
		if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
			return false;
1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477

	if (!nested_svm_vmrun_msrpm(svm)) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror =
			KVM_INTERNAL_ERROR_EMULATION;
		vcpu->run->internal.ndata = 0;
		return false;
	}

	return true;
}

1478 1479
struct kvm_x86_nested_ops svm_nested_ops = {
	.check_events = svm_check_nested_events,
1480
	.triple_fault = nested_svm_triple_fault,
1481
	.get_nested_state_pages = svm_get_nested_state_pages,
1482 1483
	.get_state = svm_get_nested_state,
	.set_state = svm_set_nested_state,
1484
};