nested.c 32.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * AMD SVM support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */

#define pr_fmt(fmt) "SVM: " fmt

#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kernel.h>

#include <asm/msr-index.h>
22
#include <asm/debugreg.h>
23 24 25 26 27

#include "kvm_emulate.h"
#include "trace.h"
#include "mmu.h"
#include "x86.h"
28
#include "cpuid.h"
29
#include "lapic.h"
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include "svm.h"

static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
				       struct x86_exception *fault)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
		/*
		 * TODO: track the cause of the nested page fault, and
		 * correctly fill in the high bits of exit_info_1.
		 */
		svm->vmcb->control.exit_code = SVM_EXIT_NPF;
		svm->vmcb->control.exit_code_hi = 0;
		svm->vmcb->control.exit_info_1 = (1ULL << 32);
		svm->vmcb->control.exit_info_2 = fault->address;
	}

	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
	svm->vmcb->control.exit_info_1 |= fault->error_code;

	nested_svm_vmexit(svm);
}

static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
	struct vcpu_svm *svm = to_svm(vcpu);
57
	u64 cr3 = svm->nested.ctl.nested_cr3;
58 59 60 61 62 63 64 65 66 67 68 69 70 71
	u64 pdpte;
	int ret;

	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
				       offset_in_page(cr3) + index * 8, 8);
	if (ret)
		return 0;
	return pdpte;
}

static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

72
	return svm->nested.ctl.nested_cr3;
73 74 75 76
}

static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
77 78 79
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *hsave = svm->nested.hsave;

80 81 82
	WARN_ON(mmu_is_nested(vcpu));

	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
83 84
	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
				svm->nested.ctl.nested_cr3);
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
	reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
}

static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
	vcpu->arch.mmu = &vcpu->arch.root_mmu;
	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}

void recalc_intercepts(struct vcpu_svm *svm)
{
100
	struct vmcb_control_area *c, *h, *g;
101
	unsigned int i;
102

103
	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
104 105 106 107 108 109

	if (!is_guest_mode(&svm->vcpu))
		return;

	c = &svm->vmcb->control;
	h = &svm->nested.hsave->control;
110
	g = &svm->nested.ctl;
111

112 113 114
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] = h->intercepts[i];

P
Paolo Bonzini 已提交
115
	if (g->int_ctl & V_INTR_MASKING_MASK) {
116
		/* We only want the cr8 intercept bits of L1 */
117 118
		vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
119 120 121 122 123 124

		/*
		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
		 * affect any interrupt we may want to inject; therefore,
		 * interrupt window vmexits are irrelevant to L0.
		 */
125
		vmcb_clr_intercept(c, INTERCEPT_VINTR);
126 127 128
	}

	/* We don't want to see VMMCALLs from a nested guest */
129
	vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
130

131 132
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] |= g->intercepts[i];
133 134
}

135 136
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
				   struct vmcb_control_area *from)
137
{
138 139 140 141 142
	unsigned int i;

	for (i = 0; i < MAX_INTERCEPT; i++)
		dst->intercepts[i] = from->intercepts[i];

143 144 145
	dst->iopm_base_pa         = from->iopm_base_pa;
	dst->msrpm_base_pa        = from->msrpm_base_pa;
	dst->tsc_offset           = from->tsc_offset;
146
	/* asid not copied, it is handled manually for svm->vmcb.  */
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	dst->tlb_ctl              = from->tlb_ctl;
	dst->int_ctl              = from->int_ctl;
	dst->int_vector           = from->int_vector;
	dst->int_state            = from->int_state;
	dst->exit_code            = from->exit_code;
	dst->exit_code_hi         = from->exit_code_hi;
	dst->exit_info_1          = from->exit_info_1;
	dst->exit_info_2          = from->exit_info_2;
	dst->exit_int_info        = from->exit_int_info;
	dst->exit_int_info_err    = from->exit_int_info_err;
	dst->nested_ctl           = from->nested_ctl;
	dst->event_inj            = from->event_inj;
	dst->event_inj_err        = from->event_inj_err;
	dst->nested_cr3           = from->nested_cr3;
	dst->virt_ext              = from->virt_ext;
	dst->pause_filter_count   = from->pause_filter_count;
	dst->pause_filter_thresh  = from->pause_filter_thresh;
}

static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
	/*
	 * This function merges the msr permission bitmaps of kvm and the
	 * nested vmcb. It is optimized in that it only merges the parts where
	 * the kvm msr permission bitmap may contain zero bits
	 */
	int i;

175
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
176 177 178 179 180 181 182 183 184 185
		return true;

	for (i = 0; i < MSRPM_OFFSETS; i++) {
		u32 value, p;
		u64 offset;

		if (msrpm_offsets[i] == 0xffffffff)
			break;

		p      = msrpm_offsets[i];
186
		offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
187 188 189 190 191 192 193 194 195 196 197 198

		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
			return false;

		svm->nested.msrpm[p] = svm->msrpm[p] | value;
	}

	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));

	return true;
}

199 200 201 202 203 204 205 206 207 208 209 210 211 212
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	if (!nested_svm_vmrun_msrpm(svm)) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror =
			KVM_INTERNAL_ERROR_EMULATION;
		vcpu->run->internal.ndata = 0;
		return false;
	}

	return true;
}

213
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
214
{
215
	if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
216 217
		return false;

218
	if (control->asid == 0)
219 220
		return false;

221 222
	if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
	    !npt_enabled)
223 224
		return false;

225 226 227
	return true;
}

228
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
229
{
230 231 232
	bool vmcb12_lma;

	if ((vmcb12->save.efer & EFER_SVME) == 0)
233 234
		return false;

235
	if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
236 237
		return false;

238
	if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
239 240
		return false;

241
	vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
242

243 244 245
	if (!vmcb12_lma) {
		if (vmcb12->save.cr4 & X86_CR4_PAE) {
			if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
246 247
				return false;
		} else {
248
			if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
249 250 251
				return false;
		}
	} else {
252 253
		if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
		    !(vmcb12->save.cr0 & X86_CR0_PE) ||
254
		    (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
255 256
			return false;
	}
257
	if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
258 259
		return false;

260
	return nested_vmcb_check_controls(&vmcb12->control);
261 262
}

263 264 265
static void load_nested_vmcb_control(struct vcpu_svm *svm,
				     struct vmcb_control_area *control)
{
266
	copy_vmcb_control_area(&svm->nested.ctl, control);
267

268 269
	/* Copy it here because nested_svm_check_controls will check it.  */
	svm->nested.ctl.asid           = control->asid;
270 271
	svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
	svm->nested.ctl.iopm_base_pa  &= ~0x0fffULL;
272 273
}

274 275 276 277 278 279 280 281 282 283 284 285 286
/*
 * Synchronize fields that are written by the processor, so that
 * they can be copied back into the nested_vmcb.
 */
void sync_nested_vmcb_control(struct vcpu_svm *svm)
{
	u32 mask;
	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;

	/* Only a few fields of int_ctl are written by the processor.  */
	mask = V_IRQ_MASK | V_TPR_MASK;
	if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
287
	    svm_is_intercept(svm, INTERCEPT_VINTR)) {
288 289 290 291 292 293 294 295 296 297 298 299 300 301
		/*
		 * In order to request an interrupt window, L0 is usurping
		 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
		 * even if it was clear in L1's VMCB.  Restoring it would be
		 * wrong.  However, in this case V_IRQ will remain true until
		 * interrupt_window_interception calls svm_clear_vintr and
		 * restores int_ctl.  We can just leave it aside.
		 */
		mask &= ~V_IRQ_MASK;
	}
	svm->nested.ctl.int_ctl        &= ~mask;
	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
}

302 303 304 305 306
/*
 * Transfer any event that L0 or L1 wanted to inject into L2 to
 * EXIT_INT_INFO.
 */
static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
307
					   struct vmcb *vmcb12)
308 309 310 311 312 313 314 315 316 317 318
{
	struct kvm_vcpu *vcpu = &svm->vcpu;
	u32 exit_int_info = 0;
	unsigned int nr;

	if (vcpu->arch.exception.injected) {
		nr = vcpu->arch.exception.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;

		if (vcpu->arch.exception.has_error_code) {
			exit_int_info |= SVM_EVTINJ_VALID_ERR;
319
			vmcb12->control.exit_int_info_err =
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
				vcpu->arch.exception.error_code;
		}

	} else if (vcpu->arch.nmi_injected) {
		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;

	} else if (vcpu->arch.interrupt.injected) {
		nr = vcpu->arch.interrupt.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID;

		if (vcpu->arch.interrupt.soft)
			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
		else
			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
	}

336
	vmcb12->control.exit_int_info = exit_int_info;
337 338
}

339 340 341 342 343 344
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}

/*
345 346
 * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
 * if we are emulating VM-Entry into a guest with NPT enabled.
347 348 349 350
 */
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
			       bool nested_npt)
{
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
	if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
		return -EINVAL;

	if (!nested_npt && is_pae_paging(vcpu) &&
	    (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
		if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
			return -EINVAL;
	}

	/*
	 * TODO: optimize unconditional TLB flush/MMU sync here and in
	 * kvm_init_shadow_npt_mmu().
	 */
	if (!nested_npt)
		kvm_mmu_new_pgd(vcpu, cr3, false, false);

	vcpu->arch.cr3 = cr3;
	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);

	kvm_init_mmu(vcpu, false);

	return 0;
373 374
}

375
static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
376 377
{
	/* Load the nested guest state */
378 379 380 381 382 383
	svm->vmcb->save.es = vmcb12->save.es;
	svm->vmcb->save.cs = vmcb12->save.cs;
	svm->vmcb->save.ss = vmcb12->save.ss;
	svm->vmcb->save.ds = vmcb12->save.ds;
	svm->vmcb->save.gdtr = vmcb12->save.gdtr;
	svm->vmcb->save.idtr = vmcb12->save.idtr;
P
Paolo Bonzini 已提交
384
	kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
385 386 387 388 389 390 391
	svm_set_efer(&svm->vcpu, vmcb12->save.efer);
	svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
	svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
	kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
	kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
	kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
392 393

	/* In case we don't even reach vcpu_run, the fields are not updated */
394 395 396
	svm->vmcb->save.rax = vmcb12->save.rax;
	svm->vmcb->save.rsp = vmcb12->save.rsp;
	svm->vmcb->save.rip = vmcb12->save.rip;
P
Paolo Bonzini 已提交
397 398
	svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
	svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
399
	svm->vmcb->save.cpl = vmcb12->save.cpl;
400
}
401

402
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
403
{
404
	const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
405 406

	if (nested_npt_enabled(svm))
407 408
		nested_svm_init_mmu_context(&svm->vcpu);

409
	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
410
		svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
411

412 413 414 415
	svm->vmcb->control.int_ctl             =
		(svm->nested.ctl.int_ctl & ~mask) |
		(svm->nested.hsave->control.int_ctl & mask);

416 417 418 419 420
	svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
421

422 423
	svm->vmcb->control.pause_filter_count  = svm->nested.ctl.pause_filter_count;
	svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
424 425 426 427 428 429 430 431 432 433

	/* Enter Guest-Mode */
	enter_guest_mode(&svm->vcpu);

	/*
	 * Merge guest and host intercepts - must be called  with vcpu in
	 * guest-mode to take affect here
	 */
	recalc_intercepts(svm);

434
	vmcb_mark_all_dirty(svm->vmcb);
435 436
}

437 438
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
			 struct vmcb *vmcb12)
439
{
440 441
	int ret;

442 443 444
	svm->nested.vmcb12_gpa = vmcb12_gpa;
	load_nested_vmcb_control(svm, &vmcb12->control);
	nested_prepare_vmcb_save(svm, vmcb12);
445
	nested_prepare_vmcb_control(svm);
446

447
	ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
448 449 450 451
				  nested_npt_enabled(svm));
	if (ret)
		return ret;

P
Paolo Bonzini 已提交
452
	svm_set_gif(svm, true);
453 454

	return 0;
455 456 457 458 459
}

int nested_svm_vmrun(struct vcpu_svm *svm)
{
	int ret;
460
	struct vmcb *vmcb12;
461 462 463
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb *vmcb = svm->vmcb;
	struct kvm_host_map map;
464
	u64 vmcb12_gpa;
465

466 467 468 469
	if (is_smm(&svm->vcpu)) {
		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
		return 1;
	}
470

471 472
	vmcb12_gpa = svm->vmcb->save.rax;
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
473 474 475 476 477 478 479 480 481
	if (ret == -EINVAL) {
		kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	} else if (ret) {
		return kvm_skip_emulated_instruction(&svm->vcpu);
	}

	ret = kvm_skip_emulated_instruction(&svm->vcpu);

482
	vmcb12 = map.hva;
483

484 485 486
	if (WARN_ON_ONCE(!svm->nested.initialized))
		return -EINVAL;

487 488 489 490 491
	if (!nested_vmcb_checks(svm, vmcb12)) {
		vmcb12->control.exit_code    = SVM_EXIT_ERR;
		vmcb12->control.exit_code_hi = 0;
		vmcb12->control.exit_info_1  = 0;
		vmcb12->control.exit_info_2  = 0;
492
		goto out;
493 494
	}

495 496 497 498 499
	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
			       vmcb12->save.rip,
			       vmcb12->control.int_ctl,
			       vmcb12->control.event_inj,
			       vmcb12->control.nested_ctl);
500

501 502
	trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
				    vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
503
				    vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
504
				    vmcb12->control.intercepts[INTERCEPT_WORD3],
505 506
				    vmcb12->control.intercepts[INTERCEPT_WORD4],
				    vmcb12->control.intercepts[INTERCEPT_WORD5]);
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533

	/* Clear internal status */
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	/*
	 * Save the old vmcb, so we don't need to pick what we save, but can
	 * restore everything when a VMEXIT occurs
	 */
	hsave->save.es     = vmcb->save.es;
	hsave->save.cs     = vmcb->save.cs;
	hsave->save.ss     = vmcb->save.ss;
	hsave->save.ds     = vmcb->save.ds;
	hsave->save.gdtr   = vmcb->save.gdtr;
	hsave->save.idtr   = vmcb->save.idtr;
	hsave->save.efer   = svm->vcpu.arch.efer;
	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
	hsave->save.cr4    = svm->vcpu.arch.cr4;
	hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
	hsave->save.rsp    = vmcb->save.rsp;
	hsave->save.rax    = vmcb->save.rax;
	if (npt_enabled)
		hsave->save.cr3    = vmcb->save.cr3;
	else
		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);

534
	copy_vmcb_control_area(&hsave->control, &vmcb->control);
535

536
	svm->nested.nested_run_pending = 1;
537

538
	if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
539
		goto out_exit_err;
540

541 542
	if (nested_svm_vmrun_msrpm(svm))
		goto out;
543

544 545 546 547 548 549 550 551 552
out_exit_err:
	svm->nested.nested_run_pending = 0;

	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
	svm->vmcb->control.exit_code_hi = 0;
	svm->vmcb->control.exit_info_1  = 0;
	svm->vmcb->control.exit_info_2  = 0;

	nested_svm_vmexit(svm);
553

554 555 556
out:
	kvm_vcpu_unmap(&svm->vcpu, &map, true);

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
	return ret;
}

void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
{
	to_vmcb->save.fs = from_vmcb->save.fs;
	to_vmcb->save.gs = from_vmcb->save.gs;
	to_vmcb->save.tr = from_vmcb->save.tr;
	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
	to_vmcb->save.star = from_vmcb->save.star;
	to_vmcb->save.lstar = from_vmcb->save.lstar;
	to_vmcb->save.cstar = from_vmcb->save.cstar;
	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}

int nested_svm_vmexit(struct vcpu_svm *svm)
{
	int rc;
579
	struct vmcb *vmcb12;
580 581 582 583
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb *vmcb = svm->vmcb;
	struct kvm_host_map map;

584
	rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
585 586 587 588 589 590
	if (rc) {
		if (rc == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	}

591
	vmcb12 = map.hva;
592 593 594

	/* Exit Guest-Mode */
	leave_guest_mode(&svm->vcpu);
595
	svm->nested.vmcb12_gpa = 0;
596
	WARN_ON_ONCE(svm->nested.nested_run_pending);
597

598 599 600
	/* in case we halted in L2 */
	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;

601 602
	/* Give the current vmcb to the guest */

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
	vmcb12->save.es     = vmcb->save.es;
	vmcb12->save.cs     = vmcb->save.cs;
	vmcb12->save.ss     = vmcb->save.ss;
	vmcb12->save.ds     = vmcb->save.ds;
	vmcb12->save.gdtr   = vmcb->save.gdtr;
	vmcb12->save.idtr   = vmcb->save.idtr;
	vmcb12->save.efer   = svm->vcpu.arch.efer;
	vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
	vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
	vmcb12->save.cr2    = vmcb->save.cr2;
	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
	vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
	vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
	vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
	vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
	vmcb12->save.dr7    = vmcb->save.dr7;
	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
	vmcb12->save.cpl    = vmcb->save.cpl;

	vmcb12->control.int_state         = vmcb->control.int_state;
	vmcb12->control.exit_code         = vmcb->control.exit_code;
	vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
	vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
	vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;

	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
		nested_vmcb_save_pending_event(svm, vmcb12);
630 631

	if (svm->nrips_enabled)
632
		vmcb12->control.next_rip  = vmcb->control.next_rip;
633

634 635 636 637
	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
	vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
638

639
	vmcb12->control.pause_filter_count =
640
		svm->vmcb->control.pause_filter_count;
641
	vmcb12->control.pause_filter_thresh =
642 643 644
		svm->vmcb->control.pause_filter_thresh;

	/* Restore the original control entries */
645
	copy_vmcb_control_area(&vmcb->control, &hsave->control);
646

647 648 649
	/* On vmexit the  GIF is set to false */
	svm_set_gif(svm, false);

650 651 652
	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
		svm->vcpu.arch.l1_tsc_offset;

653
	svm->nested.ctl.nested_cr3 = 0;
654 655 656 657 658 659 660 661 662

	/* Restore selected save entries */
	svm->vmcb->save.es = hsave->save.es;
	svm->vmcb->save.cs = hsave->save.cs;
	svm->vmcb->save.ss = hsave->save.ss;
	svm->vmcb->save.ds = hsave->save.ds;
	svm->vmcb->save.gdtr = hsave->save.gdtr;
	svm->vmcb->save.idtr = hsave->save.idtr;
	kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
P
Paolo Bonzini 已提交
663
	kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
664 665 666 667 668 669
	svm_set_efer(&svm->vcpu, hsave->save.efer);
	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
	kvm_rax_write(&svm->vcpu, hsave->save.rax);
	kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
	kvm_rip_write(&svm->vcpu, hsave->save.rip);
P
Paolo Bonzini 已提交
670
	svm->vmcb->save.dr7 = DR7_FIXED_1;
671 672 673
	svm->vmcb->save.cpl = 0;
	svm->vmcb->control.exit_int_info = 0;

674
	vmcb_mark_all_dirty(svm->vmcb);
675

676 677 678 679 680
	trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
				       vmcb12->control.exit_info_1,
				       vmcb12->control.exit_info_2,
				       vmcb12->control.exit_int_info,
				       vmcb12->control.exit_int_info_err,
681 682
				       KVM_ISA_SVM);

683 684 685
	kvm_vcpu_unmap(&svm->vcpu, &map, true);

	nested_svm_uninit_mmu_context(&svm->vcpu);
686

687 688 689
	rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
	if (rc)
		return 1;
690

691 692
	if (npt_enabled)
		svm->vmcb->save.cr3 = hsave->save.cr3;
693 694 695 696 697 698 699 700 701 702 703 704

	/*
	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
	 * doesn't end up in L1.
	 */
	svm->vcpu.arch.nmi_injected = false;
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	return 0;
}

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
int svm_allocate_nested(struct vcpu_svm *svm)
{
	struct page *hsave_page;

	if (svm->nested.initialized)
		return 0;

	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
	if (!hsave_page)
		return -ENOMEM;
	svm->nested.hsave = page_address(hsave_page);

	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
	if (!svm->nested.msrpm)
		goto err_free_hsave;
	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);

	svm->nested.initialized = true;
	return 0;

err_free_hsave:
	__free_page(hsave_page);
	return -ENOMEM;
}

void svm_free_nested(struct vcpu_svm *svm)
{
	if (!svm->nested.initialized)
		return;

	svm_vcpu_free_msrpm(svm->nested.msrpm);
	svm->nested.msrpm = NULL;

	__free_page(virt_to_page(svm->nested.hsave));
	svm->nested.hsave = NULL;

	svm->nested.initialized = false;
}

744 745 746 747 748 749 750 751 752 753 754 755 756 757
/*
 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
 */
void svm_leave_nested(struct vcpu_svm *svm)
{
	if (is_guest_mode(&svm->vcpu)) {
		struct vmcb *hsave = svm->nested.hsave;
		struct vmcb *vmcb = svm->vmcb;

		svm->nested.nested_run_pending = 0;
		leave_guest_mode(&svm->vcpu);
		copy_vmcb_control_area(&vmcb->control, &hsave->control);
		nested_svm_uninit_mmu_context(&svm->vcpu);
	}
758 759

	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
760 761
}

762 763 764 765 766
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
	u32 offset, msr, value;
	int write, mask;

767
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
768 769 770 771 772 773 774 775 776 777 778 779 780
		return NESTED_EXIT_HOST;

	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
	offset = svm_msrpm_offset(msr);
	write  = svm->vmcb->control.exit_info_1 & 1;
	mask   = 1 << ((2 * (msr & 0xf)) + write);

	if (offset == MSR_INVALID)
		return NESTED_EXIT_DONE;

	/* Offset is in 32 bit units but need in 8 bit units */
	offset *= 4;

781
	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
782 783 784 785 786 787 788 789 790 791 792 793
		return NESTED_EXIT_DONE;

	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
	unsigned port, size, iopm_len;
	u16 val, mask;
	u8 start_bit;
	u64 gpa;

794
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
795 796 797 798 799
		return NESTED_EXIT_HOST;

	port = svm->vmcb->control.exit_info_1 >> 16;
	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
		SVM_IOIO_SIZE_SHIFT;
800
	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
	start_bit = port % 8;
	iopm_len = (start_bit + size > 8) ? 2 : 1;
	mask = (0xf >> (4 - size)) << start_bit;
	val = 0;

	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
		return NESTED_EXIT_DONE;

	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;
	int vmexit = NESTED_EXIT_HOST;

	switch (exit_code) {
	case SVM_EXIT_MSR:
		vmexit = nested_svm_exit_handled_msr(svm);
		break;
	case SVM_EXIT_IOIO:
		vmexit = nested_svm_intercept_ioio(svm);
		break;
	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
825
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
826 827 828 829
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
830
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
831 832 833 834
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
835 836 837 838 839 840
		/*
		 * Host-intercepted exceptions have been checked already in
		 * nested_svm_exit_special.  There is nothing to do here,
		 * the vmexit is injected by svm_check_nested_events.
		 */
		vmexit = NESTED_EXIT_DONE;
841 842 843 844 845 846 847
		break;
	}
	case SVM_EXIT_ERR: {
		vmexit = NESTED_EXIT_DONE;
		break;
	}
	default: {
848
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
			vmexit = NESTED_EXIT_DONE;
	}
	}

	return vmexit;
}

int nested_svm_exit_handled(struct vcpu_svm *svm)
{
	int vmexit;

	vmexit = nested_svm_intercept(svm);

	if (vmexit == NESTED_EXIT_DONE)
		nested_svm_vmexit(svm);

	return vmexit;
}

int nested_svm_check_permissions(struct vcpu_svm *svm)
{
	if (!(svm->vcpu.arch.efer & EFER_SVME) ||
	    !is_paging(&svm->vcpu)) {
		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
		return 1;
	}

	if (svm->vmcb->save.cpl) {
		kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	}

	return 0;
}

884
static bool nested_exit_on_exception(struct vcpu_svm *svm)
885
{
886
	unsigned int nr = svm->vcpu.arch.exception.nr;
887

888
	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
889
}
890

891 892 893
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
	unsigned int nr = svm->vcpu.arch.exception.nr;
894 895 896

	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
	svm->vmcb->control.exit_code_hi = 0;
897 898 899

	if (svm->vcpu.arch.exception.has_error_code)
		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
900 901 902 903 904

	/*
	 * EXITINFO2 is undefined for all exception intercepts other
	 * than #PF.
	 */
905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
	if (nr == PF_VECTOR) {
		if (svm->vcpu.arch.exception.nested_apf)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
		else if (svm->vcpu.arch.exception.has_payload)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
		else
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
	} else if (nr == DB_VECTOR) {
		/* See inject_pending_event.  */
		kvm_deliver_exception_payload(&svm->vcpu);
		if (svm->vcpu.arch.dr7 & DR7_GD) {
			svm->vcpu.arch.dr7 &= ~DR7_GD;
			kvm_update_dr7(&svm->vcpu);
		}
	} else
		WARN_ON(svm->vcpu.arch.exception.has_payload);
921

922
	nested_svm_vmexit(svm);
923 924
}

925 926 927 928 929 930 931 932 933
static void nested_svm_smi(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code = SVM_EXIT_SMI;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}

934 935 936 937 938 939 940 941 942
static void nested_svm_nmi(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}

943 944
static void nested_svm_intr(struct vcpu_svm *svm)
{
945 946
	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);

947 948 949 950
	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

951
	nested_svm_vmexit(svm);
952 953
}

954 955
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
956
	return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
957 958 959 960 961 962 963 964 965 966 967 968
}

static void nested_svm_init(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}


969
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
970 971 972
{
	struct vcpu_svm *svm = to_svm(vcpu);
	bool block_nested_events =
P
Paolo Bonzini 已提交
973
		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
974 975 976 977 978 979 980 981 982 983 984
	struct kvm_lapic *apic = vcpu->arch.apic;

	if (lapic_in_kernel(vcpu) &&
	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
		if (block_nested_events)
			return -EBUSY;
		if (!nested_exit_on_init(svm))
			return 0;
		nested_svm_init(svm);
		return 0;
	}
985

986 987 988 989 990 991 992 993 994
	if (vcpu->arch.exception.pending) {
		if (block_nested_events)
                        return -EBUSY;
		if (!nested_exit_on_exception(svm))
			return 0;
		nested_svm_inject_exception_vmexit(svm);
		return 0;
	}

995
	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
996 997
		if (block_nested_events)
			return -EBUSY;
998 999
		if (!nested_exit_on_smi(svm))
			return 0;
1000 1001 1002 1003
		nested_svm_smi(svm);
		return 0;
	}

1004
	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1005 1006
		if (block_nested_events)
			return -EBUSY;
1007 1008
		if (!nested_exit_on_nmi(svm))
			return 0;
1009 1010 1011 1012
		nested_svm_nmi(svm);
		return 0;
	}

1013
	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1014 1015
		if (block_nested_events)
			return -EBUSY;
1016 1017
		if (!nested_exit_on_intr(svm))
			return 0;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
		nested_svm_intr(svm);
		return 0;
	}

	return 0;
}

int nested_svm_exit_special(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;

	switch (exit_code) {
	case SVM_EXIT_INTR:
	case SVM_EXIT_NMI:
	case SVM_EXIT_NPF:
1033 1034 1035 1036
		return NESTED_EXIT_HOST;
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);

1037 1038
		if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
				excp_bits)
1039
			return NESTED_EXIT_HOST;
1040
		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1041
			 svm->vcpu.arch.apf.host_apf_flags)
1042
			/* Trap async PF even if not shadowing */
1043 1044
			return NESTED_EXIT_HOST;
		break;
1045
	}
1046 1047 1048 1049 1050 1051
	default:
		break;
	}

	return NESTED_EXIT_CONTINUE;
}
1052

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				u32 user_data_size)
{
	struct vcpu_svm *svm;
	struct kvm_nested_state kvm_state = {
		.flags = 0,
		.format = KVM_STATE_NESTED_FORMAT_SVM,
		.size = sizeof(kvm_state),
	};
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];

	if (!vcpu)
		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;

	svm = to_svm(vcpu);

	if (user_data_size < kvm_state.size)
		goto out;

	/* First fill in the header and copy it out.  */
	if (is_guest_mode(vcpu)) {
1076
		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;

		if (svm->nested.nested_run_pending)
			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
	}

	if (gif_set(svm))
		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;

	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
		return -EFAULT;

	if (!is_guest_mode(vcpu))
		goto out;

	/*
	 * Copy over the full size of the VMCB rather than just the size
	 * of the structs.
	 */
	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
		return -EFAULT;
	if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
			 sizeof(user_vmcb->control)))
		return -EFAULT;
	if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
			 sizeof(user_vmcb->save)))
		return -EFAULT;

out:
	return kvm_state.size;
}

static int svm_set_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				struct kvm_nested_state *kvm_state)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];
1118 1119 1120
	struct vmcb_control_area *ctl;
	struct vmcb_save_area *save;
	int ret;
1121 1122
	u32 cr0;

1123 1124 1125
	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
		     KVM_STATE_NESTED_SVM_VMCB_SIZE);

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
		return -EINVAL;

	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
				 KVM_STATE_NESTED_RUN_PENDING |
				 KVM_STATE_NESTED_GIF_SET))
		return -EINVAL;

	/*
	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
	 */
	if (!(vcpu->arch.efer & EFER_SVME)) {
		/* GIF=1 and no guest mode are required if SVME=0.  */
		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
			return -EINVAL;
	}

	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
		return -EINVAL;

	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
		svm_leave_nested(svm);
1150 1151
		svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
		return 0;
1152 1153 1154 1155 1156 1157 1158
	}

	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
		return -EINVAL;
	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
		return -EINVAL;

1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
	ret  = -ENOMEM;
	ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
	save = kzalloc(sizeof(*save), GFP_KERNEL);
	if (!ctl || !save)
		goto out_free;

	ret = -EFAULT;
	if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
		goto out_free;
	if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
		goto out_free;

	ret = -EINVAL;
	if (!nested_vmcb_check_controls(ctl))
		goto out_free;
1174 1175 1176 1177 1178 1179 1180

	/*
	 * Processor state contains L2 state.  Check that it is
	 * valid for guest mode (see nested_vmcb_checks).
	 */
	cr0 = kvm_read_cr0(vcpu);
        if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
1181
		goto out_free;
1182 1183 1184 1185 1186 1187

	/*
	 * Validate host state saved from before VMRUN (see
	 * nested_svm_check_permissions).
	 * TODO: validate reserved bits for all saved state.
	 */
1188 1189
	if (!(save->cr0 & X86_CR0_PG))
		goto out_free;
1190 1191 1192 1193 1194 1195 1196 1197

	/*
	 * All checks done, we can enter guest mode.  L1 control fields
	 * come from the nested save state.  Guest state is already
	 * in the registers, the save area of the nested state instead
	 * contains saved L1 state.
	 */
	copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
1198
	hsave->save = *save;
1199

1200
	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
1201
	load_nested_vmcb_control(svm, ctl);
1202 1203
	nested_prepare_vmcb_control(svm);

1204
	kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1205 1206 1207 1208 1209 1210
	ret = 0;
out_free:
	kfree(save);
	kfree(ctl);

	return ret;
1211 1212
}

1213 1214
struct kvm_x86_nested_ops svm_nested_ops = {
	.check_events = svm_check_nested_events,
1215
	.get_nested_state_pages = svm_get_nested_state_pages,
1216 1217
	.get_state = svm_get_nested_state,
	.set_state = svm_set_nested_state,
1218
};