nested.c 32.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * AMD SVM support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 */

#define pr_fmt(fmt) "SVM: " fmt

#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kernel.h>

#include <asm/msr-index.h>
22
#include <asm/debugreg.h>
23 24 25 26 27

#include "kvm_emulate.h"
#include "trace.h"
#include "mmu.h"
#include "x86.h"
28
#include "cpuid.h"
29
#include "lapic.h"
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include "svm.h"

static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
				       struct x86_exception *fault)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
		/*
		 * TODO: track the cause of the nested page fault, and
		 * correctly fill in the high bits of exit_info_1.
		 */
		svm->vmcb->control.exit_code = SVM_EXIT_NPF;
		svm->vmcb->control.exit_code_hi = 0;
		svm->vmcb->control.exit_info_1 = (1ULL << 32);
		svm->vmcb->control.exit_info_2 = fault->address;
	}

	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
	svm->vmcb->control.exit_info_1 |= fault->error_code;

	nested_svm_vmexit(svm);
}

static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
	struct vcpu_svm *svm = to_svm(vcpu);
57
	u64 cr3 = svm->nested.ctl.nested_cr3;
58 59 60 61 62 63 64 65 66 67 68 69 70 71
	u64 pdpte;
	int ret;

	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
				       offset_in_page(cr3) + index * 8, 8);
	if (ret)
		return 0;
	return pdpte;
}

static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

72
	return svm->nested.ctl.nested_cr3;
73 74 75 76
}

static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
77 78 79
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *hsave = svm->nested.hsave;

80 81 82
	WARN_ON(mmu_is_nested(vcpu));

	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
83 84
	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
				svm->nested.ctl.nested_cr3);
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
	reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
}

static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
	vcpu->arch.mmu = &vcpu->arch.root_mmu;
	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}

void recalc_intercepts(struct vcpu_svm *svm)
{
100
	struct vmcb_control_area *c, *h, *g;
101
	unsigned int i;
102

103
	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
104 105 106 107 108 109

	if (!is_guest_mode(&svm->vcpu))
		return;

	c = &svm->vmcb->control;
	h = &svm->nested.hsave->control;
110
	g = &svm->nested.ctl;
111

112 113 114
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] = h->intercepts[i];

P
Paolo Bonzini 已提交
115
	if (g->int_ctl & V_INTR_MASKING_MASK) {
116
		/* We only want the cr8 intercept bits of L1 */
117 118
		vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
119 120 121 122 123 124

		/*
		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
		 * affect any interrupt we may want to inject; therefore,
		 * interrupt window vmexits are irrelevant to L0.
		 */
125
		vmcb_clr_intercept(c, INTERCEPT_VINTR);
126 127 128
	}

	/* We don't want to see VMMCALLs from a nested guest */
129
	vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
130

131 132
	for (i = 0; i < MAX_INTERCEPT; i++)
		c->intercepts[i] |= g->intercepts[i];
133 134
}

135 136
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
				   struct vmcb_control_area *from)
137
{
138 139 140 141 142
	unsigned int i;

	for (i = 0; i < MAX_INTERCEPT; i++)
		dst->intercepts[i] = from->intercepts[i];

143 144 145
	dst->iopm_base_pa         = from->iopm_base_pa;
	dst->msrpm_base_pa        = from->msrpm_base_pa;
	dst->tsc_offset           = from->tsc_offset;
146
	/* asid not copied, it is handled manually for svm->vmcb.  */
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	dst->tlb_ctl              = from->tlb_ctl;
	dst->int_ctl              = from->int_ctl;
	dst->int_vector           = from->int_vector;
	dst->int_state            = from->int_state;
	dst->exit_code            = from->exit_code;
	dst->exit_code_hi         = from->exit_code_hi;
	dst->exit_info_1          = from->exit_info_1;
	dst->exit_info_2          = from->exit_info_2;
	dst->exit_int_info        = from->exit_int_info;
	dst->exit_int_info_err    = from->exit_int_info_err;
	dst->nested_ctl           = from->nested_ctl;
	dst->event_inj            = from->event_inj;
	dst->event_inj_err        = from->event_inj_err;
	dst->nested_cr3           = from->nested_cr3;
	dst->virt_ext              = from->virt_ext;
	dst->pause_filter_count   = from->pause_filter_count;
	dst->pause_filter_thresh  = from->pause_filter_thresh;
}

static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
	/*
	 * This function merges the msr permission bitmaps of kvm and the
	 * nested vmcb. It is optimized in that it only merges the parts where
	 * the kvm msr permission bitmap may contain zero bits
	 */
	int i;

175
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
176 177 178 179 180 181 182 183 184 185
		return true;

	for (i = 0; i < MSRPM_OFFSETS; i++) {
		u32 value, p;
		u64 offset;

		if (msrpm_offsets[i] == 0xffffffff)
			break;

		p      = msrpm_offsets[i];
186
		offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
187 188 189 190 191 192 193 194 195 196 197 198

		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
			return false;

		svm->nested.msrpm[p] = svm->msrpm[p] | value;
	}

	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));

	return true;
}

199 200 201
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
202

203 204 205
	if (WARN_ON(!is_guest_mode(vcpu)))
		return true;

206 207 208 209 210 211 212 213 214 215 216
	if (!nested_svm_vmrun_msrpm(svm)) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror =
			KVM_INTERNAL_ERROR_EMULATION;
		vcpu->run->internal.ndata = 0;
		return false;
	}

	return true;
}

217
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
218
{
219
	if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
220 221
		return false;

222
	if (control->asid == 0)
223 224
		return false;

225 226
	if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
	    !npt_enabled)
227 228
		return false;

229 230 231
	return true;
}

232
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
233
{
234
	struct kvm_vcpu *vcpu = &svm->vcpu;
235 236 237
	bool vmcb12_lma;

	if ((vmcb12->save.efer & EFER_SVME) == 0)
238 239
		return false;

240
	if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
241 242
		return false;

243
	if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
244 245
		return false;

246
	vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
247

248
	if (vmcb12_lma) {
249 250
		if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
		    !(vmcb12->save.cr0 & X86_CR0_PE) ||
251
		    (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
252 253
			return false;
	}
254
	if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
255 256
		return false;

257
	return nested_vmcb_check_controls(&vmcb12->control);
258 259
}

260 261 262
static void load_nested_vmcb_control(struct vcpu_svm *svm,
				     struct vmcb_control_area *control)
{
263
	copy_vmcb_control_area(&svm->nested.ctl, control);
264

265 266
	/* Copy it here because nested_svm_check_controls will check it.  */
	svm->nested.ctl.asid           = control->asid;
267 268
	svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
	svm->nested.ctl.iopm_base_pa  &= ~0x0fffULL;
269 270
}

271 272 273 274 275 276 277 278 279 280 281 282 283
/*
 * Synchronize fields that are written by the processor, so that
 * they can be copied back into the nested_vmcb.
 */
void sync_nested_vmcb_control(struct vcpu_svm *svm)
{
	u32 mask;
	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;

	/* Only a few fields of int_ctl are written by the processor.  */
	mask = V_IRQ_MASK | V_TPR_MASK;
	if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
284
	    svm_is_intercept(svm, INTERCEPT_VINTR)) {
285 286 287 288 289 290 291 292 293 294 295 296 297 298
		/*
		 * In order to request an interrupt window, L0 is usurping
		 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
		 * even if it was clear in L1's VMCB.  Restoring it would be
		 * wrong.  However, in this case V_IRQ will remain true until
		 * interrupt_window_interception calls svm_clear_vintr and
		 * restores int_ctl.  We can just leave it aside.
		 */
		mask &= ~V_IRQ_MASK;
	}
	svm->nested.ctl.int_ctl        &= ~mask;
	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
}

299 300 301 302 303
/*
 * Transfer any event that L0 or L1 wanted to inject into L2 to
 * EXIT_INT_INFO.
 */
static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
304
					   struct vmcb *vmcb12)
305 306 307 308 309 310 311 312 313 314 315
{
	struct kvm_vcpu *vcpu = &svm->vcpu;
	u32 exit_int_info = 0;
	unsigned int nr;

	if (vcpu->arch.exception.injected) {
		nr = vcpu->arch.exception.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;

		if (vcpu->arch.exception.has_error_code) {
			exit_int_info |= SVM_EVTINJ_VALID_ERR;
316
			vmcb12->control.exit_int_info_err =
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
				vcpu->arch.exception.error_code;
		}

	} else if (vcpu->arch.nmi_injected) {
		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;

	} else if (vcpu->arch.interrupt.injected) {
		nr = vcpu->arch.interrupt.nr;
		exit_int_info = nr | SVM_EVTINJ_VALID;

		if (vcpu->arch.interrupt.soft)
			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
		else
			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
	}

333
	vmcb12->control.exit_int_info = exit_int_info;
334 335
}

336 337 338 339 340 341
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}

/*
342 343
 * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
 * if we are emulating VM-Entry into a guest with NPT enabled.
344 345 346 347
 */
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
			       bool nested_npt)
{
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
	if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
		return -EINVAL;

	if (!nested_npt && is_pae_paging(vcpu) &&
	    (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
		if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
			return -EINVAL;
	}

	/*
	 * TODO: optimize unconditional TLB flush/MMU sync here and in
	 * kvm_init_shadow_npt_mmu().
	 */
	if (!nested_npt)
		kvm_mmu_new_pgd(vcpu, cr3, false, false);

	vcpu->arch.cr3 = cr3;
	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);

	kvm_init_mmu(vcpu, false);

	return 0;
370 371
}

372
static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
373 374
{
	/* Load the nested guest state */
375 376 377 378 379 380
	svm->vmcb->save.es = vmcb12->save.es;
	svm->vmcb->save.cs = vmcb12->save.cs;
	svm->vmcb->save.ss = vmcb12->save.ss;
	svm->vmcb->save.ds = vmcb12->save.ds;
	svm->vmcb->save.gdtr = vmcb12->save.gdtr;
	svm->vmcb->save.idtr = vmcb12->save.idtr;
P
Paolo Bonzini 已提交
381
	kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
382 383 384 385 386 387 388
	svm_set_efer(&svm->vcpu, vmcb12->save.efer);
	svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
	svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
	kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
	kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
	kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
389 390

	/* In case we don't even reach vcpu_run, the fields are not updated */
391 392 393
	svm->vmcb->save.rax = vmcb12->save.rax;
	svm->vmcb->save.rsp = vmcb12->save.rsp;
	svm->vmcb->save.rip = vmcb12->save.rip;
P
Paolo Bonzini 已提交
394
	svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
395
	svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
396
	svm->vmcb->save.cpl = vmcb12->save.cpl;
397
}
398

399
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
400
{
401
	const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
402 403

	if (nested_npt_enabled(svm))
404 405
		nested_svm_init_mmu_context(&svm->vcpu);

406
	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
407
		svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
408

409 410 411 412
	svm->vmcb->control.int_ctl             =
		(svm->nested.ctl.int_ctl & ~mask) |
		(svm->nested.hsave->control.int_ctl & mask);

413 414 415 416 417
	svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
418

419 420
	svm->vmcb->control.pause_filter_count  = svm->nested.ctl.pause_filter_count;
	svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
421 422 423 424 425 426 427 428 429 430

	/* Enter Guest-Mode */
	enter_guest_mode(&svm->vcpu);

	/*
	 * Merge guest and host intercepts - must be called  with vcpu in
	 * guest-mode to take affect here
	 */
	recalc_intercepts(svm);

431
	vmcb_mark_all_dirty(svm->vmcb);
432 433
}

434 435
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
			 struct vmcb *vmcb12)
436
{
437 438
	int ret;

439 440 441
	svm->nested.vmcb12_gpa = vmcb12_gpa;
	load_nested_vmcb_control(svm, &vmcb12->control);
	nested_prepare_vmcb_save(svm, vmcb12);
442
	nested_prepare_vmcb_control(svm);
443

444
	ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
445 446 447 448
				  nested_npt_enabled(svm));
	if (ret)
		return ret;

P
Paolo Bonzini 已提交
449
	svm_set_gif(svm, true);
450 451

	return 0;
452 453 454 455 456
}

int nested_svm_vmrun(struct vcpu_svm *svm)
{
	int ret;
457
	struct vmcb *vmcb12;
458 459 460
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb *vmcb = svm->vmcb;
	struct kvm_host_map map;
461
	u64 vmcb12_gpa;
462

463 464 465 466
	if (is_smm(&svm->vcpu)) {
		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
		return 1;
	}
467

468 469
	vmcb12_gpa = svm->vmcb->save.rax;
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
470 471 472 473 474 475 476 477 478
	if (ret == -EINVAL) {
		kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	} else if (ret) {
		return kvm_skip_emulated_instruction(&svm->vcpu);
	}

	ret = kvm_skip_emulated_instruction(&svm->vcpu);

479
	vmcb12 = map.hva;
480

481 482 483
	if (WARN_ON_ONCE(!svm->nested.initialized))
		return -EINVAL;

484 485 486 487 488
	if (!nested_vmcb_checks(svm, vmcb12)) {
		vmcb12->control.exit_code    = SVM_EXIT_ERR;
		vmcb12->control.exit_code_hi = 0;
		vmcb12->control.exit_info_1  = 0;
		vmcb12->control.exit_info_2  = 0;
489
		goto out;
490 491
	}

492 493 494 495 496
	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
			       vmcb12->save.rip,
			       vmcb12->control.int_ctl,
			       vmcb12->control.event_inj,
			       vmcb12->control.nested_ctl);
497

498 499
	trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
				    vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
500
				    vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
501
				    vmcb12->control.intercepts[INTERCEPT_WORD3],
502 503
				    vmcb12->control.intercepts[INTERCEPT_WORD4],
				    vmcb12->control.intercepts[INTERCEPT_WORD5]);
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530

	/* Clear internal status */
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	/*
	 * Save the old vmcb, so we don't need to pick what we save, but can
	 * restore everything when a VMEXIT occurs
	 */
	hsave->save.es     = vmcb->save.es;
	hsave->save.cs     = vmcb->save.cs;
	hsave->save.ss     = vmcb->save.ss;
	hsave->save.ds     = vmcb->save.ds;
	hsave->save.gdtr   = vmcb->save.gdtr;
	hsave->save.idtr   = vmcb->save.idtr;
	hsave->save.efer   = svm->vcpu.arch.efer;
	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
	hsave->save.cr4    = svm->vcpu.arch.cr4;
	hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
	hsave->save.rsp    = vmcb->save.rsp;
	hsave->save.rax    = vmcb->save.rax;
	if (npt_enabled)
		hsave->save.cr3    = vmcb->save.cr3;
	else
		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);

531
	copy_vmcb_control_area(&hsave->control, &vmcb->control);
532

533
	svm->nested.nested_run_pending = 1;
534

535
	if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
536
		goto out_exit_err;
537

538 539
	if (nested_svm_vmrun_msrpm(svm))
		goto out;
540

541 542 543 544 545 546 547 548 549
out_exit_err:
	svm->nested.nested_run_pending = 0;

	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
	svm->vmcb->control.exit_code_hi = 0;
	svm->vmcb->control.exit_info_1  = 0;
	svm->vmcb->control.exit_info_2  = 0;

	nested_svm_vmexit(svm);
550

551 552 553
out:
	kvm_vcpu_unmap(&svm->vcpu, &map, true);

554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
	return ret;
}

void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
{
	to_vmcb->save.fs = from_vmcb->save.fs;
	to_vmcb->save.gs = from_vmcb->save.gs;
	to_vmcb->save.tr = from_vmcb->save.tr;
	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
	to_vmcb->save.star = from_vmcb->save.star;
	to_vmcb->save.lstar = from_vmcb->save.lstar;
	to_vmcb->save.cstar = from_vmcb->save.cstar;
	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}

int nested_svm_vmexit(struct vcpu_svm *svm)
{
	int rc;
576
	struct vmcb *vmcb12;
577 578 579 580
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb *vmcb = svm->vmcb;
	struct kvm_host_map map;

581
	rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
582 583 584 585 586 587
	if (rc) {
		if (rc == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	}

588
	vmcb12 = map.hva;
589 590 591

	/* Exit Guest-Mode */
	leave_guest_mode(&svm->vcpu);
592
	svm->nested.vmcb12_gpa = 0;
593
	WARN_ON_ONCE(svm->nested.nested_run_pending);
594

595 596
	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);

597 598 599
	/* in case we halted in L2 */
	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;

600 601
	/* Give the current vmcb to the guest */

602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
	vmcb12->save.es     = vmcb->save.es;
	vmcb12->save.cs     = vmcb->save.cs;
	vmcb12->save.ss     = vmcb->save.ss;
	vmcb12->save.ds     = vmcb->save.ds;
	vmcb12->save.gdtr   = vmcb->save.gdtr;
	vmcb12->save.idtr   = vmcb->save.idtr;
	vmcb12->save.efer   = svm->vcpu.arch.efer;
	vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
	vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
	vmcb12->save.cr2    = vmcb->save.cr2;
	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
	vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
	vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
	vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
	vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
	vmcb12->save.dr7    = vmcb->save.dr7;
	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
	vmcb12->save.cpl    = vmcb->save.cpl;

	vmcb12->control.int_state         = vmcb->control.int_state;
	vmcb12->control.exit_code         = vmcb->control.exit_code;
	vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
	vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
	vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;

	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
		nested_vmcb_save_pending_event(svm, vmcb12);
629 630

	if (svm->nrips_enabled)
631
		vmcb12->control.next_rip  = vmcb->control.next_rip;
632

633 634 635 636
	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
	vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
637

638
	vmcb12->control.pause_filter_count =
639
		svm->vmcb->control.pause_filter_count;
640
	vmcb12->control.pause_filter_thresh =
641 642 643
		svm->vmcb->control.pause_filter_thresh;

	/* Restore the original control entries */
644
	copy_vmcb_control_area(&vmcb->control, &hsave->control);
645

646 647 648
	/* On vmexit the  GIF is set to false */
	svm_set_gif(svm, false);

649 650 651
	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
		svm->vcpu.arch.l1_tsc_offset;

652
	svm->nested.ctl.nested_cr3 = 0;
653 654 655 656 657 658 659 660 661

	/* Restore selected save entries */
	svm->vmcb->save.es = hsave->save.es;
	svm->vmcb->save.cs = hsave->save.cs;
	svm->vmcb->save.ss = hsave->save.ss;
	svm->vmcb->save.ds = hsave->save.ds;
	svm->vmcb->save.gdtr = hsave->save.gdtr;
	svm->vmcb->save.idtr = hsave->save.idtr;
	kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
P
Paolo Bonzini 已提交
662
	kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
663 664 665 666 667 668
	svm_set_efer(&svm->vcpu, hsave->save.efer);
	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
	kvm_rax_write(&svm->vcpu, hsave->save.rax);
	kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
	kvm_rip_write(&svm->vcpu, hsave->save.rip);
P
Paolo Bonzini 已提交
669
	svm->vmcb->save.dr7 = DR7_FIXED_1;
670 671 672
	svm->vmcb->save.cpl = 0;
	svm->vmcb->control.exit_int_info = 0;

673
	vmcb_mark_all_dirty(svm->vmcb);
674

675 676 677 678 679
	trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
				       vmcb12->control.exit_info_1,
				       vmcb12->control.exit_info_2,
				       vmcb12->control.exit_int_info,
				       vmcb12->control.exit_int_info_err,
680 681
				       KVM_ISA_SVM);

682 683 684
	kvm_vcpu_unmap(&svm->vcpu, &map, true);

	nested_svm_uninit_mmu_context(&svm->vcpu);
685

686 687 688
	rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
	if (rc)
		return 1;
689

690 691
	if (npt_enabled)
		svm->vmcb->save.cr3 = hsave->save.cr3;
692 693 694 695 696 697 698 699 700 701 702 703

	/*
	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
	 * doesn't end up in L1.
	 */
	svm->vcpu.arch.nmi_injected = false;
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	return 0;
}

704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
int svm_allocate_nested(struct vcpu_svm *svm)
{
	struct page *hsave_page;

	if (svm->nested.initialized)
		return 0;

	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
	if (!hsave_page)
		return -ENOMEM;
	svm->nested.hsave = page_address(hsave_page);

	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
	if (!svm->nested.msrpm)
		goto err_free_hsave;
	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);

	svm->nested.initialized = true;
	return 0;

err_free_hsave:
	__free_page(hsave_page);
	return -ENOMEM;
}

void svm_free_nested(struct vcpu_svm *svm)
{
	if (!svm->nested.initialized)
		return;

	svm_vcpu_free_msrpm(svm->nested.msrpm);
	svm->nested.msrpm = NULL;

	__free_page(virt_to_page(svm->nested.hsave));
	svm->nested.hsave = NULL;

	svm->nested.initialized = false;
}

743 744 745 746 747 748 749 750 751 752 753 754 755
/*
 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
 */
void svm_leave_nested(struct vcpu_svm *svm)
{
	if (is_guest_mode(&svm->vcpu)) {
		struct vmcb *hsave = svm->nested.hsave;
		struct vmcb *vmcb = svm->vmcb;

		svm->nested.nested_run_pending = 0;
		leave_guest_mode(&svm->vcpu);
		copy_vmcb_control_area(&vmcb->control, &hsave->control);
		nested_svm_uninit_mmu_context(&svm->vcpu);
756
		vmcb_mark_all_dirty(svm->vmcb);
757
	}
758 759

	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
760 761
}

762 763 764 765 766
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
	u32 offset, msr, value;
	int write, mask;

767
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
768 769 770 771 772 773 774 775 776 777 778 779 780
		return NESTED_EXIT_HOST;

	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
	offset = svm_msrpm_offset(msr);
	write  = svm->vmcb->control.exit_info_1 & 1;
	mask   = 1 << ((2 * (msr & 0xf)) + write);

	if (offset == MSR_INVALID)
		return NESTED_EXIT_DONE;

	/* Offset is in 32 bit units but need in 8 bit units */
	offset *= 4;

781
	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
782 783 784 785 786 787 788 789 790 791 792 793
		return NESTED_EXIT_DONE;

	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
	unsigned port, size, iopm_len;
	u16 val, mask;
	u8 start_bit;
	u64 gpa;

794
	if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
795 796 797 798 799
		return NESTED_EXIT_HOST;

	port = svm->vmcb->control.exit_info_1 >> 16;
	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
		SVM_IOIO_SIZE_SHIFT;
800
	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
	start_bit = port % 8;
	iopm_len = (start_bit + size > 8) ? 2 : 1;
	mask = (0xf >> (4 - size)) << start_bit;
	val = 0;

	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
		return NESTED_EXIT_DONE;

	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}

static int nested_svm_intercept(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;
	int vmexit = NESTED_EXIT_HOST;

	switch (exit_code) {
	case SVM_EXIT_MSR:
		vmexit = nested_svm_exit_handled_msr(svm);
		break;
	case SVM_EXIT_IOIO:
		vmexit = nested_svm_intercept_ioio(svm);
		break;
	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
825
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
826 827 828 829
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
830
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
831 832 833 834
			vmexit = NESTED_EXIT_DONE;
		break;
	}
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
835 836 837 838 839 840
		/*
		 * Host-intercepted exceptions have been checked already in
		 * nested_svm_exit_special.  There is nothing to do here,
		 * the vmexit is injected by svm_check_nested_events.
		 */
		vmexit = NESTED_EXIT_DONE;
841 842 843 844 845 846 847
		break;
	}
	case SVM_EXIT_ERR: {
		vmexit = NESTED_EXIT_DONE;
		break;
	}
	default: {
848
		if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
			vmexit = NESTED_EXIT_DONE;
	}
	}

	return vmexit;
}

int nested_svm_exit_handled(struct vcpu_svm *svm)
{
	int vmexit;

	vmexit = nested_svm_intercept(svm);

	if (vmexit == NESTED_EXIT_DONE)
		nested_svm_vmexit(svm);

	return vmexit;
}

int nested_svm_check_permissions(struct vcpu_svm *svm)
{
	if (!(svm->vcpu.arch.efer & EFER_SVME) ||
	    !is_paging(&svm->vcpu)) {
		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
		return 1;
	}

	if (svm->vmcb->save.cpl) {
		kvm_inject_gp(&svm->vcpu, 0);
		return 1;
	}

	return 0;
}

884
static bool nested_exit_on_exception(struct vcpu_svm *svm)
885
{
886
	unsigned int nr = svm->vcpu.arch.exception.nr;
887

888
	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
889
}
890

891 892 893
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
	unsigned int nr = svm->vcpu.arch.exception.nr;
894 895 896

	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
	svm->vmcb->control.exit_code_hi = 0;
897 898 899

	if (svm->vcpu.arch.exception.has_error_code)
		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
900 901 902 903 904

	/*
	 * EXITINFO2 is undefined for all exception intercepts other
	 * than #PF.
	 */
905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
	if (nr == PF_VECTOR) {
		if (svm->vcpu.arch.exception.nested_apf)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
		else if (svm->vcpu.arch.exception.has_payload)
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
		else
			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
	} else if (nr == DB_VECTOR) {
		/* See inject_pending_event.  */
		kvm_deliver_exception_payload(&svm->vcpu);
		if (svm->vcpu.arch.dr7 & DR7_GD) {
			svm->vcpu.arch.dr7 &= ~DR7_GD;
			kvm_update_dr7(&svm->vcpu);
		}
	} else
		WARN_ON(svm->vcpu.arch.exception.has_payload);
921

922
	nested_svm_vmexit(svm);
923 924
}

925 926 927 928 929 930 931 932 933
static void nested_svm_smi(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code = SVM_EXIT_SMI;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}

934 935 936 937 938 939 940 941 942
static void nested_svm_nmi(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}

943 944
static void nested_svm_intr(struct vcpu_svm *svm)
{
945 946
	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);

947 948 949 950
	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

951
	nested_svm_vmexit(svm);
952 953
}

954 955
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
956
	return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
957 958 959 960 961 962 963 964 965 966 967 968
}

static void nested_svm_init(struct vcpu_svm *svm)
{
	svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
	svm->vmcb->control.exit_info_1 = 0;
	svm->vmcb->control.exit_info_2 = 0;

	nested_svm_vmexit(svm);
}


969
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
970 971 972
{
	struct vcpu_svm *svm = to_svm(vcpu);
	bool block_nested_events =
P
Paolo Bonzini 已提交
973
		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
974 975 976 977 978 979 980 981 982 983 984
	struct kvm_lapic *apic = vcpu->arch.apic;

	if (lapic_in_kernel(vcpu) &&
	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
		if (block_nested_events)
			return -EBUSY;
		if (!nested_exit_on_init(svm))
			return 0;
		nested_svm_init(svm);
		return 0;
	}
985

986 987 988 989 990 991 992 993 994
	if (vcpu->arch.exception.pending) {
		if (block_nested_events)
                        return -EBUSY;
		if (!nested_exit_on_exception(svm))
			return 0;
		nested_svm_inject_exception_vmexit(svm);
		return 0;
	}

995
	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
996 997
		if (block_nested_events)
			return -EBUSY;
998 999
		if (!nested_exit_on_smi(svm))
			return 0;
1000 1001 1002 1003
		nested_svm_smi(svm);
		return 0;
	}

1004
	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1005 1006
		if (block_nested_events)
			return -EBUSY;
1007 1008
		if (!nested_exit_on_nmi(svm))
			return 0;
1009 1010 1011 1012
		nested_svm_nmi(svm);
		return 0;
	}

1013
	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1014 1015
		if (block_nested_events)
			return -EBUSY;
1016 1017
		if (!nested_exit_on_intr(svm))
			return 0;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
		nested_svm_intr(svm);
		return 0;
	}

	return 0;
}

int nested_svm_exit_special(struct vcpu_svm *svm)
{
	u32 exit_code = svm->vmcb->control.exit_code;

	switch (exit_code) {
	case SVM_EXIT_INTR:
	case SVM_EXIT_NMI:
	case SVM_EXIT_NPF:
1033 1034 1035 1036
		return NESTED_EXIT_HOST;
	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);

1037 1038
		if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
				excp_bits)
1039
			return NESTED_EXIT_HOST;
1040
		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1041
			 svm->vcpu.arch.apf.host_apf_flags)
1042
			/* Trap async PF even if not shadowing */
1043 1044
			return NESTED_EXIT_HOST;
		break;
1045
	}
1046 1047 1048 1049 1050 1051
	default:
		break;
	}

	return NESTED_EXIT_CONTINUE;
}
1052

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				u32 user_data_size)
{
	struct vcpu_svm *svm;
	struct kvm_nested_state kvm_state = {
		.flags = 0,
		.format = KVM_STATE_NESTED_FORMAT_SVM,
		.size = sizeof(kvm_state),
	};
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];

	if (!vcpu)
		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;

	svm = to_svm(vcpu);

	if (user_data_size < kvm_state.size)
		goto out;

	/* First fill in the header and copy it out.  */
	if (is_guest_mode(vcpu)) {
1076
		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;

		if (svm->nested.nested_run_pending)
			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
	}

	if (gif_set(svm))
		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;

	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
		return -EFAULT;

	if (!is_guest_mode(vcpu))
		goto out;

	/*
	 * Copy over the full size of the VMCB rather than just the size
	 * of the structs.
	 */
	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
		return -EFAULT;
	if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
			 sizeof(user_vmcb->control)))
		return -EFAULT;
	if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
			 sizeof(user_vmcb->save)))
		return -EFAULT;

out:
	return kvm_state.size;
}

static int svm_set_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				struct kvm_nested_state *kvm_state)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *hsave = svm->nested.hsave;
	struct vmcb __user *user_vmcb = (struct vmcb __user *)
		&user_kvm_nested_state->data.svm[0];
1118 1119 1120
	struct vmcb_control_area *ctl;
	struct vmcb_save_area *save;
	int ret;
1121 1122
	u32 cr0;

1123 1124 1125
	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
		     KVM_STATE_NESTED_SVM_VMCB_SIZE);

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
		return -EINVAL;

	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
				 KVM_STATE_NESTED_RUN_PENDING |
				 KVM_STATE_NESTED_GIF_SET))
		return -EINVAL;

	/*
	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
	 */
	if (!(vcpu->arch.efer & EFER_SVME)) {
		/* GIF=1 and no guest mode are required if SVME=0.  */
		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
			return -EINVAL;
	}

	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
		return -EINVAL;

	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
		svm_leave_nested(svm);
1150 1151
		svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
		return 0;
1152 1153 1154 1155 1156 1157 1158
	}

	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
		return -EINVAL;
	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
		return -EINVAL;

1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
	ret  = -ENOMEM;
	ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
	save = kzalloc(sizeof(*save), GFP_KERNEL);
	if (!ctl || !save)
		goto out_free;

	ret = -EFAULT;
	if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
		goto out_free;
	if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
		goto out_free;

	ret = -EINVAL;
	if (!nested_vmcb_check_controls(ctl))
		goto out_free;
1174 1175 1176 1177 1178 1179 1180

	/*
	 * Processor state contains L2 state.  Check that it is
	 * valid for guest mode (see nested_vmcb_checks).
	 */
	cr0 = kvm_read_cr0(vcpu);
        if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
1181
		goto out_free;
1182 1183 1184 1185 1186 1187

	/*
	 * Validate host state saved from before VMRUN (see
	 * nested_svm_check_permissions).
	 * TODO: validate reserved bits for all saved state.
	 */
1188 1189
	if (!(save->cr0 & X86_CR0_PG))
		goto out_free;
1190 1191 1192 1193 1194 1195 1196

	/*
	 * All checks done, we can enter guest mode.  L1 control fields
	 * come from the nested save state.  Guest state is already
	 * in the registers, the save area of the nested state instead
	 * contains saved L1 state.
	 */
1197 1198 1199 1200

	svm->nested.nested_run_pending =
		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);

1201
	copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
1202
	hsave->save = *save;
1203

1204
	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
1205
	load_nested_vmcb_control(svm, ctl);
1206 1207
	nested_prepare_vmcb_control(svm);

1208
	kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1209 1210 1211 1212 1213 1214
	ret = 0;
out_free:
	kfree(save);
	kfree(ctl);

	return ret;
1215 1216
}

1217 1218
struct kvm_x86_nested_ops svm_nested_ops = {
	.check_events = svm_check_nested_events,
1219
	.get_nested_state_pages = svm_get_nested_state_pages,
1220 1221
	.get_state = svm_get_nested_state,
	.set_state = svm_set_nested_state,
1222
};