vmx.c 218.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
A
Avi Kivity 已提交
2 3 4 5 6 7 8
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
9
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
10 11 12 13 14 15
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 */

16 17 18 19
#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
20
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
21
#include <linux/module.h>
22
#include <linux/moduleparam.h>
23
#include <linux/mod_devicetable.h>
24 25
#include <linux/mm.h>
#include <linux/sched.h>
26
#include <linux/sched/smt.h>
27
#include <linux/slab.h>
28
#include <linux/tboot.h>
29
#include <linux/trace_events.h>
30
#include <linux/entry-kvm.h>
A
Avi Kivity 已提交
31

32
#include <asm/apic.h>
33
#include <asm/asm.h>
34
#include <asm/cpu.h>
35
#include <asm/cpu_device_id.h>
36
#include <asm/debugreg.h>
A
Anthony Liguori 已提交
37
#include <asm/desc.h>
38
#include <asm/fpu/internal.h>
39
#include <asm/io.h>
40
#include <asm/irq_remapping.h>
41 42 43
#include <asm/kexec.h>
#include <asm/perf_event.h>
#include <asm/mce.h>
44
#include <asm/mmu_context.h>
45
#include <asm/mshyperv.h>
46
#include <asm/mwait.h>
47 48 49
#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
A
Avi Kivity 已提交
50

51
#include "capabilities.h"
52
#include "cpuid.h"
53
#include "evmcs.h"
54 55 56 57
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
58
#include "nested.h"
59
#include "pmu.h"
60
#include "trace.h"
61
#include "vmcs.h"
62
#include "vmcs12.h"
63
#include "vmx.h"
64
#include "x86.h"
65

A
Avi Kivity 已提交
66 67 68
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

69
#ifdef MODULE
70
static const struct x86_cpu_id vmx_cpu_id[] = {
71
	X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
72 73 74
	{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
75
#endif
76

77
bool __read_mostly enable_vpid = 1;
78
module_param_named(vpid, enable_vpid, bool, 0444);
79

80 81 82
static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);

83
bool __read_mostly flexpriority_enabled = 1;
84
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
85

86
bool __read_mostly enable_ept = 1;
87
module_param_named(ept, enable_ept, bool, S_IRUGO);
S
Sheng Yang 已提交
88

89
bool __read_mostly enable_unrestricted_guest = 1;
90 91 92
module_param_named(unrestricted_guest,
			enable_unrestricted_guest, bool, S_IRUGO);

93
bool __read_mostly enable_ept_ad_bits = 1;
94 95
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);

96
static bool __read_mostly emulate_invalid_guest_state = true;
97
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
98

99
static bool __read_mostly fasteoi = 1;
100 101
module_param(fasteoi, bool, S_IRUGO);

102
bool __read_mostly enable_apicv = 1;
103
module_param(enable_apicv, bool, S_IRUGO);
104

105 106 107 108 109
/*
 * If nested=1, nested virtualization is supported, i.e., guests may use
 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 * use VMX instructions.
 */
110
static bool __read_mostly nested = 1;
111 112
module_param(nested, bool, S_IRUGO);

113
bool __read_mostly enable_pml = 1;
K
Kai Huang 已提交
114 115
module_param_named(pml, enable_pml, bool, S_IRUGO);

116 117 118
static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);

119 120 121
#define MSR_BITMAP_MODE_X2APIC		1
#define MSR_BITMAP_MODE_X2APIC_APICV	2

122 123
#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

124 125 126 127 128 129 130
/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif

131
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
132 133 134 135
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON				\
	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | 	\
	 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
136

137
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
138 139 140
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

141 142
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

143 144 145 146 147
#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
	RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
	RTIT_STATUS_BYTECNT))

148 149 150 151
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * ple_gap:    upper bound on the amount of time between two successive
 *             executions of PAUSE in a loop. Also indicate if ple enabled.
152
 *             According to test, this time is usually smaller than 128 cycles.
153 154 155 156 157 158
 * ple_window: upper bound on the amount of time a guest is allowed to execute
 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 *             less than 2^12 cycles
 * Time is measured based on a counter that runs at the same rate as the TSC,
 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 */
159
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
160
module_param(ple_gap, uint, 0444);
R
Radim Krčmář 已提交
161

162 163
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
164

R
Radim Krčmář 已提交
165
/* Default doubles per-vcpu window every exit. */
166
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
167
module_param(ple_window_grow, uint, 0444);
R
Radim Krčmář 已提交
168 169

/* Default resets per-vcpu window every exit to ple_window. */
170
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
171
module_param(ple_window_shrink, uint, 0444);
R
Radim Krčmář 已提交
172 173

/* Default is to compute the maximum so we can never overflow. */
174 175
static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
R
Radim Krčmář 已提交
176

177 178 179 180
/* Default is SYSTEM mode, 1 for host-guest mode */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
module_param(pt_mode, int, S_IRUGO);

181
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
182
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
183
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
184

185 186
/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
187 188 189

static const struct {
	const char *option;
190
	bool for_parse;
191
} vmentry_l1d_param[] = {
192 193 194 195 196 197
	[VMENTER_L1D_FLUSH_AUTO]	 = {"auto", true},
	[VMENTER_L1D_FLUSH_NEVER]	 = {"never", true},
	[VMENTER_L1D_FLUSH_COND]	 = {"cond", true},
	[VMENTER_L1D_FLUSH_ALWAYS]	 = {"always", true},
	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
198 199
};

200 201 202 203
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;

static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
204
{
205
	struct page *page;
206
	unsigned int i;
207

208 209 210 211 212
	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
		return 0;
	}

213 214 215
	if (!enable_ept) {
		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
		return 0;
216 217
	}

218 219 220 221 222 223 224 225 226
	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
		u64 msr;

		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
		if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
			return 0;
		}
	}
227

228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
	/* If set to auto use the default l1tf mitigation method */
	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
		switch (l1tf_mitigation) {
		case L1TF_MITIGATION_OFF:
			l1tf = VMENTER_L1D_FLUSH_NEVER;
			break;
		case L1TF_MITIGATION_FLUSH_NOWARN:
		case L1TF_MITIGATION_FLUSH:
		case L1TF_MITIGATION_FLUSH_NOSMT:
			l1tf = VMENTER_L1D_FLUSH_COND;
			break;
		case L1TF_MITIGATION_FULL:
		case L1TF_MITIGATION_FULL_FORCE:
			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
			break;
		}
	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
	}

248 249
	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
250 251 252 253
		/*
		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
		 * lifetime and so should not be charged to a memcg.
		 */
254 255 256 257
		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
		if (!page)
			return -ENOMEM;
		vmx_l1d_flush_pages = page_address(page);
258 259 260 261 262 263 264 265 266 267

		/*
		 * Initialize each page with a different pattern in
		 * order to protect against KSM in the nested
		 * virtualization case.
		 */
		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
			       PAGE_SIZE);
		}
268 269 270 271
	}

	l1tf_vmx_mitigation = l1tf;

272 273 274 275
	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
		static_branch_enable(&vmx_l1d_should_flush);
	else
		static_branch_disable(&vmx_l1d_should_flush);
276

277 278
	if (l1tf == VMENTER_L1D_FLUSH_COND)
		static_branch_enable(&vmx_l1d_flush_cond);
279
	else
280
		static_branch_disable(&vmx_l1d_flush_cond);
281 282 283 284 285 286 287 288 289
	return 0;
}

static int vmentry_l1d_flush_parse(const char *s)
{
	unsigned int i;

	if (s) {
		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
290 291 292
			if (vmentry_l1d_param[i].for_parse &&
			    sysfs_streq(s, vmentry_l1d_param[i].option))
				return i;
293 294
		}
	}
295 296 297
	return -EINVAL;
}

298 299
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
300
	int l1tf, ret;
301 302 303 304 305

	l1tf = vmentry_l1d_flush_parse(s);
	if (l1tf < 0)
		return l1tf;

306 307 308
	if (!boot_cpu_has(X86_BUG_L1TF))
		return 0;

309 310 311 312 313 314 315 316 317 318 319
	/*
	 * Has vmx_init() run already? If not then this is the pre init
	 * parameter parsing. In that case just store the value and let
	 * vmx_init() do the proper setup after enable_ept has been
	 * established.
	 */
	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
		vmentry_l1d_flush_param = l1tf;
		return 0;
	}

320 321 322 323
	mutex_lock(&vmx_l1d_flush_mutex);
	ret = vmx_setup_l1d_flush(l1tf);
	mutex_unlock(&vmx_l1d_flush_mutex);
	return ret;
324 325
}

326 327
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
328 329 330
	if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
		return sprintf(s, "???\n");

331
	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
332 333 334 335 336 337
}

static const struct kernel_param_ops vmentry_l1d_flush_ops = {
	.set = vmentry_l1d_flush_set,
	.get = vmentry_l1d_flush_get,
};
338
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
339

340
static u32 vmx_segment_access_rights(struct kvm_segment *var);
341
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
A
Ashok Raj 已提交
342
							  u32 msr, int type);
343

344 345
void vmx_vmexit(void);

346 347 348 349 350 351
#define vmx_insn_failed(fmt...)		\
do {					\
	WARN_ONCE(1, fmt);		\
	pr_warn_ratelimited(fmt);	\
} while (0)

352 353 354 355 356 357 358 359
asmlinkage void vmread_error(unsigned long field, bool fault)
{
	if (fault)
		kvm_spurious_fault();
	else
		vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
	vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
	vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
}

noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
	vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
}

noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
	vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
			ext, vpid, gva);
}

noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
	vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
			ext, eptp, gpa);
}

A
Avi Kivity 已提交
388
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
389
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
390 391 392 393 394
/*
 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 */
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
A
Avi Kivity 已提交
395

396 397 398
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

399 400
struct vmcs_config vmcs_config;
struct vmx_capability vmx_capability;
S
Sheng Yang 已提交
401

A
Avi Kivity 已提交
402 403 404 405 406 407 408 409
#define VMX_SEGMENT_FIELD(seg)					\
	[VCPU_SREG_##seg] = {                                   \
		.selector = GUEST_##seg##_SELECTOR,		\
		.base = GUEST_##seg##_BASE,		   	\
		.limit = GUEST_##seg##_LIMIT,		   	\
		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
	}

410
static const struct kvm_vmx_segment_field {
A
Avi Kivity 已提交
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
	unsigned selector;
	unsigned base;
	unsigned limit;
	unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
	VMX_SEGMENT_FIELD(CS),
	VMX_SEGMENT_FIELD(DS),
	VMX_SEGMENT_FIELD(ES),
	VMX_SEGMENT_FIELD(FS),
	VMX_SEGMENT_FIELD(GS),
	VMX_SEGMENT_FIELD(SS),
	VMX_SEGMENT_FIELD(TR),
	VMX_SEGMENT_FIELD(LDTR),
};

426 427 428 429 430
static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
	vmx->segment_cache.bitmask = 0;
}

431
static unsigned long host_idt_base;
432

433
/*
434 435 436 437 438
 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 * will emulate SYSCALL in legacy mode if the vendor string in guest
 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 * support this emulation, IA32_STAR must always be included in
 * vmx_msr_index[], even in i386 builds.
439
 */
440
const u32 vmx_msr_index[] = {
441
#ifdef CONFIG_X86_64
442
	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
A
Avi Kivity 已提交
443
#endif
B
Brian Gerst 已提交
444
	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
445
	MSR_IA32_TSX_CTRL,
A
Avi Kivity 已提交
446 447
};

448 449 450 451
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);

452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
/* check_ept_pointer() should be under protection of ept_pointer_lock. */
static void check_ept_pointer_match(struct kvm *kvm)
{
	struct kvm_vcpu *vcpu;
	u64 tmp_eptp = INVALID_PAGE;
	int i;

	kvm_for_each_vcpu(i, vcpu, kvm) {
		if (!VALID_PAGE(tmp_eptp)) {
			tmp_eptp = to_vmx(vcpu)->ept_pointer;
		} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
			to_kvm_vmx(kvm)->ept_pointers_match
				= EPT_POINTERS_MISMATCH;
			return;
		}
	}

	to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
}

472
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
		void *data)
{
	struct kvm_tlb_range *range = data;

	return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
			range->pages);
}

static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
		struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
{
	u64 ept_pointer = to_vmx(vcpu)->ept_pointer;

	/*
	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
	 * of the base of EPT PML4 table, strip off EPT configuration
	 * information.
	 */
	if (range)
		return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
				kvm_fill_hv_flush_list_func, (void *)range);
	else
		return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
}

static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
		struct kvm_tlb_range *range)
500
{
501
	struct kvm_vcpu *vcpu;
502
	int ret = 0, i;
503 504 505 506 507 508 509

	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);

	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
		check_ept_pointer_match(kvm);

	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
510
		kvm_for_each_vcpu(i, vcpu, kvm) {
511 512 513 514
			/* If ept_pointer is invalid pointer, bypass flush request. */
			if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
				ret |= __hv_remote_flush_tlb_with_range(
					kvm, vcpu, range);
515
		}
516
	} else {
517 518
		ret = __hv_remote_flush_tlb_with_range(kvm,
				kvm_get_vcpu(kvm, 0), range);
519 520 521 522 523
	}

	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
	return ret;
}
524 525 526 527 528
static int hv_remote_flush_tlb(struct kvm *kvm)
{
	return hv_remote_flush_tlb_with_range(kvm, NULL);
}

529 530 531 532 533 534 535 536 537
static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
	struct hv_enlightened_vmcs *evmcs;
	struct hv_partition_assist_pg **p_hv_pa_pg =
			&vcpu->kvm->arch.hyperv.hv_pa_pg;
	/*
	 * Synthetic VM-Exit is not enabled in current code and so All
	 * evmcs in singe VM shares same assist page.
	 */
538
	if (!*p_hv_pa_pg)
539
		*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
540 541 542

	if (!*p_hv_pa_pg)
		return -ENOMEM;
543 544 545 546 547

	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;

	evmcs->partition_assist_page =
		__pa(*p_hv_pa_pg);
548
	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
549 550 551 552 553
	evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;

	return 0;
}

554 555
#endif /* IS_ENABLED(CONFIG_HYPERV) */

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
/*
 * Comment's format: document - errata name - stepping - processor name.
 * Refer from
 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 */
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
/*
 * 320767.pdf - AAP86  - B1 -
 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 */
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
587 588
 /* Xeon E3-1220 V2 */
0x000306A8,
589 590 591 592 593 594 595 596
};

static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
	u32 eax = cpuid_eax(0x00000001), i;

	/* Clear the reserved bits */
	eax &= ~(0x3U << 14 | 0xfU << 28);
597
	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
598 599 600 601 602 603
		if (eax == vmx_preemption_cpu_tfms[i])
			return true;

	return false;
}

604
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
605
{
606
	return flexpriority_enabled && lapic_in_kernel(vcpu);
607 608
}

609 610 611 612 613
static inline bool report_flexpriority(void)
{
	return flexpriority_enabled;
}

614
static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
615 616 617
{
	int i;

618
	for (i = 0; i < vmx->nmsrs; ++i)
619
		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
620 621 622 623
			return i;
	return -1;
}

624
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
625 626 627
{
	int i;

R
Rusty Russell 已提交
628
	i = __find_msr_index(vmx, msr);
629
	if (i >= 0)
630
		return &vmx->guest_msrs[i];
A
Al Viro 已提交
631
	return NULL;
632 633
}

634 635 636 637 638 639 640 641
static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
{
	int ret = 0;

	u64 old_msr_data = msr->data;
	msr->data = data;
	if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
		preempt_disable();
642
		ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask);
643 644 645 646 647 648 649
		preempt_enable();
		if (ret)
			msr->data = old_msr_data;
	}
	return ret;
}

650
#ifdef CONFIG_KEXEC_CORE
651 652 653 654 655 656 657 658 659
static void crash_vmclear_local_loaded_vmcss(void)
{
	int cpu = raw_smp_processor_id();
	struct loaded_vmcs *v;

	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
			    loaded_vmcss_on_cpu_link)
		vmcs_clear(v->vmcs);
}
660
#endif /* CONFIG_KEXEC_CORE */
661

662
static void __loaded_vmcs_clear(void *arg)
A
Avi Kivity 已提交
663
{
664
	struct loaded_vmcs *loaded_vmcs = arg;
665
	int cpu = raw_smp_processor_id();
A
Avi Kivity 已提交
666

667 668 669
	if (loaded_vmcs->cpu != cpu)
		return; /* vcpu migration can race with cpu offline */
	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
A
Avi Kivity 已提交
670
		per_cpu(current_vmcs, cpu) = NULL;
671 672 673 674 675

	vmcs_clear(loaded_vmcs->vmcs);
	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
		vmcs_clear(loaded_vmcs->shadow_vmcs);

676
	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
677 678

	/*
679 680 681 682 683
	 * Ensure all writes to loaded_vmcs, including deleting it from its
	 * current percpu list, complete before setting loaded_vmcs->vcpu to
	 * -1, otherwise a different cpu can see vcpu == -1 first and add
	 * loaded_vmcs to its percpu list before it's deleted from this cpu's
	 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
684 685 686
	 */
	smp_wmb();

687 688
	loaded_vmcs->cpu = -1;
	loaded_vmcs->launched = 0;
A
Avi Kivity 已提交
689 690
}

691
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
A
Avi Kivity 已提交
692
{
693 694 695 696 697
	int cpu = loaded_vmcs->cpu;

	if (cpu != -1)
		smp_call_function_single(cpu,
			 __loaded_vmcs_clear, loaded_vmcs, 1);
A
Avi Kivity 已提交
698 699
}

A
Avi Kivity 已提交
700 701 702 703 704 705
static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
				       unsigned field)
{
	bool ret;
	u32 mask = 1 << (seg * SEG_FIELD_NR + field);

706 707
	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
A
Avi Kivity 已提交
708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
		vmx->segment_cache.bitmask = 0;
	}
	ret = vmx->segment_cache.bitmask & mask;
	vmx->segment_cache.bitmask |= mask;
	return ret;
}

static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
	u16 *p = &vmx->segment_cache.seg[seg].selector;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
	return *p;
}

static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
	ulong *p = &vmx->segment_cache.seg[seg].base;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
	return *p;
}

static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
	u32 *p = &vmx->segment_cache.seg[seg].limit;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
	return *p;
}

static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
	u32 *p = &vmx->segment_cache.seg[seg].ar;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
	return *p;
}

751
void update_exception_bitmap(struct kvm_vcpu *vcpu)
752 753 754
{
	u32 eb;

J
Jan Kiszka 已提交
755
	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
756
	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
757 758 759 760 761 762 763 764
	/*
	 * Guest access to VMware backdoor ports could legitimately
	 * trigger #GP because of TSS I/O permission bitmap.
	 * We intercept those #GP and allow access to them anyway
	 * as VMware does.
	 */
	if (enable_vmware_backdoor)
		eb |= (1u << GP_VECTOR);
J
Jan Kiszka 已提交
765 766 767 768
	if ((vcpu->guest_debug &
	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
		eb |= 1u << BP_VECTOR;
769
	if (to_vmx(vcpu)->rmode.vm86_active)
770
		eb = ~0;
771
	if (!vmx_need_pf_intercept(vcpu))
M
Miaohe Lin 已提交
772
		eb &= ~(1u << PF_VECTOR);
773 774 775 776 777 778 779 780 781

	/* When we are running a nested L2 guest and L1 specified for it a
	 * certain exception bitmap, we must trap the same exceptions and pass
	 * them to L1. When running L2, we will only handle the exceptions
	 * specified above if L1 did not want them.
	 */
	if (is_guest_mode(vcpu))
		eb |= get_vmcs12(vcpu)->exception_bitmap;

782 783 784
	vmcs_write32(EXCEPTION_BITMAP, eb);
}

785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
/*
 * Check if MSR is intercepted for currently loaded MSR bitmap.
 */
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
	unsigned long *msr_bitmap;
	int f = sizeof(unsigned long);

	if (!cpu_has_vmx_msr_bitmap())
		return true;

	msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;

	if (msr <= 0x1fff) {
		return !!test_bit(msr, msr_bitmap + 0x800 / f);
	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		msr &= 0x1fff;
		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
	}

	return true;
}

808 809
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
		unsigned long entry, unsigned long exit)
810
{
811 812
	vm_entry_controls_clearbit(vmx, entry);
	vm_exit_controls_clearbit(vmx, exit);
813 814
}

815
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
816 817 818 819 820 821 822 823 824 825
{
	unsigned int i;

	for (i = 0; i < m->nr; ++i) {
		if (m->val[i].index == msr)
			return i;
	}
	return -ENOENT;
}

826 827
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
828
	int i;
829 830
	struct msr_autoload *m = &vmx->msr_autoload;

831 832
	switch (msr) {
	case MSR_EFER:
833
		if (cpu_has_load_ia32_efer()) {
834 835
			clear_atomic_switch_msr_special(vmx,
					VM_ENTRY_LOAD_IA32_EFER,
836 837 838 839 840
					VM_EXIT_LOAD_IA32_EFER);
			return;
		}
		break;
	case MSR_CORE_PERF_GLOBAL_CTRL:
841
		if (cpu_has_load_perf_global_ctrl()) {
842
			clear_atomic_switch_msr_special(vmx,
843 844 845 846 847
					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
			return;
		}
		break;
A
Avi Kivity 已提交
848
	}
849
	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
850
	if (i < 0)
851
		goto skip_guest;
852 853 854
	--m->guest.nr;
	m->guest.val[i] = m->guest.val[m->guest.nr];
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
A
Avi Kivity 已提交
855

856
skip_guest:
857
	i = vmx_find_loadstore_msr_slot(&m->host, msr);
858
	if (i < 0)
859
		return;
860 861 862

	--m->host.nr;
	m->host.val[i] = m->host.val[m->host.nr];
863
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
864 865
}

866 867 868 869
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
		unsigned long entry, unsigned long exit,
		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
		u64 guest_val, u64 host_val)
870 871
{
	vmcs_write64(guest_val_vmcs, guest_val);
872 873
	if (host_val_vmcs != HOST_IA32_EFER)
		vmcs_write64(host_val_vmcs, host_val);
874 875
	vm_entry_controls_setbit(vmx, entry);
	vm_exit_controls_setbit(vmx, exit);
876 877
}

878
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
879
				  u64 guest_val, u64 host_val, bool entry_only)
880
{
881
	int i, j = 0;
882 883
	struct msr_autoload *m = &vmx->msr_autoload;

884 885
	switch (msr) {
	case MSR_EFER:
886
		if (cpu_has_load_ia32_efer()) {
887 888
			add_atomic_switch_msr_special(vmx,
					VM_ENTRY_LOAD_IA32_EFER,
889 890 891 892 893 894 895 896
					VM_EXIT_LOAD_IA32_EFER,
					GUEST_IA32_EFER,
					HOST_IA32_EFER,
					guest_val, host_val);
			return;
		}
		break;
	case MSR_CORE_PERF_GLOBAL_CTRL:
897
		if (cpu_has_load_perf_global_ctrl()) {
898
			add_atomic_switch_msr_special(vmx,
899 900 901 902 903 904 905 906
					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
					GUEST_IA32_PERF_GLOBAL_CTRL,
					HOST_IA32_PERF_GLOBAL_CTRL,
					guest_val, host_val);
			return;
		}
		break;
907 908 909 910 911 912 913
	case MSR_IA32_PEBS_ENABLE:
		/* PEBS needs a quiescent period after being disabled (to write
		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
		 * provide that period, so a CPU could write host's record into
		 * guest's memory.
		 */
		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
A
Avi Kivity 已提交
914 915
	}

916
	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
917
	if (!entry_only)
918
		j = vmx_find_loadstore_msr_slot(&m->host, msr);
919

920 921
	if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
	    (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
922
		printk_once(KERN_WARNING "Not enough msr switch entries. "
923 924
				"Can't add msr %x\n", msr);
		return;
925
	}
926
	if (i < 0) {
927
		i = m->guest.nr++;
928
		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
929
	}
930 931 932 933 934
	m->guest.val[i].index = msr;
	m->guest.val[i].value = guest_val;

	if (entry_only)
		return;
935

936 937
	if (j < 0) {
		j = m->host.nr++;
938
		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
939
	}
940 941
	m->host.val[j].index = msr;
	m->host.val[j].value = host_val;
942 943
}

A
Avi Kivity 已提交
944
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
945
{
946 947 948
	u64 guest_efer = vmx->vcpu.arch.efer;
	u64 ignore_bits = 0;

949 950 951
	/* Shadow paging assumes NX to be available.  */
	if (!enable_ept)
		guest_efer |= EFER_NX;
R
Roel Kluin 已提交
952

953
	/*
954
	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
955
	 */
956
	ignore_bits |= EFER_SCE;
957 958 959 960 961 962
#ifdef CONFIG_X86_64
	ignore_bits |= EFER_LMA | EFER_LME;
	/* SCE is meaningful only in long mode on Intel */
	if (guest_efer & EFER_LMA)
		ignore_bits &= ~(u64)EFER_SCE;
#endif
963

964 965 966 967 968
	/*
	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
	 * On CPUs that support "load IA32_EFER", always switch EFER
	 * atomically, since it's faster than switching it manually.
	 */
969
	if (cpu_has_load_ia32_efer() ||
970
	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
971 972
		if (!(guest_efer & EFER_LMA))
			guest_efer &= ~EFER_LME;
973 974
		if (guest_efer != host_efer)
			add_atomic_switch_msr(vmx, MSR_EFER,
975
					      guest_efer, host_efer, false);
976 977
		else
			clear_atomic_switch_msr(vmx, MSR_EFER);
978
		return false;
979
	} else {
980 981
		clear_atomic_switch_msr(vmx, MSR_EFER);

982 983 984 985 986
		guest_efer &= ~ignore_bits;
		guest_efer |= host_efer & ignore_bits;

		vmx->guest_msrs[efer_offset].data = guest_efer;
		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
987

988 989
		return true;
	}
990 991
}

992 993 994 995 996 997
#ifdef CONFIG_X86_32
/*
 * On 32-bit kernels, VM exits still load the FS and GS bases from the
 * VMCS rather than the segment table.  KVM uses this helper to figure
 * out the current bases to poke them into the VMCS before entry.
 */
998 999
static unsigned long segment_base(u16 selector)
{
1000
	struct desc_struct *table;
1001 1002
	unsigned long v;

1003
	if (!(selector & ~SEGMENT_RPL_MASK))
1004 1005
		return 0;

1006
	table = get_current_gdt_ro();
1007

1008
	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1009 1010
		u16 ldt_selector = kvm_read_ldt();

1011
		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1012 1013
			return 0;

1014
		table = (struct desc_struct *)segment_base(ldt_selector);
1015
	}
1016
	v = get_desc_base(&table[selector >> 3]);
1017 1018
	return v;
}
1019
#endif
1020

1021 1022
static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
{
1023
	return vmx_pt_mode_is_host_guest() &&
1024 1025 1026
	       !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}

1027 1028 1029
static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
	/* The base must be 128-byte aligned and a legal physical address. */
1030
	return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
1031 1032
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
	u32 i;

	wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
	wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
	for (i = 0; i < addr_range; i++) {
		wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
		wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
	}
}

static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
	u32 i;

	rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
	rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
	for (i = 0; i < addr_range; i++) {
		rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
		rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
	}
}

static void pt_guest_enter(struct vcpu_vmx *vmx)
{
1063
	if (vmx_pt_mode_is_system())
1064 1065 1066
		return;

	/*
1067 1068
	 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
	 * Save host state before VM entry.
1069
	 */
1070
	rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1071 1072 1073 1074 1075 1076 1077 1078 1079
	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
		wrmsrl(MSR_IA32_RTIT_CTL, 0);
		pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
		pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
	}
}

static void pt_guest_exit(struct vcpu_vmx *vmx)
{
1080
	if (vmx_pt_mode_is_system())
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
		return;

	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
		pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
		pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
	}

	/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
	wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}

1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
			unsigned long fs_base, unsigned long gs_base)
{
	if (unlikely(fs_sel != host->fs_sel)) {
		if (!(fs_sel & 7))
			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
		else
			vmcs_write16(HOST_FS_SELECTOR, 0);
		host->fs_sel = fs_sel;
	}
	if (unlikely(gs_sel != host->gs_sel)) {
		if (!(gs_sel & 7))
			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
		else
			vmcs_write16(HOST_GS_SELECTOR, 0);
		host->gs_sel = gs_sel;
	}
	if (unlikely(fs_base != host->fs_base)) {
		vmcs_writel(HOST_FS_BASE, fs_base);
		host->fs_base = fs_base;
	}
	if (unlikely(gs_base != host->gs_base)) {
		vmcs_writel(HOST_GS_BASE, gs_base);
		host->gs_base = gs_base;
	}
}

1119
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1120
{
1121
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1122
	struct vmcs_host_state *host_state;
1123
#ifdef CONFIG_X86_64
1124
	int cpu = raw_smp_processor_id();
1125
#endif
1126 1127
	unsigned long fs_base, gs_base;
	u16 fs_sel, gs_sel;
1128
	int i;
1129

1130 1131
	vmx->req_immediate_exit = false;

1132 1133 1134 1135 1136
	/*
	 * Note that guest MSRs to be saved/restored can also be changed
	 * when guest state is loaded. This happens when guest transitions
	 * to/from long-mode by setting MSR_EFER.LMA.
	 */
1137 1138
	if (!vmx->guest_msrs_ready) {
		vmx->guest_msrs_ready = true;
1139
		for (i = 0; i < vmx->save_nmsrs; ++i)
1140 1141 1142
			kvm_set_user_return_msr(vmx->guest_msrs[i].index,
						vmx->guest_msrs[i].data,
						vmx->guest_msrs[i].mask);
1143 1144

	}
1145 1146 1147 1148

    	if (vmx->nested.need_vmcs12_to_shadow_sync)
		nested_sync_vmcs12_to_shadow(vcpu);

1149
	if (vmx->guest_state_loaded)
1150 1151
		return;

1152
	host_state = &vmx->loaded_vmcs->host_state;
1153

1154 1155 1156 1157
	/*
	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
	 * allow segment selectors with cpl > 0 or ti == 1.
	 */
1158
	host_state->ldt_sel = kvm_read_ldt();
1159 1160

#ifdef CONFIG_X86_64
1161 1162
	savesegment(ds, host_state->ds_sel);
	savesegment(es, host_state->es_sel);
1163 1164

	gs_base = cpu_kernelmode_gs_base(cpu);
1165
	if (likely(is_64bit_mm(current->mm))) {
1166
		current_save_fsgs();
1167 1168
		fs_sel = current->thread.fsindex;
		gs_sel = current->thread.gsindex;
1169
		fs_base = current->thread.fsbase;
1170
		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1171
	} else {
1172 1173
		savesegment(fs, fs_sel);
		savesegment(gs, gs_sel);
1174
		fs_base = read_msr(MSR_FS_BASE);
1175
		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1176
	}
A
Avi Kivity 已提交
1177

1178
	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
P
Paolo Bonzini 已提交
1179
#else
1180 1181 1182 1183
	savesegment(fs, fs_sel);
	savesegment(gs, gs_sel);
	fs_base = segment_base(fs_sel);
	gs_base = segment_base(gs_sel);
1184
#endif
1185

1186
	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1187
	vmx->guest_state_loaded = true;
1188 1189
}

1190
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1191
{
1192 1193
	struct vmcs_host_state *host_state;

1194
	if (!vmx->guest_state_loaded)
1195 1196
		return;

1197
	host_state = &vmx->loaded_vmcs->host_state;
1198

1199
	++vmx->vcpu.stat.host_state_reload;
1200

1201
#ifdef CONFIG_X86_64
1202
	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1203
#endif
1204 1205
	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
		kvm_load_ldt(host_state->ldt_sel);
1206
#ifdef CONFIG_X86_64
1207
		load_gs_index(host_state->gs_sel);
1208
#else
1209
		loadsegment(gs, host_state->gs_sel);
1210 1211
#endif
	}
1212 1213
	if (host_state->fs_sel & 7)
		loadsegment(fs, host_state->fs_sel);
A
Avi Kivity 已提交
1214
#ifdef CONFIG_X86_64
1215 1216 1217
	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
		loadsegment(ds, host_state->ds_sel);
		loadsegment(es, host_state->es_sel);
A
Avi Kivity 已提交
1218 1219
	}
#endif
1220
	invalidate_tss_limit();
1221
#ifdef CONFIG_X86_64
1222
	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1223
#endif
1224
	load_fixmap_gdt(raw_smp_processor_id());
1225 1226
	vmx->guest_state_loaded = false;
	vmx->guest_msrs_ready = false;
1227 1228
}

1229 1230
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1231
{
1232
	preempt_disable();
1233
	if (vmx->guest_state_loaded)
1234 1235
		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
	preempt_enable();
1236
	return vmx->msr_guest_kernel_gs_base;
1237 1238
}

1239 1240
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
1241
	preempt_disable();
1242
	if (vmx->guest_state_loaded)
1243 1244
		wrmsrl(MSR_KERNEL_GS_BASE, data);
	preempt_enable();
1245 1246 1247 1248
	vmx->msr_guest_kernel_gs_base = data;
}
#endif

1249 1250
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
			struct loaded_vmcs *buddy)
A
Avi Kivity 已提交
1251
{
1252
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1253
	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1254
	struct vmcs *prev;
A
Avi Kivity 已提交
1255

1256
	if (!already_loaded) {
1257
		loaded_vmcs_clear(vmx->loaded_vmcs);
1258
		local_irq_disable();
1259 1260

		/*
1261 1262 1263 1264
		 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
		 * this cpu's percpu list, otherwise it may not yet be deleted
		 * from its previous cpu's percpu list.  Pairs with the
		 * smb_wmb() in __loaded_vmcs_clear().
1265 1266 1267
		 */
		smp_rmb();

1268 1269
		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
			 &per_cpu(loaded_vmcss_on_cpu, cpu));
1270
		local_irq_enable();
1271 1272
	}

1273 1274
	prev = per_cpu(current_vmcs, cpu);
	if (prev != vmx->loaded_vmcs->vmcs) {
1275 1276
		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
		vmcs_load(vmx->loaded_vmcs->vmcs);
1277 1278 1279 1280 1281 1282 1283 1284

		/*
		 * No indirect branch prediction barrier needed when switching
		 * the active VMCS within a guest, e.g. on nested VM-Enter.
		 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
		 */
		if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
			indirect_branch_prediction_barrier();
1285 1286 1287
	}

	if (!already_loaded) {
1288
		void *gdt = get_current_gdt_ro();
1289 1290
		unsigned long sysenter_esp;

1291 1292 1293 1294
		/*
		 * Flush all EPTP/VPID contexts, the new pCPU may have stale
		 * TLB entries from its previous association with the vCPU.
		 */
1295
		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1296

A
Avi Kivity 已提交
1297 1298
		/*
		 * Linux uses per-cpu TSS and GDT, so set these when switching
1299
		 * processors.  See 22.2.4.
A
Avi Kivity 已提交
1300
		 */
1301
		vmcs_writel(HOST_TR_BASE,
1302
			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1303
		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
A
Avi Kivity 已提交
1304 1305 1306

		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1307

1308
		vmx->loaded_vmcs->cpu = cpu;
A
Avi Kivity 已提交
1309
	}
1310

1311 1312
	/* Setup TSC multiplier */
	if (kvm_has_tsc_control &&
P
Peter Feiner 已提交
1313 1314
	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
		decache_tsc_multiplier(vmx);
1315 1316 1317 1318 1319 1320
}

/*
 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 * vcpu mutex is already taken.
 */
1321
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1322 1323 1324
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

1325
	vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1326

1327
	vmx_vcpu_pi_load(vcpu, cpu);
1328

1329
	vmx->host_debugctlmsr = get_debugctlmsr();
1330 1331
}

1332
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1333
{
1334 1335
	vmx_vcpu_pi_put(vcpu);

1336
	vmx_prepare_switch_to_host(to_vmx(vcpu));
A
Avi Kivity 已提交
1337 1338
}

1339 1340
static bool emulation_required(struct kvm_vcpu *vcpu)
{
1341
	return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1342 1343
}

1344
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1345
{
1346
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1347
	unsigned long rflags, save_rflags;
1348

1349 1350
	if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
A
Avi Kivity 已提交
1351
		rflags = vmcs_readl(GUEST_RFLAGS);
1352
		if (vmx->rmode.vm86_active) {
A
Avi Kivity 已提交
1353
			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1354
			save_rflags = vmx->rmode.save_rflags;
A
Avi Kivity 已提交
1355 1356
			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
		}
1357
		vmx->rflags = rflags;
1358
	}
1359
	return vmx->rflags;
A
Avi Kivity 已提交
1360 1361
}

1362
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
A
Avi Kivity 已提交
1363
{
1364
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1365
	unsigned long old_rflags;
1366

1367
	if (is_unrestricted_guest(vcpu)) {
1368
		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1369 1370 1371 1372 1373 1374
		vmx->rflags = rflags;
		vmcs_writel(GUEST_RFLAGS, rflags);
		return;
	}

	old_rflags = vmx_get_rflags(vcpu);
1375 1376 1377
	vmx->rflags = rflags;
	if (vmx->rmode.vm86_active) {
		vmx->rmode.save_rflags = rflags;
1378
		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1379
	}
A
Avi Kivity 已提交
1380
	vmcs_writel(GUEST_RFLAGS, rflags);
1381

1382 1383
	if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
		vmx->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
1384 1385
}

1386
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1387 1388 1389 1390 1391
{
	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
	int ret = 0;

	if (interruptibility & GUEST_INTR_STATE_STI)
1392
		ret |= KVM_X86_SHADOW_INT_STI;
1393
	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1394
		ret |= KVM_X86_SHADOW_INT_MOV_SS;
1395

1396
	return ret;
1397 1398
}

1399
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1400 1401 1402 1403 1404 1405
{
	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
	u32 interruptibility = interruptibility_old;

	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);

1406
	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1407
		interruptibility |= GUEST_INTR_STATE_MOV_SS;
1408
	else if (mask & KVM_X86_SHADOW_INT_STI)
1409 1410 1411 1412 1413 1414
		interruptibility |= GUEST_INTR_STATE_STI;

	if ((interruptibility != interruptibility_old))
		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}

1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	unsigned long value;

	/*
	 * Any MSR write that attempts to change bits marked reserved will
	 * case a #GP fault.
	 */
	if (data & vmx->pt_desc.ctl_bitmask)
		return 1;

	/*
	 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
	 * result in a #GP unless the same write also clears TraceEn.
	 */
	if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
		((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
		return 1;

	/*
	 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
	 * and FabricEn would cause #GP, if
	 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
	 */
	if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
		!(data & RTIT_CTL_FABRIC_EN) &&
		!intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_single_range_output))
		return 1;

	/*
	 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
	 * utilize encodings marked reserved will casue a #GP fault.
	 */
	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
			!test_bit((data & RTIT_CTL_MTC_RANGE) >>
			RTIT_CTL_MTC_RANGE_OFFSET, &value))
		return 1;
	value = intel_pt_validate_cap(vmx->pt_desc.caps,
						PT_CAP_cycle_thresholds);
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
			!test_bit((data & RTIT_CTL_CYC_THRESH) >>
			RTIT_CTL_CYC_THRESH_OFFSET, &value))
		return 1;
	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
			!test_bit((data & RTIT_CTL_PSB_FREQ) >>
			RTIT_CTL_PSB_FREQ_OFFSET, &value))
		return 1;

	/*
	 * If ADDRx_CFG is reserved or the encodings is >2 will
	 * cause a #GP fault.
	 */
	value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
	if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
		return 1;
	value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
	if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
		return 1;
	value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
	if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
		return 1;
	value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
	if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
		return 1;

	return 0;
}

1487 1488 1489 1490 1491
static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
{
	return true;
}

1492
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1493
{
1494
	unsigned long rip, orig_rip;
A
Avi Kivity 已提交
1495

1496 1497 1498 1499 1500 1501 1502 1503 1504 1505
	/*
	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
	 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
	 * set when EPT misconfig occurs.  In practice, real hardware updates
	 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
	 * (namely Hyper-V) don't set it due to it being undefined behavior,
	 * i.e. we end up advancing IP with some random value.
	 */
	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516
		orig_rip = kvm_rip_read(vcpu);
		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
		/*
		 * We need to mask out the high 32 bits of RIP if not in 64-bit
		 * mode, but just finding out that we are in 64-bit mode is
		 * quite expensive.  Only do it if there was a carry.
		 */
		if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
			rip = (u32)rip;
#endif
1517 1518 1519 1520 1521
		kvm_rip_write(vcpu, rip);
	} else {
		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
			return 0;
	}
A
Avi Kivity 已提交
1522

1523 1524
	/* skipping an emulated instruction also counts */
	vmx_set_interrupt_shadow(vcpu, 0);
1525

1526
	return 1;
1527 1528
}

1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
/*
 * Recognizes a pending MTF VM-exit and records the nested state for later
 * delivery.
 */
static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (!is_guest_mode(vcpu))
		return;

	/*
	 * Per the SDM, MTF takes priority over debug-trap exceptions besides
	 * T-bit traps. As instruction emulation is completed (i.e. at the
	 * instruction boundary), any #DB exception pending delivery must be a
	 * debug-trap. Record the pending MTF state to be delivered in
	 * vmx_check_nested_events().
	 */
	if (nested_cpu_has_mtf(vmcs12) &&
	    (!vcpu->arch.exception.pending ||
	     vcpu->arch.exception.nr == DB_VECTOR))
		vmx->nested.mtf_pending = true;
	else
		vmx->nested.mtf_pending = false;
}

static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
	vmx_update_emulated_instruction(vcpu);
	return skip_emulated_instruction(vcpu);
}

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
	/*
	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
	 * explicitly skip the instruction because if the HLT state is set,
	 * then the instruction is already executing and RIP has already been
	 * advanced.
	 */
	if (kvm_hlt_in_guest(vcpu->kvm) &&
			vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}

1575
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1576
{
1577
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1578 1579 1580
	unsigned nr = vcpu->arch.exception.nr;
	bool has_error_code = vcpu->arch.exception.has_error_code;
	u32 error_code = vcpu->arch.exception.error_code;
1581
	u32 intr_info = nr | INTR_INFO_VALID_MASK;
1582

1583 1584
	kvm_deliver_exception_payload(vcpu);

1585
	if (has_error_code) {
1586
		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1587 1588
		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
	}
1589

1590
	if (vmx->rmode.vm86_active) {
1591 1592 1593
		int inc_eip = 0;
		if (kvm_exception_is_soft(nr))
			inc_eip = vcpu->arch.event_exit_inst_len;
1594
		kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
1595 1596 1597
		return;
	}

1598 1599
	WARN_ON_ONCE(vmx->emulation_required);

1600 1601 1602
	if (kvm_exception_is_soft(nr)) {
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
1603 1604 1605 1606 1607
		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
	} else
		intr_info |= INTR_TYPE_HARD_EXCEPTION;

	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1608 1609

	vmx_clear_hlt(vcpu);
1610 1611
}

1612 1613 1614
/*
 * Swap MSR entry in host/guest MSR entry array.
 */
R
Rusty Russell 已提交
1615
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1616
{
1617
	struct shared_msr_entry tmp;
1618 1619 1620 1621

	tmp = vmx->guest_msrs[to];
	vmx->guest_msrs[to] = vmx->guest_msrs[from];
	vmx->guest_msrs[from] = tmp;
1622 1623
}

1624 1625 1626 1627 1628
/*
 * Set up the vmcs to automatically save and restore system
 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
 * mode, as fiddling with msrs is very expensive.
 */
R
Rusty Russell 已提交
1629
static void setup_msrs(struct vcpu_vmx *vmx)
1630
{
1631
	int save_nmsrs, index;
1632

1633 1634
	save_nmsrs = 0;
#ifdef CONFIG_X86_64
1635 1636 1637 1638 1639 1640
	/*
	 * The SYSCALL MSRs are only needed on long mode guests, and only
	 * when EFER.SCE is set.
	 */
	if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
		index = __find_msr_index(vmx, MSR_STAR);
1641
		if (index >= 0)
R
Rusty Russell 已提交
1642 1643
			move_msr_up(vmx, index, save_nmsrs++);
		index = __find_msr_index(vmx, MSR_LSTAR);
1644
		if (index >= 0)
R
Rusty Russell 已提交
1645
			move_msr_up(vmx, index, save_nmsrs++);
1646 1647
		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
		if (index >= 0)
R
Rusty Russell 已提交
1648
			move_msr_up(vmx, index, save_nmsrs++);
1649 1650
	}
#endif
A
Avi Kivity 已提交
1651 1652
	index = __find_msr_index(vmx, MSR_EFER);
	if (index >= 0 && update_transition_efer(vmx, index))
1653
		move_msr_up(vmx, index, save_nmsrs++);
1654 1655 1656
	index = __find_msr_index(vmx, MSR_TSC_AUX);
	if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
		move_msr_up(vmx, index, save_nmsrs++);
1657 1658 1659
	index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
	if (index >= 0)
		move_msr_up(vmx, index, save_nmsrs++);
1660

1661
	vmx->save_nmsrs = save_nmsrs;
1662
	vmx->guest_msrs_ready = false;
1663

1664
	if (cpu_has_vmx_msr_bitmap())
1665
		vmx_update_msr_bitmap(&vmx->vcpu);
1666 1667
}

1668
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
A
Avi Kivity 已提交
1669
{
1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	u64 g_tsc_offset = 0;

	/*
	 * We're here if L1 chose not to trap WRMSR to TSC. According
	 * to the spec, this should set L1's TSC; The offset that L1
	 * set for L2 remains unchanged, and still needs to be added
	 * to the newly set TSC to get L2's TSC.
	 */
	if (is_guest_mode(vcpu) &&
1680
	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
1681
		g_tsc_offset = vmcs12->tsc_offset;
1682

1683 1684 1685 1686 1687
	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				   vcpu->arch.tsc_offset - g_tsc_offset,
				   offset);
	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
	return offset + g_tsc_offset;
A
Avi Kivity 已提交
1688 1689
}

1690 1691 1692 1693 1694 1695
/*
 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
 * all guests if the "nested" module option is off, and can also be disabled
 * for a single guest by disabling its VMX cpuid bit.
 */
1696
bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1697
{
1698
	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1699 1700
}

1701 1702
static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
						 uint64_t val)
1703
{
1704
	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1705

1706
	return !(val & ~valid_bits);
1707 1708
}

1709
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1710
{
1711 1712 1713 1714 1715
	switch (msr->index) {
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
		if (!nested)
			return 1;
		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1716 1717 1718
	case MSR_IA32_PERF_CAPABILITIES:
		msr->data = vmx_get_perf_capabilities();
		return 0;
1719
	default:
1720
		return KVM_MSR_RET_INVALID;
1721
	}
1722 1723
}

1724 1725 1726 1727 1728 1729
/*
 * Reads an msr value (of 'msr_index') into 'pdata'.
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1730
{
1731 1732
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct shared_msr_entry *msr;
1733
	u32 index;
1734

1735 1736 1737 1738
	switch (msr_info->index) {
#ifdef CONFIG_X86_64
	case MSR_FS_BASE:
		msr_info->data = vmcs_readl(GUEST_FS_BASE);
1739
		break;
1740 1741
	case MSR_GS_BASE:
		msr_info->data = vmcs_readl(GUEST_GS_BASE);
1742
		break;
1743 1744
	case MSR_KERNEL_GS_BASE:
		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1745
		break;
1746 1747 1748
#endif
	case MSR_EFER:
		return kvm_get_msr_common(vcpu, msr_info);
1749 1750 1751 1752 1753
	case MSR_IA32_TSX_CTRL:
		if (!msr_info->host_initiated &&
		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
			return 1;
		goto find_shared_msr;
1754 1755 1756 1757 1758 1759
	case MSR_IA32_UMWAIT_CONTROL:
		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
			return 1;

		msr_info->data = vmx->msr_ia32_umwait_control;
		break;
1760 1761 1762 1763 1764 1765
	case MSR_IA32_SPEC_CTRL:
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
			return 1;

		msr_info->data = to_vmx(vcpu)->spec_ctrl;
1766
		break;
A
Avi Kivity 已提交
1767
	case MSR_IA32_SYSENTER_CS:
1768
		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
A
Avi Kivity 已提交
1769 1770
		break;
	case MSR_IA32_SYSENTER_EIP:
1771
		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
A
Avi Kivity 已提交
1772 1773
		break;
	case MSR_IA32_SYSENTER_ESP:
1774
		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
A
Avi Kivity 已提交
1775
		break;
1776
	case MSR_IA32_BNDCFGS:
1777
		if (!kvm_mpx_supported() ||
1778 1779
		    (!msr_info->host_initiated &&
		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1780
			return 1;
1781
		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1782
		break;
1783 1784
	case MSR_IA32_MCG_EXT_CTL:
		if (!msr_info->host_initiated &&
1785
		    !(vmx->msr_ia32_feature_control &
1786
		      FEAT_CTL_LMCE_ENABLED))
1787
			return 1;
1788 1789
		msr_info->data = vcpu->arch.mcg_ext_ctl;
		break;
1790
	case MSR_IA32_FEAT_CTL:
1791
		msr_info->data = vmx->msr_ia32_feature_control;
1792 1793 1794 1795
		break;
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
		if (!nested_vmx_allowed(vcpu))
			return 1;
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
		if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
				    &msr_info->data))
			return 1;
		/*
		 * Enlightened VMCS v1 doesn't have certain fields, but buggy
		 * Hyper-V versions are still trying to use corresponding
		 * features when they are exposed. Filter out the essential
		 * minimum.
		 */
		if (!msr_info->host_initiated &&
		    vmx->nested.enlightened_vmcs_enabled)
			nested_evmcs_filter_control_msr(msr_info->index,
							&msr_info->data);
		break;
1810
	case MSR_IA32_RTIT_CTL:
1811
		if (!vmx_pt_mode_is_host_guest())
1812 1813 1814 1815
			return 1;
		msr_info->data = vmx->pt_desc.guest.ctl;
		break;
	case MSR_IA32_RTIT_STATUS:
1816
		if (!vmx_pt_mode_is_host_guest())
1817 1818 1819 1820
			return 1;
		msr_info->data = vmx->pt_desc.guest.status;
		break;
	case MSR_IA32_RTIT_CR3_MATCH:
1821
		if (!vmx_pt_mode_is_host_guest() ||
1822 1823 1824 1825 1826 1827
			!intel_pt_validate_cap(vmx->pt_desc.caps,
						PT_CAP_cr3_filtering))
			return 1;
		msr_info->data = vmx->pt_desc.guest.cr3_match;
		break;
	case MSR_IA32_RTIT_OUTPUT_BASE:
1828
		if (!vmx_pt_mode_is_host_guest() ||
1829 1830 1831 1832 1833 1834 1835 1836
			(!intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_topa_output) &&
			 !intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_single_range_output)))
			return 1;
		msr_info->data = vmx->pt_desc.guest.output_base;
		break;
	case MSR_IA32_RTIT_OUTPUT_MASK:
1837
		if (!vmx_pt_mode_is_host_guest() ||
1838 1839 1840 1841 1842 1843 1844 1845 1846
			(!intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_topa_output) &&
			 !intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_single_range_output)))
			return 1;
		msr_info->data = vmx->pt_desc.guest.output_mask;
		break;
	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1847
		if (!vmx_pt_mode_is_host_guest() ||
1848 1849 1850 1851 1852 1853 1854 1855
			(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
					PT_CAP_num_address_ranges)))
			return 1;
		if (index % 2)
			msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
		else
			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
		break;
1856
	case MSR_TSC_AUX:
1857 1858
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1859
			return 1;
1860
		goto find_shared_msr;
A
Avi Kivity 已提交
1861
	default:
1862
	find_shared_msr:
1863
		msr = find_msr_entry(vmx, msr_info->index);
1864
		if (msr) {
1865
			msr_info->data = msr->data;
1866
			break;
A
Avi Kivity 已提交
1867
		}
1868
		return kvm_get_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
1869 1870 1871 1872 1873
	}

	return 0;
}

1874 1875 1876 1877 1878 1879 1880 1881 1882 1883
static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
						    u64 data)
{
#ifdef CONFIG_X86_64
	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
		return (u32)data;
#endif
	return (unsigned long)data;
}

A
Avi Kivity 已提交
1884
/*
M
Miaohe Lin 已提交
1885
 * Writes msr value into the appropriate "register".
A
Avi Kivity 已提交
1886 1887 1888
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
1889
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
A
Avi Kivity 已提交
1890
{
1891
	struct vcpu_vmx *vmx = to_vmx(vcpu);
1892
	struct shared_msr_entry *msr;
1893
	int ret = 0;
1894 1895
	u32 msr_index = msr_info->index;
	u64 data = msr_info->data;
1896
	u32 index;
1897

A
Avi Kivity 已提交
1898
	switch (msr_index) {
1899
	case MSR_EFER:
1900
		ret = kvm_set_msr_common(vcpu, msr_info);
1901
		break;
1902
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
1903
	case MSR_FS_BASE:
A
Avi Kivity 已提交
1904
		vmx_segment_cache_clear(vmx);
A
Avi Kivity 已提交
1905 1906 1907
		vmcs_writel(GUEST_FS_BASE, data);
		break;
	case MSR_GS_BASE:
A
Avi Kivity 已提交
1908
		vmx_segment_cache_clear(vmx);
A
Avi Kivity 已提交
1909 1910
		vmcs_writel(GUEST_GS_BASE, data);
		break;
1911
	case MSR_KERNEL_GS_BASE:
1912
		vmx_write_guest_kernel_gs_base(vmx, data);
1913
		break;
A
Avi Kivity 已提交
1914 1915
#endif
	case MSR_IA32_SYSENTER_CS:
1916 1917
		if (is_guest_mode(vcpu))
			get_vmcs12(vcpu)->guest_sysenter_cs = data;
A
Avi Kivity 已提交
1918 1919 1920
		vmcs_write32(GUEST_SYSENTER_CS, data);
		break;
	case MSR_IA32_SYSENTER_EIP:
1921 1922
		if (is_guest_mode(vcpu)) {
			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
1923
			get_vmcs12(vcpu)->guest_sysenter_eip = data;
1924
		}
A
Avi Kivity 已提交
1925
		vmcs_writel(GUEST_SYSENTER_EIP, data);
A
Avi Kivity 已提交
1926 1927
		break;
	case MSR_IA32_SYSENTER_ESP:
1928 1929
		if (is_guest_mode(vcpu)) {
			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
1930
			get_vmcs12(vcpu)->guest_sysenter_esp = data;
1931
		}
A
Avi Kivity 已提交
1932
		vmcs_writel(GUEST_SYSENTER_ESP, data);
A
Avi Kivity 已提交
1933
		break;
1934 1935 1936 1937 1938 1939 1940 1941
	case MSR_IA32_DEBUGCTLMSR:
		if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
						VM_EXIT_SAVE_DEBUG_CONTROLS)
			get_vmcs12(vcpu)->guest_ia32_debugctl = data;

		ret = kvm_set_msr_common(vcpu, msr_info);
		break;

1942
	case MSR_IA32_BNDCFGS:
1943
		if (!kvm_mpx_supported() ||
1944 1945
		    (!msr_info->host_initiated &&
		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1946
			return 1;
1947
		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
1948
		    (data & MSR_IA32_BNDCFGS_RSVD))
1949
			return 1;
1950 1951
		vmcs_write64(GUEST_BNDCFGS, data);
		break;
1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
	case MSR_IA32_UMWAIT_CONTROL:
		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
			return 1;

		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
			return 1;

		vmx->msr_ia32_umwait_control = data;
		break;
1962 1963 1964 1965 1966
	case MSR_IA32_SPEC_CTRL:
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
			return 1;

1967
		if (kvm_spec_ctrl_test_value(data))
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
			return 1;

		vmx->spec_ctrl = data;
		if (!data)
			break;

		/*
		 * For non-nested:
		 * When it's written (to non-zero) for the first time, pass
		 * it through.
		 *
		 * For nested:
		 * The handling of the MSR bitmap for L2 guests is done in
1981
		 * nested_vmx_prepare_msr_bitmap. We should not touch the
1982 1983 1984 1985 1986 1987 1988 1989
		 * vmcs02.msr_bitmap here since it gets completely overwritten
		 * in the merging. We update the vmcs01 here for L1 as well
		 * since it will end up touching the MSR anyway now.
		 */
		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
					      MSR_IA32_SPEC_CTRL,
					      MSR_TYPE_RW);
		break;
1990 1991 1992 1993 1994 1995 1996
	case MSR_IA32_TSX_CTRL:
		if (!msr_info->host_initiated &&
		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
			return 1;
		if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
			return 1;
		goto find_shared_msr;
A
Ashok Raj 已提交
1997 1998 1999 2000 2001 2002 2003
	case MSR_IA32_PRED_CMD:
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
			return 1;

		if (data & ~PRED_CMD_IBPB)
			return 1;
2004 2005
		if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
			return 1;
A
Ashok Raj 已提交
2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
		if (!data)
			break;

		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);

		/*
		 * For non-nested:
		 * When it's written (to non-zero) for the first time, pass
		 * it through.
		 *
		 * For nested:
		 * The handling of the MSR bitmap for L2 guests is done in
2018
		 * nested_vmx_prepare_msr_bitmap. We should not touch the
A
Ashok Raj 已提交
2019 2020 2021 2022 2023 2024
		 * vmcs02.msr_bitmap here since it gets completely overwritten
		 * in the merging.
		 */
		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
					      MSR_TYPE_W);
		break;
S
Sheng Yang 已提交
2025
	case MSR_IA32_CR_PAT:
2026 2027 2028
		if (!kvm_pat_valid(data))
			return 1;

2029 2030 2031 2032
		if (is_guest_mode(vcpu) &&
		    get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
			get_vmcs12(vcpu)->guest_ia32_pat = data;

S
Sheng Yang 已提交
2033 2034 2035 2036 2037
		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
			vmcs_write64(GUEST_IA32_PAT, data);
			vcpu->arch.pat = data;
			break;
		}
2038
		ret = kvm_set_msr_common(vcpu, msr_info);
2039
		break;
W
Will Auld 已提交
2040 2041
	case MSR_IA32_TSC_ADJUST:
		ret = kvm_set_msr_common(vcpu, msr_info);
2042
		break;
2043 2044 2045
	case MSR_IA32_MCG_EXT_CTL:
		if ((!msr_info->host_initiated &&
		     !(to_vmx(vcpu)->msr_ia32_feature_control &
2046
		       FEAT_CTL_LMCE_ENABLED)) ||
2047 2048 2049 2050
		    (data & ~MCG_EXT_CTL_LMCE_EN))
			return 1;
		vcpu->arch.mcg_ext_ctl = data;
		break;
2051
	case MSR_IA32_FEAT_CTL:
2052
		if (!vmx_feature_control_msr_valid(vcpu, data) ||
2053
		    (to_vmx(vcpu)->msr_ia32_feature_control &
2054
		     FEAT_CTL_LOCKED && !msr_info->host_initiated))
2055
			return 1;
2056
		vmx->msr_ia32_feature_control = data;
2057 2058 2059 2060
		if (msr_info->host_initiated && data == 0)
			vmx_leave_nested(vcpu);
		break;
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2061 2062 2063 2064 2065
		if (!msr_info->host_initiated)
			return 1; /* they are read-only */
		if (!nested_vmx_allowed(vcpu))
			return 1;
		return vmx_set_vmx_msr(vcpu, msr_index, data);
2066
	case MSR_IA32_RTIT_CTL:
2067
		if (!vmx_pt_mode_is_host_guest() ||
2068 2069
			vmx_rtit_ctl_check(vcpu, data) ||
			vmx->nested.vmxon)
2070 2071 2072
			return 1;
		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
		vmx->pt_desc.guest.ctl = data;
2073
		pt_update_intercept_for_msr(vmx);
2074 2075
		break;
	case MSR_IA32_RTIT_STATUS:
2076 2077 2078
		if (!pt_can_write_msr(vmx))
			return 1;
		if (data & MSR_IA32_RTIT_STATUS_MASK)
2079 2080 2081 2082
			return 1;
		vmx->pt_desc.guest.status = data;
		break;
	case MSR_IA32_RTIT_CR3_MATCH:
2083 2084 2085 2086
		if (!pt_can_write_msr(vmx))
			return 1;
		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
					   PT_CAP_cr3_filtering))
2087 2088 2089 2090
			return 1;
		vmx->pt_desc.guest.cr3_match = data;
		break;
	case MSR_IA32_RTIT_OUTPUT_BASE:
2091 2092 2093 2094 2095 2096 2097
		if (!pt_can_write_msr(vmx))
			return 1;
		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
					   PT_CAP_topa_output) &&
		    !intel_pt_validate_cap(vmx->pt_desc.caps,
					   PT_CAP_single_range_output))
			return 1;
2098
		if (!pt_output_base_valid(vcpu, data))
2099 2100 2101 2102
			return 1;
		vmx->pt_desc.guest.output_base = data;
		break;
	case MSR_IA32_RTIT_OUTPUT_MASK:
2103 2104 2105 2106 2107 2108
		if (!pt_can_write_msr(vmx))
			return 1;
		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
					   PT_CAP_topa_output) &&
		    !intel_pt_validate_cap(vmx->pt_desc.caps,
					   PT_CAP_single_range_output))
2109 2110 2111 2112
			return 1;
		vmx->pt_desc.guest.output_mask = data;
		break;
	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2113 2114
		if (!pt_can_write_msr(vmx))
			return 1;
2115
		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2116 2117
		if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
						       PT_CAP_num_address_ranges))
2118
			return 1;
2119
		if (is_noncanonical_address(data, vcpu))
2120 2121 2122 2123 2124 2125
			return 1;
		if (index % 2)
			vmx->pt_desc.guest.addr_b[index / 2] = data;
		else
			vmx->pt_desc.guest.addr_a[index / 2] = data;
		break;
2126
	case MSR_TSC_AUX:
2127 2128
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2129 2130 2131 2132
			return 1;
		/* Check reserved bit, higher 32 bits should be zero */
		if ((data >> 32) != 0)
			return 1;
2133 2134
		goto find_shared_msr;

A
Avi Kivity 已提交
2135
	default:
2136
	find_shared_msr:
R
Rusty Russell 已提交
2137
		msr = find_msr_entry(vmx, msr_index);
2138 2139 2140 2141
		if (msr)
			ret = vmx_set_guest_msr(vmx, msr, data);
		else
			ret = kvm_set_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
2142 2143
	}

2144
	return ret;
A
Avi Kivity 已提交
2145 2146
}

2147
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
A
Avi Kivity 已提交
2148
{
2149 2150
	unsigned long guest_owned_bits;

2151 2152
	kvm_register_mark_available(vcpu, reg);

2153 2154 2155 2156 2157 2158 2159
	switch (reg) {
	case VCPU_REGS_RSP:
		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
		break;
	case VCPU_REGS_RIP:
		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
		break;
A
Avi Kivity 已提交
2160 2161 2162 2163
	case VCPU_EXREG_PDPTR:
		if (enable_ept)
			ept_save_pdptrs(vcpu);
		break;
2164 2165 2166 2167 2168 2169
	case VCPU_EXREG_CR0:
		guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;

		vcpu->arch.cr0 &= ~guest_owned_bits;
		vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
		break;
2170
	case VCPU_EXREG_CR3:
2171 2172
		if (is_unrestricted_guest(vcpu) ||
		    (enable_ept && is_paging(vcpu)))
2173 2174
			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
		break;
2175 2176 2177 2178 2179 2180
	case VCPU_EXREG_CR4:
		guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;

		vcpu->arch.cr4 &= ~guest_owned_bits;
		vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
		break;
2181
	default:
2182
		WARN_ON_ONCE(1);
2183 2184
		break;
	}
A
Avi Kivity 已提交
2185 2186 2187 2188
}

static __init int cpu_has_kvm_support(void)
{
2189
	return cpu_has_vmx();
A
Avi Kivity 已提交
2190 2191 2192 2193
}

static __init int vmx_disabled_by_bios(void)
{
2194 2195
	return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
	       !boot_cpu_has(X86_FEATURE_VMX);
A
Avi Kivity 已提交
2196 2197
}

2198
static int kvm_cpu_vmxon(u64 vmxon_pointer)
2199
{
2200 2201
	u64 msr;

2202
	cr4_set_bits(X86_CR4_VMXE);
2203 2204
	intel_pt_handle_vmx(1);

2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217
	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
			  _ASM_EXTABLE(1b, %l[fault])
			  : : [vmxon_pointer] "m"(vmxon_pointer)
			  : : fault);
	return 0;

fault:
	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
		  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
	intel_pt_handle_vmx(0);
	cr4_clear_bits(X86_CR4_VMXE);

	return -EFAULT;
2218 2219
}

2220
static int hardware_enable(void)
A
Avi Kivity 已提交
2221 2222 2223
{
	int cpu = raw_smp_processor_id();
	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2224
	int r;
A
Avi Kivity 已提交
2225

2226
	if (cr4_read_shadow() & X86_CR4_VMXE)
2227 2228
		return -EBUSY;

2229 2230 2231 2232 2233 2234 2235 2236
	/*
	 * This can happen if we hot-added a CPU but failed to allocate
	 * VP assist page for it.
	 */
	if (static_branch_unlikely(&enable_evmcs) &&
	    !hv_get_vp_assist_page(cpu))
		return -EFAULT;

2237 2238 2239
	r = kvm_cpu_vmxon(phys_addr);
	if (r)
		return r;
2240

2241 2242
	if (enable_ept)
		ept_sync_global();
2243 2244

	return 0;
A
Avi Kivity 已提交
2245 2246
}

2247
static void vmclear_local_loaded_vmcss(void)
2248 2249
{
	int cpu = raw_smp_processor_id();
2250
	struct loaded_vmcs *v, *n;
2251

2252 2253 2254
	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
				 loaded_vmcss_on_cpu_link)
		__loaded_vmcs_clear(v);
2255 2256
}

2257 2258 2259 2260 2261

/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
 * tricks.
 */
static void kvm_cpu_vmxoff(void)
A
Avi Kivity 已提交
2262
{
2263
	asm volatile (__ex("vmxoff"));
2264 2265

	intel_pt_handle_vmx(0);
2266
	cr4_clear_bits(X86_CR4_VMXE);
A
Avi Kivity 已提交
2267 2268
}

2269
static void hardware_disable(void)
2270
{
2271 2272
	vmclear_local_loaded_vmcss();
	kvm_cpu_vmxoff();
2273 2274
}

2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285
/*
 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
 * directly instead of going through cpu_has(), to ensure KVM is trapping
 * ENCLS whenever it's supported in hardware.  It does not matter whether
 * the host OS supports or has enabled SGX.
 */
static bool cpu_has_sgx(void)
{
	return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}

2286
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
M
Mike Day 已提交
2287
				      u32 msr, u32 *result)
2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298
{
	u32 vmx_msr_low, vmx_msr_high;
	u32 ctl = ctl_min | ctl_opt;

	rdmsr(msr, vmx_msr_low, vmx_msr_high);

	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */

	/* Ensure minimum (required) set of control bits are supported. */
	if (ctl_min & ~ctl)
Y
Yang, Sheng 已提交
2299
		return -EIO;
2300 2301 2302 2303 2304

	*result = ctl;
	return 0;
}

2305 2306
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
				    struct vmx_capability *vmx_cap)
A
Avi Kivity 已提交
2307 2308
{
	u32 vmx_msr_low, vmx_msr_high;
S
Sheng Yang 已提交
2309
	u32 min, opt, min2, opt2;
2310 2311
	u32 _pin_based_exec_control = 0;
	u32 _cpu_based_exec_control = 0;
2312
	u32 _cpu_based_2nd_exec_control = 0;
2313 2314 2315
	u32 _vmexit_control = 0;
	u32 _vmentry_control = 0;

2316
	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
R
Raghavendra K T 已提交
2317
	min = CPU_BASED_HLT_EXITING |
2318 2319 2320 2321
#ifdef CONFIG_X86_64
	      CPU_BASED_CR8_LOAD_EXITING |
	      CPU_BASED_CR8_STORE_EXITING |
#endif
S
Sheng Yang 已提交
2322 2323
	      CPU_BASED_CR3_LOAD_EXITING |
	      CPU_BASED_CR3_STORE_EXITING |
Q
Quan Xu 已提交
2324
	      CPU_BASED_UNCOND_IO_EXITING |
2325
	      CPU_BASED_MOV_DR_EXITING |
2326
	      CPU_BASED_USE_TSC_OFFSETTING |
2327 2328
	      CPU_BASED_MWAIT_EXITING |
	      CPU_BASED_MONITOR_EXITING |
A
Avi Kivity 已提交
2329 2330
	      CPU_BASED_INVLPG_EXITING |
	      CPU_BASED_RDPMC_EXITING;
2331

2332
	opt = CPU_BASED_TPR_SHADOW |
S
Sheng Yang 已提交
2333
	      CPU_BASED_USE_MSR_BITMAPS |
2334
	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2335 2336
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
				&_cpu_based_exec_control) < 0)
Y
Yang, Sheng 已提交
2337
		return -EIO;
2338 2339 2340 2341 2342
#ifdef CONFIG_X86_64
	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
					   ~CPU_BASED_CR8_STORE_EXITING;
#endif
2343
	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
S
Sheng Yang 已提交
2344 2345
		min2 = 0;
		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2346
			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2347
			SECONDARY_EXEC_WBINVD_EXITING |
S
Sheng Yang 已提交
2348
			SECONDARY_EXEC_ENABLE_VPID |
2349
			SECONDARY_EXEC_ENABLE_EPT |
2350
			SECONDARY_EXEC_UNRESTRICTED_GUEST |
2351
			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2352
			SECONDARY_EXEC_DESC |
2353
			SECONDARY_EXEC_ENABLE_RDTSCP |
2354
			SECONDARY_EXEC_ENABLE_INVPCID |
2355
			SECONDARY_EXEC_APIC_REGISTER_VIRT |
2356
			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
W
Wanpeng Li 已提交
2357
			SECONDARY_EXEC_SHADOW_VMCS |
K
Kai Huang 已提交
2358
			SECONDARY_EXEC_XSAVES |
2359 2360
			SECONDARY_EXEC_RDSEED_EXITING |
			SECONDARY_EXEC_RDRAND_EXITING |
X
Xiao Guangrong 已提交
2361
			SECONDARY_EXEC_ENABLE_PML |
B
Bandan Das 已提交
2362
			SECONDARY_EXEC_TSC_SCALING |
2363
			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2364 2365
			SECONDARY_EXEC_PT_USE_GPA |
			SECONDARY_EXEC_PT_CONCEAL_VMX |
2366 2367 2368
			SECONDARY_EXEC_ENABLE_VMFUNC;
		if (cpu_has_sgx())
			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
S
Sheng Yang 已提交
2369 2370
		if (adjust_vmx_controls(min2, opt2,
					MSR_IA32_VMX_PROCBASED_CTLS2,
2371 2372 2373 2374 2375 2376 2377 2378
					&_cpu_based_2nd_exec_control) < 0)
			return -EIO;
	}
#ifndef CONFIG_X86_64
	if (!(_cpu_based_2nd_exec_control &
				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
2379 2380 2381

	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
		_cpu_based_2nd_exec_control &= ~(
2382
				SECONDARY_EXEC_APIC_REGISTER_VIRT |
2383 2384
				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2385

2386
	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2387
		&vmx_cap->ept, &vmx_cap->vpid);
2388

S
Sheng Yang 已提交
2389
	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
M
Marcelo Tosatti 已提交
2390 2391
		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
		   enabled */
2392 2393 2394
		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
					     CPU_BASED_CR3_STORE_EXITING |
					     CPU_BASED_INVLPG_EXITING);
2395 2396
	} else if (vmx_cap->ept) {
		vmx_cap->ept = 0;
2397 2398 2399 2400
		pr_warn_once("EPT CAP should not exist if not support "
				"1-setting enable EPT VM-execution control\n");
	}
	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2401 2402
		vmx_cap->vpid) {
		vmx_cap->vpid = 0;
2403 2404
		pr_warn_once("VPID CAP should not exist if not support "
				"1-setting enable VPID VM-execution control\n");
S
Sheng Yang 已提交
2405
	}
2406

2407
	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2408 2409 2410
#ifdef CONFIG_X86_64
	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
2411 2412 2413
	opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
	      VM_EXIT_LOAD_IA32_PAT |
	      VM_EXIT_LOAD_IA32_EFER |
2414 2415 2416
	      VM_EXIT_CLEAR_BNDCFGS |
	      VM_EXIT_PT_CONCEAL_PIP |
	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
2417 2418
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
				&_vmexit_control) < 0)
Y
Yang, Sheng 已提交
2419
		return -EIO;
2420

2421 2422 2423
	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
		 PIN_BASED_VMX_PREEMPTION_TIMER;
2424 2425 2426 2427
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
				&_pin_based_exec_control) < 0)
		return -EIO;

2428 2429
	if (cpu_has_broken_vmx_preemption_timer())
		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2430
	if (!(_cpu_based_2nd_exec_control &
2431
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2432 2433
		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;

2434
	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2435 2436 2437
	opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
	      VM_ENTRY_LOAD_IA32_PAT |
	      VM_ENTRY_LOAD_IA32_EFER |
2438 2439 2440
	      VM_ENTRY_LOAD_BNDCFGS |
	      VM_ENTRY_PT_CONCEAL_PIP |
	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
2441 2442
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
				&_vmentry_control) < 0)
Y
Yang, Sheng 已提交
2443
		return -EIO;
A
Avi Kivity 已提交
2444

2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457
	/*
	 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
	 * can't be used due to an errata where VM Exit may incorrectly clear
	 * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
	 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
	 */
	if (boot_cpu_data.x86 == 0x6) {
		switch (boot_cpu_data.x86_model) {
		case 26: /* AAK155 */
		case 30: /* AAP115 */
		case 37: /* AAT100 */
		case 44: /* BC86,AAY89,BD102 */
		case 46: /* BA97 */
2458
			_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2459 2460 2461 2462 2463 2464 2465 2466 2467 2468
			_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
			pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
					"does not work properly. Using workaround\n");
			break;
		default:
			break;
		}
	}


N
Nguyen Anh Quynh 已提交
2469
	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2470 2471 2472

	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
Y
Yang, Sheng 已提交
2473
		return -EIO;
2474 2475 2476 2477

#ifdef CONFIG_X86_64
	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
	if (vmx_msr_high & (1u<<16))
Y
Yang, Sheng 已提交
2478
		return -EIO;
2479 2480 2481 2482
#endif

	/* Require Write-Back (WB) memory type for VMCS accesses. */
	if (((vmx_msr_high >> 18) & 15) != 6)
Y
Yang, Sheng 已提交
2483
		return -EIO;
2484

Y
Yang, Sheng 已提交
2485
	vmcs_conf->size = vmx_msr_high & 0x1fff;
2486
	vmcs_conf->order = get_order(vmcs_conf->size);
2487
	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2488

2489
	vmcs_conf->revision_id = vmx_msr_low;
2490

Y
Yang, Sheng 已提交
2491 2492
	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2493
	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
Y
Yang, Sheng 已提交
2494 2495
	vmcs_conf->vmexit_ctrl         = _vmexit_control;
	vmcs_conf->vmentry_ctrl        = _vmentry_control;
2496

2497 2498 2499
	if (static_branch_unlikely(&enable_evmcs))
		evmcs_sanitize_exec_ctrls(vmcs_conf);

2500
	return 0;
N
Nguyen Anh Quynh 已提交
2501
}
A
Avi Kivity 已提交
2502

2503
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
A
Avi Kivity 已提交
2504 2505 2506 2507 2508
{
	int node = cpu_to_node(cpu);
	struct page *pages;
	struct vmcs *vmcs;

2509
	pages = __alloc_pages_node(node, flags, vmcs_config.order);
A
Avi Kivity 已提交
2510 2511 2512
	if (!pages)
		return NULL;
	vmcs = page_address(pages);
2513
	memset(vmcs, 0, vmcs_config.size);
2514 2515 2516

	/* KVM supports Enlightened VMCS v1 only */
	if (static_branch_unlikely(&enable_evmcs))
2517
		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2518
	else
2519
		vmcs->hdr.revision_id = vmcs_config.revision_id;
2520

2521 2522
	if (shadow)
		vmcs->hdr.shadow_vmcs = 1;
A
Avi Kivity 已提交
2523 2524 2525
	return vmcs;
}

2526
void free_vmcs(struct vmcs *vmcs)
A
Avi Kivity 已提交
2527
{
2528
	free_pages((unsigned long)vmcs, vmcs_config.order);
A
Avi Kivity 已提交
2529 2530
}

2531 2532 2533
/*
 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
 */
2534
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2535 2536 2537 2538 2539 2540
{
	if (!loaded_vmcs->vmcs)
		return;
	loaded_vmcs_clear(loaded_vmcs);
	free_vmcs(loaded_vmcs->vmcs);
	loaded_vmcs->vmcs = NULL;
2541 2542
	if (loaded_vmcs->msr_bitmap)
		free_page((unsigned long)loaded_vmcs->msr_bitmap);
2543
	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2544 2545
}

2546
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2547
{
2548
	loaded_vmcs->vmcs = alloc_vmcs(false);
2549 2550 2551
	if (!loaded_vmcs->vmcs)
		return -ENOMEM;

2552 2553
	vmcs_clear(loaded_vmcs->vmcs);

2554
	loaded_vmcs->shadow_vmcs = NULL;
2555
	loaded_vmcs->hv_timer_soft_disabled = false;
2556 2557
	loaded_vmcs->cpu = -1;
	loaded_vmcs->launched = 0;
2558 2559

	if (cpu_has_vmx_msr_bitmap()) {
2560 2561
		loaded_vmcs->msr_bitmap = (unsigned long *)
				__get_free_page(GFP_KERNEL_ACCOUNT);
2562 2563 2564
		if (!loaded_vmcs->msr_bitmap)
			goto out_vmcs;
		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2565

2566 2567
		if (IS_ENABLED(CONFIG_HYPERV) &&
		    static_branch_unlikely(&enable_evmcs) &&
2568 2569 2570 2571 2572 2573
		    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
			struct hv_enlightened_vmcs *evmcs =
				(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;

			evmcs->hv_enlightenments_control.msr_bitmap = 1;
		}
2574
	}
2575 2576

	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2577 2578
	memset(&loaded_vmcs->controls_shadow, 0,
		sizeof(struct vmcs_controls_shadow));
2579

2580
	return 0;
2581 2582 2583 2584

out_vmcs:
	free_loaded_vmcs(loaded_vmcs);
	return -ENOMEM;
2585 2586
}

2587
static void free_kvm_area(void)
A
Avi Kivity 已提交
2588 2589 2590
{
	int cpu;

Z
Zachary Amsden 已提交
2591
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
2592
		free_vmcs(per_cpu(vmxarea, cpu));
Z
Zachary Amsden 已提交
2593 2594
		per_cpu(vmxarea, cpu) = NULL;
	}
A
Avi Kivity 已提交
2595 2596 2597 2598 2599 2600
}

static __init int alloc_kvm_area(void)
{
	int cpu;

Z
Zachary Amsden 已提交
2601
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
2602 2603
		struct vmcs *vmcs;

2604
		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
A
Avi Kivity 已提交
2605 2606 2607 2608 2609
		if (!vmcs) {
			free_kvm_area();
			return -ENOMEM;
		}

2610 2611 2612 2613 2614
		/*
		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
		 * revision_id reported by MSR_IA32_VMX_BASIC.
		 *
2615
		 * However, even though not explicitly documented by
2616 2617 2618 2619 2620
		 * TLFS, VMXArea passed as VMXON argument should
		 * still be marked with revision_id reported by
		 * physical CPU.
		 */
		if (static_branch_unlikely(&enable_evmcs))
2621
			vmcs->hdr.revision_id = vmcs_config.revision_id;
2622

A
Avi Kivity 已提交
2623 2624 2625 2626 2627
		per_cpu(vmxarea, cpu) = vmcs;
	}
	return 0;
}

2628
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2629
		struct kvm_segment *save)
A
Avi Kivity 已提交
2630
{
2631 2632 2633 2634 2635 2636 2637 2638 2639
	if (!emulate_invalid_guest_state) {
		/*
		 * CS and SS RPL should be equal during guest entry according
		 * to VMX spec, but in reality it is not always so. Since vcpu
		 * is in the middle of the transition from real mode to
		 * protected mode it is safe to assume that RPL 0 is a good
		 * default value.
		 */
		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2640 2641
			save->selector &= ~SEGMENT_RPL_MASK;
		save->dpl = save->selector & SEGMENT_RPL_MASK;
2642
		save->s = 1;
A
Avi Kivity 已提交
2643
	}
2644
	vmx_set_segment(vcpu, save, seg);
A
Avi Kivity 已提交
2645 2646 2647 2648 2649
}

static void enter_pmode(struct kvm_vcpu *vcpu)
{
	unsigned long flags;
2650
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
2651

2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662
	/*
	 * Update real mode segment cache. It may be not up-to-date if sement
	 * register was written while vcpu was in a guest mode.
	 */
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

2663
	vmx->rmode.vm86_active = 0;
A
Avi Kivity 已提交
2664

2665
	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
A
Avi Kivity 已提交
2666 2667

	flags = vmcs_readl(GUEST_RFLAGS);
2668 2669
	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
A
Avi Kivity 已提交
2670 2671
	vmcs_writel(GUEST_RFLAGS, flags);

2672 2673
	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
A
Avi Kivity 已提交
2674 2675 2676

	update_exception_bitmap(vcpu);

2677 2678 2679 2680 2681 2682
	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
A
Avi Kivity 已提交
2683 2684
}

2685
static void fix_rmode_seg(int seg, struct kvm_segment *save)
A
Avi Kivity 已提交
2686
{
2687
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
	struct kvm_segment var = *save;

	var.dpl = 0x3;
	if (seg == VCPU_SREG_CS)
		var.type = 0x3;

	if (!emulate_invalid_guest_state) {
		var.selector = var.base >> 4;
		var.base = var.base & 0xffff0;
		var.limit = 0xffff;
		var.g = 0;
		var.db = 0;
		var.present = 1;
		var.s = 1;
		var.l = 0;
		var.unusable = 0;
		var.type = 0x3;
		var.avl = 0;
		if (save->base & 0xf)
			printk_once(KERN_WARNING "kvm: segment base is not "
					"paragraph aligned when entering "
					"protected mode (seg=%d)", seg);
	}
A
Avi Kivity 已提交
2711

2712
	vmcs_write16(sf->selector, var.selector);
2713
	vmcs_writel(sf->base, var.base);
2714 2715
	vmcs_write32(sf->limit, var.limit);
	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
A
Avi Kivity 已提交
2716 2717 2718 2719 2720
}

static void enter_rmode(struct kvm_vcpu *vcpu)
{
	unsigned long flags;
2721
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2722
	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
A
Avi Kivity 已提交
2723

2724 2725 2726 2727 2728
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2729 2730
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2731

2732
	vmx->rmode.vm86_active = 1;
A
Avi Kivity 已提交
2733

2734 2735
	/*
	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2736
	 * vcpu. Warn the user that an update is overdue.
2737
	 */
2738
	if (!kvm_vmx->tss_addr)
2739 2740 2741
		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
			     "called before entering vcpu\n");

A
Avi Kivity 已提交
2742 2743
	vmx_segment_cache_clear(vmx);

2744
	vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
A
Avi Kivity 已提交
2745 2746 2747 2748
	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

	flags = vmcs_readl(GUEST_RFLAGS);
2749
	vmx->rmode.save_rflags = flags;
A
Avi Kivity 已提交
2750

2751
	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
A
Avi Kivity 已提交
2752 2753

	vmcs_writel(GUEST_RFLAGS, flags);
2754
	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
A
Avi Kivity 已提交
2755 2756
	update_exception_bitmap(vcpu);

2757 2758 2759 2760 2761 2762
	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2763

2764
	kvm_mmu_reset_context(vcpu);
A
Avi Kivity 已提交
2765 2766
}

2767
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2768 2769
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2770 2771 2772 2773
	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);

	if (!msr)
		return;
2774

2775
	vcpu->arch.efer = efer;
2776
	if (efer & EFER_LMA) {
2777
		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2778 2779
		msr->data = efer;
	} else {
2780
		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2781 2782 2783 2784 2785 2786

		msr->data = efer & ~EFER_LME;
	}
	setup_msrs(vmx);
}

2787
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
2788 2789 2790 2791 2792

static void enter_lmode(struct kvm_vcpu *vcpu)
{
	u32 guest_tr_ar;

A
Avi Kivity 已提交
2793 2794
	vmx_segment_cache_clear(to_vmx(vcpu));

A
Avi Kivity 已提交
2795
	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2796
	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2797 2798
		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
				     __func__);
A
Avi Kivity 已提交
2799
		vmcs_write32(GUEST_TR_AR_BYTES,
2800 2801
			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
			     | VMX_AR_TYPE_BUSY_64_TSS);
A
Avi Kivity 已提交
2802
	}
2803
	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
A
Avi Kivity 已提交
2804 2805 2806 2807
}

static void exit_lmode(struct kvm_vcpu *vcpu)
{
2808
	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2809
	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
A
Avi Kivity 已提交
2810 2811 2812 2813
}

#endif

2814
static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
2815 2816 2817 2818
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	/*
2819 2820 2821 2822 2823
	 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
	 * the CPU is not required to invalidate guest-physical mappings on
	 * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
	 * associated with the root EPT structure and not any particular VPID
	 * (INVVPID also isn't required to invalidate guest-physical mappings).
2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836
	 */
	if (enable_ept) {
		ept_sync_global();
	} else if (enable_vpid) {
		if (cpu_has_vmx_invvpid_global()) {
			vpid_sync_vcpu_global();
		} else {
			vpid_sync_vcpu_single(vmx->vpid);
			vpid_sync_vcpu_single(vmx->nested.vpid02);
		}
	}
}

2837 2838
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
2839 2840
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	u64 root_hpa = mmu->root_hpa;
2841 2842 2843 2844 2845 2846

	/* No flush required if the current context is invalid. */
	if (!VALID_PAGE(root_hpa))
		return;

	if (enable_ept)
2847 2848
		ept_sync_context(construct_eptp(vcpu, root_hpa,
						mmu->shadow_root_level));
2849 2850 2851 2852 2853 2854
	else if (!is_guest_mode(vcpu))
		vpid_sync_context(to_vmx(vcpu)->vpid);
	else
		vpid_sync_context(nested_get_vpid02(vcpu));
}

2855 2856 2857
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
	/*
2858 2859
	 * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
	 * vmx_flush_tlb_guest() for an explanation of why this is ok.
2860
	 */
2861
	vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
2862 2863
}

2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
	/*
	 * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
	 * or a vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit
	 * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
	 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
	 * i.e. no explicit INVVPID is necessary.
	 */
	vpid_sync_context(to_vmx(vcpu)->vpid);
}

2876
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
2877
{
G
Gleb Natapov 已提交
2878 2879
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

2880
	if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
A
Avi Kivity 已提交
2881 2882
		return;

2883
	if (is_pae_paging(vcpu)) {
G
Gleb Natapov 已提交
2884 2885 2886 2887
		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
2888 2889 2890
	}
}

2891
void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2892
{
G
Gleb Natapov 已提交
2893 2894
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

2895 2896 2897 2898 2899 2900 2901
	if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
		return;

	mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
	mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
	mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
	mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
A
Avi Kivity 已提交
2902

2903
	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
2904 2905
}

2906 2907 2908 2909
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
					unsigned long cr0,
					struct kvm_vcpu *vcpu)
{
2910 2911
	struct vcpu_vmx *vmx = to_vmx(vcpu);

2912
	if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
2913
		vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
2914 2915
	if (!(cr0 & X86_CR0_PG)) {
		/* From paging/starting to nonpaging */
2916 2917
		exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
					  CPU_BASED_CR3_STORE_EXITING);
2918
		vcpu->arch.cr0 = cr0;
2919
		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2920 2921
	} else if (!is_paging(vcpu)) {
		/* From nonpaging to paging */
2922 2923
		exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
					    CPU_BASED_CR3_STORE_EXITING);
2924
		vcpu->arch.cr0 = cr0;
2925
		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2926
	}
2927 2928 2929

	if (!(cr0 & X86_CR0_WP))
		*hw_cr0 &= ~X86_CR0_WP;
2930 2931
}

2932
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
A
Avi Kivity 已提交
2933
{
2934
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2935 2936
	unsigned long hw_cr0;

2937
	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
2938
	if (is_unrestricted_guest(vcpu))
G
Gleb Natapov 已提交
2939
		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
2940
	else {
G
Gleb Natapov 已提交
2941
		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
2942

2943 2944
		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
			enter_pmode(vcpu);
A
Avi Kivity 已提交
2945

2946 2947 2948
		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
			enter_rmode(vcpu);
	}
A
Avi Kivity 已提交
2949

2950
#ifdef CONFIG_X86_64
2951
	if (vcpu->arch.efer & EFER_LME) {
2952
		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
A
Avi Kivity 已提交
2953
			enter_lmode(vcpu);
2954
		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
A
Avi Kivity 已提交
2955 2956 2957 2958
			exit_lmode(vcpu);
	}
#endif

2959
	if (enable_ept && !is_unrestricted_guest(vcpu))
2960 2961
		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);

A
Avi Kivity 已提交
2962
	vmcs_writel(CR0_READ_SHADOW, cr0);
2963
	vmcs_writel(GUEST_CR0, hw_cr0);
2964
	vcpu->arch.cr0 = cr0;
2965
	kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
2966 2967 2968

	/* depends on vcpu->arch.cr0 to be set to a new value */
	vmx->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
2969 2970
}

2971
static int vmx_get_max_tdp_level(void)
2972
{
2973
	if (cpu_has_vmx_ept_5levels())
2974 2975 2976 2977
		return 5;
	return 4;
}

2978 2979
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
		   int root_level)
2980
{
2981 2982
	u64 eptp = VMX_EPTP_MT_WB;

2983
	eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
2984

2985 2986
	if (enable_ept_ad_bits &&
	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
2987
		eptp |= VMX_EPTP_AD_ENABLE_BIT;
2988 2989 2990 2991 2992
	eptp |= (root_hpa & PAGE_MASK);

	return eptp;
}

2993 2994
static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
			     int pgd_level)
A
Avi Kivity 已提交
2995
{
2996
	struct kvm *kvm = vcpu->kvm;
2997
	bool update_guest_cr3 = true;
2998 2999 3000
	unsigned long guest_cr3;
	u64 eptp;

3001
	if (enable_ept) {
3002
		eptp = construct_eptp(vcpu, pgd, pgd_level);
3003
		vmcs_write64(EPT_POINTER, eptp);
3004

3005
		if (kvm_x86_ops.tlb_remote_flush) {
3006 3007 3008 3009 3010 3011 3012
			spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
			to_vmx(vcpu)->ept_pointer = eptp;
			to_kvm_vmx(kvm)->ept_pointers_match
				= EPT_POINTERS_CHECK;
			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
		}

3013
		if (!enable_unrestricted_guest && !is_paging(vcpu))
3014
			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3015 3016 3017 3018
		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
			guest_cr3 = vcpu->arch.cr3;
		else /* vmcs01.GUEST_CR3 is already up-to-date. */
			update_guest_cr3 = false;
3019
		vmx_ept_load_pdptrs(vcpu);
3020 3021
	} else {
		guest_cr3 = pgd;
3022 3023
	}

3024 3025
	if (update_guest_cr3)
		vmcs_writel(GUEST_CR3, guest_cr3);
A
Avi Kivity 已提交
3026 3027
}

3028
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
A
Avi Kivity 已提交
3029
{
3030
	struct vcpu_vmx *vmx = to_vmx(vcpu);
3031 3032 3033 3034 3035
	/*
	 * Pass through host's Machine Check Enable value to hw_cr4, which
	 * is in force while we are in guest mode.  Do not let guests control
	 * this bit, even if host CR4.MCE == 0.
	 */
3036 3037 3038
	unsigned long hw_cr4;

	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3039
	if (is_unrestricted_guest(vcpu))
3040
		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3041
	else if (vmx->rmode.vm86_active)
3042 3043 3044
		hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
	else
		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3045

3046 3047
	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
		if (cr4 & X86_CR4_UMIP) {
3048
			secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3049 3050
			hw_cr4 &= ~X86_CR4_UMIP;
		} else if (!is_guest_mode(vcpu) ||
3051 3052 3053
			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
			secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
		}
3054
	}
3055

3056 3057 3058 3059 3060
	if (cr4 & X86_CR4_VMXE) {
		/*
		 * To use VMXON (and later other VMX instructions), a guest
		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
		 * So basically the check on whether to allow nested VMX
3061 3062
		 * is here.  We operate under the default treatment of SMM,
		 * so VMX cannot be enabled under SMM.
3063
		 */
3064
		if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
3065
			return 1;
3066
	}
3067

3068
	if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3069 3070
		return 1;

3071
	vcpu->arch.cr4 = cr4;
3072
	kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3073

3074
	if (!is_unrestricted_guest(vcpu)) {
3075 3076 3077 3078 3079 3080 3081
		if (enable_ept) {
			if (!is_paging(vcpu)) {
				hw_cr4 &= ~X86_CR4_PAE;
				hw_cr4 |= X86_CR4_PSE;
			} else if (!(cr4 & X86_CR4_PAE)) {
				hw_cr4 &= ~X86_CR4_PAE;
			}
3082
		}
3083

3084
		/*
3085 3086 3087 3088 3089 3090 3091 3092 3093
		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
		 * to be manually disabled when guest switches to non-paging
		 * mode.
		 *
		 * If !enable_unrestricted_guest, the CPU is always running
		 * with CR0.PG=1 and CR4 needs to be modified.
		 * If enable_unrestricted_guest, the CPU automatically
		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3094
		 */
3095 3096 3097
		if (!is_paging(vcpu))
			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
	}
3098

3099 3100
	vmcs_writel(CR4_READ_SHADOW, cr4);
	vmcs_writel(GUEST_CR4, hw_cr4);
3101
	return 0;
A
Avi Kivity 已提交
3102 3103
}

3104
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
A
Avi Kivity 已提交
3105
{
3106
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
3107 3108
	u32 ar;

3109
	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3110
		*var = vmx->rmode.segs[seg];
3111
		if (seg == VCPU_SREG_TR
A
Avi Kivity 已提交
3112
		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3113
			return;
3114 3115 3116
		var->base = vmx_read_guest_seg_base(vmx, seg);
		var->selector = vmx_read_guest_seg_selector(vmx, seg);
		return;
3117
	}
A
Avi Kivity 已提交
3118 3119 3120 3121
	var->base = vmx_read_guest_seg_base(vmx, seg);
	var->limit = vmx_read_guest_seg_limit(vmx, seg);
	var->selector = vmx_read_guest_seg_selector(vmx, seg);
	ar = vmx_read_guest_seg_ar(vmx, seg);
3122
	var->unusable = (ar >> 16) & 1;
A
Avi Kivity 已提交
3123 3124 3125
	var->type = ar & 15;
	var->s = (ar >> 4) & 1;
	var->dpl = (ar >> 5) & 3;
3126 3127 3128 3129 3130 3131 3132 3133
	/*
	 * Some userspaces do not preserve unusable property. Since usable
	 * segment has to be present according to VMX spec we can use present
	 * property to amend userspace bug by making unusable segment always
	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
	 * segment as unusable.
	 */
	var->present = !var->unusable;
A
Avi Kivity 已提交
3134 3135 3136 3137 3138 3139
	var->avl = (ar >> 12) & 1;
	var->l = (ar >> 13) & 1;
	var->db = (ar >> 14) & 1;
	var->g = (ar >> 15) & 1;
}

3140 3141 3142 3143 3144 3145 3146 3147
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment s;

	if (to_vmx(vcpu)->rmode.vm86_active) {
		vmx_get_segment(vcpu, &s, seg);
		return s.base;
	}
A
Avi Kivity 已提交
3148
	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3149 3150
}

3151
int vmx_get_cpl(struct kvm_vcpu *vcpu)
3152
{
3153 3154
	struct vcpu_vmx *vmx = to_vmx(vcpu);

P
Paolo Bonzini 已提交
3155
	if (unlikely(vmx->rmode.vm86_active))
3156
		return 0;
P
Paolo Bonzini 已提交
3157 3158
	else {
		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3159
		return VMX_AR_DPL(ar);
A
Avi Kivity 已提交
3160 3161 3162
	}
}

3163
static u32 vmx_segment_access_rights(struct kvm_segment *var)
A
Avi Kivity 已提交
3164 3165 3166
{
	u32 ar;

3167
	if (var->unusable || !var->present)
A
Avi Kivity 已提交
3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
		ar = 1 << 16;
	else {
		ar = var->type & 15;
		ar |= (var->s & 1) << 4;
		ar |= (var->dpl & 3) << 5;
		ar |= (var->present & 1) << 7;
		ar |= (var->avl & 1) << 12;
		ar |= (var->l & 1) << 13;
		ar |= (var->db & 1) << 14;
		ar |= (var->g & 1) << 15;
	}
3179 3180 3181 3182

	return ar;
}

3183
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3184
{
3185
	struct vcpu_vmx *vmx = to_vmx(vcpu);
3186
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3187

A
Avi Kivity 已提交
3188 3189
	vmx_segment_cache_clear(vmx);

3190 3191 3192 3193 3194 3195
	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
		vmx->rmode.segs[seg] = *var;
		if (seg == VCPU_SREG_TR)
			vmcs_write16(sf->selector, var->selector);
		else if (var->s)
			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3196
		goto out;
3197
	}
3198

3199 3200 3201
	vmcs_writel(sf->base, var->base);
	vmcs_write32(sf->limit, var->limit);
	vmcs_write16(sf->selector, var->selector);
3202 3203 3204 3205 3206 3207

	/*
	 *   Fix the "Accessed" bit in AR field of segment registers for older
	 * qemu binaries.
	 *   IA32 arch specifies that at the time of processor reset the
	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
G
Guo Chao 已提交
3208
	 * is setting it to 0 in the userland code. This causes invalid guest
3209 3210 3211 3212 3213
	 * state vmexit when "unrestricted guest" mode is turned on.
	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
	 * tree. Newer qemu binaries with that qemu fix would not need this
	 * kvm hack.
	 */
3214
	if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3215
		var->type |= 0x1; /* Accessed */
3216

3217
	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3218 3219

out:
3220
	vmx->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
3221 3222 3223 3224
}

static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
A
Avi Kivity 已提交
3225
	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
A
Avi Kivity 已提交
3226 3227 3228 3229 3230

	*db = (ar >> 14) & 1;
	*l = (ar >> 13) & 1;
}

3231
static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
3232
{
3233 3234
	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
	dt->address = vmcs_readl(GUEST_IDTR_BASE);
A
Avi Kivity 已提交
3235 3236
}

3237
static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
3238
{
3239 3240
	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
	vmcs_writel(GUEST_IDTR_BASE, dt->address);
A
Avi Kivity 已提交
3241 3242
}

3243
static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
3244
{
3245 3246
	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
	dt->address = vmcs_readl(GUEST_GDTR_BASE);
A
Avi Kivity 已提交
3247 3248
}

3249
static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
3250
{
3251 3252
	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
	vmcs_writel(GUEST_GDTR_BASE, dt->address);
A
Avi Kivity 已提交
3253 3254
}

3255 3256 3257 3258 3259 3260
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment var;
	u32 ar;

	vmx_get_segment(vcpu, &var, seg);
3261
	var.dpl = 0x3;
3262 3263
	if (seg == VCPU_SREG_CS)
		var.type = 0x3;
3264 3265 3266 3267
	ar = vmx_segment_access_rights(&var);

	if (var.base != (var.selector << 4))
		return false;
3268
	if (var.limit != 0xffff)
3269
		return false;
3270
	if (ar != 0xf3)
3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281
		return false;

	return true;
}

static bool code_segment_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment cs;
	unsigned int cs_rpl;

	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3282
	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3283

3284 3285
	if (cs.unusable)
		return false;
3286
	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3287 3288 3289
		return false;
	if (!cs.s)
		return false;
3290
	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3291 3292
		if (cs.dpl > cs_rpl)
			return false;
3293
	} else {
3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309
		if (cs.dpl != cs_rpl)
			return false;
	}
	if (!cs.present)
		return false;

	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
	return true;
}

static bool stack_segment_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment ss;
	unsigned int ss_rpl;

	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3310
	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3311

3312 3313 3314
	if (ss.unusable)
		return true;
	if (ss.type != 3 && ss.type != 7)
3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331
		return false;
	if (!ss.s)
		return false;
	if (ss.dpl != ss_rpl) /* DPL != RPL */
		return false;
	if (!ss.present)
		return false;

	return true;
}

static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment var;
	unsigned int rpl;

	vmx_get_segment(vcpu, &var, seg);
3332
	rpl = var.selector & SEGMENT_RPL_MASK;
3333

3334 3335
	if (var.unusable)
		return true;
3336 3337 3338 3339
	if (!var.s)
		return false;
	if (!var.present)
		return false;
3340
	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356
		if (var.dpl < rpl) /* DPL < RPL */
			return false;
	}

	/* TODO: Add other members to kvm_segment_field to allow checking for other access
	 * rights flags
	 */
	return true;
}

static bool tr_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment tr;

	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);

3357 3358
	if (tr.unusable)
		return false;
3359
	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3360
		return false;
3361
	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374
		return false;
	if (!tr.present)
		return false;

	return true;
}

static bool ldtr_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment ldtr;

	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);

3375 3376
	if (ldtr.unusable)
		return true;
3377
	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393
		return false;
	if (ldtr.type != 2)
		return false;
	if (!ldtr.present)
		return false;

	return true;
}

static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
{
	struct kvm_segment cs, ss;

	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);

3394 3395
	return ((cs.selector & SEGMENT_RPL_MASK) ==
		 (ss.selector & SEGMENT_RPL_MASK));
3396 3397 3398 3399 3400 3401 3402
}

/*
 * Check if guest state is valid. Returns true if valid, false if
 * not.
 * We assume that registers are always usable
 */
3403
bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3404 3405
{
	/* real mode guest state checks */
3406
	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447
		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
			return false;
	} else {
	/* protected mode guest state checks */
		if (!cs_ss_rpl_check(vcpu))
			return false;
		if (!code_segment_valid(vcpu))
			return false;
		if (!stack_segment_valid(vcpu))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
			return false;
		if (!tr_valid(vcpu))
			return false;
		if (!ldtr_valid(vcpu))
			return false;
	}
	/* TODO:
	 * - Add checks on RIP
	 * - Add checks on RFLAGS
	 */

	return true;
}

M
Mike Day 已提交
3448
static int init_rmode_tss(struct kvm *kvm)
A
Avi Kivity 已提交
3449
{
3450
	gfn_t fn;
3451
	u16 data = 0;
3452
	int idx, r;
A
Avi Kivity 已提交
3453

3454
	idx = srcu_read_lock(&kvm->srcu);
3455
	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
3456 3457
	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
	if (r < 0)
3458
		goto out;
3459
	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3460 3461
	r = kvm_write_guest_page(kvm, fn++, &data,
			TSS_IOPB_BASE_OFFSET, sizeof(u16));
3462
	if (r < 0)
3463
		goto out;
3464 3465
	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
	if (r < 0)
3466
		goto out;
3467 3468
	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
	if (r < 0)
3469
		goto out;
3470
	data = ~0;
3471 3472 3473 3474
	r = kvm_write_guest_page(kvm, fn, &data,
				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
				 sizeof(u8));
out:
3475
	srcu_read_unlock(&kvm->srcu, idx);
3476
	return r;
A
Avi Kivity 已提交
3477 3478
}

3479 3480
static int init_rmode_identity_map(struct kvm *kvm)
{
3481
	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3482
	int i, r = 0;
D
Dan Williams 已提交
3483
	kvm_pfn_t identity_map_pfn;
3484 3485
	u32 tmp;

3486
	/* Protect kvm_vmx->ept_identity_pagetable_done. */
3487 3488
	mutex_lock(&kvm->slots_lock);

3489
	if (likely(kvm_vmx->ept_identity_pagetable_done))
3490
		goto out;
3491

3492 3493 3494
	if (!kvm_vmx->ept_identity_map_addr)
		kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
3495

3496
	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3497
				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
3498
	if (r < 0)
3499
		goto out;
3500

3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512
	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
	if (r < 0)
		goto out;
	/* Set up identity-mapping pagetable for EPT in real mode */
	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
		r = kvm_write_guest_page(kvm, identity_map_pfn,
				&tmp, i * sizeof(tmp), sizeof(tmp));
		if (r < 0)
			goto out;
	}
3513
	kvm_vmx->ept_identity_pagetable_done = true;
3514

3515
out:
3516
	mutex_unlock(&kvm->slots_lock);
3517
	return r;
3518 3519
}

A
Avi Kivity 已提交
3520 3521
static void seg_setup(int seg)
{
3522
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3523
	unsigned int ar;
A
Avi Kivity 已提交
3524 3525 3526 3527

	vmcs_write16(sf->selector, 0);
	vmcs_writel(sf->base, 0);
	vmcs_write32(sf->limit, 0xffff);
3528 3529 3530
	ar = 0x93;
	if (seg == VCPU_SREG_CS)
		ar |= 0x08; /* code segment */
3531 3532

	vmcs_write32(sf->ar_bytes, ar);
A
Avi Kivity 已提交
3533 3534
}

3535 3536
static int alloc_apic_access_page(struct kvm *kvm)
{
3537
	struct page *page;
3538 3539
	int r = 0;

3540
	mutex_lock(&kvm->slots_lock);
3541
	if (kvm->arch.apic_access_page_done)
3542
		goto out;
3543 3544
	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
3545 3546
	if (r)
		goto out;
3547

3548
	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
3549 3550 3551 3552 3553
	if (is_error_page(page)) {
		r = -EFAULT;
		goto out;
	}

3554 3555 3556 3557 3558 3559
	/*
	 * Do not pin the page in memory, so that memory hot-unplug
	 * is able to migrate it.
	 */
	put_page(page);
	kvm->arch.apic_access_page_done = true;
3560
out:
3561
	mutex_unlock(&kvm->slots_lock);
3562 3563 3564
	return r;
}

3565
int allocate_vpid(void)
3566 3567 3568
{
	int vpid;

3569
	if (!enable_vpid)
3570
		return 0;
3571 3572
	spin_lock(&vmx_vpid_lock);
	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3573
	if (vpid < VMX_NR_VPIDS)
3574
		__set_bit(vpid, vmx_vpid_bitmap);
3575 3576
	else
		vpid = 0;
3577
	spin_unlock(&vmx_vpid_lock);
3578
	return vpid;
3579 3580
}

3581
void free_vpid(int vpid)
3582
{
3583
	if (!enable_vpid || vpid == 0)
3584 3585
		return;
	spin_lock(&vmx_vpid_lock);
3586
	__clear_bit(vpid, vmx_vpid_bitmap);
3587 3588 3589
	spin_unlock(&vmx_vpid_lock);
}

3590
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3591
							  u32 msr, int type)
S
Sheng Yang 已提交
3592
{
3593
	int f = sizeof(unsigned long);
S
Sheng Yang 已提交
3594 3595 3596 3597

	if (!cpu_has_vmx_msr_bitmap())
		return;

3598 3599 3600
	if (static_branch_unlikely(&enable_evmcs))
		evmcs_touch_msr_bitmap();

S
Sheng Yang 已提交
3601 3602 3603 3604 3605 3606
	/*
	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
	 * have the write-low and read-high bitmap offsets the wrong way round.
	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
	 */
	if (msr <= 0x1fff) {
3607 3608 3609 3610 3611 3612 3613 3614
		if (type & MSR_TYPE_R)
			/* read-low */
			__clear_bit(msr, msr_bitmap + 0x000 / f);

		if (type & MSR_TYPE_W)
			/* write-low */
			__clear_bit(msr, msr_bitmap + 0x800 / f);

S
Sheng Yang 已提交
3615 3616
	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		msr &= 0x1fff;
3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627
		if (type & MSR_TYPE_R)
			/* read-high */
			__clear_bit(msr, msr_bitmap + 0x400 / f);

		if (type & MSR_TYPE_W)
			/* write-high */
			__clear_bit(msr, msr_bitmap + 0xc00 / f);

	}
}

3628
static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3629 3630 3631 3632 3633 3634 3635
							 u32 msr, int type)
{
	int f = sizeof(unsigned long);

	if (!cpu_has_vmx_msr_bitmap())
		return;

3636 3637 3638
	if (static_branch_unlikely(&enable_evmcs))
		evmcs_touch_msr_bitmap();

3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665
	/*
	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
	 * have the write-low and read-high bitmap offsets the wrong way round.
	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
	 */
	if (msr <= 0x1fff) {
		if (type & MSR_TYPE_R)
			/* read-low */
			__set_bit(msr, msr_bitmap + 0x000 / f);

		if (type & MSR_TYPE_W)
			/* write-low */
			__set_bit(msr, msr_bitmap + 0x800 / f);

	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		msr &= 0x1fff;
		if (type & MSR_TYPE_R)
			/* read-high */
			__set_bit(msr, msr_bitmap + 0x400 / f);

		if (type & MSR_TYPE_W)
			/* write-high */
			__set_bit(msr, msr_bitmap + 0xc00 / f);

	}
}

3666
static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
3667 3668 3669 3670 3671 3672 3673 3674 3675
			     			      u32 msr, int type, bool value)
{
	if (value)
		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
	else
		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
}

static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3676
{
3677 3678 3679
	u8 mode = 0;

	if (cpu_has_secondary_exec_ctrls() &&
3680
	    (secondary_exec_controls_get(to_vmx(vcpu)) &
3681 3682 3683 3684 3685 3686 3687
	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
		mode |= MSR_BITMAP_MODE_X2APIC;
		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
	}

	return mode;
3688 3689
}

3690 3691
static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
					 u8 mode)
3692
{
3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711
	int msr;

	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
		unsigned word = msr / BITS_PER_LONG;
		msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
	}

	if (mode & MSR_BITMAP_MODE_X2APIC) {
		/*
		 * TPR reads and writes can be virtualized even if virtual interrupt
		 * delivery is not in use.
		 */
		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
		}
3712
	}
3713 3714
}

3715
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
	u8 mode = vmx_msr_bitmap_mode(vcpu);
	u8 changed = mode ^ vmx->msr_bitmap_mode;

	if (!changed)
		return;

	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);

	vmx->msr_bitmap_mode = mode;
}

3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752
void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
{
	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
	u32 i;

	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
							MSR_TYPE_RW, flag);
	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
							MSR_TYPE_RW, flag);
	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
							MSR_TYPE_RW, flag);
	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
							MSR_TYPE_RW, flag);
	for (i = 0; i < vmx->pt_desc.addr_range; i++) {
		vmx_set_intercept_for_msr(msr_bitmap,
			MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
		vmx_set_intercept_for_msr(msr_bitmap,
			MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
	}
}

3753 3754 3755 3756 3757 3758 3759 3760 3761
static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	void *vapic_page;
	u32 vppr;
	int rvi;

	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
3762
		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
3763 3764
		return false;

3765
	rvi = vmx_get_rvi();
3766

3767
	vapic_page = vmx->nested.virtual_apic_map.hva;
3768 3769 3770 3771 3772
	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));

	return ((rvi & 0xf0) > (vppr & 0xf0));
}

3773 3774
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
						     bool nested)
3775 3776
{
#ifdef CONFIG_SMP
3777 3778
	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;

3779
	if (vcpu->mode == IN_GUEST_MODE) {
3780
		/*
3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795
		 * The vector of interrupt to be delivered to vcpu had
		 * been set in PIR before this function.
		 *
		 * Following cases will be reached in this block, and
		 * we always send a notification event in all cases as
		 * explained below.
		 *
		 * Case 1: vcpu keeps in non-root mode. Sending a
		 * notification event posts the interrupt to vcpu.
		 *
		 * Case 2: vcpu exits to root mode and is still
		 * runnable. PIR will be synced to vIRR before the
		 * next vcpu entry. Sending a notification event in
		 * this case has no effect, as vcpu is not in root
		 * mode.
3796
		 *
3797 3798 3799 3800 3801 3802
		 * Case 3: vcpu exits to root mode and is blocked.
		 * vcpu_block() has already synced PIR to vIRR and
		 * never blocks vcpu if vIRR is not cleared. Therefore,
		 * a blocked vcpu here does not wait for any requested
		 * interrupts in PIR, and sending a notification event
		 * which has no effect is safe here.
3803 3804
		 */

3805
		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
3806 3807 3808 3809 3810 3811
		return true;
	}
#endif
	return false;
}

3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
						int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (is_guest_mode(vcpu) &&
	    vector == vmx->nested.posted_intr_nv) {
		/*
		 * If a posted intr is not recognized by hardware,
		 * we will accomplish it in the next vmentry.
		 */
		vmx->nested.pi_pending = true;
		kvm_make_request(KVM_REQ_EVENT, vcpu);
3825 3826 3827
		/* the PIR and ON have been set by L1. */
		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
			kvm_vcpu_kick(vcpu);
3828 3829 3830 3831
		return 0;
	}
	return -1;
}
3832 3833 3834 3835 3836 3837 3838
/*
 * Send interrupt to vcpu via posted interrupt way.
 * 1. If target vcpu is running(non-root mode), send posted interrupt
 * notification to vcpu and hardware will sync PIR to vIRR atomically.
 * 2. If target vcpu isn't running(root mode), kick it to pick up the
 * interrupt from PIR in next vmentry.
 */
3839
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
3840 3841 3842 3843
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int r;

3844 3845
	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
	if (!r)
3846 3847 3848 3849
		return 0;

	if (!vcpu->arch.apicv_active)
		return -1;
3850

3851
	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
3852
		return 0;
3853

3854 3855
	/* If a previous notification has sent the IPI, nothing to do.  */
	if (pi_test_and_set_on(&vmx->pi_desc))
3856
		return 0;
3857

3858 3859
	if (vcpu != kvm_get_running_vcpu() &&
	    !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
3860
		kvm_vcpu_kick(vcpu);
3861 3862

	return 0;
3863 3864
}

3865 3866 3867 3868 3869 3870
/*
 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
 * will not change in the lifetime of the guest.
 * Note that host-state that does change is set elsewhere. E.g., host-state
 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
 */
3871
void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3872 3873 3874
{
	u32 low32, high32;
	unsigned long tmpl;
3875
	unsigned long cr0, cr3, cr4;
3876

3877 3878 3879
	cr0 = read_cr0();
	WARN_ON(cr0 & X86_CR0_TS);
	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
3880 3881 3882 3883 3884

	/*
	 * Save the most likely value for this task's CR3 in the VMCS.
	 * We can't use __get_current_cr3_fast() because we're not atomic.
	 */
3885
	cr3 = __read_cr3();
3886
	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
3887
	vmx->loaded_vmcs->host_state.cr3 = cr3;
3888

3889
	/* Save the most likely value for this task's CR4 in the VMCS. */
3890
	cr4 = cr4_read_shadow();
3891
	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
3892
	vmx->loaded_vmcs->host_state.cr4 = cr4;
3893

3894
	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
A
Avi Kivity 已提交
3895 3896 3897
#ifdef CONFIG_X86_64
	/*
	 * Load null selectors, so we can avoid reloading them in
3898 3899
	 * vmx_prepare_switch_to_host(), in case userspace uses
	 * the null selectors too (the expected case).
A
Avi Kivity 已提交
3900 3901 3902 3903
	 */
	vmcs_write16(HOST_DS_SELECTOR, 0);
	vmcs_write16(HOST_ES_SELECTOR, 0);
#else
3904 3905
	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
A
Avi Kivity 已提交
3906
#endif
3907 3908 3909
	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */

3910
	vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
3911

3912
	vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
3913 3914 3915 3916 3917 3918 3919 3920 3921 3922

	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */

	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
		rdmsr(MSR_IA32_CR_PAT, low32, high32);
		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
	}
3923

3924
	if (cpu_has_load_ia32_efer())
3925
		vmcs_write64(HOST_IA32_EFER, host_efer);
3926 3927
}

3928
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3929
{
3930 3931 3932
	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS;
	if (!enable_ept)
		vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
3933 3934 3935
	if (is_guest_mode(&vmx->vcpu))
		vmx->vcpu.arch.cr4_guest_owned_bits &=
			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3936 3937 3938
	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
}

3939
u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
3940 3941 3942
{
	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;

3943
	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
3944
		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
3945 3946 3947 3948

	if (!enable_vnmi)
		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;

3949 3950 3951
	if (!enable_preemption_timer)
		pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;

3952 3953 3954
	return pin_based_exec_ctrl;
}

3955 3956 3957 3958
static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

3959
	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
3960 3961
	if (cpu_has_secondary_exec_ctrls()) {
		if (kvm_vcpu_apicv_active(vcpu))
3962
			secondary_exec_controls_setbit(vmx,
3963 3964 3965
				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
		else
3966
			secondary_exec_controls_clearbit(vmx,
3967 3968 3969 3970 3971
					SECONDARY_EXEC_APIC_REGISTER_VIRT |
					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
	}

	if (cpu_has_vmx_msr_bitmap())
3972
		vmx_update_msr_bitmap(vcpu);
3973 3974
}

3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000
u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;

	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
		exec_control &= ~CPU_BASED_MOV_DR_EXITING;

	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
		exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
		exec_control |= CPU_BASED_CR8_STORE_EXITING |
				CPU_BASED_CR8_LOAD_EXITING;
#endif
	}
	if (!enable_ept)
		exec_control |= CPU_BASED_CR3_STORE_EXITING |
				CPU_BASED_CR3_LOAD_EXITING  |
				CPU_BASED_INVLPG_EXITING;
	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
				CPU_BASED_MONITOR_EXITING);
	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
		exec_control &= ~CPU_BASED_HLT_EXITING;
	return exec_control;
}

4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055
/*
 * Adjust a single secondary execution control bit to intercept/allow an
 * instruction in the guest.  This is usually done based on whether or not a
 * feature has been exposed to the guest in order to correctly emulate faults.
 */
static inline void
vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
				  u32 control, bool enabled, bool exiting)
{
	/*
	 * If the control is for an opt-in feature, clear the control if the
	 * feature is not exposed to the guest, i.e. not enabled.  If the
	 * control is opt-out, i.e. an exiting control, clear the control if
	 * the feature _is_ exposed to the guest, i.e. exiting/interception is
	 * disabled for the associated instruction.  Note, the caller is
	 * responsible presetting exec_control to set all supported bits.
	 */
	if (enabled == exiting)
		*exec_control &= ~control;

	/*
	 * Update the nested MSR settings so that a nested VMM can/can't set
	 * controls for features that are/aren't exposed to the guest.
	 */
	if (nested) {
		if (enabled)
			vmx->nested.msrs.secondary_ctls_high |= control;
		else
			vmx->nested.msrs.secondary_ctls_high &= ~control;
	}
}

/*
 * Wrapper macro for the common case of adjusting a secondary execution control
 * based on a single guest CPUID bit, with a dedicated feature bit.  This also
 * verifies that the control is actually supported by KVM and hardware.
 */
#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
({									 \
	bool __enabled;							 \
									 \
	if (cpu_has_vmx_##name()) {					 \
		__enabled = guest_cpuid_has(&(vmx)->vcpu,		 \
					    X86_FEATURE_##feat_name);	 \
		vmx_adjust_secondary_exec_control(vmx, exec_control,	 \
			SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
	}								 \
})

/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)

#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4056

4057
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
4058
{
4059 4060
	struct kvm_vcpu *vcpu = &vmx->vcpu;

4061
	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4062

4063
	if (vmx_pt_mode_is_system())
4064
		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4065
	if (!cpu_need_virtualize_apic_accesses(vcpu))
4066 4067 4068 4069 4070 4071 4072 4073 4074
		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
	if (vmx->vpid == 0)
		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
	if (!enable_ept) {
		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
		enable_unrestricted_guest = 0;
	}
	if (!enable_unrestricted_guest)
		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4075
	if (kvm_pause_in_guest(vmx->vcpu.kvm))
4076
		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4077
	if (!kvm_vcpu_apicv_active(vcpu))
4078 4079
		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4080
	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4081 4082 4083 4084 4085

	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
	 * in vmx_set_cr4.  */
	exec_control &= ~SECONDARY_EXEC_DESC;

4086 4087 4088 4089 4090 4091
	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
	   (handle_vmptrld).
	   We can NOT enable shadow_vmcs here because we don't have yet
	   a current VMCS12
	*/
	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
K
Kai Huang 已提交
4092 4093 4094

	if (!enable_pml)
		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
K
Kai Huang 已提交
4095

4096
	if (cpu_has_vmx_xsaves()) {
4097 4098
		/* Exposing XSAVES only when XSAVE is exposed */
		bool xsaves_enabled =
4099
			boot_cpu_has(X86_FEATURE_XSAVE) &&
4100 4101 4102
			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);

4103 4104
		vcpu->arch.xsaves_enabled = xsaves_enabled;

4105 4106 4107
		vmx_adjust_secondary_exec_control(vmx, &exec_control,
						  SECONDARY_EXEC_XSAVES,
						  xsaves_enabled, false);
4108 4109
	}

4110
	vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
4111

4112 4113 4114 4115 4116 4117 4118 4119
	/*
	 * Expose INVPCID if and only if PCID is also exposed to the guest.
	 * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
	 * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
	 * behavior from the guest perspective (it would expect #GP or #PF).
	 */
	if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
		guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4120
	vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4121

4122

4123 4124
	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4125

4126 4127
	vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
				    ENABLE_USR_WAIT_PAUSE, false);
4128

4129
	vmx->secondary_exec_control = exec_control;
4130 4131
}

4132 4133 4134 4135 4136 4137
static void ept_set_mmio_spte_mask(void)
{
	/*
	 * EPT Misconfigurations can be generated if the value of bits 2:0
	 * of an EPT paging-structure entry is 110b (write/execute).
	 */
P
Paolo Bonzini 已提交
4138
	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
4139 4140
}

4141
#define VMX_XSS_EXIT_BITMAP 0
A
Avi Kivity 已提交
4142

4143
/*
4144 4145
 * Noting that the initialization of Guest-state Area of VMCS is in
 * vmx_vcpu_reset().
4146
 */
4147
static void init_vmcs(struct vcpu_vmx *vmx)
4148 4149
{
	if (nested)
4150
		nested_vmx_set_vmcs_shadowing_bitmap();
4151

S
Sheng Yang 已提交
4152
	if (cpu_has_vmx_msr_bitmap())
4153
		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
S
Sheng Yang 已提交
4154

A
Avi Kivity 已提交
4155 4156 4157
	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

	/* Control */
4158
	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4159

4160
	exec_controls_set(vmx, vmx_exec_control(vmx));
A
Avi Kivity 已提交
4161

4162
	if (cpu_has_secondary_exec_ctrls()) {
4163
		vmx_compute_secondary_exec_control(vmx);
4164
		secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
4165
	}
4166

4167
	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4168 4169 4170 4171 4172 4173
		vmcs_write64(EOI_EXIT_BITMAP0, 0);
		vmcs_write64(EOI_EXIT_BITMAP1, 0);
		vmcs_write64(EOI_EXIT_BITMAP2, 0);
		vmcs_write64(EOI_EXIT_BITMAP3, 0);

		vmcs_write16(GUEST_INTR_STATUS, 0);
4174

4175
		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4176
		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4177 4178
	}

4179
	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4180
		vmcs_write32(PLE_GAP, ple_gap);
4181 4182
		vmx->ple_window = ple_window;
		vmx->ple_window_dirty = true;
4183 4184
	}

4185 4186
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
A
Avi Kivity 已提交
4187 4188
	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */

4189 4190
	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4191
	vmx_set_constant_host_state(vmx);
A
Avi Kivity 已提交
4192 4193 4194
	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */

B
Bandan Das 已提交
4195 4196 4197
	if (cpu_has_vmx_vmfunc())
		vmcs_write64(VM_FUNCTION_CONTROL, 0);

4198 4199
	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4200
	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4201
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4202
	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
A
Avi Kivity 已提交
4203

4204 4205
	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
S
Sheng Yang 已提交
4206

4207
	vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
A
Avi Kivity 已提交
4208 4209

	/* 22.2.1, 20.8.1 */
4210
	vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4211

4212 4213
	vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
	vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4214

4215
	set_cr4_guest_host_mask(vmx);
4216

4217 4218 4219
	if (vmx->vpid != 0)
		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);

4220
	if (cpu_has_vmx_xsaves())
4221 4222
		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);

4223 4224 4225 4226
	if (enable_pml) {
		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
	}
4227 4228 4229

	if (cpu_has_vmx_encls_vmexit())
		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
4230

4231
	if (vmx_pt_mode_is_host_guest()) {
4232 4233 4234 4235 4236
		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
		/* Bit[6~0] are forced to 1, writes are ignored. */
		vmx->pt_desc.guest.output_mask = 0x7F;
		vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
	}
4237 4238 4239 4240 4241 4242 4243 4244 4245 4246

	/*
	 * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
	 * between guest and host.  In that case we only care about present
	 * faults.
	 */
	if (enable_ept) {
		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
	}
4247 4248
}

4249
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4250 4251
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
4252
	struct msr_data apic_base_msr;
4253
	u64 cr0;
4254

4255
	vmx->rmode.vm86_active = 0;
4256
	vmx->spec_ctrl = 0;
4257

4258 4259
	vmx->msr_ia32_umwait_control = 0;

4260
	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4261
	vmx->hv_deadline_tsc = -1;
4262 4263 4264 4265 4266 4267 4268 4269 4270 4271
	kvm_set_cr8(vcpu, 0);

	if (!init_event) {
		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
				     MSR_IA32_APICBASE_ENABLE;
		if (kvm_vcpu_is_reset_bsp(vcpu))
			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
		apic_base_msr.host_initiated = true;
		kvm_set_apic_base(vcpu, &apic_base_msr);
	}
4272

A
Avi Kivity 已提交
4273 4274
	vmx_segment_cache_clear(vmx);

4275
	seg_setup(VCPU_SREG_CS);
4276
	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4277
	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294

	seg_setup(VCPU_SREG_DS);
	seg_setup(VCPU_SREG_ES);
	seg_setup(VCPU_SREG_FS);
	seg_setup(VCPU_SREG_GS);
	seg_setup(VCPU_SREG_SS);

	vmcs_write16(GUEST_TR_SELECTOR, 0);
	vmcs_writel(GUEST_TR_BASE, 0);
	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
	vmcs_writel(GUEST_LDTR_BASE, 0);
	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);

4295 4296 4297 4298 4299 4300
	if (!init_event) {
		vmcs_write32(GUEST_SYSENTER_CS, 0);
		vmcs_writel(GUEST_SYSENTER_ESP, 0);
		vmcs_writel(GUEST_SYSENTER_EIP, 0);
		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
	}
4301

4302
	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
4303
	kvm_rip_write(vcpu, 0xfff0);
4304 4305 4306 4307 4308 4309 4310

	vmcs_writel(GUEST_GDTR_BASE, 0);
	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);

	vmcs_writel(GUEST_IDTR_BASE, 0);
	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);

4311
	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4312
	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4313
	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4314 4315
	if (kvm_mpx_supported())
		vmcs_write64(GUEST_BNDCFGS, 0);
4316 4317 4318

	setup_msrs(vmx);

A
Avi Kivity 已提交
4319 4320
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */

4321
	if (cpu_has_vmx_tpr_shadow() && !init_event) {
4322
		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4323
		if (cpu_need_tpr_shadow(vcpu))
4324
			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4325
				     __pa(vcpu->arch.apic->regs));
4326 4327 4328
		vmcs_write32(TPR_THRESHOLD, 0);
	}

4329
	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
A
Avi Kivity 已提交
4330

4331 4332
	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
	vmx->vcpu.arch.cr0 = cr0;
4333
	vmx_set_cr0(vcpu, cr0); /* enter rmode */
4334
	vmx_set_cr4(vcpu, 0);
P
Paolo Bonzini 已提交
4335
	vmx_set_efer(vcpu, 0);
4336

4337
	update_exception_bitmap(vcpu);
A
Avi Kivity 已提交
4338

4339
	vpid_sync_context(vmx->vpid);
4340 4341
	if (init_event)
		vmx_clear_hlt(vcpu);
A
Avi Kivity 已提交
4342 4343
}

4344
static void enable_irq_window(struct kvm_vcpu *vcpu)
4345
{
4346
	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4347 4348
}

4349
static void enable_nmi_window(struct kvm_vcpu *vcpu)
4350
{
4351
	if (!enable_vnmi ||
4352
	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4353 4354 4355
		enable_irq_window(vcpu);
		return;
	}
4356

4357
	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4358 4359
}

4360
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4361
{
4362
	struct vcpu_vmx *vmx = to_vmx(vcpu);
4363 4364
	uint32_t intr;
	int irq = vcpu->arch.interrupt.nr;
4365

4366
	trace_kvm_inj_virq(irq);
F
Feng (Eric) Liu 已提交
4367

4368
	++vcpu->stat.irq_injections;
4369
	if (vmx->rmode.vm86_active) {
4370 4371 4372
		int inc_eip = 0;
		if (vcpu->arch.interrupt.soft)
			inc_eip = vcpu->arch.event_exit_inst_len;
4373
		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4374 4375
		return;
	}
4376 4377 4378 4379 4380 4381 4382 4383
	intr = irq | INTR_INFO_VALID_MASK;
	if (vcpu->arch.interrupt.soft) {
		intr |= INTR_TYPE_SOFT_INTR;
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
	} else
		intr |= INTR_TYPE_EXT_INTR;
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4384 4385

	vmx_clear_hlt(vcpu);
4386 4387
}

4388 4389
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
J
Jan Kiszka 已提交
4390 4391
	struct vcpu_vmx *vmx = to_vmx(vcpu);

4392
	if (!enable_vnmi) {
4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404
		/*
		 * Tracking the NMI-blocked state in software is built upon
		 * finding the next open IRQ window. This, in turn, depends on
		 * well-behaving guests: They have to keep IRQs disabled at
		 * least as long as the NMI handler runs. Otherwise we may
		 * cause NMI nesting, maybe breaking the guest. But as this is
		 * highly unlikely, we can live with the residual risk.
		 */
		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
		vmx->loaded_vmcs->vnmi_blocked_time = 0;
	}

4405 4406
	++vcpu->stat.nmi_injections;
	vmx->loaded_vmcs->nmi_known_unmasked = false;
4407

4408
	if (vmx->rmode.vm86_active) {
4409
		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
J
Jan Kiszka 已提交
4410 4411
		return;
	}
4412

4413 4414
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4415 4416

	vmx_clear_hlt(vcpu);
4417 4418
}

4419
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
J
Jan Kiszka 已提交
4420
{
4421 4422 4423
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	bool masked;

4424
	if (!enable_vnmi)
4425
		return vmx->loaded_vmcs->soft_vnmi_blocked;
4426
	if (vmx->loaded_vmcs->nmi_known_unmasked)
4427
		return false;
4428 4429 4430
	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
	return masked;
J
Jan Kiszka 已提交
4431 4432
}

4433
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
J
Jan Kiszka 已提交
4434 4435 4436
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

4437
	if (!enable_vnmi) {
4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450
		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
			vmx->loaded_vmcs->vnmi_blocked_time = 0;
		}
	} else {
		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
		if (masked)
			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				      GUEST_INTR_STATE_NMI);
		else
			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
					GUEST_INTR_STATE_NMI);
	}
J
Jan Kiszka 已提交
4451 4452
}

4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
{
	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
		return false;

	if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
		return true;

	return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
		(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
		 GUEST_INTR_STATE_NMI));
}

4466
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4467
{
4468
	if (to_vmx(vcpu)->nested.nested_run_pending)
4469
		return -EBUSY;
4470

4471 4472
	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4473
		return -EBUSY;
4474

4475 4476
	return !vmx_nmi_blocked(vcpu);
}
4477

4478 4479 4480
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4481
		return false;
4482

4483
	return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
4484 4485
	       (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4486 4487
}

4488
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4489
{
4490
	if (to_vmx(vcpu)->nested.nested_run_pending)
4491
		return -EBUSY;
4492

4493 4494 4495 4496 4497
       /*
        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
        * e.g. if the IRQ arrived asynchronously after checking nested events.
        */
	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4498
		return -EBUSY;
4499

4500
	return !vmx_interrupt_blocked(vcpu);
4501 4502
}

4503 4504 4505 4506
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
	int ret;

4507 4508 4509
	if (enable_unrestricted_guest)
		return 0;

4510 4511 4512 4513 4514
	mutex_lock(&kvm->slots_lock);
	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
				      PAGE_SIZE * 3);
	mutex_unlock(&kvm->slots_lock);

4515 4516
	if (ret)
		return ret;
4517
	to_kvm_vmx(kvm)->tss_addr = addr;
4518
	return init_rmode_tss(kvm);
4519 4520
}

4521 4522
static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
4523
	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
4524 4525 4526
	return 0;
}

4527
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
A
Avi Kivity 已提交
4528
{
4529 4530
	switch (vec) {
	case BP_VECTOR:
4531 4532 4533 4534 4535 4536
		/*
		 * Update instruction length as we may reinject the exception
		 * from user space while in guest debugging mode.
		 */
		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
J
Jan Kiszka 已提交
4537
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4538
			return false;
4539
		fallthrough;
4540
	case DB_VECTOR:
4541 4542
		return !(vcpu->guest_debug &
			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
J
Jan Kiszka 已提交
4543
	case DE_VECTOR:
4544 4545 4546 4547 4548 4549 4550
	case OF_VECTOR:
	case BR_VECTOR:
	case UD_VECTOR:
	case DF_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
	case MF_VECTOR:
4551
		return true;
4552
	}
4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563
	return false;
}

static int handle_rmode_exception(struct kvm_vcpu *vcpu,
				  int vec, u32 err_code)
{
	/*
	 * Instruction with address size override prefix opcode 0x67
	 * Cause the #SS fault with 0 error code in VM86 mode.
	 */
	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4564
		if (kvm_emulate_instruction(vcpu, 0)) {
4565 4566
			if (vcpu->arch.halt_request) {
				vcpu->arch.halt_request = 0;
4567
				return kvm_vcpu_halt(vcpu);
4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580
			}
			return 1;
		}
		return 0;
	}

	/*
	 * Forward all other exceptions that are valid in real mode.
	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
	 *        the required debugging infrastructure rework.
	 */
	kvm_queue_exception(vcpu, vec);
	return 1;
A
Avi Kivity 已提交
4581 4582
}

A
Andi Kleen 已提交
4583 4584 4585 4586 4587 4588 4589 4590 4591
/*
 * Trigger machine check on the host. We assume all the MSRs are already set up
 * by the CPU and that we still run on the same CPU as the MCE occurred on.
 * We pass a fake environment to the machine check handler because we want
 * the guest to be always treated like user space, no matter what context
 * it used internally.
 */
static void kvm_machine_check(void)
{
4592
#if defined(CONFIG_X86_MCE)
A
Andi Kleen 已提交
4593 4594 4595 4596 4597
	struct pt_regs regs = {
		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
		.flags = X86_EFLAGS_IF,
	};

4598
	do_machine_check(&regs);
A
Andi Kleen 已提交
4599 4600 4601
#endif
}

A
Avi Kivity 已提交
4602
static int handle_machine_check(struct kvm_vcpu *vcpu)
A
Andi Kleen 已提交
4603
{
4604
	/* handled by vmx_vcpu_run() */
A
Andi Kleen 已提交
4605 4606 4607
	return 1;
}

4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627
/*
 * If the host has split lock detection disabled, then #AC is
 * unconditionally injected into the guest, which is the pre split lock
 * detection behaviour.
 *
 * If the host has split lock detection enabled then #AC is
 * only injected into the guest when:
 *  - Guest CPL == 3 (user mode)
 *  - Guest has #AC detection enabled in CR0
 *  - Guest EFLAGS has AC bit set
 */
static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
{
	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
		return true;

	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}

4628
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
4629
{
4630
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
4631
	struct kvm_run *kvm_run = vcpu->run;
J
Jan Kiszka 已提交
4632
	u32 intr_info, ex_no, error_code;
4633
	unsigned long cr2, rip, dr6;
A
Avi Kivity 已提交
4634 4635
	u32 vect_info;

4636
	vect_info = vmx->idt_vectoring_info;
4637
	intr_info = vmx_get_intr_info(vcpu);
A
Avi Kivity 已提交
4638

4639
	if (is_machine_check(intr_info) || is_nmi(intr_info))
4640
		return 1; /* handled by handle_exception_nmi_irqoff() */
4641

W
Wanpeng Li 已提交
4642 4643
	if (is_invalid_opcode(intr_info))
		return handle_ud(vcpu);
4644

A
Avi Kivity 已提交
4645
	error_code = 0;
4646
	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
A
Avi Kivity 已提交
4647
		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4648

4649 4650
	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
		WARN_ON_ONCE(!enable_vmware_backdoor);
4651 4652 4653 4654 4655 4656 4657 4658 4659 4660

		/*
		 * VMware backdoor emulation on #GP interception only handles
		 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
		 * error code on #GP.
		 */
		if (error_code) {
			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
			return 1;
		}
4661
		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
4662 4663
	}

4664 4665 4666 4667 4668 4669 4670 4671 4672
	/*
	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
	 * MMIO, it is better to report an internal error.
	 * See the comments in vmx_handle_exit.
	 */
	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4673
		vcpu->run->internal.ndata = 4;
4674 4675
		vcpu->run->internal.data[0] = vect_info;
		vcpu->run->internal.data[1] = intr_info;
4676
		vcpu->run->internal.data[2] = error_code;
4677
		vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
4678 4679 4680
		return 0;
	}

A
Avi Kivity 已提交
4681
	if (is_page_fault(intr_info)) {
4682
		cr2 = vmx_get_exit_qual(vcpu);
4683 4684 4685 4686 4687 4688 4689 4690 4691
		if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
			/*
			 * EPT will cause page fault only if we need to
			 * detect illegal GPAs.
			 */
			kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
			return 1;
		} else
			return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
A
Avi Kivity 已提交
4692 4693
	}

J
Jan Kiszka 已提交
4694
	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4695 4696 4697 4698

	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
		return handle_rmode_exception(vcpu, ex_no, error_code);

4699 4700
	switch (ex_no) {
	case DB_VECTOR:
4701
		dr6 = vmx_get_exit_qual(vcpu);
4702 4703
		if (!(vcpu->guest_debug &
		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4704
			if (is_icebp(intr_info))
4705
				WARN_ON(!skip_emulated_instruction(vcpu));
4706

4707
			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
4708 4709
			return 1;
		}
4710
		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
4711
		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4712
		fallthrough;
4713
	case BP_VECTOR:
4714 4715 4716 4717 4718 4719 4720
		/*
		 * Update instruction length as we may reinject #BP from
		 * user space while in guest debugging mode. Reading it for
		 * #DB as well causes no harm, it is not used in that case.
		 */
		vmx->vcpu.arch.event_exit_inst_len =
			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
A
Avi Kivity 已提交
4721
		kvm_run->exit_reason = KVM_EXIT_DEBUG;
4722
		rip = kvm_rip_read(vcpu);
J
Jan Kiszka 已提交
4723 4724
		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
		kvm_run->debug.arch.exception = ex_no;
4725
		break;
4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739
	case AC_VECTOR:
		if (guest_inject_ac(vcpu)) {
			kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
			return 1;
		}

		/*
		 * Handle split lock. Depending on detection mode this will
		 * either warn and disable split lock detection for this
		 * task or force SIGBUS on it.
		 */
		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
			return 1;
		fallthrough;
4740
	default:
J
Jan Kiszka 已提交
4741 4742 4743
		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
		kvm_run->ex.exception = ex_no;
		kvm_run->ex.error_code = error_code;
4744
		break;
A
Avi Kivity 已提交
4745 4746 4747 4748
	}
	return 0;
}

4749
static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
4750
{
A
Avi Kivity 已提交
4751
	++vcpu->stat.irq_exits;
A
Avi Kivity 已提交
4752 4753 4754
	return 1;
}

A
Avi Kivity 已提交
4755
static int handle_triple_fault(struct kvm_vcpu *vcpu)
4756
{
A
Avi Kivity 已提交
4757
	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4758
	vcpu->mmio_needed = 0;
4759 4760
	return 0;
}
A
Avi Kivity 已提交
4761

A
Avi Kivity 已提交
4762
static int handle_io(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
4763
{
4764
	unsigned long exit_qualification;
4765
	int size, in, string;
4766
	unsigned port;
A
Avi Kivity 已提交
4767

4768
	exit_qualification = vmx_get_exit_qual(vcpu);
4769
	string = (exit_qualification & 16) != 0;
4770

4771
	++vcpu->stat.io_exits;
4772

4773
	if (string)
4774
		return kvm_emulate_instruction(vcpu, 0);
4775

4776 4777
	port = exit_qualification >> 16;
	size = (exit_qualification & 7) + 1;
4778
	in = (exit_qualification & 8) != 0;
4779

4780
	return kvm_fast_pio(vcpu, size, port, in);
A
Avi Kivity 已提交
4781 4782
}

I
Ingo Molnar 已提交
4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793
static void
vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
	/*
	 * Patch in the VMCALL instruction:
	 */
	hypercall[0] = 0x0f;
	hypercall[1] = 0x01;
	hypercall[2] = 0xc1;
}

G
Guo Chao 已提交
4794
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4795 4796 4797
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
	if (is_guest_mode(vcpu)) {
4798 4799 4800
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		unsigned long orig_val = val;

4801 4802 4803
		/*
		 * We get here when L2 changed cr0 in a way that did not change
		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4804 4805 4806 4807
		 * but did change L0 shadowed bits. So we first calculate the
		 * effective cr0 value that L1 would like to write into the
		 * hardware. It consists of the L2-owned bits from the new
		 * value combined with the L1-owned bits from L1's guest_cr0.
4808
		 */
4809 4810 4811
		val = (val & ~vmcs12->cr0_guest_host_mask) |
			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);

4812
		if (!nested_guest_cr0_valid(vcpu, val))
4813
			return 1;
4814 4815 4816 4817

		if (kvm_set_cr0(vcpu, val))
			return 1;
		vmcs_writel(CR0_READ_SHADOW, orig_val);
4818
		return 0;
4819 4820
	} else {
		if (to_vmx(vcpu)->nested.vmxon &&
4821
		    !nested_host_cr0_valid(vcpu, val))
4822
			return 1;
4823

4824
		return kvm_set_cr0(vcpu, val);
4825
	}
4826 4827 4828 4829 4830
}

static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
{
	if (is_guest_mode(vcpu)) {
4831 4832 4833 4834 4835 4836 4837
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		unsigned long orig_val = val;

		/* analogously to handle_set_cr0 */
		val = (val & ~vmcs12->cr4_guest_host_mask) |
			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
		if (kvm_set_cr4(vcpu, val))
4838
			return 1;
4839
		vmcs_writel(CR4_READ_SHADOW, orig_val);
4840 4841 4842 4843 4844
		return 0;
	} else
		return kvm_set_cr4(vcpu, val);
}

4845 4846 4847
static int handle_desc(struct kvm_vcpu *vcpu)
{
	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
4848
	return kvm_emulate_instruction(vcpu, 0);
4849 4850
}

A
Avi Kivity 已提交
4851
static int handle_cr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
4852
{
4853
	unsigned long exit_qualification, val;
A
Avi Kivity 已提交
4854 4855
	int cr;
	int reg;
4856
	int err;
4857
	int ret;
A
Avi Kivity 已提交
4858

4859
	exit_qualification = vmx_get_exit_qual(vcpu);
A
Avi Kivity 已提交
4860 4861 4862 4863
	cr = exit_qualification & 15;
	reg = (exit_qualification >> 8) & 15;
	switch ((exit_qualification >> 4) & 3) {
	case 0: /* mov to cr */
4864
		val = kvm_register_readl(vcpu, reg);
4865
		trace_kvm_cr_write(cr, val);
A
Avi Kivity 已提交
4866 4867
		switch (cr) {
		case 0:
4868
			err = handle_set_cr0(vcpu, val);
4869
			return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
4870
		case 3:
4871
			WARN_ON_ONCE(enable_unrestricted_guest);
4872
			err = kvm_set_cr3(vcpu, val);
4873
			return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
4874
		case 4:
4875
			err = handle_set_cr4(vcpu, val);
4876
			return kvm_complete_insn_gp(vcpu, err);
4877 4878
		case 8: {
				u8 cr8_prev = kvm_get_cr8(vcpu);
4879
				u8 cr8 = (u8)val;
A
Andre Przywara 已提交
4880
				err = kvm_set_cr8(vcpu, cr8);
4881
				ret = kvm_complete_insn_gp(vcpu, err);
4882
				if (lapic_in_kernel(vcpu))
4883
					return ret;
4884
				if (cr8_prev <= cr8)
4885 4886 4887 4888 4889 4890
					return ret;
				/*
				 * TODO: we might be squashing a
				 * KVM_GUESTDBG_SINGLESTEP-triggered
				 * KVM_EXIT_DEBUG here.
				 */
A
Avi Kivity 已提交
4891
				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
4892 4893
				return 0;
			}
4894
		}
A
Avi Kivity 已提交
4895
		break;
4896
	case 2: /* clts */
4897 4898
		WARN_ONCE(1, "Guest should always own CR0.TS");
		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4899
		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
4900
		return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
4901 4902 4903
	case 1: /*mov from cr*/
		switch (cr) {
		case 3:
4904
			WARN_ON_ONCE(enable_unrestricted_guest);
4905 4906 4907
			val = kvm_read_cr3(vcpu);
			kvm_register_write(vcpu, reg, val);
			trace_kvm_cr_read(cr, val);
4908
			return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
4909
		case 8:
4910 4911 4912
			val = kvm_get_cr8(vcpu);
			kvm_register_write(vcpu, reg, val);
			trace_kvm_cr_read(cr, val);
4913
			return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
4914 4915 4916
		}
		break;
	case 3: /* lmsw */
4917
		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4918
		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
4919
		kvm_lmsw(vcpu, val);
A
Avi Kivity 已提交
4920

4921
		return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
4922 4923 4924
	default:
		break;
	}
A
Avi Kivity 已提交
4925
	vcpu->run->exit_reason = 0;
4926
	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
A
Avi Kivity 已提交
4927 4928 4929 4930
	       (int)(exit_qualification >> 4) & 3, cr);
	return 0;
}

A
Avi Kivity 已提交
4931
static int handle_dr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
4932
{
4933
	unsigned long exit_qualification;
4934 4935
	int dr, dr7, reg;

4936
	exit_qualification = vmx_get_exit_qual(vcpu);
4937 4938 4939 4940 4941
	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;

	/* First, if DR does not exist, trigger UD */
	if (!kvm_require_dr(vcpu, dr))
		return 1;
A
Avi Kivity 已提交
4942

4943
	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
4944 4945
	if (!kvm_require_cpl(vcpu, 0))
		return 1;
4946 4947
	dr7 = vmcs_readl(GUEST_DR7);
	if (dr7 & DR7_GD) {
4948 4949 4950 4951 4952 4953
		/*
		 * As the vm-exit takes precedence over the debug trap, we
		 * need to emulate the latter, either for the host or the
		 * guest debugging itself.
		 */
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4954
			vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
4955
			vcpu->run->debug.arch.dr7 = dr7;
4956
			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
A
Avi Kivity 已提交
4957 4958
			vcpu->run->debug.arch.exception = DB_VECTOR;
			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
4959 4960
			return 0;
		} else {
4961
			kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
4962 4963 4964 4965
			return 1;
		}
	}

4966
	if (vcpu->guest_debug == 0) {
4967
		exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
4968 4969 4970 4971 4972 4973 4974 4975 4976 4977

		/*
		 * No more DR vmexits; force a reload of the debug registers
		 * and reenter on this instruction.  The next vmexit will
		 * retrieve the full state of the debug registers.
		 */
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
		return 1;
	}

4978 4979
	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
	if (exit_qualification & TYPE_MOV_FROM_DR) {
4980
		unsigned long val;
4981 4982 4983 4984

		if (kvm_get_dr(vcpu, dr, &val))
			return 1;
		kvm_register_write(vcpu, reg, val);
4985
	} else
4986
		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4987 4988
			return 1;

4989
	return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
4990 4991
}

4992 4993 4994 4995 4996 4997 4998 4999 5000 5001
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
	get_debugreg(vcpu->arch.db[0], 0);
	get_debugreg(vcpu->arch.db[1], 1);
	get_debugreg(vcpu->arch.db[2], 2);
	get_debugreg(vcpu->arch.db[3], 3);
	get_debugreg(vcpu->arch.dr6, 6);
	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);

	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5002
	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5003 5004
}

5005 5006 5007 5008 5009
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
	vmcs_writel(GUEST_DR7, val);
}

A
Avi Kivity 已提交
5010
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5011
{
5012
	kvm_apic_update_ppr(vcpu);
5013 5014 5015
	return 1;
}

A
Avi Kivity 已提交
5016
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
5017
{
5018
	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
F
Feng (Eric) Liu 已提交
5019

5020 5021
	kvm_make_request(KVM_REQ_EVENT, vcpu);

5022
	++vcpu->stat.irq_window_exits;
A
Avi Kivity 已提交
5023 5024 5025
	return 1;
}

A
Avi Kivity 已提交
5026
static int handle_vmcall(struct kvm_vcpu *vcpu)
5027
{
5028
	return kvm_emulate_hypercall(vcpu);
5029 5030
}

5031 5032
static int handle_invd(struct kvm_vcpu *vcpu)
{
5033 5034
	/* Treat an INVD instruction as a NOP and just skip it. */
	return kvm_skip_emulated_instruction(vcpu);
5035 5036
}

A
Avi Kivity 已提交
5037
static int handle_invlpg(struct kvm_vcpu *vcpu)
M
Marcelo Tosatti 已提交
5038
{
5039
	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
M
Marcelo Tosatti 已提交
5040 5041

	kvm_mmu_invlpg(vcpu, exit_qualification);
5042
	return kvm_skip_emulated_instruction(vcpu);
M
Marcelo Tosatti 已提交
5043 5044
}

A
Avi Kivity 已提交
5045 5046 5047 5048 5049
static int handle_rdpmc(struct kvm_vcpu *vcpu)
{
	int err;

	err = kvm_rdpmc(vcpu);
5050
	return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
5051 5052
}

A
Avi Kivity 已提交
5053
static int handle_wbinvd(struct kvm_vcpu *vcpu)
E
Eddie Dong 已提交
5054
{
5055
	return kvm_emulate_wbinvd(vcpu);
E
Eddie Dong 已提交
5056 5057
}

5058 5059 5060
static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
	u64 new_bv = kvm_read_edx_eax(vcpu);
5061
	u32 index = kvm_rcx_read(vcpu);
5062 5063

	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5064
		return kvm_skip_emulated_instruction(vcpu);
5065 5066 5067
	return 1;
}

A
Avi Kivity 已提交
5068
static int handle_apic_access(struct kvm_vcpu *vcpu)
5069
{
5070
	if (likely(fasteoi)) {
5071
		unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083
		int access_type, offset;

		access_type = exit_qualification & APIC_ACCESS_TYPE;
		offset = exit_qualification & APIC_ACCESS_OFFSET;
		/*
		 * Sane guest uses MOV to write EOI, with written value
		 * not cared. So make a short-circuit here by avoiding
		 * heavy instruction emulation.
		 */
		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
		    (offset == APIC_EOI)) {
			kvm_lapic_set_eoi(vcpu);
5084
			return kvm_skip_emulated_instruction(vcpu);
5085 5086
		}
	}
5087
	return kvm_emulate_instruction(vcpu, 0);
5088 5089
}

5090 5091
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
5092
	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5093 5094 5095 5096 5097 5098 5099
	int vector = exit_qualification & 0xff;

	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
	kvm_apic_set_eoi_accelerated(vcpu, vector);
	return 1;
}

5100 5101
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
5102
	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5103 5104 5105 5106 5107 5108 5109
	u32 offset = exit_qualification & 0xfff;

	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
	kvm_apic_write_nodecode(vcpu, offset);
	return 1;
}

A
Avi Kivity 已提交
5110
static int handle_task_switch(struct kvm_vcpu *vcpu)
5111
{
J
Jan Kiszka 已提交
5112
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5113
	unsigned long exit_qualification;
5114 5115
	bool has_error_code = false;
	u32 error_code = 0;
5116
	u16 tss_selector;
5117
	int reason, type, idt_v, idt_index;
5118 5119

	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5120
	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5121
	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5122

5123
	exit_qualification = vmx_get_exit_qual(vcpu);
5124 5125

	reason = (u32)exit_qualification >> 30;
5126 5127 5128 5129
	if (reason == TASK_SWITCH_GATE && idt_v) {
		switch (type) {
		case INTR_TYPE_NMI_INTR:
			vcpu->arch.nmi_injected = false;
5130
			vmx_set_nmi_mask(vcpu, true);
5131 5132
			break;
		case INTR_TYPE_EXT_INTR:
5133
		case INTR_TYPE_SOFT_INTR:
5134 5135 5136
			kvm_clear_interrupt_queue(vcpu);
			break;
		case INTR_TYPE_HARD_EXCEPTION:
5137 5138 5139 5140 5141 5142
			if (vmx->idt_vectoring_info &
			    VECTORING_INFO_DELIVER_CODE_MASK) {
				has_error_code = true;
				error_code =
					vmcs_read32(IDT_VECTORING_ERROR_CODE);
			}
5143
			fallthrough;
5144 5145 5146 5147 5148 5149
		case INTR_TYPE_SOFT_EXCEPTION:
			kvm_clear_exception_queue(vcpu);
			break;
		default:
			break;
		}
J
Jan Kiszka 已提交
5150
	}
5151 5152
	tss_selector = exit_qualification;

5153 5154 5155
	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
		       type != INTR_TYPE_EXT_INTR &&
		       type != INTR_TYPE_NMI_INTR))
5156
		WARN_ON(!skip_emulated_instruction(vcpu));
5157

5158 5159 5160 5161
	/*
	 * TODO: What about debug traps on tss switch?
	 *       Are we supposed to inject them and update dr6?
	 */
5162 5163
	return kvm_task_switch(vcpu, tss_selector,
			       type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5164
			       reason, has_error_code, error_code);
5165 5166
}

A
Avi Kivity 已提交
5167
static int handle_ept_violation(struct kvm_vcpu *vcpu)
5168
{
5169
	unsigned long exit_qualification;
5170
	gpa_t gpa;
5171
	u64 error_code;
5172

5173
	exit_qualification = vmx_get_exit_qual(vcpu);
5174

5175 5176 5177 5178 5179 5180
	/*
	 * EPT violation happened while executing iret from NMI,
	 * "blocked by NMI" bit has to be set before next VM entry.
	 * There are errata that may cause this bit to not be set:
	 * AAK134, BY25.
	 */
5181
	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5182
			enable_vnmi &&
5183
			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5184 5185
		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);

5186
	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5187
	trace_kvm_page_fault(gpa, exit_qualification);
5188

5189
	/* Is it a read fault? */
5190
	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5191 5192
		     ? PFERR_USER_MASK : 0;
	/* Is it a write fault? */
5193
	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5194 5195
		      ? PFERR_WRITE_MASK : 0;
	/* Is it a fetch fault? */
5196
	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5197 5198 5199 5200 5201 5202
		      ? PFERR_FETCH_MASK : 0;
	/* ept page table entry is present? */
	error_code |= (exit_qualification &
		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
			EPT_VIOLATION_EXECUTABLE))
		      ? PFERR_PRESENT_MASK : 0;
5203

5204 5205
	error_code |= (exit_qualification & 0x100) != 0 ?
	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5206 5207

	vcpu->arch.exit_qualification = exit_qualification;
5208 5209 5210 5211 5212 5213 5214 5215 5216

	/*
	 * Check that the GPA doesn't exceed physical memory limits, as that is
	 * a guest page fault.  We have to emulate the instruction here, because
	 * if the illegal address is that of a paging structure, then
	 * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
	 * would also use advanced VM-exit information for EPT violations to
	 * reconstruct the page fault error code.
	 */
5217
	if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5218 5219
		return kvm_emulate_instruction(vcpu, 0);

5220
	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5221 5222
}

A
Avi Kivity 已提交
5223
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5224 5225 5226
{
	gpa_t gpa;

5227 5228 5229 5230
	/*
	 * A nested guest cannot optimize MMIO vmexits, because we have an
	 * nGPA here instead of the required GPA.
	 */
5231
	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5232 5233
	if (!is_guest_mode(vcpu) &&
	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
J
Jason Wang 已提交
5234
		trace_kvm_fast_mmio(gpa);
5235
		return kvm_skip_emulated_instruction(vcpu);
5236
	}
5237

5238
	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5239 5240
}

A
Avi Kivity 已提交
5241
static int handle_nmi_window(struct kvm_vcpu *vcpu)
5242
{
5243
	WARN_ON_ONCE(!enable_vnmi);
5244
	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5245
	++vcpu->stat.nmi_window_exits;
5246
	kvm_make_request(KVM_REQ_EVENT, vcpu);
5247 5248 5249 5250

	return 1;
}

5251
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5252
{
5253
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5254
	bool intr_window_requested;
5255
	unsigned count = 130;
5256

5257
	intr_window_requested = exec_controls_get(vmx) &
5258
				CPU_BASED_INTR_WINDOW_EXITING;
5259

5260
	while (vmx->emulation_required && count-- != 0) {
5261
		if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5262 5263
			return handle_interrupt_window(&vmx->vcpu);

5264
		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5265 5266
			return 1;

5267
		if (!kvm_emulate_instruction(vcpu, 0))
5268
			return 0;
5269

5270
		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
5271 5272 5273 5274 5275 5276 5277
		    vcpu->arch.exception.pending) {
			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
			vcpu->run->internal.suberror =
						KVM_INTERNAL_ERROR_EMULATION;
			vcpu->run->internal.ndata = 0;
			return 0;
		}
5278

5279 5280
		if (vcpu->arch.halt_request) {
			vcpu->arch.halt_request = 0;
5281
			return kvm_vcpu_halt(vcpu);
5282 5283
		}

5284
		/*
5285 5286 5287
		 * Note, return 1 and not 0, vcpu_run() will invoke
		 * xfer_to_guest_mode() which will create a proper return
		 * code.
5288
		 */
5289
		if (__xfer_to_guest_mode_work_pending())
5290
			return 1;
5291 5292
	}

5293
	return 1;
R
Radim Krčmář 已提交
5294 5295 5296 5297 5298
}

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5299
	unsigned int old = vmx->ple_window;
R
Radim Krčmář 已提交
5300

5301 5302 5303
	vmx->ple_window = __grow_ple_window(old, ple_window,
					    ple_window_grow,
					    ple_window_max);
R
Radim Krčmář 已提交
5304

P
Peter Xu 已提交
5305
	if (vmx->ple_window != old) {
R
Radim Krčmář 已提交
5306
		vmx->ple_window_dirty = true;
P
Peter Xu 已提交
5307 5308 5309
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    vmx->ple_window, old);
	}
R
Radim Krčmář 已提交
5310 5311 5312 5313 5314
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5315
	unsigned int old = vmx->ple_window;
R
Radim Krčmář 已提交
5316

5317 5318 5319
	vmx->ple_window = __shrink_ple_window(old, ple_window,
					      ple_window_shrink,
					      ple_window);
R
Radim Krčmář 已提交
5320

P
Peter Xu 已提交
5321
	if (vmx->ple_window != old) {
R
Radim Krčmář 已提交
5322
		vmx->ple_window_dirty = true;
P
Peter Xu 已提交
5323 5324 5325
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    vmx->ple_window, old);
	}
R
Radim Krčmář 已提交
5326 5327
}

P
Peng Hao 已提交
5328
static void vmx_enable_tdp(void)
5329 5330 5331 5332 5333 5334
{
	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
		0ull, VMX_EPT_EXECUTABLE_MASK,
		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
5335
		VMX_EPT_RWX_MASK, 0ull);
5336 5337 5338 5339

	ept_set_mmio_spte_mask();
}

5340 5341 5342 5343
/*
 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
 */
5344
static int handle_pause(struct kvm_vcpu *vcpu)
5345
{
5346
	if (!kvm_pause_in_guest(vcpu->kvm))
R
Radim Krčmář 已提交
5347 5348
		grow_ple_window(vcpu);

5349 5350 5351 5352 5353 5354 5355
	/*
	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
	 * never set PAUSE_EXITING and just set PLE if supported,
	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
	 */
	kvm_vcpu_on_spin(vcpu, true);
5356
	return kvm_skip_emulated_instruction(vcpu);
5357 5358
}

5359
static int handle_nop(struct kvm_vcpu *vcpu)
5360
{
5361
	return kvm_skip_emulated_instruction(vcpu);
5362 5363
}

5364 5365 5366 5367 5368 5369
static int handle_mwait(struct kvm_vcpu *vcpu)
{
	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
	return handle_nop(vcpu);
}

5370 5371 5372 5373 5374 5375
static int handle_invalid_op(struct kvm_vcpu *vcpu)
{
	kvm_queue_exception(vcpu, UD_VECTOR);
	return 1;
}

5376 5377 5378 5379 5380
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
	return 1;
}

5381 5382 5383 5384 5385 5386
static int handle_monitor(struct kvm_vcpu *vcpu)
{
	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
	return handle_nop(vcpu);
}

5387
static int handle_invpcid(struct kvm_vcpu *vcpu)
5388
{
5389 5390 5391 5392 5393 5394 5395
	u32 vmx_instruction_info;
	unsigned long type;
	gva_t gva;
	struct {
		u64 pcid;
		u64 gla;
	} operand;
5396

5397
	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5398 5399 5400 5401
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

5402 5403 5404 5405 5406
	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);

	if (type > 3) {
		kvm_inject_gp(vcpu, 0);
5407 5408 5409
		return 1;
	}

5410 5411 5412
	/* According to the Intel instruction reference, the memory operand
	 * is read even if it isn't needed (e.g., for type==all)
	 */
5413
	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5414 5415
				vmx_instruction_info, false,
				sizeof(operand), &gva))
5416 5417
		return 1;

5418
	return kvm_handle_invpcid(vcpu, type, gva);
J
Jim Mattson 已提交
5419 5420
}

5421
static int handle_pml_full(struct kvm_vcpu *vcpu)
5422
{
5423
	unsigned long exit_qualification;
5424

5425
	trace_kvm_pml_full(vcpu->vcpu_id);
5426

5427
	exit_qualification = vmx_get_exit_qual(vcpu);
5428 5429

	/*
5430 5431
	 * PML buffer FULL happened while executing iret from NMI,
	 * "blocked by NMI" bit has to be set before next VM entry.
5432
	 */
5433 5434 5435 5436 5437
	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
			enable_vnmi &&
			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				GUEST_INTR_STATE_NMI);
5438

5439 5440 5441 5442
	/*
	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
	 * here.., and there's no userspace involvement needed for PML.
	 */
5443 5444 5445
	return 1;
}

5446
static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5447
{
5448 5449 5450
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (!vmx->req_immediate_exit &&
5451
	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5452
		kvm_lapic_expired_hv_timer(vcpu);
5453 5454 5455 5456 5457
		return EXIT_FASTPATH_REENTER_GUEST;
	}

	return EXIT_FASTPATH_NONE;
}
5458

5459 5460 5461
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
	handle_fastpath_preemption_timer(vcpu);
5462
	return 1;
5463 5464
}

5465 5466 5467 5468 5469
/*
 * When nested=0, all VMX instruction VM Exits filter here.  The handlers
 * are overwritten by nested_vmx_setup() when nested=1.
 */
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
5470
{
5471 5472
	kvm_queue_exception(vcpu, UD_VECTOR);
	return 1;
5473 5474
}

5475
static int handle_encls(struct kvm_vcpu *vcpu)
A
Abel Gordon 已提交
5476
{
5477 5478 5479 5480 5481 5482 5483
	/*
	 * SGX virtualization is not yet supported.  There is no software
	 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
	 * to prevent the guest from executing ENCLS.
	 */
	kvm_queue_exception(vcpu, UD_VECTOR);
	return 1;
A
Abel Gordon 已提交
5484 5485
}

5486
/*
5487 5488 5489
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
 * to be done to userspace and return 0.
5490
 */
5491
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5492
	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
5493 5494 5495 5496 5497 5498
	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
	[EXIT_REASON_CR_ACCESS]               = handle_cr,
	[EXIT_REASON_DR_ACCESS]               = handle_dr,
5499 5500 5501
	[EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
	[EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
	[EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
5502
	[EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
5503
	[EXIT_REASON_HLT]                     = kvm_emulate_halt,
5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542
	[EXIT_REASON_INVD]		      = handle_invd,
	[EXIT_REASON_INVLPG]		      = handle_invlpg,
	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
	[EXIT_REASON_VMCALL]                  = handle_vmcall,
	[EXIT_REASON_VMCLEAR]		      = handle_vmx_instruction,
	[EXIT_REASON_VMLAUNCH]		      = handle_vmx_instruction,
	[EXIT_REASON_VMPTRLD]		      = handle_vmx_instruction,
	[EXIT_REASON_VMPTRST]		      = handle_vmx_instruction,
	[EXIT_REASON_VMREAD]		      = handle_vmx_instruction,
	[EXIT_REASON_VMRESUME]		      = handle_vmx_instruction,
	[EXIT_REASON_VMWRITE]		      = handle_vmx_instruction,
	[EXIT_REASON_VMOFF]		      = handle_vmx_instruction,
	[EXIT_REASON_VMON]		      = handle_vmx_instruction,
	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
	[EXIT_REASON_LDTR_TR]		      = handle_desc,
	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
	[EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
	[EXIT_REASON_INVPCID]                 = handle_invpcid,
	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
	[EXIT_REASON_ENCLS]		      = handle_encls,
};
5543

5544 5545
static const int kvm_vmx_max_exit_handlers =
	ARRAY_SIZE(kvm_vmx_exit_handlers);
5546

5547 5548
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
			      u32 *intr_info, u32 *error_code)
5549
{
5550 5551
	struct vcpu_vmx *vmx = to_vmx(vcpu);

5552
	*info1 = vmx_get_exit_qual(vcpu);
5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564
	if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
		*info2 = vmx->idt_vectoring_info;
		*intr_info = vmx_get_intr_info(vcpu);
		if (is_exception_with_error_code(*intr_info))
			*error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
		else
			*error_code = 0;
	} else {
		*info2 = 0;
		*intr_info = 0;
		*error_code = 0;
	}
5565 5566
}

5567
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
N
Nadav Har'El 已提交
5568
{
5569 5570 5571
	if (vmx->pml_pg) {
		__free_page(vmx->pml_pg);
		vmx->pml_pg = NULL;
5572
	}
N
Nadav Har'El 已提交
5573 5574
}

5575
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
5576
{
5577 5578 5579
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u64 *pml_buf;
	u16 pml_idx;
5580

5581
	pml_idx = vmcs_read16(GUEST_PML_INDEX);
5582

5583 5584 5585
	/* Do nothing if PML buffer is empty */
	if (pml_idx == (PML_ENTITY_NUM - 1))
		return;
5586

5587 5588 5589 5590 5591
	/* PML index always points to next available PML buffer entity */
	if (pml_idx >= PML_ENTITY_NUM)
		pml_idx = 0;
	else
		pml_idx++;
5592

5593 5594 5595
	pml_buf = page_address(vmx->pml_pg);
	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
		u64 gpa;
5596

5597 5598 5599
		gpa = pml_buf[pml_idx];
		WARN_ON(gpa & (PAGE_SIZE - 1));
		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5600 5601
	}

5602 5603
	/* reset PML index */
	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5604 5605
}

5606
/*
5607 5608
 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
 * Called before reporting dirty_bitmap to userspace.
5609
 */
5610
static void kvm_flush_pml_buffers(struct kvm *kvm)
5611
{
5612 5613
	int i;
	struct kvm_vcpu *vcpu;
5614
	/*
5615 5616 5617 5618
	 * We only need to kick vcpu out of guest mode here, as PML buffer
	 * is flushed at beginning of all VMEXITs, and it's obvious that only
	 * vcpus running in guest are possible to have unflushed GPAs in PML
	 * buffer.
5619
	 */
5620 5621
	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_vcpu_kick(vcpu);
5622 5623
}

5624
static void vmx_dump_sel(char *name, uint32_t sel)
5625
{
5626 5627 5628 5629 5630
	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
	       name, vmcs_read16(sel),
	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
5631 5632
}

5633
static void vmx_dump_dtsel(char *name, uint32_t limit)
5634
{
5635 5636 5637
	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
	       name, vmcs_read32(limit),
	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
5638 5639
}

5640
void dump_vmcs(void)
N
Nadav Har'El 已提交
5641
{
5642 5643 5644 5645
	u32 vmentry_ctl, vmexit_ctl;
	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
	unsigned long cr4;
	u64 efer;
N
Nadav Har'El 已提交
5646

5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658
	if (!dump_invalid_vmcs) {
		pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
		return;
	}

	vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
	vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
	cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
	cr4 = vmcs_readl(GUEST_CR4);
	efer = vmcs_read64(GUEST_IA32_EFER);
	secondary_exec_control = 0;
5659 5660
	if (cpu_has_secondary_exec_ctrls())
		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5661

5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675
	pr_err("*** Guest State ***\n");
	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
	       vmcs_readl(CR0_GUEST_HOST_MASK));
	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
	{
		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
5676
	}
5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712
	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
	       vmcs_readl(GUEST_SYSENTER_ESP),
	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
		pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
		       efer, vmcs_read64(GUEST_IA32_PAT));
	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
	       vmcs_read64(GUEST_IA32_DEBUGCTL),
	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
	if (cpu_has_load_perf_global_ctrl() &&
	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
		pr_err("PerfGlobCtl = 0x%016llx\n",
		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
	       vmcs_read32(GUEST_ACTIVITY_STATE));
	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
		pr_err("InterruptStatus = %04x\n",
		       vmcs_read16(GUEST_INTR_STATUS));
5713

5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741
	pr_err("*** Host State ***\n");
	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
	       vmcs_read16(HOST_TR_SELECTOR));
	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
	       vmcs_readl(HOST_TR_BASE));
	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
	       vmcs_readl(HOST_CR4));
	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
	       vmcs_read32(HOST_IA32_SYSENTER_CS),
	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
		       vmcs_read64(HOST_IA32_EFER),
		       vmcs_read64(HOST_IA32_PAT));
	if (cpu_has_load_perf_global_ctrl() &&
	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
		pr_err("PerfGlobCtl = 0x%016llx\n",
		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
5742

5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767
	pr_err("*** Control State ***\n");
	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
	       vmcs_read32(EXCEPTION_BITMAP),
	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
	       vmcs_read32(VM_EXIT_INTR_INFO),
	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
	pr_err("        reason=%08x qualification=%016lx\n",
	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
		pr_err("TSC Multiplier = 0x%016llx\n",
		       vmcs_read64(TSC_MULTIPLIER));
5768 5769 5770 5771 5772
	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
		if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
			u16 status = vmcs_read16(GUEST_INTR_STATUS);
			pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
		}
5773
		pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
5774 5775
		if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
			pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
5776
		pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
5777
	}
5778 5779 5780 5781 5782 5783 5784 5785 5786 5787
	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
		pr_err("PLE Gap=%08x Window=%08x\n",
		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
		pr_err("Virtual processor ID = 0x%04x\n",
		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
5788 5789
}

5790 5791 5792 5793
/*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
 */
5794
static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
5795
{
5796 5797 5798
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u32 exit_reason = vmx->exit_reason;
	u32 vectoring_info = vmx->idt_vectoring_info;
5799

5800 5801 5802 5803 5804 5805 5806 5807 5808
	/*
	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
	 * mode as if vcpus is in root mode, the PML buffer must has been
	 * flushed already.
	 */
	if (enable_pml)
		vmx_flush_pml_buffer(vcpu);
5809

5810 5811 5812 5813 5814 5815 5816 5817
	/*
	 * We should never reach this point with a pending nested VM-Enter, and
	 * more specifically emulation of L2 due to invalid guest state (see
	 * below) should never happen as that means we incorrectly allowed a
	 * nested VM-Enter with an invalid vmcs12.
	 */
	WARN_ON_ONCE(vmx->nested.nested_run_pending);

5818 5819 5820
	/* If guest state is invalid, start emulating */
	if (vmx->emulation_required)
		return handle_invalid_guest_state(vcpu);
5821

5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835
	if (is_guest_mode(vcpu)) {
		/*
		 * The host physical addresses of some pages of guest memory
		 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
		 * Page). The CPU may write to these pages via their host
		 * physical address while L2 is running, bypassing any
		 * address-translation-based dirty tracking (e.g. EPT write
		 * protection).
		 *
		 * Mark them dirty on every exit from L2 to prevent them from
		 * getting out of sync with dirty tracking.
		 */
		nested_mark_vmcs12_pages_dirty(vcpu);

5836
		if (nested_vmx_reflect_vmexit(vcpu))
5837
			return 1;
5838
	}
5839

5840 5841 5842 5843 5844
	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
		dump_vmcs();
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		vcpu->run->fail_entry.hardware_entry_failure_reason
			= exit_reason;
5845
		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
5846
		return 0;
5847 5848
	}

5849
	if (unlikely(vmx->fail)) {
5850
		dump_vmcs();
5851 5852 5853
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		vcpu->run->fail_entry.hardware_entry_failure_reason
			= vmcs_read32(VM_INSTRUCTION_ERROR);
5854
		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
5855 5856
		return 0;
	}
5857

5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868
	/*
	 * Note:
	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
	 * delivery event since it indicates guest is accessing MMIO.
	 * The vm-exit can be triggered again after return to guest that
	 * will cause infinite loop.
	 */
	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
			exit_reason != EXIT_REASON_EPT_VIOLATION &&
			exit_reason != EXIT_REASON_PML_FULL &&
5869
			exit_reason != EXIT_REASON_APIC_ACCESS &&
5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881
			exit_reason != EXIT_REASON_TASK_SWITCH)) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
		vcpu->run->internal.ndata = 3;
		vcpu->run->internal.data[0] = vectoring_info;
		vcpu->run->internal.data[1] = exit_reason;
		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
			vcpu->run->internal.ndata++;
			vcpu->run->internal.data[3] =
				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
		}
5882
		vcpu->run->internal.data[vcpu->run->internal.ndata++] =
5883
			vcpu->arch.last_vmentry_cpu;
5884 5885
		return 0;
	}
5886

5887 5888
	if (unlikely(!enable_vnmi &&
		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
5889
		if (!vmx_interrupt_blocked(vcpu)) {
5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904
			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
			   vcpu->arch.nmi_pending) {
			/*
			 * This CPU don't support us in finding the end of an
			 * NMI-blocked window if the guest runs with IRQs
			 * disabled. So we pull the trigger after 1 s of
			 * futile waiting, but inform the user about this.
			 */
			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
			       "state on VCPU %d after 1 s timeout\n",
			       __func__, vcpu->vcpu_id);
			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
		}
	}
5905

5906
	if (exit_fastpath != EXIT_FASTPATH_NONE)
5907
		return 1;
5908 5909 5910

	if (exit_reason >= kvm_vmx_max_exit_handlers)
		goto unexpected_vmexit;
5911
#ifdef CONFIG_RETPOLINE
5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923
	if (exit_reason == EXIT_REASON_MSR_WRITE)
		return kvm_emulate_wrmsr(vcpu);
	else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
		return handle_preemption_timer(vcpu);
	else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
		return handle_interrupt_window(vcpu);
	else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
		return handle_external_interrupt(vcpu);
	else if (exit_reason == EXIT_REASON_HLT)
		return kvm_emulate_halt(vcpu);
	else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
		return handle_ept_misconfig(vcpu);
5924
#endif
5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937

	exit_reason = array_index_nospec(exit_reason,
					 kvm_vmx_max_exit_handlers);
	if (!kvm_vmx_exit_handlers[exit_reason])
		goto unexpected_vmexit;

	return kvm_vmx_exit_handlers[exit_reason](vcpu);

unexpected_vmexit:
	vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
	dump_vmcs();
	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	vcpu->run->internal.suberror =
5938
			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
5939
	vcpu->run->internal.ndata = 2;
5940
	vcpu->run->internal.data[0] = exit_reason;
5941
	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
5942
	return 0;
5943 5944
}

5945
/*
5946 5947
 * Software based L1D cache flush which is used when microcode providing
 * the cache control MSR is not loaded.
5948
 *
5949 5950 5951 5952 5953
 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
 * flush it is required to read in 64 KiB because the replacement algorithm
 * is not exactly LRU. This could be sized at runtime via topology
 * information but as all relevant affected CPUs have 32KiB L1D cache size
 * there is no point in doing so.
5954
 */
5955
static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
5956
{
5957
	int size = PAGE_SIZE << L1D_CACHE_ORDER;
5958 5959

	/*
5960 5961
	 * This code is only executed when the the flush mode is 'cond' or
	 * 'always'
5962
	 */
5963 5964
	if (static_branch_likely(&vmx_l1d_flush_cond)) {
		bool flush_l1d;
5965

5966 5967 5968 5969 5970 5971 5972
		/*
		 * Clear the per-vcpu flush bit, it gets set again
		 * either from vcpu_run() or from one of the unsafe
		 * VMEXIT handlers.
		 */
		flush_l1d = vcpu->arch.l1tf_flush_l1d;
		vcpu->arch.l1tf_flush_l1d = false;
5973

5974 5975 5976 5977 5978 5979
		/*
		 * Clear the per-cpu flush bit, it gets set again from
		 * the interrupt handlers.
		 */
		flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
		kvm_clear_cpu_l1tf_flush_l1d();
5980

5981 5982 5983
		if (!flush_l1d)
			return;
	}
5984

5985
	vcpu->stat.l1d_flush++;
5986

5987
	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
5988
		native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
5989 5990
		return;
	}
5991

5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012
	asm volatile(
		/* First ensure the pages are in the TLB */
		"xorl	%%eax, %%eax\n"
		".Lpopulate_tlb:\n\t"
		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
		"addl	$4096, %%eax\n\t"
		"cmpl	%%eax, %[size]\n\t"
		"jne	.Lpopulate_tlb\n\t"
		"xorl	%%eax, %%eax\n\t"
		"cpuid\n\t"
		/* Now fill the cache */
		"xorl	%%eax, %%eax\n"
		".Lfill_cache:\n"
		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
		"addl	$64, %%eax\n\t"
		"cmpl	%%eax, %[size]\n\t"
		"jne	.Lfill_cache\n\t"
		"lfence\n"
		:: [flush_pages] "r" (vmx_l1d_flush_pages),
		    [size] "r" (size)
		: "eax", "ebx", "ecx", "edx");
6013
}
6014

6015
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6016
{
6017
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6018
	int tpr_threshold;
6019

6020 6021 6022
	if (is_guest_mode(vcpu) &&
		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
		return;
6023

6024
	tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6025 6026 6027 6028
	if (is_guest_mode(vcpu))
		to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
	else
		vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6029 6030
}

6031
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6032
{
6033
	struct vcpu_vmx *vmx = to_vmx(vcpu);
6034
	u32 sec_exec_control;
6035

6036 6037
	if (!lapic_in_kernel(vcpu))
		return;
6038

6039 6040 6041
	if (!flexpriority_enabled &&
	    !cpu_has_vmx_virtualize_x2apic_mode())
		return;
6042

6043 6044
	/* Postpone execution until vmcs01 is the current VMCS. */
	if (is_guest_mode(vcpu)) {
6045
		vmx->nested.change_vmcs01_virtual_apic_mode = true;
6046
		return;
6047
	}
6048

6049
	sec_exec_control = secondary_exec_controls_get(vmx);
6050 6051
	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6052

6053 6054 6055 6056 6057 6058 6059 6060 6061
	switch (kvm_get_apic_mode(vcpu)) {
	case LAPIC_MODE_INVALID:
		WARN_ONCE(true, "Invalid local APIC state");
	case LAPIC_MODE_DISABLED:
		break;
	case LAPIC_MODE_XAPIC:
		if (flexpriority_enabled) {
			sec_exec_control |=
				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6062 6063 6064 6065 6066 6067 6068 6069 6070
			kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);

			/*
			 * Flush the TLB, reloading the APIC access page will
			 * only do so if its physical address has changed, but
			 * the guest may have inserted a non-APIC mapping into
			 * the TLB while the APIC access page was disabled.
			 */
			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6071 6072 6073 6074 6075 6076 6077
		}
		break;
	case LAPIC_MODE_X2APIC:
		if (cpu_has_vmx_virtualize_x2apic_mode())
			sec_exec_control |=
				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
		break;
6078
	}
6079
	secondary_exec_controls_set(vmx, sec_exec_control);
6080

6081 6082
	vmx_update_msr_bitmap(vcpu);
}
6083

6084
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6085
{
6086 6087
	struct page *page;

6088 6089 6090 6091
	/* Defer reload until vmcs01 is the current VMCS. */
	if (is_guest_mode(vcpu)) {
		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
		return;
6092
	}
6093

6094 6095 6096 6097
	if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
	    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
		return;

6098 6099 6100 6101 6102
	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
	if (is_error_page(page))
		return;

	vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6103
	vmx_flush_tlb_current(vcpu);
6104 6105 6106 6107 6108 6109

	/*
	 * Do not pin apic access page in memory, the MMU notifier
	 * will call us again if it is migrated or swapped out.
	 */
	put_page(page);
6110
}
6111

6112 6113 6114 6115
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
	u16 status;
	u8 old;
6116

6117 6118
	if (max_isr == -1)
		max_isr = 0;
6119

6120 6121 6122 6123 6124 6125 6126 6127
	status = vmcs_read16(GUEST_INTR_STATUS);
	old = status >> 8;
	if (max_isr != old) {
		status &= 0xff;
		status |= max_isr << 8;
		vmcs_write16(GUEST_INTR_STATUS, status);
	}
}
6128

6129 6130 6131 6132
static void vmx_set_rvi(int vector)
{
	u16 status;
	u8 old;
6133

6134 6135
	if (vector == -1)
		vector = 0;
6136

6137 6138 6139 6140 6141 6142
	status = vmcs_read16(GUEST_INTR_STATUS);
	old = (u8)status & 0xff;
	if ((u8)vector != old) {
		status &= ~0xff;
		status |= (u8)vector;
		vmcs_write16(GUEST_INTR_STATUS, status);
6143
	}
6144
}
6145

6146 6147
static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
6148
	/*
6149 6150 6151 6152 6153 6154
	 * When running L2, updating RVI is only relevant when
	 * vmcs12 virtual-interrupt-delivery enabled.
	 * However, it can be enabled only when L1 also
	 * intercepts external-interrupts and in that case
	 * we should not update vmcs02 RVI but instead intercept
	 * interrupt. Therefore, do nothing when running L2.
6155
	 */
6156 6157 6158
	if (!is_guest_mode(vcpu))
		vmx_set_rvi(max_irr);
}
6159

6160 6161 6162 6163 6164
static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int max_irr;
	bool max_irr_updated;
6165

6166 6167 6168 6169
	WARN_ON(!vcpu->arch.apicv_active);
	if (pi_test_on(&vmx->pi_desc)) {
		pi_clear_on(&vmx->pi_desc);
		/*
6170
		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
6171 6172 6173 6174 6175
		 * But on x86 this is just a compiler barrier anyway.
		 */
		smp_mb__after_atomic();
		max_irr_updated =
			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6176 6177

		/*
6178 6179 6180 6181 6182 6183
		 * If we are running L2 and L1 has a new pending interrupt
		 * which can be injected, we should re-evaluate
		 * what should be done with this new L1 interrupt.
		 * If L1 intercepts external-interrupts, we should
		 * exit from L2 to L1. Otherwise, interrupt should be
		 * delivered directly to L2.
6184
		 */
6185 6186 6187 6188 6189
		if (is_guest_mode(vcpu) && max_irr_updated) {
			if (nested_exit_on_intr(vcpu))
				kvm_vcpu_exiting_guest_mode(vcpu);
			else
				kvm_make_request(KVM_REQ_EVENT, vcpu);
6190
		}
6191 6192
	} else {
		max_irr = kvm_lapic_find_highest_irr(vcpu);
6193
	}
6194 6195 6196
	vmx_hwapic_irr_update(vcpu, max_irr);
	return max_irr;
}
6197

6198 6199 6200 6201
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
	if (!kvm_vcpu_apicv_active(vcpu))
		return;
6202

6203 6204 6205 6206
	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6207 6208
}

6209
static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6210 6211
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
6212

6213 6214 6215
	pi_clear_on(&vmx->pi_desc);
	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
6216

6217 6218
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);

6219 6220 6221 6222 6223 6224 6225 6226 6227 6228
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
	gate_desc *desc = (gate_desc *)host_idt_base + vector;

	kvm_before_interrupt(vcpu);
	vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
	kvm_after_interrupt(vcpu);
}

6229
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6230
{
6231
	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6232

6233
	/* if exit due to PF check for async PF */
6234
	if (is_page_fault(intr_info))
6235
		vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6236
	/* Handle machine checks before interrupts are enabled */
6237
	else if (is_machine_check(intr_info))
6238 6239
		kvm_machine_check();
	/* We need to handle NMIs before interrupts are enabled */
6240 6241
	else if (is_nmi(intr_info))
		handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
6242
}
6243

6244
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6245
{
6246
	u32 intr_info = vmx_get_intr_info(vcpu);
6247

6248 6249 6250 6251
	if (WARN_ONCE(!is_external_intr(intr_info),
	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
		return;

6252
	handle_interrupt_nmi_irqoff(vcpu, intr_info);
6253
}
6254

6255
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6256 6257 6258 6259 6260 6261 6262 6263
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
		handle_external_interrupt_irqoff(vcpu);
	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
		handle_exception_nmi_irqoff(vmx);
}
6264

6265
static bool vmx_has_emulated_msr(u32 index)
6266 6267 6268 6269 6270 6271 6272 6273
{
	switch (index) {
	case MSR_IA32_SMBASE:
		/*
		 * We cannot do SMM unless we can run the guest in big
		 * real mode.
		 */
		return enable_unrestricted_guest || emulate_invalid_guest_state;
6274 6275
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
		return nested;
6276 6277 6278 6279 6280
	case MSR_AMD64_VIRT_SPEC_CTRL:
		/* This is AMD only.  */
		return false;
	default:
		return true;
6281
	}
6282
}
6283

6284 6285 6286 6287 6288 6289
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
	u32 exit_intr_info;
	bool unblock_nmi;
	u8 vector;
	bool idtv_info_valid;
6290

6291
	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6292

6293 6294 6295
	if (enable_vnmi) {
		if (vmx->loaded_vmcs->nmi_known_unmasked)
			return;
6296 6297

		exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321
		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
		/*
		 * SDM 3: 27.7.1.2 (September 2008)
		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
		 * a guest IRET fault.
		 * SDM 3: 23.2.2 (September 2008)
		 * Bit 12 is undefined in any of the following cases:
		 *  If the VM exit sets the valid bit in the IDT-vectoring
		 *   information field.
		 *  If the VM exit is due to a double fault.
		 */
		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
		    vector != DF_VECTOR && !idtv_info_valid)
			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				      GUEST_INTR_STATE_NMI);
		else
			vmx->loaded_vmcs->nmi_known_unmasked =
				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
				  & GUEST_INTR_STATE_NMI);
	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
		vmx->loaded_vmcs->vnmi_blocked_time +=
			ktime_to_ns(ktime_sub(ktime_get(),
					      vmx->loaded_vmcs->entry_time));
6322 6323
}

6324 6325 6326 6327
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
				      u32 idt_vectoring_info,
				      int instr_len_field,
				      int error_code_field)
6328
{
6329 6330 6331
	u8 vector;
	int type;
	bool idtv_info_valid;
6332

6333
	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6334

6335 6336 6337
	vcpu->arch.nmi_injected = false;
	kvm_clear_exception_queue(vcpu);
	kvm_clear_interrupt_queue(vcpu);
6338

6339 6340
	if (!idtv_info_valid)
		return;
6341

6342
	kvm_make_request(KVM_REQ_EVENT, vcpu);
6343

6344 6345
	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6346

6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358
	switch (type) {
	case INTR_TYPE_NMI_INTR:
		vcpu->arch.nmi_injected = true;
		/*
		 * SDM 3: 27.7.1.2 (September 2008)
		 * Clear bit "block by NMI" before VM entry if a NMI
		 * delivery faulted.
		 */
		vmx_set_nmi_mask(vcpu, false);
		break;
	case INTR_TYPE_SOFT_EXCEPTION:
		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6359
		fallthrough;
6360 6361 6362 6363 6364 6365 6366 6367 6368
	case INTR_TYPE_HARD_EXCEPTION:
		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
			u32 err = vmcs_read32(error_code_field);
			kvm_requeue_exception_e(vcpu, vector, err);
		} else
			kvm_requeue_exception(vcpu, vector);
		break;
	case INTR_TYPE_SOFT_INTR:
		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6369
		fallthrough;
6370 6371 6372 6373 6374
	case INTR_TYPE_EXT_INTR:
		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
		break;
	default:
		break;
6375
	}
6376 6377
}

6378
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6379
{
6380 6381 6382
	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
				  VM_EXIT_INSTRUCTION_LEN,
				  IDT_VECTORING_ERROR_CODE);
6383 6384
}

6385
static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6386
{
6387 6388 6389 6390
	__vmx_complete_interrupts(vcpu,
				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
				  VM_ENTRY_INSTRUCTION_LEN,
				  VM_ENTRY_EXCEPTION_ERROR_CODE);
6391

6392
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6393 6394
}

6395
static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6396
{
6397 6398
	int i, nr_msrs;
	struct perf_guest_switch_msr *msrs;
6399

6400
	msrs = perf_guest_get_msrs(&nr_msrs);
6401

6402 6403
	if (!msrs)
		return;
6404

6405 6406 6407 6408 6409 6410
	for (i = 0; i < nr_msrs; i++)
		if (msrs[i].host == msrs[i].guest)
			clear_atomic_switch_msr(vmx, msrs[i].msr);
		else
			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
					msrs[i].host, false);
6411
}
6412

6413
static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6414 6415
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
6416 6417
	u64 tscl;
	u32 delta_tsc;
6418

6419
	if (vmx->req_immediate_exit) {
6420 6421 6422
		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
	} else if (vmx->hv_deadline_tsc != -1) {
6423 6424 6425 6426 6427 6428 6429
		tscl = rdtsc();
		if (vmx->hv_deadline_tsc > tscl)
			/* set_hv_timer ensures the delta fits in 32-bits */
			delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
				cpu_preemption_timer_multi);
		else
			delta_tsc = 0;
6430

6431 6432 6433 6434 6435
		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
	} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
		vmx->loaded_vmcs->hv_timer_soft_disabled = true;
6436
	}
6437 6438
}

6439
void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
6440
{
6441 6442 6443 6444
	if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
		vmx->loaded_vmcs->host_state.rsp = host_rsp;
		vmcs_writel(HOST_RSP, host_rsp);
	}
6445
}
6446

6447
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
6448 6449 6450 6451
{
	switch (to_vmx(vcpu)->exit_reason) {
	case EXIT_REASON_MSR_WRITE:
		return handle_fastpath_set_msr_irqoff(vcpu);
6452 6453
	case EXIT_REASON_PREEMPTION_TIMER:
		return handle_fastpath_preemption_timer(vcpu);
6454 6455 6456 6457 6458
	default:
		return EXIT_FASTPATH_NONE;
	}
}

6459
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
6460

6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
					struct vcpu_vmx *vmx)
{
	/*
	 * VMENTER enables interrupts (host state), but the kernel state is
	 * interrupts disabled when this is invoked. Also tell RCU about
	 * it. This is the same logic as for exit_to_user_mode().
	 *
	 * This ensures that e.g. latency analysis on the host observes
	 * guest mode as interrupt enabled.
	 *
	 * guest_enter_irqoff() informs context tracking about the
	 * transition to guest mode and if enabled adjusts RCU state
	 * accordingly.
	 */
	instrumentation_begin();
	trace_hardirqs_on_prepare();
	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
	instrumentation_end();

	guest_enter_irqoff();
	lockdep_hardirqs_on(CALLER_ADDR0);

	/* L1D Flush includes CPU buffer clear to mitigate MDS */
	if (static_branch_unlikely(&vmx_l1d_should_flush))
		vmx_l1d_flush(vcpu);
	else if (static_branch_unlikely(&mds_user_clear))
		mds_clear_cpu_buffers();

6490 6491
	if (vcpu->arch.cr2 != native_read_cr2())
		native_write_cr2(vcpu->arch.cr2);
6492 6493 6494 6495

	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
				   vmx->loaded_vmcs->launched);

6496
	vcpu->arch.cr2 = native_read_cr2();
6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517

	/*
	 * VMEXIT disables interrupts (host state), but tracing and lockdep
	 * have them in state 'on' as recorded before entering guest mode.
	 * Same as enter_from_user_mode().
	 *
	 * guest_exit_irqoff() restores host context and reinstates RCU if
	 * enabled and required.
	 *
	 * This needs to be done before the below as native_read_msr()
	 * contains a tracepoint and x86_spec_ctrl_restore_host() calls
	 * into world and some more.
	 */
	lockdep_hardirqs_off(CALLER_ADDR0);
	guest_exit_irqoff();

	instrumentation_begin();
	trace_hardirqs_off_finish();
	instrumentation_end();
}

6518
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
6519
{
6520
	fastpath_t exit_fastpath;
6521 6522 6523
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	unsigned long cr3, cr4;

6524
reenter_guest:
6525 6526 6527 6528 6529 6530 6531 6532
	/* Record the guest's net vcpu time for enforced NMI injections. */
	if (unlikely(!enable_vnmi &&
		     vmx->loaded_vmcs->soft_vnmi_blocked))
		vmx->loaded_vmcs->entry_time = ktime_get();

	/* Don't enter VMX if guest state is invalid, let the exit handler
	   start emulation until we arrive back to a valid state */
	if (vmx->emulation_required)
6533
		return EXIT_FASTPATH_NONE;
6534 6535 6536 6537 6538 6539

	if (vmx->ple_window_dirty) {
		vmx->ple_window_dirty = false;
		vmcs_write32(PLE_WINDOW, vmx->ple_window);
	}

6540 6541 6542 6543 6544
	/*
	 * We did this in prepare_switch_to_guest, because it needs to
	 * be within srcu_read_lock.
	 */
	WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
6545

6546
	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
6547
		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6548
	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570
		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

	cr3 = __get_current_cr3_fast();
	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
		vmcs_writel(HOST_CR3, cr3);
		vmx->loaded_vmcs->host_state.cr3 = cr3;
	}

	cr4 = cr4_read_shadow();
	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
		vmcs_writel(HOST_CR4, cr4);
		vmx->loaded_vmcs->host_state.cr4 = cr4;
	}

	/* When single-stepping over STI and MOV SS, we must clear the
	 * corresponding interruptibility bits in the guest state. Otherwise
	 * vmentry fails as it then expects bit 14 (BS) in pending debug
	 * exceptions being set, but that's not correct for the guest debugging
	 * case. */
	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
		vmx_set_interrupt_shadow(vcpu, 0);

6571
	kvm_load_guest_xsave_state(vcpu);
6572

6573 6574
	pt_guest_enter(vmx);

6575
	atomic_switch_perf_msrs(vmx);
6576

6577 6578
	if (enable_preemption_timer)
		vmx_update_hv_timer(vcpu);
6579

6580
	kvm_wait_lapic_expire(vcpu);
6581

6582 6583 6584 6585 6586 6587 6588 6589
	/*
	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
	 * is no need to worry about the conditional branch over the wrmsr
	 * being speculatively taken.
	 */
	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);

6590 6591
	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
	vmx_vcpu_enter_exit(vcpu, vmx);
6592

6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609
	/*
	 * We do not use IBRS in the kernel. If this vCPU has used the
	 * SPEC_CTRL MSR it may have left it on; save the value and
	 * turn it off. This is much more efficient than blindly adding
	 * it to the atomic save/restore list. Especially as the former
	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
	 *
	 * For non-nested case:
	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 *
	 * For nested case:
	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 */
	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
6610

6611
	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
6612

6613 6614 6615 6616
	/* All fields are clean at this point */
	if (static_branch_unlikely(&enable_evmcs))
		current_evmcs->hv_clean_fields |=
			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
6617

6618 6619 6620
	if (static_branch_unlikely(&enable_evmcs))
		current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;

6621 6622 6623
	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
	if (vmx->host_debugctlmsr)
		update_debugctlmsr(vmx->host_debugctlmsr);
6624

6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636
#ifndef CONFIG_X86_64
	/*
	 * The sysexit path does not restore ds/es, so we must set them to
	 * a reasonable value ourselves.
	 *
	 * We can't defer this to vmx_prepare_switch_to_host() since that
	 * function may be executed in interrupt context, which saves and
	 * restore segments around it, nullifying its effect.
	 */
	loadsegment(ds, __USER_DS);
	loadsegment(es, __USER_DS);
#endif
N
Nadav Har'El 已提交
6637

6638
	vmx_register_cache_reset(vcpu);
6639

6640 6641
	pt_guest_exit(vmx);

6642
	kvm_load_host_xsave_state(vcpu);
6643

6644 6645
	vmx->nested.nested_run_pending = 0;
	vmx->idt_vectoring_info = 0;
6646

6647 6648
	if (unlikely(vmx->fail)) {
		vmx->exit_reason = 0xdead;
6649
		return EXIT_FASTPATH_NONE;
6650 6651 6652 6653
	}

	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
	if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
6654 6655
		kvm_machine_check();

6656 6657
	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);

6658
	if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6659 6660
		return EXIT_FASTPATH_NONE;

6661 6662
	vmx->loaded_vmcs->launched = 1;
	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6663

6664 6665
	vmx_recover_nmi_blocking(vmx);
	vmx_complete_interrupts(vmx);
6666

6667 6668 6669 6670
	if (is_guest_mode(vcpu))
		return EXIT_FASTPATH_NONE;

	exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
6671 6672 6673 6674 6675 6676 6677
	if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
		if (!kvm_vcpu_exit_request(vcpu)) {
			/*
			 * FIXME: this goto should be a loop in vcpu_enter_guest,
			 * but it would incur the cost of a retpoline for now.
			 * Revisit once static calls are available.
			 */
6678 6679
			if (vcpu->arch.apicv_active)
				vmx_sync_pir_to_irr(vcpu);
6680 6681 6682 6683 6684
			goto reenter_guest;
		}
		exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
	}

6685
	return exit_fastpath;
6686
}
6687

6688
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6689
{
6690
	struct vcpu_vmx *vmx = to_vmx(vcpu);
N
Nadav Har'El 已提交
6691

6692 6693 6694 6695 6696 6697
	if (enable_pml)
		vmx_destroy_pml_buffer(vmx);
	free_vpid(vmx->vpid);
	nested_vmx_free_vcpu(vcpu);
	free_loaded_vmcs(vmx->loaded_vmcs);
}
N
Nadav Har'El 已提交
6698

6699
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
6700
{
6701
	struct vcpu_vmx *vmx;
6702
	unsigned long *msr_bitmap;
6703
	int i, cpu, err;
N
Nadav Har'El 已提交
6704

6705 6706
	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
	vmx = to_vmx(vcpu);
6707

6708
	err = -ENOMEM;
6709

6710
	vmx->vpid = allocate_vpid();
6711

6712
	/*
6713 6714 6715
	 * If PML is turned on, failure on enabling PML just results in failure
	 * of creating the vcpu, therefore we can simplify PML logic (by
	 * avoiding dealing with cases, such as enabling PML partially on vcpus
6716
	 * for the guest), etc.
6717
	 */
6718
	if (enable_pml) {
6719
		vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
6720
		if (!vmx->pml_pg)
6721
			goto free_vpid;
6722
	}
N
Nadav Har'El 已提交
6723

6724
	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_SHARED_MSRS);
N
Nadav Har'El 已提交
6725

6726 6727 6728 6729 6730 6731 6732 6733 6734
	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
		u32 index = vmx_msr_index[i];
		u32 data_low, data_high;
		int j = vmx->nmsrs;

		if (rdmsr_safe(index, &data_low, &data_high) < 0)
			continue;
		if (wrmsr_safe(index, data_low, data_high) < 0)
			continue;
6735

6736 6737
		vmx->guest_msrs[j].index = i;
		vmx->guest_msrs[j].data = 0;
6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750
		switch (index) {
		case MSR_IA32_TSX_CTRL:
			/*
			 * No need to pass TSX_CTRL_CPUID_CLEAR through, so
			 * let's avoid changing CPUID bits under the host
			 * kernel's feet.
			 */
			vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
			break;
		default:
			vmx->guest_msrs[j].mask = -1ull;
			break;
		}
6751 6752 6753
		++vmx->nmsrs;
	}

6754 6755
	err = alloc_loaded_vmcs(&vmx->vmcs01);
	if (err < 0)
6756
		goto free_pml;
6757

6758
	msr_bitmap = vmx->vmcs01.msr_bitmap;
6759
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
6760 6761 6762 6763 6764 6765
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
6766
	if (kvm_cstate_in_guest(vcpu->kvm)) {
6767 6768 6769 6770 6771
		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
	}
6772
	vmx->msr_bitmap_mode = 0;
N
Nadav Har'El 已提交
6773

6774 6775
	vmx->loaded_vmcs = &vmx->vmcs01;
	cpu = get_cpu();
6776 6777
	vmx_vcpu_load(vcpu, cpu);
	vcpu->cpu = cpu;
6778
	init_vmcs(vmx);
6779
	vmx_vcpu_put(vcpu);
6780
	put_cpu();
6781
	if (cpu_need_virtualize_apic_accesses(vcpu)) {
6782
		err = alloc_apic_access_page(vcpu->kvm);
6783 6784 6785 6786 6787
		if (err)
			goto free_vmcs;
	}

	if (enable_ept && !enable_unrestricted_guest) {
6788
		err = init_rmode_identity_map(vcpu->kvm);
6789 6790 6791
		if (err)
			goto free_vmcs;
	}
N
Nadav Har'El 已提交
6792

6793
	if (nested)
6794
		memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
6795 6796
	else
		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
6797

6798 6799
	vmx->nested.posted_intr_nv = -1;
	vmx->nested.current_vmptr = -1ull;
6800

6801
	vcpu->arch.microcode_version = 0x100000000ULL;
6802
	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
6803

6804
	/*
6805 6806
	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
	 * or POSTED_INTR_WAKEUP_VECTOR.
6807
	 */
6808 6809
	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
	vmx->pi_desc.sn = 1;
N
Nadav Har'El 已提交
6810

6811 6812
	vmx->ept_pointer = INVALID_PAGE;

6813
	return 0;
N
Nadav Har'El 已提交
6814

6815 6816 6817 6818
free_vmcs:
	free_loaded_vmcs(vmx->loaded_vmcs);
free_pml:
	vmx_destroy_pml_buffer(vmx);
6819
free_vpid:
6820
	free_vpid(vmx->vpid);
6821
	return err;
6822
}
6823

6824 6825
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
6826

6827 6828 6829
static int vmx_vm_init(struct kvm *kvm)
{
	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
6830

6831 6832
	if (!ple_gap)
		kvm->arch.pause_in_guest = true;
6833

6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846
	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
		switch (l1tf_mitigation) {
		case L1TF_MITIGATION_OFF:
		case L1TF_MITIGATION_FLUSH_NOWARN:
			/* 'I explicitly don't care' is set */
			break;
		case L1TF_MITIGATION_FLUSH:
		case L1TF_MITIGATION_FLUSH_NOSMT:
		case L1TF_MITIGATION_FULL:
			/*
			 * Warn upon starting the first VM in a potentially
			 * insecure environment.
			 */
6847
			if (sched_smt_active())
6848 6849 6850 6851 6852 6853 6854 6855 6856
				pr_warn_once(L1TF_MSG_SMT);
			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
				pr_warn_once(L1TF_MSG_L1D);
			break;
		case L1TF_MITIGATION_FULL_FORCE:
			/* Flush is enforced */
			break;
		}
	}
6857
	kvm_apicv_init(kvm, enable_apicv);
6858
	return 0;
N
Nadav Har'El 已提交
6859 6860
}

6861
static int __init vmx_check_processor_compat(void)
6862
{
6863 6864
	struct vmcs_config vmcs_conf;
	struct vmx_capability vmx_cap;
6865

6866 6867 6868 6869 6870 6871
	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
	    !this_cpu_has(X86_FEATURE_VMX)) {
		pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
		return -EIO;
	}

6872
	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
6873
		return -EIO;
6874
	if (nested)
6875
		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
6876 6877 6878
	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
				smp_processor_id());
6879
		return -EIO;
6880
	}
6881
	return 0;
6882 6883
}

6884
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6885
{
6886 6887
	u8 cache;
	u64 ipat = 0;
6888

6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904
	/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
	 * memory aliases with conflicting memory types and sometimes MCEs.
	 * We have to be careful as to what are honored and when.
	 *
	 * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
	 * UC.  The effective memory type is UC or WC depending on guest PAT.
	 * This was historically the source of MCEs and we want to be
	 * conservative.
	 *
	 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
	 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
	 * EPT memory type is set to WB.  The effective memory type is forced
	 * WB.
	 *
	 * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
	 * EPT memory type is used to emulate guest CD/MTRR.
6905
	 */
6906

6907 6908 6909 6910
	if (is_mmio) {
		cache = MTRR_TYPE_UNCACHABLE;
		goto exit;
	}
6911

6912 6913 6914 6915 6916
	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
		ipat = VMX_EPT_IPAT_BIT;
		cache = MTRR_TYPE_WRBACK;
		goto exit;
	}
6917

6918 6919 6920 6921 6922 6923 6924 6925
	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
		ipat = VMX_EPT_IPAT_BIT;
		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
			cache = MTRR_TYPE_WRBACK;
		else
			cache = MTRR_TYPE_UNCACHABLE;
		goto exit;
	}
6926

6927
	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
6928

6929 6930 6931
exit:
	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
6932

6933
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
6934
{
6935
	/*
6936 6937 6938 6939
	 * These bits in the secondary execution controls field
	 * are dynamic, the others are mostly based on the hypervisor
	 * architecture and the guest's CPUID.  Do not touch the
	 * dynamic bits.
6940
	 */
6941 6942 6943 6944 6945
	u32 mask =
		SECONDARY_EXEC_SHADOW_VMCS |
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
		SECONDARY_EXEC_DESC;
6946

6947 6948
	u32 new_ctl = vmx->secondary_exec_control;
	u32 cur_ctl = secondary_exec_controls_get(vmx);
6949

6950
	secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
6951 6952
}

N
Nadav Har'El 已提交
6953
/*
6954 6955
 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
 * (indicating "allowed-1") if they are supported in the guest's CPUID.
N
Nadav Har'El 已提交
6956
 */
6957
static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
N
Nadav Har'El 已提交
6958 6959
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
6960
	struct kvm_cpuid_entry2 *entry;
N
Nadav Har'El 已提交
6961

6962 6963
	vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
6964

6965 6966 6967 6968
#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
	if (entry && (entry->_reg & (_cpuid_mask)))			\
		vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);	\
} while (0)
6969

6970
	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984
	cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
	cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
	cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
	cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
	cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
	cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
	cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
	cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
	cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
	cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
6985

6986
	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6987 6988 6989 6990 6991 6992
	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
	cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
	cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
6993

6994 6995
#undef cr4_fixed1_update
}
6996

6997 6998 6999
static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
7000

7001 7002
	if (kvm_mpx_supported()) {
		bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
N
Nadav Har'El 已提交
7003

7004 7005 7006 7007 7008 7009 7010
		if (mpx_enabled) {
			vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
			vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
		} else {
			vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
		}
7011
	}
7012
}
N
Nadav Har'El 已提交
7013

7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct kvm_cpuid_entry2 *best = NULL;
	int i;

	for (i = 0; i < PT_CPUID_LEAVES; i++) {
		best = kvm_find_cpuid_entry(vcpu, 0x14, i);
		if (!best)
			return;
		vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
		vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
		vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
		vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
	}

	/* Get the number of configurable Address Ranges for filtering */
	vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
						PT_CAP_num_address_ranges);

	/* Initialize and clear the no dependency bits */
	vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
			RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);

	/*
	 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
	 * will inject an #GP
	 */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;

	/*
	 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
	 * PSBFreq can be set
	 */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
				RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);

	/*
	 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
	 * MTCFreq can be set
	 */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
				RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);

	/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
							RTIT_CTL_PTW_EN);

	/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;

	/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;

	/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;

	/* unmask address range configure area */
	for (i = 0; i < vmx->pt_desc.addr_range; i++)
7080
		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7081 7082
}

7083
static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7084 7085
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
N
Nadav Har'El 已提交
7086

7087 7088 7089
	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
	vcpu->arch.xsaves_enabled = false;

7090 7091
	if (cpu_has_secondary_exec_ctrls()) {
		vmx_compute_secondary_exec_control(vmx);
7092
		vmcs_set_secondary_exec_control(vmx);
7093
	}
N
Nadav Har'El 已提交
7094

7095 7096
	if (nested_vmx_allowed(vcpu))
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7097 7098
			FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
			FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7099 7100
	else
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7101 7102
			~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
			  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7103

7104 7105 7106
	if (nested_vmx_allowed(vcpu)) {
		nested_vmx_cr_fixed1_bits_update(vcpu);
		nested_vmx_entry_exit_ctls_update(vcpu);
7107
	}
7108 7109 7110 7111

	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
		update_intel_pt_cfg(vcpu);
7112 7113 7114 7115 7116 7117 7118 7119 7120

	if (boot_cpu_has(X86_FEATURE_RTM)) {
		struct shared_msr_entry *msr;
		msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
		if (msr) {
			bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
			vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
		}
	}
7121
}
7122

7123
static __init void vmx_set_cpu_caps(void)
7124
{
7125 7126 7127 7128 7129 7130 7131
	kvm_set_cpu_caps();

	/* CPUID 0x1 */
	if (nested)
		kvm_cpu_cap_set(X86_FEATURE_VMX);

	/* CPUID 0x7 */
7132 7133 7134 7135 7136 7137
	if (kvm_mpx_supported())
		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
	if (cpu_has_vmx_invpcid())
		kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
	if (vmx_pt_mode_is_host_guest())
		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7138

7139 7140 7141
	if (vmx_umip_emulated())
		kvm_cpu_cap_set(X86_FEATURE_UMIP);

7142
	/* CPUID 0xD.1 */
7143
	supported_xss = 0;
7144
	if (!cpu_has_vmx_xsaves())
7145 7146
		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);

7147 7148 7149
	/* CPUID 0x80000001 */
	if (!cpu_has_vmx_rdtscp())
		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7150

7151
	if (cpu_has_vmx_waitpkg())
7152
		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
N
Nadav Har'El 已提交
7153 7154
}

7155
static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7156
{
7157
	to_vmx(vcpu)->req_immediate_exit = true;
7158 7159
}

7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
				  struct x86_instruction_info *info)
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	unsigned short port;
	bool intercept;
	int size;

	if (info->intercept == x86_intercept_in ||
	    info->intercept == x86_intercept_ins) {
		port = info->src_val;
		size = info->dst_bytes;
	} else {
		port = info->dst_val;
		size = info->src_bytes;
	}

	/*
	 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
	 * VM-exits depend on the 'unconditional IO exiting' VM-execution
	 * control.
	 *
	 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
	 */
	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
		intercept = nested_cpu_has(vmcs12,
					   CPU_BASED_UNCOND_IO_EXITING);
	else
		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);

7190
	/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7191 7192 7193
	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}

7194 7195
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
			       struct x86_instruction_info *info,
7196 7197
			       enum x86_intercept_stage stage,
			       struct x86_exception *exception)
7198
{
P
Paolo Bonzini 已提交
7199 7200
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

7201
	switch (info->intercept) {
P
Paolo Bonzini 已提交
7202 7203 7204 7205
	/*
	 * RDPID causes #UD if disabled through secondary execution controls.
	 * Because it is marked as EmulateOnUD, we need to intercept it here.
	 */
7206
	case x86_intercept_rdtscp:
7207
		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7208 7209
			exception->vector = UD_VECTOR;
			exception->error_code_valid = false;
7210 7211 7212 7213 7214 7215 7216 7217 7218
			return X86EMUL_PROPAGATE_FAULT;
		}
		break;

	case x86_intercept_in:
	case x86_intercept_ins:
	case x86_intercept_out:
	case x86_intercept_outs:
		return vmx_check_intercept_io(vcpu, info);
P
Paolo Bonzini 已提交
7219

7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233
	case x86_intercept_lgdt:
	case x86_intercept_lidt:
	case x86_intercept_lldt:
	case x86_intercept_ltr:
	case x86_intercept_sgdt:
	case x86_intercept_sidt:
	case x86_intercept_sldt:
	case x86_intercept_str:
		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
			return X86EMUL_CONTINUE;

		/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
		break;

P
Paolo Bonzini 已提交
7234
	/* TODO: check more intercepts... */
7235 7236 7237 7238
	default:
		break;
	}

7239
	return X86EMUL_UNHANDLEABLE;
7240 7241
}

7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260
#ifdef CONFIG_X86_64
/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
static inline int u64_shl_div_u64(u64 a, unsigned int shift,
				  u64 divisor, u64 *result)
{
	u64 low = a << shift, high = a >> (64 - shift);

	/* To avoid the overflow on divq */
	if (high >= divisor)
		return 1;

	/* Low hold the result, high hold rem which is discarded */
	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
	    "rm" (divisor), "0" (low), "1" (high));
	*result = low;

	return 0;
}

7261 7262
static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
			    bool *expired)
7263
{
7264
	struct vcpu_vmx *vmx;
7265
	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7266
	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7267 7268 7269 7270 7271

	vmx = to_vmx(vcpu);
	tscl = rdtsc();
	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7272 7273
	lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
						    ktimer->timer_advance_ns);
7274 7275 7276 7277 7278

	if (delta_tsc > lapic_timer_advance_cycles)
		delta_tsc -= lapic_timer_advance_cycles;
	else
		delta_tsc = 0;
7279 7280 7281

	/* Convert to host delta tsc if tsc scaling is enabled */
	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7282
	    delta_tsc && u64_shl_div_u64(delta_tsc,
7283
				kvm_tsc_scaling_ratio_frac_bits,
7284
				vcpu->arch.tsc_scaling_ratio, &delta_tsc))
7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296
		return -ERANGE;

	/*
	 * If the delta tsc can't fit in the 32 bit after the multi shift,
	 * we can't use the preemption timer.
	 * It's possible that it fits on later vmentries, but checking
	 * on every vmentry is costly so we just use an hrtimer.
	 */
	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
		return -ERANGE;

	vmx->hv_deadline_tsc = tscl + delta_tsc;
7297 7298
	*expired = !delta_tsc;
	return 0;
7299 7300 7301 7302
}

static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
7303
	to_vmx(vcpu)->hv_deadline_tsc = -1;
7304 7305 7306
}
#endif

7307
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7308
{
7309
	if (!kvm_pause_in_guest(vcpu->kvm))
R
Radim Krčmář 已提交
7310
		shrink_ple_window(vcpu);
7311 7312
}

K
Kai Huang 已提交
7313 7314 7315
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
				     struct kvm_memory_slot *slot)
{
7316 7317
	if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
		kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
K
Kai Huang 已提交
7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338
	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
}

static void vmx_slot_disable_log_dirty(struct kvm *kvm,
				       struct kvm_memory_slot *slot)
{
	kvm_mmu_slot_set_dirty(kvm, slot);
}

static void vmx_flush_log_dirty(struct kvm *kvm)
{
	kvm_flush_pml_buffers(kvm);
}

static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
					   struct kvm_memory_slot *memslot,
					   gfn_t offset, unsigned long mask)
{
	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}

7339 7340 7341 7342 7343
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
	if (pi_pre_block(vcpu))
		return 1;

7344 7345 7346
	if (kvm_lapic_hv_timer_in_use(vcpu))
		kvm_lapic_switch_to_sw_timer(vcpu);

7347 7348 7349 7350 7351
	return 0;
}

static void vmx_post_block(struct kvm_vcpu *vcpu)
{
7352
	if (kvm_x86_ops.set_hv_timer)
7353 7354
		kvm_lapic_switch_to_hv_timer(vcpu);

7355 7356 7357
	pi_post_block(vcpu);
}

7358 7359 7360 7361
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7362
			FEAT_CTL_LMCE_ENABLED;
7363 7364
	else
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7365
			~FEAT_CTL_LMCE_ENABLED;
7366 7367
}

7368
static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
7369
{
7370 7371
	/* we need a nested vmexit to enter SMM, postpone if run is pending */
	if (to_vmx(vcpu)->nested.nested_run_pending)
7372
		return -EBUSY;
7373
	return !is_smm(vcpu);
7374 7375
}

7376 7377
static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
7378 7379 7380 7381 7382 7383 7384 7385
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
	if (vmx->nested.smm.guest_mode)
		nested_vmx_vmexit(vcpu, -1, 0, 0);

	vmx->nested.smm.vmxon = vmx->nested.vmxon;
	vmx->nested.vmxon = false;
7386
	vmx_clear_hlt(vcpu);
7387 7388 7389
	return 0;
}

7390
static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
7391
{
7392 7393 7394 7395 7396 7397 7398 7399 7400
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int ret;

	if (vmx->nested.smm.vmxon) {
		vmx->nested.vmxon = true;
		vmx->nested.smm.vmxon = false;
	}

	if (vmx->nested.smm.guest_mode) {
7401
		ret = nested_vmx_enter_non_root_mode(vcpu, false);
7402 7403 7404 7405 7406
		if (ret)
			return ret;

		vmx->nested.smm.guest_mode = false;
	}
7407 7408 7409
	return 0;
}

7410
static void enable_smi_window(struct kvm_vcpu *vcpu)
7411
{
7412
	/* RSM will cause a vmexit anyway.  */
7413 7414
}

7415 7416 7417 7418 7419
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
	return to_vmx(vcpu)->nested.vmxon;
}

7420 7421 7422 7423 7424 7425 7426 7427 7428 7429
static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
	if (is_guest_mode(vcpu)) {
		struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;

		if (hrtimer_try_to_cancel(timer) == 1)
			hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	}
}

7430
static void hardware_unsetup(void)
7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445
{
	if (nested)
		nested_vmx_hardware_unsetup();

	free_kvm_area();
}

static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
	ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
			  BIT(APICV_INHIBIT_REASON_HYPERV);

	return supported & BIT(bit);
}

7446
static struct kvm_x86_ops vmx_x86_ops __initdata = {
7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464
	.hardware_unsetup = hardware_unsetup,

	.hardware_enable = hardware_enable,
	.hardware_disable = hardware_disable,
	.cpu_has_accelerated_tpr = report_flexpriority,
	.has_emulated_msr = vmx_has_emulated_msr,

	.vm_size = sizeof(struct kvm_vmx),
	.vm_init = vmx_vm_init,

	.vcpu_create = vmx_create_vcpu,
	.vcpu_free = vmx_free_vcpu,
	.vcpu_reset = vmx_vcpu_reset,

	.prepare_guest_switch = vmx_prepare_switch_to_guest,
	.vcpu_load = vmx_vcpu_load,
	.vcpu_put = vmx_vcpu_put,

7465
	.update_exception_bitmap = update_exception_bitmap,
7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486
	.get_msr_feature = vmx_get_msr_feature,
	.get_msr = vmx_get_msr,
	.set_msr = vmx_set_msr,
	.get_segment_base = vmx_get_segment_base,
	.get_segment = vmx_get_segment,
	.set_segment = vmx_set_segment,
	.get_cpl = vmx_get_cpl,
	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
	.set_cr0 = vmx_set_cr0,
	.set_cr4 = vmx_set_cr4,
	.set_efer = vmx_set_efer,
	.get_idt = vmx_get_idt,
	.set_idt = vmx_set_idt,
	.get_gdt = vmx_get_gdt,
	.set_gdt = vmx_set_gdt,
	.set_dr7 = vmx_set_dr7,
	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
	.cache_reg = vmx_cache_reg,
	.get_rflags = vmx_get_rflags,
	.set_rflags = vmx_set_rflags,

7487
	.tlb_flush_all = vmx_flush_tlb_all,
7488
	.tlb_flush_current = vmx_flush_tlb_current,
7489
	.tlb_flush_gva = vmx_flush_tlb_gva,
7490
	.tlb_flush_guest = vmx_flush_tlb_guest,
7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520

	.run = vmx_vcpu_run,
	.handle_exit = vmx_handle_exit,
	.skip_emulated_instruction = vmx_skip_emulated_instruction,
	.update_emulated_instruction = vmx_update_emulated_instruction,
	.set_interrupt_shadow = vmx_set_interrupt_shadow,
	.get_interrupt_shadow = vmx_get_interrupt_shadow,
	.patch_hypercall = vmx_patch_hypercall,
	.set_irq = vmx_inject_irq,
	.set_nmi = vmx_inject_nmi,
	.queue_exception = vmx_queue_exception,
	.cancel_injection = vmx_cancel_injection,
	.interrupt_allowed = vmx_interrupt_allowed,
	.nmi_allowed = vmx_nmi_allowed,
	.get_nmi_mask = vmx_get_nmi_mask,
	.set_nmi_mask = vmx_set_nmi_mask,
	.enable_nmi_window = enable_nmi_window,
	.enable_irq_window = enable_irq_window,
	.update_cr8_intercept = update_cr8_intercept,
	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
	.load_eoi_exitmap = vmx_load_eoi_exitmap,
	.apicv_post_state_restore = vmx_apicv_post_state_restore,
	.check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
	.hwapic_irr_update = vmx_hwapic_irr_update,
	.hwapic_isr_update = vmx_hwapic_isr_update,
	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
	.sync_pir_to_irr = vmx_sync_pir_to_irr,
	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7521
	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
7522 7523 7524 7525 7526 7527 7528

	.set_tss_addr = vmx_set_tss_addr,
	.set_identity_map_addr = vmx_set_identity_map_addr,
	.get_mt_mask = vmx_get_mt_mask,

	.get_exit_info = vmx_get_exit_info,

7529
	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552

	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,

	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,

	.load_mmu_pgd = vmx_load_mmu_pgd,

	.check_intercept = vmx_check_intercept,
	.handle_exit_irqoff = vmx_handle_exit_irqoff,

	.request_immediate_exit = vmx_request_immediate_exit,

	.sched_in = vmx_sched_in,

	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
	.flush_log_dirty = vmx_flush_log_dirty,
	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,

	.pre_block = vmx_pre_block,
	.post_block = vmx_post_block,

	.pmu_ops = &intel_pmu_ops,
7553
	.nested_ops = &vmx_nested_ops,
7554

7555
	.update_pi_irte = pi_update_irte,
7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568

#ifdef CONFIG_X86_64
	.set_hv_timer = vmx_set_hv_timer,
	.cancel_hv_timer = vmx_cancel_hv_timer,
#endif

	.setup_mce = vmx_setup_mce,

	.smi_allowed = vmx_smi_allowed,
	.pre_enter_smm = vmx_pre_enter_smm,
	.pre_leave_smm = vmx_pre_leave_smm,
	.enable_smi_window = enable_smi_window,

7569
	.can_emulate_instruction = vmx_can_emulate_instruction,
7570
	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
7571
	.migrate_timers = vmx_migrate_timers,
7572 7573
};

7574 7575 7576
static __init int hardware_setup(void)
{
	unsigned long host_bndcfgs;
7577
	struct desc_ptr dt;
7578
	int r, i, ept_lpage_level;
7579

7580 7581 7582
	store_idt(&dt);
	host_idt_base = dt.address;

7583
	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7584
		kvm_define_user_return_msr(i, vmx_msr_index[i]);
7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596

	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
		return -EIO;

	if (boot_cpu_has(X86_FEATURE_NX))
		kvm_enable_efer_bits(EFER_NX);

	if (boot_cpu_has(X86_FEATURE_MPX)) {
		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
	}

7597
	if (!cpu_has_vmx_mpx())
7598 7599 7600
		supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
				    XFEATURE_MASK_BNDCSR);

7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628
	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
		enable_vpid = 0;

	if (!cpu_has_vmx_ept() ||
	    !cpu_has_vmx_ept_4levels() ||
	    !cpu_has_vmx_ept_mt_wb() ||
	    !cpu_has_vmx_invept_global())
		enable_ept = 0;

	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
		enable_ept_ad_bits = 0;

	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
		enable_unrestricted_guest = 0;

	if (!cpu_has_vmx_flexpriority())
		flexpriority_enabled = 0;

	if (!cpu_has_virtual_nmis())
		enable_vnmi = 0;

	/*
	 * set_apic_access_page_addr() is used to reload apic access
	 * page upon invalidation.  No need to do anything if not
	 * using the APIC_ACCESS_ADDR VMCS field.
	 */
	if (!flexpriority_enabled)
7629
		vmx_x86_ops.set_apic_access_page_addr = NULL;
7630 7631

	if (!cpu_has_vmx_tpr_shadow())
7632
		vmx_x86_ops.update_cr8_intercept = NULL;
7633 7634 7635

#if IS_ENABLED(CONFIG_HYPERV)
	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7636
	    && enable_ept) {
7637 7638
		vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
		vmx_x86_ops.tlb_remote_flush_with_range =
7639 7640
				hv_remote_flush_tlb_with_range;
	}
7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652
#endif

	if (!cpu_has_vmx_ple()) {
		ple_gap = 0;
		ple_window = 0;
		ple_window_grow = 0;
		ple_window_max = 0;
		ple_window_shrink = 0;
	}

	if (!cpu_has_vmx_apicv()) {
		enable_apicv = 0;
7653
		vmx_x86_ops.sync_pir_to_irr = NULL;
7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665
	}

	if (cpu_has_vmx_tsc_scaling()) {
		kvm_has_tsc_control = true;
		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
		kvm_tsc_scaling_ratio_frac_bits = 48;
	}

	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */

	if (enable_ept)
		vmx_enable_tdp();
7666 7667 7668 7669

	if (!enable_ept)
		ept_lpage_level = 0;
	else if (cpu_has_vmx_ept_1g_page())
7670
		ept_lpage_level = PG_LEVEL_1G;
7671
	else if (cpu_has_vmx_ept_2m_page())
7672
		ept_lpage_level = PG_LEVEL_2M;
7673
	else
7674
		ept_lpage_level = PG_LEVEL_4K;
7675
	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
7676 7677 7678 7679 7680 7681 7682 7683 7684

	/*
	 * Only enable PML when hardware supports PML feature, and both EPT
	 * and EPT A/D bit features are enabled -- PML depends on them to work.
	 */
	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
		enable_pml = 0;

	if (!enable_pml) {
7685 7686 7687 7688
		vmx_x86_ops.slot_enable_log_dirty = NULL;
		vmx_x86_ops.slot_disable_log_dirty = NULL;
		vmx_x86_ops.flush_log_dirty = NULL;
		vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
7689 7690 7691
	}

	if (!cpu_has_vmx_preemption_timer())
7692
		enable_preemption_timer = false;
7693

7694 7695
	if (enable_preemption_timer) {
		u64 use_timer_freq = 5000ULL * 1000 * 1000;
7696 7697 7698 7699 7700
		u64 vmx_msr;

		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
		cpu_preemption_timer_multi =
			vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715

		if (tsc_khz)
			use_timer_freq = (u64)tsc_khz * 1000;
		use_timer_freq >>= cpu_preemption_timer_multi;

		/*
		 * KVM "disables" the preemption timer by setting it to its max
		 * value.  Don't use the timer if it might cause spurious exits
		 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
		 */
		if (use_timer_freq > 0xffffffffu / 10)
			enable_preemption_timer = false;
	}

	if (!enable_preemption_timer) {
7716 7717 7718
		vmx_x86_ops.set_hv_timer = NULL;
		vmx_x86_ops.cancel_hv_timer = NULL;
		vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
7719 7720
	}

7721
	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
7722 7723 7724

	kvm_mce_cap_supported |= MCG_LMCE_P;

7725 7726 7727 7728 7729
	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
		return -EINVAL;
	if (!enable_ept || !cpu_has_vmx_intel_pt())
		pt_mode = PT_MODE_SYSTEM;

7730
	if (nested) {
7731
		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
7732
					   vmx_capability.ept);
7733

7734
		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
7735 7736 7737 7738
		if (r)
			return r;
	}

7739
	vmx_set_cpu_caps();
7740

7741 7742 7743 7744 7745 7746
	r = alloc_kvm_area();
	if (r)
		nested_vmx_hardware_unsetup();
	return r;
}

7747
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
A
Avi Kivity 已提交
7748 7749
	.cpu_has_kvm_support = cpu_has_kvm_support,
	.disabled_by_bios = vmx_disabled_by_bios,
Y
Yang, Sheng 已提交
7750
	.check_processor_compatibility = vmx_check_processor_compat,
7751
	.hardware_setup = hardware_setup,
7752

7753
	.runtime_ops = &vmx_x86_ops,
A
Avi Kivity 已提交
7754 7755
};

7756
static void vmx_cleanup_l1d_flush(void)
7757 7758 7759 7760 7761
{
	if (vmx_l1d_flush_pages) {
		free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
		vmx_l1d_flush_pages = NULL;
	}
7762 7763
	/* Restore state so sysfs ignores VMX */
	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
7764 7765
}

7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789
static void vmx_exit(void)
{
#ifdef CONFIG_KEXEC_CORE
	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
	synchronize_rcu();
#endif

	kvm_exit();

#if IS_ENABLED(CONFIG_HYPERV)
	if (static_branch_unlikely(&enable_evmcs)) {
		int cpu;
		struct hv_vp_assist_page *vp_ap;
		/*
		 * Reset everything to support using non-enlightened VMCS
		 * access later (e.g. when we reload the module with
		 * enlightened_vmcs=0)
		 */
		for_each_online_cpu(cpu) {
			vp_ap =	hv_get_vp_assist_page(cpu);

			if (!vp_ap)
				continue;

7790
			vp_ap->nested_control.features.directhypercall = 0;
7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801
			vp_ap->current_nested_vmcs = 0;
			vp_ap->enlighten_vmentry = 0;
		}

		static_branch_disable(&enable_evmcs);
	}
#endif
	vmx_cleanup_l1d_flush();
}
module_exit(vmx_exit);

A
Avi Kivity 已提交
7802 7803
static int __init vmx_init(void)
{
7804
	int r, cpu;
7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829

#if IS_ENABLED(CONFIG_HYPERV)
	/*
	 * Enlightened VMCS usage should be recommended and the host needs
	 * to support eVMCS v1 or above. We can also disable eVMCS support
	 * with module parameter.
	 */
	if (enlightened_vmcs &&
	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
	    KVM_EVMCS_VERSION) {
		int cpu;

		/* Check that we have assist pages on all online CPUs */
		for_each_online_cpu(cpu) {
			if (!hv_get_vp_assist_page(cpu)) {
				enlightened_vmcs = false;
				break;
			}
		}

		if (enlightened_vmcs) {
			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
			static_branch_enable(&enable_evmcs);
		}
7830 7831 7832 7833 7834

		if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
			vmx_x86_ops.enable_direct_tlbflush
				= hv_enable_direct_tlbflush;

7835 7836 7837 7838 7839
	} else {
		enlightened_vmcs = false;
	}
#endif

7840
	r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
7841
		     __alignof__(struct vcpu_vmx), THIS_MODULE);
7842
	if (r)
7843
		return r;
S
Sheng Yang 已提交
7844

7845
	/*
7846 7847 7848 7849 7850 7851
	 * Must be called after kvm_init() so enable_ept is properly set
	 * up. Hand the parameter mitigation value in which was stored in
	 * the pre module init parser. If no parameter was given, it will
	 * contain 'auto' which will be turned into the default 'cond'
	 * mitigation mode.
	 */
7852 7853 7854 7855
	r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
	if (r) {
		vmx_exit();
		return r;
7856
	}
S
Sheng Yang 已提交
7857

7858 7859
	for_each_possible_cpu(cpu) {
		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
7860 7861

		pi_init(cpu);
7862 7863
	}

7864
#ifdef CONFIG_KEXEC_CORE
7865 7866 7867
	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
			   crash_vmclear_local_loaded_vmcss);
#endif
7868
	vmx_check_vmcs12_offsets();
7869

7870 7871 7872 7873 7874 7875 7876
	/*
	 * Intel processors don't have problems with
	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
	 * it for VMX by default
	 */
	allow_smaller_maxphyaddr = true;

7877
	return 0;
A
Avi Kivity 已提交
7878
}
7879
module_init(vmx_init);