vmx.c 346.7 KB
Newer Older
A
Avi Kivity 已提交
1 2 3 4 5 6 7
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
N
Nicolas Kaiser 已提交
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
A
Avi Kivity 已提交
9 10 11 12 13 14 15 16 17 18
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

19
#include "irq.h"
20
#include "mmu.h"
A
Avi Kivity 已提交
21
#include "cpuid.h"
22
#include "lapic.h"
A
Avi Kivity 已提交
23

24
#include <linux/kvm_host.h>
A
Avi Kivity 已提交
25
#include <linux/module.h>
26
#include <linux/kernel.h>
A
Avi Kivity 已提交
27 28
#include <linux/mm.h>
#include <linux/highmem.h>
A
Alexey Dobriyan 已提交
29
#include <linux/sched.h>
30
#include <linux/moduleparam.h>
31
#include <linux/mod_devicetable.h>
32
#include <linux/trace_events.h>
33
#include <linux/slab.h>
34
#include <linux/tboot.h>
35
#include <linux/hrtimer.h>
36
#include <linux/frame.h>
37
#include "kvm_cache_regs.h"
38
#include "x86.h"
A
Avi Kivity 已提交
39

40
#include <asm/cpu.h>
A
Avi Kivity 已提交
41
#include <asm/io.h>
A
Anthony Liguori 已提交
42
#include <asm/desc.h>
43
#include <asm/vmx.h>
44
#include <asm/virtext.h>
A
Andi Kleen 已提交
45
#include <asm/mce.h>
46
#include <asm/fpu/internal.h>
47
#include <asm/perf_event.h>
48
#include <asm/debugreg.h>
49
#include <asm/kexec.h>
50
#include <asm/apic.h>
51
#include <asm/irq_remapping.h>
52
#include <asm/mmu_context.h>
A
Avi Kivity 已提交
53

54
#include "trace.h"
55
#include "pmu.h"
56

57
#define __ex(x) __kvm_handle_fault_on_reboot(x)
58 59
#define __ex_clear(x, reg) \
	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
60

A
Avi Kivity 已提交
61 62 63
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

64 65 66 67 68 69
static const struct x86_cpu_id vmx_cpu_id[] = {
	X86_FEATURE_MATCH(X86_FEATURE_VMX),
	{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);

70
static bool __read_mostly enable_vpid = 1;
71
module_param_named(vpid, enable_vpid, bool, 0444);
72

73 74 75
static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);

76
static bool __read_mostly flexpriority_enabled = 1;
77
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
78

79
static bool __read_mostly enable_ept = 1;
80
module_param_named(ept, enable_ept, bool, S_IRUGO);
S
Sheng Yang 已提交
81

82
static bool __read_mostly enable_unrestricted_guest = 1;
83 84 85
module_param_named(unrestricted_guest,
			enable_unrestricted_guest, bool, S_IRUGO);

86 87 88
static bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);

89
static bool __read_mostly emulate_invalid_guest_state = true;
90
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
91

92
static bool __read_mostly fasteoi = 1;
93 94
module_param(fasteoi, bool, S_IRUGO);

95
static bool __read_mostly enable_apicv = 1;
96
module_param(enable_apicv, bool, S_IRUGO);
97

98 99
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
100 101 102 103 104
/*
 * If nested=1, nested virtualization is supported, i.e., guests may use
 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 * use VMX instructions.
 */
105
static bool __read_mostly nested = 0;
106 107
module_param(nested, bool, S_IRUGO);

W
Wanpeng Li 已提交
108 109
static u64 __read_mostly host_xss;

K
Kai Huang 已提交
110 111 112
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);

113 114
#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

115 116 117 118 119 120 121
/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif

G
Gleb Natapov 已提交
122 123
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
124 125
#define KVM_VM_CR0_ALWAYS_ON						\
	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
126 127
#define KVM_CR4_GUEST_OWNED_BITS				      \
	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
128
	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
129

130 131 132
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

133 134
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

135 136
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5

137 138 139 140 141 142 143 144 145 146
/*
 * Hyper-V requires all of these, so mark them as supported even though
 * they are just treated the same as all-context.
 */
#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)

147 148 149 150
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * ple_gap:    upper bound on the amount of time between two successive
 *             executions of PAUSE in a loop. Also indicate if ple enabled.
151
 *             According to test, this time is usually smaller than 128 cycles.
152 153 154 155 156 157
 * ple_window: upper bound on the amount of time a guest is allowed to execute
 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 *             less than 2^12 cycles
 * Time is measured based on a counter that runs at the same rate as the TSC,
 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 */
R
Radim Krčmář 已提交
158 159 160 161 162 163 164
#define KVM_VMX_DEFAULT_PLE_GAP           128
#define KVM_VMX_DEFAULT_PLE_WINDOW        4096
#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
		INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW

165 166 167 168 169 170
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);

static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);

R
Radim Krčmář 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183
/* Default doubles per-vcpu window every exit. */
static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, int, S_IRUGO);

/* Default resets per-vcpu window every exit to ple_window. */
static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, int, S_IRUGO);

/* Default is to compute the maximum so we can never overflow. */
static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, int, S_IRUGO);

A
Avi Kivity 已提交
184 185
extern const ulong vmx_return;

186
#define NR_AUTOLOAD_MSRS 8
187
#define VMCS02_POOL_SIZE 1
188

189 190 191 192 193 194
struct vmcs {
	u32 revision_id;
	u32 abort;
	char data[0];
};

195 196 197 198 199 200 201
/*
 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 * loaded on this CPU (so we can clear them if the CPU goes down).
 */
struct loaded_vmcs {
	struct vmcs *vmcs;
202
	struct vmcs *shadow_vmcs;
203
	int cpu;
204 205
	bool launched;
	bool nmi_known_unmasked;
206 207
	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
208 209 210 211
	/* Support for vnmi-less CPUs */
	int soft_vnmi_blocked;
	ktime_t entry_time;
	s64 vnmi_blocked_time;
212 213 214
	struct list_head loaded_vmcss_on_cpu_link;
};

215 216 217
struct shared_msr_entry {
	unsigned index;
	u64 data;
218
	u64 mask;
219 220
};

221 222 223 224 225 226 227 228 229 230 231 232 233
/*
 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 * More than one of these structures may exist, if L1 runs multiple L2 guests.
 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
 * underlying hardware which will be used to run L2.
 * This structure is packed to ensure that its layout is identical across
 * machines (necessary for live migration).
 * If there are changes in this struct, VMCS12_REVISION must be changed.
 */
234
typedef u64 natural_width;
235 236 237 238 239 240
struct __packed vmcs12 {
	/* According to the Intel spec, a VMCS region must start with the
	 * following two fields. Then follow implementation-specific data.
	 */
	u32 revision_id;
	u32 abort;
241

N
Nadav Har'El 已提交
242 243 244
	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
	u32 padding[7]; /* room for future expansion */

245 246 247 248 249 250 251 252 253
	u64 io_bitmap_a;
	u64 io_bitmap_b;
	u64 msr_bitmap;
	u64 vm_exit_msr_store_addr;
	u64 vm_exit_msr_load_addr;
	u64 vm_entry_msr_load_addr;
	u64 tsc_offset;
	u64 virtual_apic_page_addr;
	u64 apic_access_addr;
254
	u64 posted_intr_desc_addr;
255
	u64 vm_function_control;
256
	u64 ept_pointer;
257 258 259 260
	u64 eoi_exit_bitmap0;
	u64 eoi_exit_bitmap1;
	u64 eoi_exit_bitmap2;
	u64 eoi_exit_bitmap3;
261
	u64 eptp_list_address;
262
	u64 xss_exit_bitmap;
263 264
	u64 guest_physical_address;
	u64 vmcs_link_pointer;
265
	u64 pml_address;
266 267 268 269 270 271 272 273
	u64 guest_ia32_debugctl;
	u64 guest_ia32_pat;
	u64 guest_ia32_efer;
	u64 guest_ia32_perf_global_ctrl;
	u64 guest_pdptr0;
	u64 guest_pdptr1;
	u64 guest_pdptr2;
	u64 guest_pdptr3;
274
	u64 guest_bndcfgs;
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
	u64 host_ia32_pat;
	u64 host_ia32_efer;
	u64 host_ia32_perf_global_ctrl;
	u64 padding64[8]; /* room for future expansion */
	/*
	 * To allow migration of L1 (complete with its L2 guests) between
	 * machines of different natural widths (32 or 64 bit), we cannot have
	 * unsigned long fields with no explict size. We use u64 (aliased
	 * natural_width) instead. Luckily, x86 is little-endian.
	 */
	natural_width cr0_guest_host_mask;
	natural_width cr4_guest_host_mask;
	natural_width cr0_read_shadow;
	natural_width cr4_read_shadow;
	natural_width cr3_target_value0;
	natural_width cr3_target_value1;
	natural_width cr3_target_value2;
	natural_width cr3_target_value3;
	natural_width exit_qualification;
	natural_width guest_linear_address;
	natural_width guest_cr0;
	natural_width guest_cr3;
	natural_width guest_cr4;
	natural_width guest_es_base;
	natural_width guest_cs_base;
	natural_width guest_ss_base;
	natural_width guest_ds_base;
	natural_width guest_fs_base;
	natural_width guest_gs_base;
	natural_width guest_ldtr_base;
	natural_width guest_tr_base;
	natural_width guest_gdtr_base;
	natural_width guest_idtr_base;
	natural_width guest_dr7;
	natural_width guest_rsp;
	natural_width guest_rip;
	natural_width guest_rflags;
	natural_width guest_pending_dbg_exceptions;
	natural_width guest_sysenter_esp;
	natural_width guest_sysenter_eip;
	natural_width host_cr0;
	natural_width host_cr3;
	natural_width host_cr4;
	natural_width host_fs_base;
	natural_width host_gs_base;
	natural_width host_tr_base;
	natural_width host_gdtr_base;
	natural_width host_idtr_base;
	natural_width host_ia32_sysenter_esp;
	natural_width host_ia32_sysenter_eip;
	natural_width host_rsp;
	natural_width host_rip;
	natural_width paddingl[8]; /* room for future expansion */
	u32 pin_based_vm_exec_control;
	u32 cpu_based_vm_exec_control;
	u32 exception_bitmap;
	u32 page_fault_error_code_mask;
	u32 page_fault_error_code_match;
	u32 cr3_target_count;
	u32 vm_exit_controls;
	u32 vm_exit_msr_store_count;
	u32 vm_exit_msr_load_count;
	u32 vm_entry_controls;
	u32 vm_entry_msr_load_count;
	u32 vm_entry_intr_info_field;
	u32 vm_entry_exception_error_code;
	u32 vm_entry_instruction_len;
	u32 tpr_threshold;
	u32 secondary_vm_exec_control;
	u32 vm_instruction_error;
	u32 vm_exit_reason;
	u32 vm_exit_intr_info;
	u32 vm_exit_intr_error_code;
	u32 idt_vectoring_info_field;
	u32 idt_vectoring_error_code;
	u32 vm_exit_instruction_len;
	u32 vmx_instruction_info;
	u32 guest_es_limit;
	u32 guest_cs_limit;
	u32 guest_ss_limit;
	u32 guest_ds_limit;
	u32 guest_fs_limit;
	u32 guest_gs_limit;
	u32 guest_ldtr_limit;
	u32 guest_tr_limit;
	u32 guest_gdtr_limit;
	u32 guest_idtr_limit;
	u32 guest_es_ar_bytes;
	u32 guest_cs_ar_bytes;
	u32 guest_ss_ar_bytes;
	u32 guest_ds_ar_bytes;
	u32 guest_fs_ar_bytes;
	u32 guest_gs_ar_bytes;
	u32 guest_ldtr_ar_bytes;
	u32 guest_tr_ar_bytes;
	u32 guest_interruptibility_info;
	u32 guest_activity_state;
	u32 guest_sysenter_cs;
	u32 host_ia32_sysenter_cs;
374 375
	u32 vmx_preemption_timer_value;
	u32 padding32[7]; /* room for future expansion */
376
	u16 virtual_processor_id;
377
	u16 posted_intr_nv;
378 379 380 381 382 383 384 385
	u16 guest_es_selector;
	u16 guest_cs_selector;
	u16 guest_ss_selector;
	u16 guest_ds_selector;
	u16 guest_fs_selector;
	u16 guest_gs_selector;
	u16 guest_ldtr_selector;
	u16 guest_tr_selector;
386
	u16 guest_intr_status;
387
	u16 guest_pml_index;
388 389 390 391 392 393 394
	u16 host_es_selector;
	u16 host_cs_selector;
	u16 host_ss_selector;
	u16 host_ds_selector;
	u16 host_fs_selector;
	u16 host_gs_selector;
	u16 host_tr_selector;
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
};

/*
 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
 */
#define VMCS12_REVISION 0x11e57ed0

/*
 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
 * current implementation, 4K are reserved to avoid future complications.
 */
#define VMCS12_SIZE 0x1000

411 412 413 414 415 416 417
/* Used to remember the last vmcs02 used for some recently used vmcs12s */
struct vmcs02_list {
	struct list_head list;
	gpa_t vmptr;
	struct loaded_vmcs vmcs02;
};

418 419 420 421 422 423 424
/*
 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
 */
struct nested_vmx {
	/* Has the level1 guest done vmxon? */
	bool vmxon;
425
	gpa_t vmxon_ptr;
426
	bool pml_full;
427 428 429

	/* The guest-physical address of the current VMCS L1 keeps for L2 */
	gpa_t current_vmptr;
430 431 432
	/*
	 * Cache of the guest's VMCS, existing outside of guest memory.
	 * Loaded from guest memory during VMPTRLD. Flushed to guest
433
	 * memory during VMCLEAR and VMPTRLD.
434 435
	 */
	struct vmcs12 *cached_vmcs12;
436 437 438 439 440
	/*
	 * Indicates if the shadow vmcs must be updated with the
	 * data hold by vmcs12
	 */
	bool sync_shadow_vmcs;
441 442 443 444

	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
	struct list_head vmcs02_pool;
	int vmcs02_num;
445
	bool change_vmcs01_virtual_x2apic_mode;
446 447
	/* L2 must run next, and mustn't decide to exit to L1. */
	bool nested_run_pending;
448 449 450 451 452
	/*
	 * Guest pages referred to in vmcs02 with host-physical pointers, so
	 * we must keep them pinned while L2 runs.
	 */
	struct page *apic_access_page;
453
	struct page *virtual_apic_page;
454 455 456 457
	struct page *pi_desc_page;
	struct pi_desc *pi_desc;
	bool pi_pending;
	u16 posted_intr_nv;
458

459 460
	unsigned long *msr_bitmap;

461 462
	struct hrtimer preemption_timer;
	bool preemption_timer_expired;
463 464 465

	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
	u64 vmcs01_debugctl;
466

W
Wanpeng Li 已提交
467 468 469
	u16 vpid02;
	u16 last_vpid;

470 471 472 473 474
	/*
	 * We only store the "true" versions of the VMX capability MSRs. We
	 * generate the "non-true" versions by setting the must-be-1 bits
	 * according to the SDM.
	 */
475 476 477 478 479 480 481 482 483 484 485 486 487
	u32 nested_vmx_procbased_ctls_low;
	u32 nested_vmx_procbased_ctls_high;
	u32 nested_vmx_secondary_ctls_low;
	u32 nested_vmx_secondary_ctls_high;
	u32 nested_vmx_pinbased_ctls_low;
	u32 nested_vmx_pinbased_ctls_high;
	u32 nested_vmx_exit_ctls_low;
	u32 nested_vmx_exit_ctls_high;
	u32 nested_vmx_entry_ctls_low;
	u32 nested_vmx_entry_ctls_high;
	u32 nested_vmx_misc_low;
	u32 nested_vmx_misc_high;
	u32 nested_vmx_ept_caps;
488
	u32 nested_vmx_vpid_caps;
489 490 491 492 493 494
	u64 nested_vmx_basic;
	u64 nested_vmx_cr0_fixed0;
	u64 nested_vmx_cr0_fixed1;
	u64 nested_vmx_cr4_fixed0;
	u64 nested_vmx_cr4_fixed1;
	u64 nested_vmx_vmcs_enum;
495
	u64 nested_vmx_vmfunc_controls;
496 497 498 499 500 501 502 503

	/* SMM related state */
	struct {
		/* in VMX operation on SMM entry? */
		bool vmxon;
		/* in guest mode on SMM entry? */
		bool guest_mode;
	} smm;
504 505
};

506
#define POSTED_INTR_ON  0
507 508
#define POSTED_INTR_SN  1

509 510 511
/* Posted-Interrupt Descriptor */
struct pi_desc {
	u32 pir[8];     /* Posted interrupt requested */
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
	union {
		struct {
				/* bit 256 - Outstanding Notification */
			u16	on	: 1,
				/* bit 257 - Suppress Notification */
				sn	: 1,
				/* bit 271:258 - Reserved */
				rsvd_1	: 14;
				/* bit 279:272 - Notification Vector */
			u8	nv;
				/* bit 287:280 - Reserved */
			u8	rsvd_2;
				/* bit 319:288 - Notification Destination */
			u32	ndst;
		};
		u64 control;
	};
	u32 rsvd[6];
530 531
} __aligned(64);

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
	return test_and_set_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
{
	return test_and_clear_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}

549 550 551 552 553 554 555 556 557 558 559 560
static inline void pi_clear_sn(struct pi_desc *pi_desc)
{
	return clear_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

static inline void pi_set_sn(struct pi_desc *pi_desc)
{
	return set_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

561 562 563 564 565 566
static inline void pi_clear_on(struct pi_desc *pi_desc)
{
	clear_bit(POSTED_INTR_ON,
  		  (unsigned long *)&pi_desc->control);
}

567 568 569 570 571 572 573 574 575 576 577 578
static inline int pi_test_on(struct pi_desc *pi_desc)
{
	return test_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static inline int pi_test_sn(struct pi_desc *pi_desc)
{
	return test_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

579
struct vcpu_vmx {
R
Rusty Russell 已提交
580
	struct kvm_vcpu       vcpu;
581
	unsigned long         host_rsp;
582
	u8                    fail;
583
	u32                   exit_intr_info;
584
	u32                   idt_vectoring_info;
A
Avi Kivity 已提交
585
	ulong                 rflags;
586
	struct shared_msr_entry *guest_msrs;
587 588
	int                   nmsrs;
	int                   save_nmsrs;
589
	unsigned long	      host_idt_base;
590
#ifdef CONFIG_X86_64
591 592
	u64 		      msr_host_kernel_gs_base;
	u64 		      msr_guest_kernel_gs_base;
593
#endif
594 595
	u32 vm_entry_controls_shadow;
	u32 vm_exit_controls_shadow;
596 597
	u32 secondary_exec_control;

598 599 600 601 602 603 604 605
	/*
	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
	 * non-nested (L1) guest, it always points to vmcs01. For a nested
	 * guest (L2), it points to a different VMCS.
	 */
	struct loaded_vmcs    vmcs01;
	struct loaded_vmcs   *loaded_vmcs;
	bool                  __launched; /* temporary, used in vmx_vcpu_run */
606 607 608 609 610
	struct msr_autoload {
		unsigned nr;
		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
	} msr_autoload;
611 612 613
	struct {
		int           loaded;
		u16           fs_sel, gs_sel, ldt_sel;
A
Avi Kivity 已提交
614 615 616
#ifdef CONFIG_X86_64
		u16           ds_sel, es_sel;
#endif
617 618
		int           gs_ldt_reload_needed;
		int           fs_reload_needed;
619
		u64           msr_host_bndcfgs;
M
Mike Day 已提交
620
	} host_state;
621
	struct {
622
		int vm86_active;
623
		ulong save_rflags;
624 625 626 627
		struct kvm_segment segs[8];
	} rmode;
	struct {
		u32 bitmask; /* 4 bits per segment (1 bit per field) */
628 629 630 631 632
		struct kvm_save_segment {
			u16 selector;
			unsigned long base;
			u32 limit;
			u32 ar;
633
		} seg[8];
A
Avi Kivity 已提交
634
	} segment_cache;
635
	int vpid;
636
	bool emulation_required;
637

A
Andi Kleen 已提交
638
	u32 exit_reason;
639

640 641 642
	/* Posted interrupt descriptor */
	struct pi_desc pi_desc;

643 644
	/* Support for a guest hypervisor (nested VMX) */
	struct nested_vmx nested;
645 646 647 648

	/* Dynamic PLE window. */
	int ple_window;
	bool ple_window_dirty;
K
Kai Huang 已提交
649 650 651 652

	/* Support for PML */
#define PML_ENTITY_NUM		512
	struct page *pml_pg;
653

654 655 656
	/* apic deadline value in host tsc */
	u64 hv_deadline_tsc;

657
	u64 current_tsc_ratio;
658 659

	u32 host_pkru;
660

661 662 663 664 665
	/*
	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
	 * in msr_ia32_feature_control_valid_bits.
	 */
666
	u64 msr_ia32_feature_control;
667
	u64 msr_ia32_feature_control_valid_bits;
668 669
};

A
Avi Kivity 已提交
670 671 672 673 674 675 676 677 678
enum segment_cache_field {
	SEG_FIELD_SEL = 0,
	SEG_FIELD_BASE = 1,
	SEG_FIELD_LIMIT = 2,
	SEG_FIELD_AR = 3,

	SEG_FIELD_NR = 4
};

679 680
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
R
Rusty Russell 已提交
681
	return container_of(vcpu, struct vcpu_vmx, vcpu);
682 683
}

684 685 686 687 688
static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
	return &(to_vmx(vcpu)->pi_desc);
}

689 690 691 692 693
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
				[number##_HIGH] = VMCS12_OFFSET(name)+4

694

695
static unsigned long shadow_read_only_fields[] = {
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
	/*
	 * We do NOT shadow fields that are modified when L0
	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
	 * VMXON...) executed by L1.
	 * For example, VM_INSTRUCTION_ERROR is read
	 * by L1 if a vmx instruction fails (part of the error path).
	 * Note the code assumes this logic. If for some reason
	 * we start shadowing these fields then we need to
	 * force a shadow sync when L0 emulates vmx instructions
	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
	 * by nested_vmx_failValid)
	 */
	VM_EXIT_REASON,
	VM_EXIT_INTR_INFO,
	VM_EXIT_INSTRUCTION_LEN,
	IDT_VECTORING_INFO_FIELD,
	IDT_VECTORING_ERROR_CODE,
	VM_EXIT_INTR_ERROR_CODE,
	EXIT_QUALIFICATION,
	GUEST_LINEAR_ADDRESS,
	GUEST_PHYSICAL_ADDRESS
};
718
static int max_shadow_read_only_fields =
719 720
	ARRAY_SIZE(shadow_read_only_fields);

721
static unsigned long shadow_read_write_fields[] = {
722
	TPR_THRESHOLD,
723 724 725 726 727 728 729 730 731 732 733 734
	GUEST_RIP,
	GUEST_RSP,
	GUEST_CR0,
	GUEST_CR3,
	GUEST_CR4,
	GUEST_INTERRUPTIBILITY_INFO,
	GUEST_RFLAGS,
	GUEST_CS_SELECTOR,
	GUEST_CS_AR_BYTES,
	GUEST_CS_LIMIT,
	GUEST_CS_BASE,
	GUEST_ES_BASE,
735
	GUEST_BNDCFGS,
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
	CR0_GUEST_HOST_MASK,
	CR0_READ_SHADOW,
	CR4_READ_SHADOW,
	TSC_OFFSET,
	EXCEPTION_BITMAP,
	CPU_BASED_VM_EXEC_CONTROL,
	VM_ENTRY_EXCEPTION_ERROR_CODE,
	VM_ENTRY_INTR_INFO_FIELD,
	VM_ENTRY_INSTRUCTION_LEN,
	VM_ENTRY_EXCEPTION_ERROR_CODE,
	HOST_FS_BASE,
	HOST_GS_BASE,
	HOST_FS_SELECTOR,
	HOST_GS_SELECTOR
};
751
static int max_shadow_read_write_fields =
752 753
	ARRAY_SIZE(shadow_read_write_fields);

754
static const unsigned short vmcs_field_to_offset_table[] = {
755
	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
756
	FIELD(POSTED_INTR_NV, posted_intr_nv),
757 758 759 760 761 762 763 764
	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
765
	FIELD(GUEST_INTR_STATUS, guest_intr_status),
766
	FIELD(GUEST_PML_INDEX, guest_pml_index),
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782
	FIELD(HOST_ES_SELECTOR, host_es_selector),
	FIELD(HOST_CS_SELECTOR, host_cs_selector),
	FIELD(HOST_SS_SELECTOR, host_ss_selector),
	FIELD(HOST_DS_SELECTOR, host_ds_selector),
	FIELD(HOST_FS_SELECTOR, host_fs_selector),
	FIELD(HOST_GS_SELECTOR, host_gs_selector),
	FIELD(HOST_TR_SELECTOR, host_tr_selector),
	FIELD64(IO_BITMAP_A, io_bitmap_a),
	FIELD64(IO_BITMAP_B, io_bitmap_b),
	FIELD64(MSR_BITMAP, msr_bitmap),
	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
	FIELD64(TSC_OFFSET, tsc_offset),
	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
783
	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
784
	FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
785
	FIELD64(EPT_POINTER, ept_pointer),
786 787 788 789
	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
790
	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
791
	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
792 793
	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
794
	FIELD64(PML_ADDRESS, pml_address),
795 796 797 798 799 800 801 802
	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
	FIELD64(GUEST_PDPTR0, guest_pdptr0),
	FIELD64(GUEST_PDPTR1, guest_pdptr1),
	FIELD64(GUEST_PDPTR2, guest_pdptr2),
	FIELD64(GUEST_PDPTR3, guest_pdptr3),
803
	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
	FIELD64(HOST_IA32_PAT, host_ia32_pat),
	FIELD64(HOST_IA32_EFER, host_ia32_efer),
	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
	FIELD(EXCEPTION_BITMAP, exception_bitmap),
	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
	FIELD(CR3_TARGET_COUNT, cr3_target_count),
	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
	FIELD(TPR_THRESHOLD, tpr_threshold),
	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
	FIELD(VM_EXIT_REASON, vm_exit_reason),
	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
	FIELD(GUEST_ES_LIMIT, guest_es_limit),
	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
853
	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
	FIELD(EXIT_QUALIFICATION, exit_qualification),
	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
	FIELD(GUEST_CR0, guest_cr0),
	FIELD(GUEST_CR3, guest_cr3),
	FIELD(GUEST_CR4, guest_cr4),
	FIELD(GUEST_ES_BASE, guest_es_base),
	FIELD(GUEST_CS_BASE, guest_cs_base),
	FIELD(GUEST_SS_BASE, guest_ss_base),
	FIELD(GUEST_DS_BASE, guest_ds_base),
	FIELD(GUEST_FS_BASE, guest_fs_base),
	FIELD(GUEST_GS_BASE, guest_gs_base),
	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
	FIELD(GUEST_TR_BASE, guest_tr_base),
	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
	FIELD(GUEST_DR7, guest_dr7),
	FIELD(GUEST_RSP, guest_rsp),
	FIELD(GUEST_RIP, guest_rip),
	FIELD(GUEST_RFLAGS, guest_rflags),
	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
	FIELD(HOST_CR0, host_cr0),
	FIELD(HOST_CR3, host_cr3),
	FIELD(HOST_CR4, host_cr4),
	FIELD(HOST_FS_BASE, host_fs_base),
	FIELD(HOST_GS_BASE, host_gs_base),
	FIELD(HOST_TR_BASE, host_tr_base),
	FIELD(HOST_GDTR_BASE, host_gdtr_base),
	FIELD(HOST_IDTR_BASE, host_idtr_base),
	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
	FIELD(HOST_RSP, host_rsp),
	FIELD(HOST_RIP, host_rip),
};

static inline short vmcs_field_to_offset(unsigned long field)
{
900 901 902 903 904 905
	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);

	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
	    vmcs_field_to_offset_table[field] == 0)
		return -ENOENT;

906 907 908
	return vmcs_field_to_offset_table[field];
}

909 910
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
911
	return to_vmx(vcpu)->nested.cached_vmcs12;
912 913
}

914
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
N
Nadav Har'El 已提交
915
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
916
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
917
static bool vmx_xsaves_supported(void);
918 919 920 921
static void vmx_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
922 923
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
924
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
925 926 927 928
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
					    u16 error_code);
929

A
Avi Kivity 已提交
930 931
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
932 933 934 935 936
/*
 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 */
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
A
Avi Kivity 已提交
937

938 939 940 941 942 943 944
/*
 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
 * can find which vCPU should be waken up.
 */
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);

945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
enum {
	VMX_IO_BITMAP_A,
	VMX_IO_BITMAP_B,
	VMX_MSR_BITMAP_LEGACY,
	VMX_MSR_BITMAP_LONGMODE,
	VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
	VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
	VMX_MSR_BITMAP_LEGACY_X2APIC,
	VMX_MSR_BITMAP_LONGMODE_X2APIC,
	VMX_VMREAD_BITMAP,
	VMX_VMWRITE_BITMAP,
	VMX_BITMAP_NR
};

static unsigned long *vmx_bitmap[VMX_BITMAP_NR];

#define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
#define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
#define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
#define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
#define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
#define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
#define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
971

A
Avi Kivity 已提交
972
static bool cpu_has_load_ia32_efer;
973
static bool cpu_has_load_perf_global_ctrl;
A
Avi Kivity 已提交
974

975 976 977
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

978
static struct vmcs_config {
A
Avi Kivity 已提交
979 980
	int size;
	int order;
981
	u32 basic_cap;
A
Avi Kivity 已提交
982
	u32 revision_id;
983 984
	u32 pin_based_exec_ctrl;
	u32 cpu_based_exec_ctrl;
985
	u32 cpu_based_2nd_exec_ctrl;
986 987 988
	u32 vmexit_ctrl;
	u32 vmentry_ctrl;
} vmcs_config;
A
Avi Kivity 已提交
989

H
Hannes Eder 已提交
990
static struct vmx_capability {
S
Sheng Yang 已提交
991 992 993 994
	u32 ept;
	u32 vpid;
} vmx_capability;

A
Avi Kivity 已提交
995 996 997 998 999 1000 1001 1002
#define VMX_SEGMENT_FIELD(seg)					\
	[VCPU_SREG_##seg] = {                                   \
		.selector = GUEST_##seg##_SELECTOR,		\
		.base = GUEST_##seg##_BASE,		   	\
		.limit = GUEST_##seg##_LIMIT,		   	\
		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
	}

1003
static const struct kvm_vmx_segment_field {
A
Avi Kivity 已提交
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
	unsigned selector;
	unsigned base;
	unsigned limit;
	unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
	VMX_SEGMENT_FIELD(CS),
	VMX_SEGMENT_FIELD(DS),
	VMX_SEGMENT_FIELD(ES),
	VMX_SEGMENT_FIELD(FS),
	VMX_SEGMENT_FIELD(GS),
	VMX_SEGMENT_FIELD(SS),
	VMX_SEGMENT_FIELD(TR),
	VMX_SEGMENT_FIELD(LDTR),
};

1019 1020
static u64 host_efer;

A
Avi Kivity 已提交
1021 1022
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);

1023
/*
B
Brian Gerst 已提交
1024
 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1025 1026
 * away by decrementing the array size.
 */
A
Avi Kivity 已提交
1027
static const u32 vmx_msr_index[] = {
1028
#ifdef CONFIG_X86_64
1029
	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
A
Avi Kivity 已提交
1030
#endif
B
Brian Gerst 已提交
1031
	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
A
Avi Kivity 已提交
1032 1033
};

1034
static inline bool is_exception_n(u32 intr_info, u8 vector)
A
Avi Kivity 已提交
1035 1036 1037
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
1038 1039 1040
		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
}

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
static inline bool is_debug(u32 intr_info)
{
	return is_exception_n(intr_info, DB_VECTOR);
}

static inline bool is_breakpoint(u32 intr_info)
{
	return is_exception_n(intr_info, BP_VECTOR);
}

1051 1052 1053
static inline bool is_page_fault(u32 intr_info)
{
	return is_exception_n(intr_info, PF_VECTOR);
A
Avi Kivity 已提交
1054 1055
}

1056
static inline bool is_no_device(u32 intr_info)
1057
{
1058
	return is_exception_n(intr_info, NM_VECTOR);
1059 1060
}

1061
static inline bool is_invalid_opcode(u32 intr_info)
1062
{
1063
	return is_exception_n(intr_info, UD_VECTOR);
1064 1065
}

1066
static inline bool is_external_interrupt(u32 intr_info)
A
Avi Kivity 已提交
1067 1068 1069 1070 1071
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}

1072
static inline bool is_machine_check(u32 intr_info)
A
Andi Kleen 已提交
1073 1074 1075 1076 1077 1078
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}

1079
static inline bool cpu_has_vmx_msr_bitmap(void)
S
Sheng Yang 已提交
1080
{
1081
	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
S
Sheng Yang 已提交
1082 1083
}

1084
static inline bool cpu_has_vmx_tpr_shadow(void)
1085
{
1086
	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1087 1088
}

1089
static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1090
{
1091
	return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1092 1093
}

1094
static inline bool cpu_has_secondary_exec_ctrls(void)
1095
{
1096 1097
	return vmcs_config.cpu_based_exec_ctrl &
		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1098 1099
}

1100
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1101
{
1102 1103 1104 1105
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}

1106 1107 1108 1109 1110 1111
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
}

1112 1113 1114 1115 1116 1117
static inline bool cpu_has_vmx_apic_register_virt(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_APIC_REGISTER_VIRT;
}

1118 1119 1120 1121 1122 1123
static inline bool cpu_has_vmx_virtual_intr_delivery(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
/*
 * Comment's format: document - errata name - stepping - processor name.
 * Refer from
 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 */
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
/*
 * 320767.pdf - AAP86  - B1 -
 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 */
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
};

static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
	u32 eax = cpuid_eax(0x00000001), i;

	/* Clear the reserved bits */
	eax &= ~(0x3U << 14 | 0xfU << 28);
1163
	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
		if (eax == vmx_preemption_cpu_tfms[i])
			return true;

	return false;
}

static inline bool cpu_has_vmx_preemption_timer(void)
{
	return vmcs_config.pin_based_exec_ctrl &
		PIN_BASED_VMX_PREEMPTION_TIMER;
}

1176 1177
static inline bool cpu_has_vmx_posted_intr(void)
{
1178 1179
	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
		vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1180 1181 1182 1183 1184 1185 1186 1187 1188
}

static inline bool cpu_has_vmx_apicv(void)
{
	return cpu_has_vmx_apic_register_virt() &&
		cpu_has_vmx_virtual_intr_delivery() &&
		cpu_has_vmx_posted_intr();
}

1189 1190 1191 1192
static inline bool cpu_has_vmx_flexpriority(void)
{
	return cpu_has_vmx_tpr_shadow() &&
		cpu_has_vmx_virtualize_apic_accesses();
1193 1194
}

1195 1196
static inline bool cpu_has_vmx_ept_execute_only(void)
{
1197
	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1198 1199 1200 1201
}

static inline bool cpu_has_vmx_ept_2m_page(void)
{
1202
	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1203 1204
}

1205 1206
static inline bool cpu_has_vmx_ept_1g_page(void)
{
1207
	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1208 1209
}

1210 1211 1212 1213 1214
static inline bool cpu_has_vmx_ept_4levels(void)
{
	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}

1215 1216 1217 1218 1219
static inline bool cpu_has_vmx_ept_mt_wb(void)
{
	return vmx_capability.ept & VMX_EPTP_WB_BIT;
}

1220 1221 1222 1223 1224
static inline bool cpu_has_vmx_ept_5levels(void)
{
	return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
}

1225 1226 1227 1228 1229
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
	return vmx_capability.ept & VMX_EPT_AD_BIT;
}

1230
static inline bool cpu_has_vmx_invept_context(void)
S
Sheng Yang 已提交
1231
{
1232
	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
S
Sheng Yang 已提交
1233 1234
}

1235
static inline bool cpu_has_vmx_invept_global(void)
S
Sheng Yang 已提交
1236
{
1237
	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
S
Sheng Yang 已提交
1238 1239
}

1240 1241 1242 1243 1244
static inline bool cpu_has_vmx_invvpid_single(void)
{
	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
}

1245 1246 1247 1248 1249
static inline bool cpu_has_vmx_invvpid_global(void)
{
	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}

1250 1251 1252 1253 1254
static inline bool cpu_has_vmx_invvpid(void)
{
	return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
}

1255
static inline bool cpu_has_vmx_ept(void)
S
Sheng Yang 已提交
1256
{
1257 1258
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_EPT;
S
Sheng Yang 已提交
1259 1260
}

1261
static inline bool cpu_has_vmx_unrestricted_guest(void)
1262 1263 1264 1265 1266
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_UNRESTRICTED_GUEST;
}

1267
static inline bool cpu_has_vmx_ple(void)
1268 1269 1270 1271 1272
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}

1273 1274 1275 1276 1277
static inline bool cpu_has_vmx_basic_inout(void)
{
	return	(((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
}

1278
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1279
{
1280
	return flexpriority_enabled && lapic_in_kernel(vcpu);
1281 1282
}

1283
static inline bool cpu_has_vmx_vpid(void)
1284
{
1285 1286
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_VPID;
1287 1288
}

1289
static inline bool cpu_has_vmx_rdtscp(void)
1290 1291 1292 1293 1294
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_RDTSCP;
}

1295 1296 1297 1298 1299 1300
static inline bool cpu_has_vmx_invpcid(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_INVPCID;
}

1301 1302 1303 1304 1305
static inline bool cpu_has_virtual_nmis(void)
{
	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}

1306 1307 1308 1309 1310 1311
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_WBINVD_EXITING;
}

1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
	u64 vmx_msr;
	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
	/* check if the cpu supports writing r/o exit information fields */
	if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
		return false;

	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_SHADOW_VMCS;
}

K
Kai Huang 已提交
1324 1325 1326 1327 1328
static inline bool cpu_has_vmx_pml(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}

1329 1330 1331 1332 1333 1334
static inline bool cpu_has_vmx_tsc_scaling(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_TSC_SCALING;
}

B
Bandan Das 已提交
1335 1336 1337 1338 1339 1340
static inline bool cpu_has_vmx_vmfunc(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_VMFUNC;
}

1341 1342 1343 1344 1345
static inline bool report_flexpriority(void)
{
	return flexpriority_enabled;
}

1346 1347 1348 1349 1350
static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
{
	return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
}

1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
	return vmcs12->cpu_based_vm_exec_control & bit;
}

static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
{
	return (vmcs12->cpu_based_vm_exec_control &
			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
		(vmcs12->secondary_vm_exec_control & bit);
}

1363 1364 1365 1366 1367 1368
static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
{
	return vmcs12->pin_based_vm_exec_control &
		PIN_BASED_VMX_PREEMPTION_TIMER;
}

N
Nadav Har'El 已提交
1369 1370 1371 1372 1373
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
}

1374 1375
static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
{
1376
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1377 1378
}

1379 1380 1381 1382 1383
static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
}

1384 1385 1386 1387 1388
static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
}

W
Wanpeng Li 已提交
1389 1390 1391 1392 1393
static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
}

1394 1395 1396 1397 1398
static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
}

1399 1400 1401 1402 1403
static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
}

1404 1405 1406 1407 1408
static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
{
	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}

1409 1410 1411 1412 1413
static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
{
	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
}

1414 1415 1416 1417 1418 1419 1420
static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
{
	return nested_cpu_has_vmfunc(vmcs12) &&
		(vmcs12->vm_function_control &
		 VMX_VMFUNC_EPTP_SWITCHING);
}

1421
static inline bool is_nmi(u32 intr_info)
1422 1423
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1424
		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1425 1426
}

1427 1428 1429
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
			      u32 exit_intr_info,
			      unsigned long exit_qualification);
1430 1431 1432 1433
static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
			struct vmcs12 *vmcs12,
			u32 reason, unsigned long qualification);

R
Rusty Russell 已提交
1434
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1435 1436 1437
{
	int i;

1438
	for (i = 0; i < vmx->nmsrs; ++i)
1439
		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1440 1441 1442 1443
			return i;
	return -1;
}

1444 1445 1446 1447 1448 1449 1450 1451
static inline void __invvpid(int ext, u16 vpid, gva_t gva)
{
    struct {
	u64 vpid : 16;
	u64 rsvd : 48;
	u64 gva;
    } operand = { vpid, 0, gva };

1452
    asm volatile (__ex(ASM_VMX_INVVPID)
1453 1454 1455 1456 1457
		  /* CF==1 or ZF==1 --> rc = -1 */
		  "; ja 1f ; ud2 ; 1:"
		  : : "a"(&operand), "c"(ext) : "cc", "memory");
}

1458 1459 1460 1461 1462 1463
static inline void __invept(int ext, u64 eptp, gpa_t gpa)
{
	struct {
		u64 eptp, gpa;
	} operand = {eptp, gpa};

1464
	asm volatile (__ex(ASM_VMX_INVEPT)
1465 1466 1467 1468 1469
			/* CF==1 or ZF==1 --> rc = -1 */
			"; ja 1f ; ud2 ; 1:\n"
			: : "a" (&operand), "c" (ext) : "cc", "memory");
}

1470
static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1471 1472 1473
{
	int i;

R
Rusty Russell 已提交
1474
	i = __find_msr_index(vmx, msr);
1475
	if (i >= 0)
1476
		return &vmx->guest_msrs[i];
A
Al Viro 已提交
1477
	return NULL;
1478 1479
}

A
Avi Kivity 已提交
1480 1481 1482 1483 1484
static void vmcs_clear(struct vmcs *vmcs)
{
	u64 phys_addr = __pa(vmcs);
	u8 error;

1485
	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1486
		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
A
Avi Kivity 已提交
1487 1488 1489 1490 1491 1492
		      : "cc", "memory");
	if (error)
		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
		       vmcs, phys_addr);
}

1493 1494 1495
static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
	vmcs_clear(loaded_vmcs->vmcs);
1496 1497
	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
		vmcs_clear(loaded_vmcs->shadow_vmcs);
1498 1499 1500 1501
	loaded_vmcs->cpu = -1;
	loaded_vmcs->launched = 0;
}

1502 1503 1504 1505 1506 1507
static void vmcs_load(struct vmcs *vmcs)
{
	u64 phys_addr = __pa(vmcs);
	u8 error;

	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1508
			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1509 1510
			: "cc", "memory");
	if (error)
1511
		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1512 1513 1514
		       vmcs, phys_addr);
}

1515
#ifdef CONFIG_KEXEC_CORE
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
/*
 * This bitmap is used to indicate whether the vmclear
 * operation is enabled on all cpus. All disabled by
 * default.
 */
static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;

static inline void crash_enable_local_vmclear(int cpu)
{
	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static inline void crash_disable_local_vmclear(int cpu)
{
	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static inline int crash_local_vmclear_enabled(int cpu)
{
	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static void crash_vmclear_local_loaded_vmcss(void)
{
	int cpu = raw_smp_processor_id();
	struct loaded_vmcs *v;

	if (!crash_local_vmclear_enabled(cpu))
		return;

	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
			    loaded_vmcss_on_cpu_link)
		vmcs_clear(v->vmcs);
}
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
1553
#endif /* CONFIG_KEXEC_CORE */
1554

1555
static void __loaded_vmcs_clear(void *arg)
A
Avi Kivity 已提交
1556
{
1557
	struct loaded_vmcs *loaded_vmcs = arg;
1558
	int cpu = raw_smp_processor_id();
A
Avi Kivity 已提交
1559

1560 1561 1562
	if (loaded_vmcs->cpu != cpu)
		return; /* vcpu migration can race with cpu offline */
	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
A
Avi Kivity 已提交
1563
		per_cpu(current_vmcs, cpu) = NULL;
1564
	crash_disable_local_vmclear(cpu);
1565
	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1566 1567 1568 1569 1570 1571 1572 1573 1574

	/*
	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
	 * is before setting loaded_vmcs->vcpu to -1 which is done in
	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
	 * then adds the vmcs into percpu list before it is deleted.
	 */
	smp_wmb();

1575
	loaded_vmcs_init(loaded_vmcs);
1576
	crash_enable_local_vmclear(cpu);
A
Avi Kivity 已提交
1577 1578
}

1579
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
A
Avi Kivity 已提交
1580
{
1581 1582 1583 1584 1585
	int cpu = loaded_vmcs->cpu;

	if (cpu != -1)
		smp_call_function_single(cpu,
			 __loaded_vmcs_clear, loaded_vmcs, 1);
A
Avi Kivity 已提交
1586 1587
}

1588
static inline void vpid_sync_vcpu_single(int vpid)
1589
{
1590
	if (vpid == 0)
1591 1592
		return;

1593
	if (cpu_has_vmx_invvpid_single())
1594
		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1595 1596
}

1597 1598 1599 1600 1601 1602
static inline void vpid_sync_vcpu_global(void)
{
	if (cpu_has_vmx_invvpid_global())
		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}

1603
static inline void vpid_sync_context(int vpid)
1604 1605
{
	if (cpu_has_vmx_invvpid_single())
1606
		vpid_sync_vcpu_single(vpid);
1607 1608 1609 1610
	else
		vpid_sync_vcpu_global();
}

1611 1612
static inline void ept_sync_global(void)
{
1613
	__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1614 1615 1616 1617
}

static inline void ept_sync_context(u64 eptp)
{
1618 1619 1620 1621
	if (cpu_has_vmx_invept_context())
		__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
	else
		ept_sync_global();
1622 1623
}

1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
static __always_inline void vmcs_check16(unsigned long field)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
			 "16-bit accessor invalid for 64-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
			 "16-bit accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
			 "16-bit accessor invalid for 32-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
			 "16-bit accessor invalid for natural width field");
}

static __always_inline void vmcs_check32(unsigned long field)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
			 "32-bit accessor invalid for 16-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
			 "32-bit accessor invalid for natural width field");
}

static __always_inline void vmcs_check64(unsigned long field)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
			 "64-bit accessor invalid for 16-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
			 "64-bit accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
			 "64-bit accessor invalid for 32-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
			 "64-bit accessor invalid for natural width field");
}

static __always_inline void vmcs_checkl(unsigned long field)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
			 "Natural width accessor invalid for 16-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
			 "Natural width accessor invalid for 64-bit field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
			 "Natural width accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
			 "Natural width accessor invalid for 32-bit field");
}

static __always_inline unsigned long __vmcs_readl(unsigned long field)
A
Avi Kivity 已提交
1669
{
1670
	unsigned long value;
A
Avi Kivity 已提交
1671

1672 1673
	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
		      : "=a"(value) : "d"(field) : "cc");
A
Avi Kivity 已提交
1674 1675 1676
	return value;
}

A
Avi Kivity 已提交
1677
static __always_inline u16 vmcs_read16(unsigned long field)
A
Avi Kivity 已提交
1678
{
1679 1680
	vmcs_check16(field);
	return __vmcs_readl(field);
A
Avi Kivity 已提交
1681 1682
}

A
Avi Kivity 已提交
1683
static __always_inline u32 vmcs_read32(unsigned long field)
A
Avi Kivity 已提交
1684
{
1685 1686
	vmcs_check32(field);
	return __vmcs_readl(field);
A
Avi Kivity 已提交
1687 1688
}

A
Avi Kivity 已提交
1689
static __always_inline u64 vmcs_read64(unsigned long field)
A
Avi Kivity 已提交
1690
{
1691
	vmcs_check64(field);
1692
#ifdef CONFIG_X86_64
1693
	return __vmcs_readl(field);
A
Avi Kivity 已提交
1694
#else
1695
	return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
A
Avi Kivity 已提交
1696 1697 1698
#endif
}

1699 1700 1701 1702 1703 1704
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
	vmcs_checkl(field);
	return __vmcs_readl(field);
}

1705 1706 1707 1708 1709 1710 1711
static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
	dump_stack();
}

1712
static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
A
Avi Kivity 已提交
1713 1714 1715
{
	u8 error;

1716
	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
M
Mike Day 已提交
1717
		       : "=q"(error) : "a"(value), "d"(field) : "cc");
1718 1719
	if (unlikely(error))
		vmwrite_error(field, value);
A
Avi Kivity 已提交
1720 1721
}

1722
static __always_inline void vmcs_write16(unsigned long field, u16 value)
A
Avi Kivity 已提交
1723
{
1724 1725
	vmcs_check16(field);
	__vmcs_writel(field, value);
A
Avi Kivity 已提交
1726 1727
}

1728
static __always_inline void vmcs_write32(unsigned long field, u32 value)
A
Avi Kivity 已提交
1729
{
1730 1731
	vmcs_check32(field);
	__vmcs_writel(field, value);
A
Avi Kivity 已提交
1732 1733
}

1734
static __always_inline void vmcs_write64(unsigned long field, u64 value)
A
Avi Kivity 已提交
1735
{
1736 1737
	vmcs_check64(field);
	__vmcs_writel(field, value);
1738
#ifndef CONFIG_X86_64
A
Avi Kivity 已提交
1739
	asm volatile ("");
1740
	__vmcs_writel(field+1, value >> 32);
A
Avi Kivity 已提交
1741 1742 1743
#endif
}

1744
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1745
{
1746 1747
	vmcs_checkl(field);
	__vmcs_writel(field, value);
1748 1749
}

1750
static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1751
{
1752 1753 1754
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
			 "vmcs_clear_bits does not support 64-bit fields");
	__vmcs_writel(field, __vmcs_readl(field) & ~mask);
1755 1756
}

1757
static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1758
{
1759 1760 1761
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
			 "vmcs_set_bits does not support 64-bit fields");
	__vmcs_writel(field, __vmcs_readl(field) | mask);
1762 1763
}

1764 1765 1766 1767 1768
static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
{
	vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
}

1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
{
	vmcs_write32(VM_ENTRY_CONTROLS, val);
	vmx->vm_entry_controls_shadow = val;
}

static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
{
	if (vmx->vm_entry_controls_shadow != val)
		vm_entry_controls_init(vmx, val);
}

static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
{
	return vmx->vm_entry_controls_shadow;
}


static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
{
	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
}

static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
{
	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
}

1797 1798 1799 1800 1801
static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
{
	vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
}

1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
{
	vmcs_write32(VM_EXIT_CONTROLS, val);
	vmx->vm_exit_controls_shadow = val;
}

static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
{
	if (vmx->vm_exit_controls_shadow != val)
		vm_exit_controls_init(vmx, val);
}

static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
{
	return vmx->vm_exit_controls_shadow;
}


static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
{
	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
}

static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
{
	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
}

A
Avi Kivity 已提交
1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885
static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
	vmx->segment_cache.bitmask = 0;
}

static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
				       unsigned field)
{
	bool ret;
	u32 mask = 1 << (seg * SEG_FIELD_NR + field);

	if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
		vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
		vmx->segment_cache.bitmask = 0;
	}
	ret = vmx->segment_cache.bitmask & mask;
	vmx->segment_cache.bitmask |= mask;
	return ret;
}

static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
	u16 *p = &vmx->segment_cache.seg[seg].selector;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
	return *p;
}

static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
	ulong *p = &vmx->segment_cache.seg[seg].base;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
	return *p;
}

static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
	u32 *p = &vmx->segment_cache.seg[seg].limit;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
	return *p;
}

static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
	u32 *p = &vmx->segment_cache.seg[seg].ar;

	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
	return *p;
}

1886 1887 1888 1889
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
	u32 eb;

J
Jan Kiszka 已提交
1890
	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1891
	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
J
Jan Kiszka 已提交
1892 1893 1894 1895
	if ((vcpu->guest_debug &
	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
		eb |= 1u << BP_VECTOR;
1896
	if (to_vmx(vcpu)->rmode.vm86_active)
1897
		eb = ~0;
1898
	if (enable_ept)
1899
		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1900 1901 1902 1903 1904 1905 1906 1907 1908

	/* When we are running a nested L2 guest and L1 specified for it a
	 * certain exception bitmap, we must trap the same exceptions and pass
	 * them to L1. When running L2, we will only handle the exceptions
	 * specified above if L1 did not want them.
	 */
	if (is_guest_mode(vcpu))
		eb |= get_vmcs12(vcpu)->exception_bitmap;

1909 1910 1911
	vmcs_write32(EXCEPTION_BITMAP, eb);
}

1912 1913
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
		unsigned long entry, unsigned long exit)
1914
{
1915 1916
	vm_entry_controls_clearbit(vmx, entry);
	vm_exit_controls_clearbit(vmx, exit);
1917 1918
}

1919 1920 1921 1922 1923
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
	unsigned i;
	struct msr_autoload *m = &vmx->msr_autoload;

1924 1925 1926
	switch (msr) {
	case MSR_EFER:
		if (cpu_has_load_ia32_efer) {
1927 1928
			clear_atomic_switch_msr_special(vmx,
					VM_ENTRY_LOAD_IA32_EFER,
1929 1930 1931 1932 1933 1934
					VM_EXIT_LOAD_IA32_EFER);
			return;
		}
		break;
	case MSR_CORE_PERF_GLOBAL_CTRL:
		if (cpu_has_load_perf_global_ctrl) {
1935
			clear_atomic_switch_msr_special(vmx,
1936 1937 1938 1939 1940
					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
			return;
		}
		break;
A
Avi Kivity 已提交
1941 1942
	}

1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955
	for (i = 0; i < m->nr; ++i)
		if (m->guest[i].index == msr)
			break;

	if (i == m->nr)
		return;
	--m->nr;
	m->guest[i] = m->guest[m->nr];
	m->host[i] = m->host[m->nr];
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}

1956 1957 1958 1959
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
		unsigned long entry, unsigned long exit,
		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
		u64 guest_val, u64 host_val)
1960 1961 1962
{
	vmcs_write64(guest_val_vmcs, guest_val);
	vmcs_write64(host_val_vmcs, host_val);
1963 1964
	vm_entry_controls_setbit(vmx, entry);
	vm_exit_controls_setbit(vmx, exit);
1965 1966
}

1967 1968 1969 1970 1971 1972
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
				  u64 guest_val, u64 host_val)
{
	unsigned i;
	struct msr_autoload *m = &vmx->msr_autoload;

1973 1974 1975
	switch (msr) {
	case MSR_EFER:
		if (cpu_has_load_ia32_efer) {
1976 1977
			add_atomic_switch_msr_special(vmx,
					VM_ENTRY_LOAD_IA32_EFER,
1978 1979 1980 1981 1982 1983 1984 1985 1986
					VM_EXIT_LOAD_IA32_EFER,
					GUEST_IA32_EFER,
					HOST_IA32_EFER,
					guest_val, host_val);
			return;
		}
		break;
	case MSR_CORE_PERF_GLOBAL_CTRL:
		if (cpu_has_load_perf_global_ctrl) {
1987
			add_atomic_switch_msr_special(vmx,
1988 1989 1990 1991 1992 1993 1994 1995
					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
					GUEST_IA32_PERF_GLOBAL_CTRL,
					HOST_IA32_PERF_GLOBAL_CTRL,
					guest_val, host_val);
			return;
		}
		break;
1996 1997 1998 1999 2000 2001 2002
	case MSR_IA32_PEBS_ENABLE:
		/* PEBS needs a quiescent period after being disabled (to write
		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
		 * provide that period, so a CPU could write host's record into
		 * guest's memory.
		 */
		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
A
Avi Kivity 已提交
2003 2004
	}

2005 2006 2007 2008
	for (i = 0; i < m->nr; ++i)
		if (m->guest[i].index == msr)
			break;

2009
	if (i == NR_AUTOLOAD_MSRS) {
2010
		printk_once(KERN_WARNING "Not enough msr switch entries. "
2011 2012 2013
				"Can't add msr %x\n", msr);
		return;
	} else if (i == m->nr) {
2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
		++m->nr;
		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
	}

	m->guest[i].index = msr;
	m->guest[i].value = guest_val;
	m->host[i].index = msr;
	m->host[i].value = host_val;
}

A
Avi Kivity 已提交
2025
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2026
{
2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040
	u64 guest_efer = vmx->vcpu.arch.efer;
	u64 ignore_bits = 0;

	if (!enable_ept) {
		/*
		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
		 * host CPUID is more efficient than testing guest CPUID
		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
		 */
		if (boot_cpu_has(X86_FEATURE_SMEP))
			guest_efer |= EFER_NX;
		else if (!(guest_efer & EFER_NX))
			ignore_bits |= EFER_NX;
	}
R
Roel Kluin 已提交
2041

2042
	/*
2043
	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
2044
	 */
2045
	ignore_bits |= EFER_SCE;
2046 2047 2048 2049 2050 2051
#ifdef CONFIG_X86_64
	ignore_bits |= EFER_LMA | EFER_LME;
	/* SCE is meaningful only in long mode on Intel */
	if (guest_efer & EFER_LMA)
		ignore_bits &= ~(u64)EFER_SCE;
#endif
2052 2053

	clear_atomic_switch_msr(vmx, MSR_EFER);
2054 2055 2056 2057 2058 2059 2060 2061

	/*
	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
	 * On CPUs that support "load IA32_EFER", always switch EFER
	 * atomically, since it's faster than switching it manually.
	 */
	if (cpu_has_load_ia32_efer ||
	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2062 2063
		if (!(guest_efer & EFER_LMA))
			guest_efer &= ~EFER_LME;
2064 2065 2066
		if (guest_efer != host_efer)
			add_atomic_switch_msr(vmx, MSR_EFER,
					      guest_efer, host_efer);
2067
		return false;
2068 2069 2070 2071 2072 2073
	} else {
		guest_efer &= ~ignore_bits;
		guest_efer |= host_efer & ignore_bits;

		vmx->guest_msrs[efer_offset].data = guest_efer;
		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2074

2075 2076
		return true;
	}
2077 2078
}

2079 2080 2081 2082 2083 2084
#ifdef CONFIG_X86_32
/*
 * On 32-bit kernels, VM exits still load the FS and GS bases from the
 * VMCS rather than the segment table.  KVM uses this helper to figure
 * out the current bases to poke them into the VMCS before entry.
 */
2085 2086
static unsigned long segment_base(u16 selector)
{
2087
	struct desc_struct *table;
2088 2089
	unsigned long v;

2090
	if (!(selector & ~SEGMENT_RPL_MASK))
2091 2092
		return 0;

2093
	table = get_current_gdt_ro();
2094

2095
	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2096 2097
		u16 ldt_selector = kvm_read_ldt();

2098
		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2099 2100
			return 0;

2101
		table = (struct desc_struct *)segment_base(ldt_selector);
2102
	}
2103
	v = get_desc_base(&table[selector >> 3]);
2104 2105
	return v;
}
2106
#endif
2107

2108
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2109
{
2110
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2111
	int i;
2112

2113
	if (vmx->host_state.loaded)
2114 2115
		return;

2116
	vmx->host_state.loaded = 1;
2117 2118 2119 2120
	/*
	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
	 * allow segment selectors with cpl > 0 or ti == 1.
	 */
2121
	vmx->host_state.ldt_sel = kvm_read_ldt();
2122
	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2123
	savesegment(fs, vmx->host_state.fs_sel);
2124
	if (!(vmx->host_state.fs_sel & 7)) {
2125
		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2126 2127
		vmx->host_state.fs_reload_needed = 0;
	} else {
2128
		vmcs_write16(HOST_FS_SELECTOR, 0);
2129
		vmx->host_state.fs_reload_needed = 1;
2130
	}
2131
	savesegment(gs, vmx->host_state.gs_sel);
2132 2133
	if (!(vmx->host_state.gs_sel & 7))
		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2134 2135
	else {
		vmcs_write16(HOST_GS_SELECTOR, 0);
2136
		vmx->host_state.gs_ldt_reload_needed = 1;
2137 2138
	}

A
Avi Kivity 已提交
2139 2140 2141 2142 2143
#ifdef CONFIG_X86_64
	savesegment(ds, vmx->host_state.ds_sel);
	savesegment(es, vmx->host_state.es_sel);
#endif

2144 2145 2146 2147
#ifdef CONFIG_X86_64
	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
2148 2149
	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2150
#endif
2151 2152

#ifdef CONFIG_X86_64
2153 2154
	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
	if (is_long_mode(&vmx->vcpu))
2155
		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2156
#endif
2157 2158
	if (boot_cpu_has(X86_FEATURE_MPX))
		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2159 2160
	for (i = 0; i < vmx->save_nmsrs; ++i)
		kvm_set_shared_msr(vmx->guest_msrs[i].index,
2161 2162
				   vmx->guest_msrs[i].data,
				   vmx->guest_msrs[i].mask);
2163 2164
}

2165
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2166
{
2167
	if (!vmx->host_state.loaded)
2168 2169
		return;

2170
	++vmx->vcpu.stat.host_state_reload;
2171
	vmx->host_state.loaded = 0;
2172 2173 2174 2175
#ifdef CONFIG_X86_64
	if (is_long_mode(&vmx->vcpu))
		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
2176
	if (vmx->host_state.gs_ldt_reload_needed) {
2177
		kvm_load_ldt(vmx->host_state.ldt_sel);
2178
#ifdef CONFIG_X86_64
2179 2180 2181
		load_gs_index(vmx->host_state.gs_sel);
#else
		loadsegment(gs, vmx->host_state.gs_sel);
2182 2183
#endif
	}
2184 2185
	if (vmx->host_state.fs_reload_needed)
		loadsegment(fs, vmx->host_state.fs_sel);
A
Avi Kivity 已提交
2186 2187 2188 2189 2190 2191
#ifdef CONFIG_X86_64
	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
		loadsegment(ds, vmx->host_state.ds_sel);
		loadsegment(es, vmx->host_state.es_sel);
	}
#endif
2192
	invalidate_tss_limit();
2193
#ifdef CONFIG_X86_64
2194
	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2195
#endif
2196 2197
	if (vmx->host_state.msr_host_bndcfgs)
		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2198
	load_fixmap_gdt(raw_smp_processor_id());
2199 2200
}

2201 2202 2203 2204 2205 2206 2207
static void vmx_load_host_state(struct vcpu_vmx *vmx)
{
	preempt_disable();
	__vmx_load_host_state(vmx);
	preempt_enable();
}

2208 2209 2210 2211 2212 2213
static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct pi_desc old, new;
	unsigned int dest;

2214 2215 2216 2217 2218 2219 2220
	/*
	 * In case of hot-plug or hot-unplug, we may have to undo
	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
	 * always keep PI.NDST up to date for simplicity: it makes the
	 * code easier, and CPU migration is not a fast path.
	 */
	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2221 2222
		return;

2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234
	/*
	 * First handle the simple case where no cmpxchg is necessary; just
	 * allow posting non-urgent interrupts.
	 *
	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
	 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
	 * expects the VCPU to be on the blocked_vcpu_list that matches
	 * PI.NDST.
	 */
	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
	    vcpu->cpu == cpu) {
		pi_clear_sn(pi_desc);
2235
		return;
2236
	}
2237

2238
	/* The full case.  */
2239 2240 2241
	do {
		old.control = new.control = pi_desc->control;

2242
		dest = cpu_physical_id(cpu);
2243

2244 2245 2246 2247
		if (x2apic_enabled())
			new.ndst = dest;
		else
			new.ndst = (dest << 8) & 0xFF00;
2248 2249

		new.sn = 0;
P
Paolo Bonzini 已提交
2250 2251
	} while (cmpxchg64(&pi_desc->control, old.control,
			   new.control) != old.control);
2252
}
2253

P
Peter Feiner 已提交
2254 2255 2256 2257 2258 2259
static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
{
	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
}

A
Avi Kivity 已提交
2260 2261 2262 2263
/*
 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 * vcpu mutex is already taken.
 */
2264
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
A
Avi Kivity 已提交
2265
{
2266
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2267
	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
A
Avi Kivity 已提交
2268

2269
	if (!already_loaded) {
2270
		loaded_vmcs_clear(vmx->loaded_vmcs);
2271
		local_irq_disable();
2272
		crash_disable_local_vmclear(cpu);
2273 2274 2275 2276 2277 2278 2279 2280

		/*
		 * Read loaded_vmcs->cpu should be before fetching
		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
		 * See the comments in __loaded_vmcs_clear().
		 */
		smp_rmb();

2281 2282
		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
			 &per_cpu(loaded_vmcss_on_cpu, cpu));
2283
		crash_enable_local_vmclear(cpu);
2284
		local_irq_enable();
2285 2286 2287 2288 2289 2290 2291 2292
	}

	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
		vmcs_load(vmx->loaded_vmcs->vmcs);
	}

	if (!already_loaded) {
2293
		void *gdt = get_current_gdt_ro();
2294 2295 2296
		unsigned long sysenter_esp;

		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2297

A
Avi Kivity 已提交
2298 2299
		/*
		 * Linux uses per-cpu TSS and GDT, so set these when switching
2300
		 * processors.  See 22.2.4.
A
Avi Kivity 已提交
2301
		 */
2302 2303
		vmcs_writel(HOST_TR_BASE,
			    (unsigned long)this_cpu_ptr(&cpu_tss));
2304
		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
A
Avi Kivity 已提交
2305

2306 2307 2308 2309 2310 2311 2312 2313
		/*
		 * VM exits change the host TR limit to 0x67 after a VM
		 * exit.  This is okay, since 0x67 covers everything except
		 * the IO bitmap and have have code to handle the IO bitmap
		 * being lost after a VM exit.
		 */
		BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);

A
Avi Kivity 已提交
2314 2315
		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2316

2317
		vmx->loaded_vmcs->cpu = cpu;
A
Avi Kivity 已提交
2318
	}
2319

2320 2321
	/* Setup TSC multiplier */
	if (kvm_has_tsc_control &&
P
Peter Feiner 已提交
2322 2323
	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
		decache_tsc_multiplier(vmx);
2324

2325
	vmx_vcpu_pi_load(vcpu, cpu);
2326
	vmx->host_pkru = read_pkru();
2327 2328 2329 2330 2331 2332 2333
}

static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2334 2335
		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
		!kvm_vcpu_apicv_active(vcpu))
2336 2337 2338 2339 2340
		return;

	/* Set SN when the vCPU is preempted */
	if (vcpu->preempted)
		pi_set_sn(pi_desc);
A
Avi Kivity 已提交
2341 2342 2343 2344
}

static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
2345 2346
	vmx_vcpu_pi_put(vcpu);

2347
	__vmx_load_host_state(to_vmx(vcpu));
A
Avi Kivity 已提交
2348 2349
}

2350 2351 2352 2353 2354
static bool emulation_required(struct kvm_vcpu *vcpu)
{
	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
}

2355 2356
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);

2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372
/*
 * Return the cr0 value that a nested guest would read. This is a combination
 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
 * its hypervisor (cr0_read_shadow).
 */
static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
{
	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
}
static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
{
	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
}

A
Avi Kivity 已提交
2373 2374
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
2375
	unsigned long rflags, save_rflags;
2376

A
Avi Kivity 已提交
2377 2378 2379 2380 2381 2382 2383 2384 2385
	if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
		__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
		rflags = vmcs_readl(GUEST_RFLAGS);
		if (to_vmx(vcpu)->rmode.vm86_active) {
			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
			save_rflags = to_vmx(vcpu)->rmode.save_rflags;
			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
		}
		to_vmx(vcpu)->rflags = rflags;
2386
	}
A
Avi Kivity 已提交
2387
	return to_vmx(vcpu)->rflags;
A
Avi Kivity 已提交
2388 2389 2390 2391
}

static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
2392 2393
	unsigned long old_rflags = vmx_get_rflags(vcpu);

A
Avi Kivity 已提交
2394 2395
	__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
	to_vmx(vcpu)->rflags = rflags;
2396 2397
	if (to_vmx(vcpu)->rmode.vm86_active) {
		to_vmx(vcpu)->rmode.save_rflags = rflags;
2398
		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2399
	}
A
Avi Kivity 已提交
2400
	vmcs_writel(GUEST_RFLAGS, rflags);
2401 2402 2403

	if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
		to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
2404 2405
}

2406
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2407 2408 2409 2410 2411
{
	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
	int ret = 0;

	if (interruptibility & GUEST_INTR_STATE_STI)
2412
		ret |= KVM_X86_SHADOW_INT_STI;
2413
	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2414
		ret |= KVM_X86_SHADOW_INT_MOV_SS;
2415

2416
	return ret;
2417 2418 2419 2420 2421 2422 2423 2424 2425
}

static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
	u32 interruptibility = interruptibility_old;

	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);

2426
	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2427
		interruptibility |= GUEST_INTR_STATE_MOV_SS;
2428
	else if (mask & KVM_X86_SHADOW_INT_STI)
2429 2430 2431 2432 2433 2434
		interruptibility |= GUEST_INTR_STATE_STI;

	if ((interruptibility != interruptibility_old))
		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}

A
Avi Kivity 已提交
2435 2436 2437 2438
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
	unsigned long rip;

2439
	rip = kvm_rip_read(vcpu);
A
Avi Kivity 已提交
2440
	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2441
	kvm_rip_write(vcpu, rip);
A
Avi Kivity 已提交
2442

2443 2444
	/* skipping an emulated instruction also counts */
	vmx_set_interrupt_shadow(vcpu, 0);
A
Avi Kivity 已提交
2445 2446
}

2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
					       unsigned long exit_qual)
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	unsigned int nr = vcpu->arch.exception.nr;
	u32 intr_info = nr | INTR_INFO_VALID_MASK;

	if (vcpu->arch.exception.has_error_code) {
		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
	}

	if (kvm_exception_is_soft(nr))
		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
	else
		intr_info |= INTR_TYPE_HARD_EXCEPTION;

	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
	    vmx_get_nmi_mask(vcpu))
		intr_info |= INTR_INFO_UNBLOCK_NMI;

	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
}

2471 2472 2473 2474
/*
 * KVM wants to inject page-faults which it got to the guest. This function
 * checks whether in a nested guest, we need to inject them to L1 or L2.
 */
2475
static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
2476 2477
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2478
	unsigned int nr = vcpu->arch.exception.nr;
2479

2480 2481
	if (nr == PF_VECTOR) {
		if (vcpu->arch.exception.nested_apf) {
2482
			*exit_qual = vcpu->arch.apf.nested_apf_token;
2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
			return 1;
		}
		/*
		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
		 * can be written only when inject_pending_event runs.  This should be
		 * conditional on a new capability---if the capability is disabled,
		 * kvm_multiple_exception would write the ancillary information to
		 * CR2 or DR6, for backwards ABI-compatibility.
		 */
		if (nested_vmx_is_page_fault_vmexit(vmcs12,
						    vcpu->arch.exception.error_code)) {
2496
			*exit_qual = vcpu->arch.cr2;
2497 2498 2499 2500
			return 1;
		}
	} else {
		if (vmcs12->exception_bitmap & (1u << nr)) {
2501 2502 2503 2504
			if (nr == DB_VECTOR)
				*exit_qual = vcpu->arch.dr6;
			else
				*exit_qual = 0;
2505 2506
			return 1;
		}
2507 2508
	}

2509
	return 0;
2510 2511
}

2512
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2513
{
2514
	struct vcpu_vmx *vmx = to_vmx(vcpu);
2515 2516 2517
	unsigned nr = vcpu->arch.exception.nr;
	bool has_error_code = vcpu->arch.exception.has_error_code;
	u32 error_code = vcpu->arch.exception.error_code;
2518
	u32 intr_info = nr | INTR_INFO_VALID_MASK;
2519

2520
	if (has_error_code) {
2521
		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2522 2523
		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
	}
2524

2525
	if (vmx->rmode.vm86_active) {
2526 2527 2528 2529
		int inc_eip = 0;
		if (kvm_exception_is_soft(nr))
			inc_eip = vcpu->arch.event_exit_inst_len;
		if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2530
			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2531 2532 2533
		return;
	}

2534 2535 2536
	if (kvm_exception_is_soft(nr)) {
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
2537 2538 2539 2540 2541
		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
	} else
		intr_info |= INTR_TYPE_HARD_EXCEPTION;

	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2542 2543
}

2544 2545 2546 2547 2548
static bool vmx_rdtscp_supported(void)
{
	return cpu_has_vmx_rdtscp();
}

2549 2550 2551 2552 2553
static bool vmx_invpcid_supported(void)
{
	return cpu_has_vmx_invpcid() && enable_ept;
}

2554 2555 2556
/*
 * Swap MSR entry in host/guest MSR entry array.
 */
R
Rusty Russell 已提交
2557
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2558
{
2559
	struct shared_msr_entry tmp;
2560 2561 2562 2563

	tmp = vmx->guest_msrs[to];
	vmx->guest_msrs[to] = vmx->guest_msrs[from];
	vmx->guest_msrs[from] = tmp;
2564 2565
}

2566 2567 2568 2569
static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
{
	unsigned long *msr_bitmap;

2570
	if (is_guest_mode(vcpu))
2571
		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2572 2573 2574
	else if (cpu_has_secondary_exec_ctrls() &&
		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2575 2576
		if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
			if (is_long_mode(vcpu))
2577
				msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2578
			else
2579
				msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2580 2581
		} else {
			if (is_long_mode(vcpu))
2582
				msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2583
			else
2584
				msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2585
		}
2586 2587 2588 2589 2590 2591 2592 2593 2594 2595
	} else {
		if (is_long_mode(vcpu))
			msr_bitmap = vmx_msr_bitmap_longmode;
		else
			msr_bitmap = vmx_msr_bitmap_legacy;
	}

	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
}

2596 2597 2598 2599 2600
/*
 * Set up the vmcs to automatically save and restore system
 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
 * mode, as fiddling with msrs is very expensive.
 */
R
Rusty Russell 已提交
2601
static void setup_msrs(struct vcpu_vmx *vmx)
2602
{
2603
	int save_nmsrs, index;
2604

2605 2606
	save_nmsrs = 0;
#ifdef CONFIG_X86_64
R
Rusty Russell 已提交
2607 2608
	if (is_long_mode(&vmx->vcpu)) {
		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2609
		if (index >= 0)
R
Rusty Russell 已提交
2610 2611
			move_msr_up(vmx, index, save_nmsrs++);
		index = __find_msr_index(vmx, MSR_LSTAR);
2612
		if (index >= 0)
R
Rusty Russell 已提交
2613 2614
			move_msr_up(vmx, index, save_nmsrs++);
		index = __find_msr_index(vmx, MSR_CSTAR);
2615
		if (index >= 0)
R
Rusty Russell 已提交
2616
			move_msr_up(vmx, index, save_nmsrs++);
2617
		index = __find_msr_index(vmx, MSR_TSC_AUX);
2618
		if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2619
			move_msr_up(vmx, index, save_nmsrs++);
2620
		/*
B
Brian Gerst 已提交
2621
		 * MSR_STAR is only needed on long mode guests, and only
2622 2623
		 * if efer.sce is enabled.
		 */
B
Brian Gerst 已提交
2624
		index = __find_msr_index(vmx, MSR_STAR);
2625
		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
R
Rusty Russell 已提交
2626
			move_msr_up(vmx, index, save_nmsrs++);
2627 2628
	}
#endif
A
Avi Kivity 已提交
2629 2630
	index = __find_msr_index(vmx, MSR_EFER);
	if (index >= 0 && update_transition_efer(vmx, index))
2631
		move_msr_up(vmx, index, save_nmsrs++);
2632

2633
	vmx->save_nmsrs = save_nmsrs;
2634

2635 2636
	if (cpu_has_vmx_msr_bitmap())
		vmx_set_msr_bitmap(&vmx->vcpu);
2637 2638
}

A
Avi Kivity 已提交
2639 2640
/*
 * reads and returns guest's timestamp counter "register"
2641 2642
 * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
 * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
A
Avi Kivity 已提交
2643
 */
2644
static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
2645 2646 2647
{
	u64 host_tsc, tsc_offset;

2648
	host_tsc = rdtsc();
A
Avi Kivity 已提交
2649
	tsc_offset = vmcs_read64(TSC_OFFSET);
2650
	return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
A
Avi Kivity 已提交
2651 2652 2653
}

/*
2654
 * writes 'offset' into guest's timestamp counter offset register
A
Avi Kivity 已提交
2655
 */
2656
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
A
Avi Kivity 已提交
2657
{
2658
	if (is_guest_mode(vcpu)) {
2659
		/*
2660 2661 2662 2663
		 * We're here if L1 chose not to trap WRMSR to TSC. According
		 * to the spec, this should set L1's TSC; The offset that L1
		 * set for L2 remains unchanged, and still needs to be added
		 * to the newly set TSC to get L2's TSC.
2664
		 */
2665 2666 2667 2668 2669 2670 2671
		struct vmcs12 *vmcs12;
		/* recalculate vmcs02.TSC_OFFSET: */
		vmcs12 = get_vmcs12(vcpu);
		vmcs_write64(TSC_OFFSET, offset +
			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
			 vmcs12->tsc_offset : 0));
	} else {
2672 2673
		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
					   vmcs_read64(TSC_OFFSET), offset);
2674 2675
		vmcs_write64(TSC_OFFSET, offset);
	}
A
Avi Kivity 已提交
2676 2677
}

2678 2679 2680 2681 2682 2683 2684 2685
/*
 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
 * all guests if the "nested" module option is off, and can also be disabled
 * for a single guest by disabling its VMX cpuid bit.
 */
static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
{
2686
	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2687 2688
}

2689 2690 2691 2692 2693 2694 2695 2696 2697 2698
/*
 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
 * returned for the various VMX controls MSRs when nested VMX is enabled.
 * The same values should also be used to verify that vmcs12 control fields are
 * valid during nested entry from L1 to L2.
 * Each of these control msrs has a low and high 32-bit half: A low bit is on
 * if the corresponding bit in the (32-bit) control field *must* be on, and a
 * bit in the high half is on if the corresponding bit in the control field
 * may be on. See also vmx_control_verify().
 */
2699
static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711
{
	/*
	 * Note that as a general rule, the high half of the MSRs (bits in
	 * the control fields which may be 1) should be initialized by the
	 * intersection of the underlying hardware's MSR (i.e., features which
	 * can be supported) and the list of features we want to expose -
	 * because they are known to be properly supported in our code.
	 * Also, usually, the low half of the MSRs (bits which must be 1) can
	 * be set to 0, meaning that L1 may turn off any of these bits. The
	 * reason is that if one of these bits is necessary, it will appear
	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
	 * fields of vmcs01 and vmcs02, will turn these bits off - and
2712
	 * nested_vmx_exit_reflected() will not pass related exits to L1.
2713 2714 2715 2716
	 * These rules have exceptions below.
	 */

	/* pin-based controls */
2717
	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2718 2719 2720 2721 2722 2723 2724 2725 2726 2727
		vmx->nested.nested_vmx_pinbased_ctls_low,
		vmx->nested.nested_vmx_pinbased_ctls_high);
	vmx->nested.nested_vmx_pinbased_ctls_low |=
		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
	vmx->nested.nested_vmx_pinbased_ctls_high &=
		PIN_BASED_EXT_INTR_MASK |
		PIN_BASED_NMI_EXITING |
		PIN_BASED_VIRTUAL_NMIS;
	vmx->nested.nested_vmx_pinbased_ctls_high |=
		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2728
		PIN_BASED_VMX_PREEMPTION_TIMER;
2729
	if (kvm_vcpu_apicv_active(&vmx->vcpu))
2730 2731
		vmx->nested.nested_vmx_pinbased_ctls_high |=
			PIN_BASED_POSTED_INTR;
2732

2733
	/* exit controls */
2734
	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2735 2736 2737 2738
		vmx->nested.nested_vmx_exit_ctls_low,
		vmx->nested.nested_vmx_exit_ctls_high);
	vmx->nested.nested_vmx_exit_ctls_low =
		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2739

2740
	vmx->nested.nested_vmx_exit_ctls_high &=
2741
#ifdef CONFIG_X86_64
2742
		VM_EXIT_HOST_ADDR_SPACE_SIZE |
2743
#endif
2744
		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2745 2746
	vmx->nested.nested_vmx_exit_ctls_high |=
		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2747
		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2748 2749
		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;

2750
	if (kvm_mpx_supported())
2751
		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2752

2753
	/* We support free control of debug control saving. */
2754
	vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2755

2756 2757
	/* entry controls */
	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2758 2759 2760 2761 2762
		vmx->nested.nested_vmx_entry_ctls_low,
		vmx->nested.nested_vmx_entry_ctls_high);
	vmx->nested.nested_vmx_entry_ctls_low =
		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
	vmx->nested.nested_vmx_entry_ctls_high &=
2763 2764 2765 2766
#ifdef CONFIG_X86_64
		VM_ENTRY_IA32E_MODE |
#endif
		VM_ENTRY_LOAD_IA32_PAT;
2767 2768
	vmx->nested.nested_vmx_entry_ctls_high |=
		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2769
	if (kvm_mpx_supported())
2770
		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2771

2772
	/* We support free control of debug control loading. */
2773
	vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2774

2775 2776
	/* cpu-based controls */
	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2777 2778 2779 2780 2781
		vmx->nested.nested_vmx_procbased_ctls_low,
		vmx->nested.nested_vmx_procbased_ctls_high);
	vmx->nested.nested_vmx_procbased_ctls_low =
		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
	vmx->nested.nested_vmx_procbased_ctls_high &=
2782 2783
		CPU_BASED_VIRTUAL_INTR_PENDING |
		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2784 2785 2786 2787 2788 2789 2790
		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
		CPU_BASED_CR3_STORE_EXITING |
#ifdef CONFIG_X86_64
		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
#endif
		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2791 2792 2793 2794
		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2795 2796 2797 2798 2799 2800
	/*
	 * We can allow some features even when not supported by the
	 * hardware. For example, L1 can specify an MSR bitmap - and we
	 * can use it to avoid exits to L1 - even when L0 runs L2
	 * without MSR bitmaps.
	 */
2801 2802
	vmx->nested.nested_vmx_procbased_ctls_high |=
		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2803
		CPU_BASED_USE_MSR_BITMAPS;
2804

2805
	/* We support free control of CR3 access interception. */
2806
	vmx->nested.nested_vmx_procbased_ctls_low &=
2807 2808
		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);

2809 2810 2811 2812
	/*
	 * secondary cpu-based controls.  Do not include those that
	 * depend on CPUID bits, they are added later by vmx_cpuid_update.
	 */
2813
	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2814 2815 2816 2817
		vmx->nested.nested_vmx_secondary_ctls_low,
		vmx->nested.nested_vmx_secondary_ctls_high);
	vmx->nested.nested_vmx_secondary_ctls_low = 0;
	vmx->nested.nested_vmx_secondary_ctls_high &=
2818
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2819
		SECONDARY_EXEC_DESC |
2820
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2821
		SECONDARY_EXEC_APIC_REGISTER_VIRT |
2822
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2823
		SECONDARY_EXEC_WBINVD_EXITING;
2824

2825 2826
	if (enable_ept) {
		/* nested EPT: emulate EPT also to L1 */
2827
		vmx->nested.nested_vmx_secondary_ctls_high |=
2828
			SECONDARY_EXEC_ENABLE_EPT;
2829
		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2830
			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2831 2832 2833
		if (cpu_has_vmx_ept_execute_only())
			vmx->nested.nested_vmx_ept_caps |=
				VMX_EPT_EXECUTE_ONLY_BIT;
2834
		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2835
		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2836 2837
			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
			VMX_EPT_1GB_PAGE_BIT;
2838 2839 2840
		if (enable_ept_ad_bits) {
			vmx->nested.nested_vmx_secondary_ctls_high |=
				SECONDARY_EXEC_ENABLE_PML;
2841
			vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2842
		}
2843
	}
2844

2845 2846 2847
	if (cpu_has_vmx_vmfunc()) {
		vmx->nested.nested_vmx_secondary_ctls_high |=
			SECONDARY_EXEC_ENABLE_VMFUNC;
2848 2849 2850 2851
		/*
		 * Advertise EPTP switching unconditionally
		 * since we emulate it
		 */
2852 2853 2854
		if (enable_ept)
			vmx->nested.nested_vmx_vmfunc_controls =
				VMX_VMFUNC_EPTP_SWITCHING;
2855 2856
	}

2857 2858 2859 2860 2861 2862
	/*
	 * Old versions of KVM use the single-context version without
	 * checking for support, so declare that it is supported even
	 * though it is treated as global context.  The alternative is
	 * not failing the single-context invvpid, and it is worse.
	 */
2863 2864 2865
	if (enable_vpid) {
		vmx->nested.nested_vmx_secondary_ctls_high |=
			SECONDARY_EXEC_ENABLE_VPID;
2866
		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2867
			VMX_VPID_EXTENT_SUPPORTED_MASK;
2868
	}
2869

2870 2871 2872 2873
	if (enable_unrestricted_guest)
		vmx->nested.nested_vmx_secondary_ctls_high |=
			SECONDARY_EXEC_UNRESTRICTED_GUEST;

2874
	/* miscellaneous data */
2875 2876 2877 2878 2879 2880
	rdmsr(MSR_IA32_VMX_MISC,
		vmx->nested.nested_vmx_misc_low,
		vmx->nested.nested_vmx_misc_high);
	vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
	vmx->nested.nested_vmx_misc_low |=
		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2881
		VMX_MISC_ACTIVITY_HLT;
2882
	vmx->nested.nested_vmx_misc_high = 0;
2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899

	/*
	 * This MSR reports some information about VMX support. We
	 * should return information about the VMX we emulate for the
	 * guest, and the VMCS structure we give it - not about the
	 * VMX support of the underlying hardware.
	 */
	vmx->nested.nested_vmx_basic =
		VMCS12_REVISION |
		VMX_BASIC_TRUE_CTLS |
		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);

	if (cpu_has_vmx_basic_inout())
		vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;

	/*
2900
	 * These MSRs specify bits which the guest must keep fixed on
2901 2902 2903 2904 2905 2906 2907
	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
	 * We picked the standard core2 setting.
	 */
#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
	vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
	vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
2908 2909 2910 2911

	/* These MSRs specify bits which the guest must keep fixed off. */
	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
2912 2913 2914

	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
	vmx->nested.nested_vmx_vmcs_enum = 0x2e;
2915 2916
}

2917 2918 2919 2920 2921 2922 2923
/*
 * if fixed0[i] == 1: val[i] must be 1
 * if fixed1[i] == 0: val[i] must be 0
 */
static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
{
	return ((val & fixed1) | fixed0) == val;
2924 2925 2926 2927
}

static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{
2928
	return fixed_bits_valid(control, low, high);
2929 2930 2931 2932 2933 2934 2935
}

static inline u64 vmx_control_msr(u32 low, u32 high)
{
	return low | ((u64)high << 32);
}

2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
{
	superset &= mask;
	subset &= mask;

	return (superset | subset) == superset;
}

static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
	const u64 feature_and_reserved =
		/* feature (except bit 48; see below) */
		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
		/* reserved */
		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
	u64 vmx_basic = vmx->nested.nested_vmx_basic;

	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
		return -EINVAL;

	/*
	 * KVM does not emulate a version of VMX that constrains physical
	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
	 */
	if (data & BIT_ULL(48))
		return -EINVAL;

	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
	    vmx_basic_vmcs_revision_id(data))
		return -EINVAL;

	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
		return -EINVAL;

	vmx->nested.nested_vmx_basic = data;
	return 0;
}

static int
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{
	u64 supported;
	u32 *lowp, *highp;

	switch (msr_index) {
	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
		lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
		highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
		break;
	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
		lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
		highp = &vmx->nested.nested_vmx_procbased_ctls_high;
		break;
	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
		lowp = &vmx->nested.nested_vmx_exit_ctls_low;
		highp = &vmx->nested.nested_vmx_exit_ctls_high;
		break;
	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
		lowp = &vmx->nested.nested_vmx_entry_ctls_low;
		highp = &vmx->nested.nested_vmx_entry_ctls_high;
		break;
	case MSR_IA32_VMX_PROCBASED_CTLS2:
		lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
		highp = &vmx->nested.nested_vmx_secondary_ctls_high;
		break;
	default:
		BUG();
	}

	supported = vmx_control_msr(*lowp, *highp);

	/* Check must-be-1 bits are still 1. */
	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
		return -EINVAL;

	/* Check must-be-0 bits are still 0. */
	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
		return -EINVAL;

	*lowp = data;
	*highp = data >> 32;
	return 0;
}

static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{
	const u64 feature_and_reserved_bits =
		/* feature */
		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
		/* reserved */
		GENMASK_ULL(13, 9) | BIT_ULL(31);
	u64 vmx_misc;

	vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
				   vmx->nested.nested_vmx_misc_high);

	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
		return -EINVAL;

	if ((vmx->nested.nested_vmx_pinbased_ctls_high &
	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
	    vmx_misc_preemption_timer_rate(data) !=
	    vmx_misc_preemption_timer_rate(vmx_misc))
		return -EINVAL;

	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
		return -EINVAL;

	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
		return -EINVAL;

	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
		return -EINVAL;

	vmx->nested.nested_vmx_misc_low = data;
	vmx->nested.nested_vmx_misc_high = data >> 32;
	return 0;
}

static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{
	u64 vmx_ept_vpid_cap;

	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
					   vmx->nested.nested_vmx_vpid_caps);

	/* Every bit is either reserved or a feature bit. */
	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
		return -EINVAL;

	vmx->nested.nested_vmx_ept_caps = data;
	vmx->nested.nested_vmx_vpid_caps = data >> 32;
	return 0;
}

static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{
	u64 *msr;

	switch (msr_index) {
	case MSR_IA32_VMX_CR0_FIXED0:
		msr = &vmx->nested.nested_vmx_cr0_fixed0;
		break;
	case MSR_IA32_VMX_CR4_FIXED0:
		msr = &vmx->nested.nested_vmx_cr4_fixed0;
		break;
	default:
		BUG();
	}

	/*
	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
	 * must be 1 in the restored value.
	 */
	if (!is_bitwise_subset(data, *msr, -1ULL))
		return -EINVAL;

	*msr = data;
	return 0;
}

/*
 * Called when userspace is restoring VMX MSRs.
 *
 * Returns 0 on success, non-0 otherwise.
 */
static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3104
{
3105 3106
	struct vcpu_vmx *vmx = to_vmx(vcpu);

3107 3108
	switch (msr_index) {
	case MSR_IA32_VMX_BASIC:
3109 3110 3111 3112 3113
		return vmx_restore_vmx_basic(vmx, data);
	case MSR_IA32_VMX_PINBASED_CTLS:
	case MSR_IA32_VMX_PROCBASED_CTLS:
	case MSR_IA32_VMX_EXIT_CTLS:
	case MSR_IA32_VMX_ENTRY_CTLS:
3114
		/*
3115 3116 3117 3118 3119 3120 3121
		 * The "non-true" VMX capability MSRs are generated from the
		 * "true" MSRs, so we do not support restoring them directly.
		 *
		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
		 * should restore the "true" MSRs with the must-be-1 bits
		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
		 * DEFAULT SETTINGS".
3122
		 */
3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147
		return -EINVAL;
	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
	case MSR_IA32_VMX_PROCBASED_CTLS2:
		return vmx_restore_control_msr(vmx, msr_index, data);
	case MSR_IA32_VMX_MISC:
		return vmx_restore_vmx_misc(vmx, data);
	case MSR_IA32_VMX_CR0_FIXED0:
	case MSR_IA32_VMX_CR4_FIXED0:
		return vmx_restore_fixed0_msr(vmx, msr_index, data);
	case MSR_IA32_VMX_CR0_FIXED1:
	case MSR_IA32_VMX_CR4_FIXED1:
		/*
		 * These MSRs are generated based on the vCPU's CPUID, so we
		 * do not support restoring them directly.
		 */
		return -EINVAL;
	case MSR_IA32_VMX_EPT_VPID_CAP:
		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
	case MSR_IA32_VMX_VMCS_ENUM:
		vmx->nested.nested_vmx_vmcs_enum = data;
		return 0;
	default:
3148
		/*
3149
		 * The rest of the VMX capability MSRs do not support restore.
3150
		 */
3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162
		return -EINVAL;
	}
}

/* Returns 0 on success, non-0 otherwise. */
static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	switch (msr_index) {
	case MSR_IA32_VMX_BASIC:
		*pdata = vmx->nested.nested_vmx_basic;
3163 3164 3165
		break;
	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
	case MSR_IA32_VMX_PINBASED_CTLS:
3166 3167 3168
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_pinbased_ctls_low,
			vmx->nested.nested_vmx_pinbased_ctls_high);
3169 3170
		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3171 3172 3173
		break;
	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
	case MSR_IA32_VMX_PROCBASED_CTLS:
3174 3175 3176
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_procbased_ctls_low,
			vmx->nested.nested_vmx_procbased_ctls_high);
3177 3178
		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3179 3180 3181
		break;
	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
	case MSR_IA32_VMX_EXIT_CTLS:
3182 3183 3184
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_exit_ctls_low,
			vmx->nested.nested_vmx_exit_ctls_high);
3185 3186
		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3187 3188 3189
		break;
	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
	case MSR_IA32_VMX_ENTRY_CTLS:
3190 3191 3192
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_entry_ctls_low,
			vmx->nested.nested_vmx_entry_ctls_high);
3193 3194
		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3195 3196
		break;
	case MSR_IA32_VMX_MISC:
3197 3198 3199
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_misc_low,
			vmx->nested.nested_vmx_misc_high);
3200 3201
		break;
	case MSR_IA32_VMX_CR0_FIXED0:
3202
		*pdata = vmx->nested.nested_vmx_cr0_fixed0;
3203 3204
		break;
	case MSR_IA32_VMX_CR0_FIXED1:
3205
		*pdata = vmx->nested.nested_vmx_cr0_fixed1;
3206 3207
		break;
	case MSR_IA32_VMX_CR4_FIXED0:
3208
		*pdata = vmx->nested.nested_vmx_cr4_fixed0;
3209 3210
		break;
	case MSR_IA32_VMX_CR4_FIXED1:
3211
		*pdata = vmx->nested.nested_vmx_cr4_fixed1;
3212 3213
		break;
	case MSR_IA32_VMX_VMCS_ENUM:
3214
		*pdata = vmx->nested.nested_vmx_vmcs_enum;
3215 3216
		break;
	case MSR_IA32_VMX_PROCBASED_CTLS2:
3217 3218 3219
		*pdata = vmx_control_msr(
			vmx->nested.nested_vmx_secondary_ctls_low,
			vmx->nested.nested_vmx_secondary_ctls_high);
3220 3221
		break;
	case MSR_IA32_VMX_EPT_VPID_CAP:
3222 3223
		*pdata = vmx->nested.nested_vmx_ept_caps |
			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
3224
		break;
3225 3226 3227
	case MSR_IA32_VMX_VMFUNC:
		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
		break;
3228 3229
	default:
		return 1;
3230 3231
	}

3232 3233 3234
	return 0;
}

3235 3236 3237 3238 3239 3240 3241 3242
static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
						 uint64_t val)
{
	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;

	return !(val & ~valid_bits);
}

A
Avi Kivity 已提交
3243 3244 3245 3246 3247
/*
 * Reads an msr value (of 'msr_index') into 'pdata'.
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
3248
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
A
Avi Kivity 已提交
3249
{
3250
	struct shared_msr_entry *msr;
A
Avi Kivity 已提交
3251

3252
	switch (msr_info->index) {
3253
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
3254
	case MSR_FS_BASE:
3255
		msr_info->data = vmcs_readl(GUEST_FS_BASE);
A
Avi Kivity 已提交
3256 3257
		break;
	case MSR_GS_BASE:
3258
		msr_info->data = vmcs_readl(GUEST_GS_BASE);
A
Avi Kivity 已提交
3259
		break;
3260 3261
	case MSR_KERNEL_GS_BASE:
		vmx_load_host_state(to_vmx(vcpu));
3262
		msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
3263
		break;
3264
#endif
A
Avi Kivity 已提交
3265
	case MSR_EFER:
3266
		return kvm_get_msr_common(vcpu, msr_info);
3267
	case MSR_IA32_TSC:
3268
		msr_info->data = guest_read_tsc(vcpu);
A
Avi Kivity 已提交
3269 3270
		break;
	case MSR_IA32_SYSENTER_CS:
3271
		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
A
Avi Kivity 已提交
3272 3273
		break;
	case MSR_IA32_SYSENTER_EIP:
3274
		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
A
Avi Kivity 已提交
3275 3276
		break;
	case MSR_IA32_SYSENTER_ESP:
3277
		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
A
Avi Kivity 已提交
3278
		break;
3279
	case MSR_IA32_BNDCFGS:
3280
		if (!kvm_mpx_supported() ||
3281 3282
		    (!msr_info->host_initiated &&
		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3283
			return 1;
3284
		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3285
		break;
3286 3287 3288 3289
	case MSR_IA32_MCG_EXT_CTL:
		if (!msr_info->host_initiated &&
		    !(to_vmx(vcpu)->msr_ia32_feature_control &
		      FEATURE_CONTROL_LMCE))
3290
			return 1;
3291 3292
		msr_info->data = vcpu->arch.mcg_ext_ctl;
		break;
3293
	case MSR_IA32_FEATURE_CONTROL:
3294
		msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
3295 3296 3297 3298
		break;
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
		if (!nested_vmx_allowed(vcpu))
			return 1;
3299
		return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
W
Wanpeng Li 已提交
3300 3301 3302
	case MSR_IA32_XSS:
		if (!vmx_xsaves_supported())
			return 1;
3303
		msr_info->data = vcpu->arch.ia32_xss;
W
Wanpeng Li 已提交
3304
		break;
3305
	case MSR_TSC_AUX:
3306 3307
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3308 3309
			return 1;
		/* Otherwise falls through */
A
Avi Kivity 已提交
3310
	default:
3311
		msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
3312
		if (msr) {
3313
			msr_info->data = msr->data;
3314
			break;
A
Avi Kivity 已提交
3315
		}
3316
		return kvm_get_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
3317 3318 3319 3320 3321
	}

	return 0;
}

3322 3323
static void vmx_leave_nested(struct kvm_vcpu *vcpu);

A
Avi Kivity 已提交
3324 3325 3326 3327 3328
/*
 * Writes msr value into into the appropriate "register".
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
3329
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
A
Avi Kivity 已提交
3330
{
3331
	struct vcpu_vmx *vmx = to_vmx(vcpu);
3332
	struct shared_msr_entry *msr;
3333
	int ret = 0;
3334 3335
	u32 msr_index = msr_info->index;
	u64 data = msr_info->data;
3336

A
Avi Kivity 已提交
3337
	switch (msr_index) {
3338
	case MSR_EFER:
3339
		ret = kvm_set_msr_common(vcpu, msr_info);
3340
		break;
3341
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
3342
	case MSR_FS_BASE:
A
Avi Kivity 已提交
3343
		vmx_segment_cache_clear(vmx);
A
Avi Kivity 已提交
3344 3345 3346
		vmcs_writel(GUEST_FS_BASE, data);
		break;
	case MSR_GS_BASE:
A
Avi Kivity 已提交
3347
		vmx_segment_cache_clear(vmx);
A
Avi Kivity 已提交
3348 3349
		vmcs_writel(GUEST_GS_BASE, data);
		break;
3350 3351 3352 3353
	case MSR_KERNEL_GS_BASE:
		vmx_load_host_state(vmx);
		vmx->msr_guest_kernel_gs_base = data;
		break;
A
Avi Kivity 已提交
3354 3355 3356 3357 3358
#endif
	case MSR_IA32_SYSENTER_CS:
		vmcs_write32(GUEST_SYSENTER_CS, data);
		break;
	case MSR_IA32_SYSENTER_EIP:
A
Avi Kivity 已提交
3359
		vmcs_writel(GUEST_SYSENTER_EIP, data);
A
Avi Kivity 已提交
3360 3361
		break;
	case MSR_IA32_SYSENTER_ESP:
A
Avi Kivity 已提交
3362
		vmcs_writel(GUEST_SYSENTER_ESP, data);
A
Avi Kivity 已提交
3363
		break;
3364
	case MSR_IA32_BNDCFGS:
3365
		if (!kvm_mpx_supported() ||
3366 3367
		    (!msr_info->host_initiated &&
		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
3368
			return 1;
3369
		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
3370
		    (data & MSR_IA32_BNDCFGS_RSVD))
3371
			return 1;
3372 3373
		vmcs_write64(GUEST_BNDCFGS, data);
		break;
3374
	case MSR_IA32_TSC:
3375
		kvm_write_tsc(vcpu, msr_info);
A
Avi Kivity 已提交
3376
		break;
S
Sheng Yang 已提交
3377 3378
	case MSR_IA32_CR_PAT:
		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3379 3380
			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
				return 1;
S
Sheng Yang 已提交
3381 3382 3383 3384
			vmcs_write64(GUEST_IA32_PAT, data);
			vcpu->arch.pat = data;
			break;
		}
3385
		ret = kvm_set_msr_common(vcpu, msr_info);
3386
		break;
W
Will Auld 已提交
3387 3388
	case MSR_IA32_TSC_ADJUST:
		ret = kvm_set_msr_common(vcpu, msr_info);
3389
		break;
3390 3391 3392 3393 3394 3395 3396 3397
	case MSR_IA32_MCG_EXT_CTL:
		if ((!msr_info->host_initiated &&
		     !(to_vmx(vcpu)->msr_ia32_feature_control &
		       FEATURE_CONTROL_LMCE)) ||
		    (data & ~MCG_EXT_CTL_LMCE_EN))
			return 1;
		vcpu->arch.mcg_ext_ctl = data;
		break;
3398
	case MSR_IA32_FEATURE_CONTROL:
3399
		if (!vmx_feature_control_msr_valid(vcpu, data) ||
3400
		    (to_vmx(vcpu)->msr_ia32_feature_control &
3401 3402
		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
			return 1;
3403
		vmx->msr_ia32_feature_control = data;
3404 3405 3406 3407
		if (msr_info->host_initiated && data == 0)
			vmx_leave_nested(vcpu);
		break;
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3408 3409 3410 3411 3412
		if (!msr_info->host_initiated)
			return 1; /* they are read-only */
		if (!nested_vmx_allowed(vcpu))
			return 1;
		return vmx_set_vmx_msr(vcpu, msr_index, data);
W
Wanpeng Li 已提交
3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428
	case MSR_IA32_XSS:
		if (!vmx_xsaves_supported())
			return 1;
		/*
		 * The only supported bit as of Skylake is bit 8, but
		 * it is not supported on KVM.
		 */
		if (data != 0)
			return 1;
		vcpu->arch.ia32_xss = data;
		if (vcpu->arch.ia32_xss != host_xss)
			add_atomic_switch_msr(vmx, MSR_IA32_XSS,
				vcpu->arch.ia32_xss, host_xss);
		else
			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
		break;
3429
	case MSR_TSC_AUX:
3430 3431
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
3432 3433 3434 3435 3436
			return 1;
		/* Check reserved bit, higher 32 bits should be zero */
		if ((data >> 32) != 0)
			return 1;
		/* Otherwise falls through */
A
Avi Kivity 已提交
3437
	default:
R
Rusty Russell 已提交
3438
		msr = find_msr_entry(vmx, msr_index);
3439
		if (msr) {
3440
			u64 old_msr_data = msr->data;
3441
			msr->data = data;
3442 3443
			if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
				preempt_disable();
3444 3445
				ret = kvm_set_shared_msr(msr->index, msr->data,
							 msr->mask);
3446
				preempt_enable();
3447 3448
				if (ret)
					msr->data = old_msr_data;
3449
			}
3450
			break;
A
Avi Kivity 已提交
3451
		}
3452
		ret = kvm_set_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
3453 3454
	}

3455
	return ret;
A
Avi Kivity 已提交
3456 3457
}

3458
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
A
Avi Kivity 已提交
3459
{
3460 3461 3462 3463 3464 3465 3466 3467
	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
	switch (reg) {
	case VCPU_REGS_RSP:
		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
		break;
	case VCPU_REGS_RIP:
		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
		break;
A
Avi Kivity 已提交
3468 3469 3470 3471
	case VCPU_EXREG_PDPTR:
		if (enable_ept)
			ept_save_pdptrs(vcpu);
		break;
3472 3473 3474
	default:
		break;
	}
A
Avi Kivity 已提交
3475 3476 3477 3478
}

static __init int cpu_has_kvm_support(void)
{
3479
	return cpu_has_vmx();
A
Avi Kivity 已提交
3480 3481 3482 3483 3484 3485 3486
}

static __init int vmx_disabled_by_bios(void)
{
	u64 msr;

	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3487
	if (msr & FEATURE_CONTROL_LOCKED) {
3488
		/* launched w/ TXT and VMX disabled */
3489 3490 3491
		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
			&& tboot_enabled())
			return 1;
3492
		/* launched w/o TXT and VMX only enabled w/ TXT */
3493
		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3494
			&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3495 3496
			&& !tboot_enabled()) {
			printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3497
				"activate TXT before enabling KVM\n");
3498
			return 1;
3499
		}
3500 3501 3502 3503
		/* launched w/o TXT and VMX disabled */
		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
			&& !tboot_enabled())
			return 1;
3504 3505 3506
	}

	return 0;
A
Avi Kivity 已提交
3507 3508
}

3509 3510
static void kvm_cpu_vmxon(u64 addr)
{
3511
	cr4_set_bits(X86_CR4_VMXE);
3512 3513
	intel_pt_handle_vmx(1);

3514 3515 3516 3517 3518
	asm volatile (ASM_VMX_VMXON_RAX
			: : "a"(&addr), "m"(addr)
			: "memory", "cc");
}

3519
static int hardware_enable(void)
A
Avi Kivity 已提交
3520 3521 3522
{
	int cpu = raw_smp_processor_id();
	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3523
	u64 old, test_bits;
A
Avi Kivity 已提交
3524

3525
	if (cr4_read_shadow() & X86_CR4_VMXE)
3526 3527
		return -EBUSY;

3528
	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3529 3530
	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542

	/*
	 * Now we can enable the vmclear operation in kdump
	 * since the loaded_vmcss_on_cpu list on this cpu
	 * has been initialized.
	 *
	 * Though the cpu is not in VMX operation now, there
	 * is no problem to enable the vmclear operation
	 * for the loaded_vmcss_on_cpu list is empty!
	 */
	crash_enable_local_vmclear(cpu);

A
Avi Kivity 已提交
3543
	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3544 3545 3546 3547 3548 3549 3550

	test_bits = FEATURE_CONTROL_LOCKED;
	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
	if (tboot_enabled())
		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;

	if ((old & test_bits) != test_bits) {
A
Avi Kivity 已提交
3551
		/* enable and lock */
3552 3553
		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
	}
3554
	kvm_cpu_vmxon(phys_addr);
3555 3556
	if (enable_ept)
		ept_sync_global();
3557 3558

	return 0;
A
Avi Kivity 已提交
3559 3560
}

3561
static void vmclear_local_loaded_vmcss(void)
3562 3563
{
	int cpu = raw_smp_processor_id();
3564
	struct loaded_vmcs *v, *n;
3565

3566 3567 3568
	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
				 loaded_vmcss_on_cpu_link)
		__loaded_vmcs_clear(v);
3569 3570
}

3571 3572 3573 3574 3575

/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
 * tricks.
 */
static void kvm_cpu_vmxoff(void)
A
Avi Kivity 已提交
3576
{
3577
	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
3578 3579

	intel_pt_handle_vmx(0);
3580
	cr4_clear_bits(X86_CR4_VMXE);
A
Avi Kivity 已提交
3581 3582
}

3583
static void hardware_disable(void)
3584
{
3585 3586
	vmclear_local_loaded_vmcss();
	kvm_cpu_vmxoff();
3587 3588
}

3589
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
M
Mike Day 已提交
3590
				      u32 msr, u32 *result)
3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601
{
	u32 vmx_msr_low, vmx_msr_high;
	u32 ctl = ctl_min | ctl_opt;

	rdmsr(msr, vmx_msr_low, vmx_msr_high);

	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */

	/* Ensure minimum (required) set of control bits are supported. */
	if (ctl_min & ~ctl)
Y
Yang, Sheng 已提交
3602
		return -EIO;
3603 3604 3605 3606 3607

	*result = ctl;
	return 0;
}

A
Avi Kivity 已提交
3608 3609 3610 3611 3612 3613 3614 3615
static __init bool allow_1_setting(u32 msr, u32 ctl)
{
	u32 vmx_msr_low, vmx_msr_high;

	rdmsr(msr, vmx_msr_low, vmx_msr_high);
	return vmx_msr_high & ctl;
}

Y
Yang, Sheng 已提交
3616
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
A
Avi Kivity 已提交
3617 3618
{
	u32 vmx_msr_low, vmx_msr_high;
S
Sheng Yang 已提交
3619
	u32 min, opt, min2, opt2;
3620 3621
	u32 _pin_based_exec_control = 0;
	u32 _cpu_based_exec_control = 0;
3622
	u32 _cpu_based_2nd_exec_control = 0;
3623 3624 3625
	u32 _vmexit_control = 0;
	u32 _vmentry_control = 0;

R
Raghavendra K T 已提交
3626
	min = CPU_BASED_HLT_EXITING |
3627 3628 3629 3630
#ifdef CONFIG_X86_64
	      CPU_BASED_CR8_LOAD_EXITING |
	      CPU_BASED_CR8_STORE_EXITING |
#endif
S
Sheng Yang 已提交
3631 3632
	      CPU_BASED_CR3_LOAD_EXITING |
	      CPU_BASED_CR3_STORE_EXITING |
3633 3634
	      CPU_BASED_USE_IO_BITMAPS |
	      CPU_BASED_MOV_DR_EXITING |
M
Marcelo Tosatti 已提交
3635
	      CPU_BASED_USE_TSC_OFFSETING |
A
Avi Kivity 已提交
3636 3637
	      CPU_BASED_INVLPG_EXITING |
	      CPU_BASED_RDPMC_EXITING;
3638

3639 3640 3641 3642
	if (!kvm_mwait_in_guest())
		min |= CPU_BASED_MWAIT_EXITING |
			CPU_BASED_MONITOR_EXITING;

3643
	opt = CPU_BASED_TPR_SHADOW |
S
Sheng Yang 已提交
3644
	      CPU_BASED_USE_MSR_BITMAPS |
3645
	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3646 3647
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
				&_cpu_based_exec_control) < 0)
Y
Yang, Sheng 已提交
3648
		return -EIO;
3649 3650 3651 3652 3653
#ifdef CONFIG_X86_64
	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
					   ~CPU_BASED_CR8_STORE_EXITING;
#endif
3654
	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
S
Sheng Yang 已提交
3655 3656
		min2 = 0;
		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3657
			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3658
			SECONDARY_EXEC_WBINVD_EXITING |
S
Sheng Yang 已提交
3659
			SECONDARY_EXEC_ENABLE_VPID |
3660
			SECONDARY_EXEC_ENABLE_EPT |
3661
			SECONDARY_EXEC_UNRESTRICTED_GUEST |
3662
			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3663
			SECONDARY_EXEC_RDTSCP |
3664
			SECONDARY_EXEC_ENABLE_INVPCID |
3665
			SECONDARY_EXEC_APIC_REGISTER_VIRT |
3666
			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
W
Wanpeng Li 已提交
3667
			SECONDARY_EXEC_SHADOW_VMCS |
K
Kai Huang 已提交
3668
			SECONDARY_EXEC_XSAVES |
3669 3670
			SECONDARY_EXEC_RDSEED_EXITING |
			SECONDARY_EXEC_RDRAND_EXITING |
X
Xiao Guangrong 已提交
3671
			SECONDARY_EXEC_ENABLE_PML |
B
Bandan Das 已提交
3672 3673
			SECONDARY_EXEC_TSC_SCALING |
			SECONDARY_EXEC_ENABLE_VMFUNC;
S
Sheng Yang 已提交
3674 3675
		if (adjust_vmx_controls(min2, opt2,
					MSR_IA32_VMX_PROCBASED_CTLS2,
3676 3677 3678 3679 3680 3681 3682 3683
					&_cpu_based_2nd_exec_control) < 0)
			return -EIO;
	}
#ifndef CONFIG_X86_64
	if (!(_cpu_based_2nd_exec_control &
				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
3684 3685 3686

	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
		_cpu_based_2nd_exec_control &= ~(
3687
				SECONDARY_EXEC_APIC_REGISTER_VIRT |
3688 3689
				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3690

3691 3692 3693
	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
		&vmx_capability.ept, &vmx_capability.vpid);

S
Sheng Yang 已提交
3694
	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
M
Marcelo Tosatti 已提交
3695 3696
		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
		   enabled */
3697 3698 3699
		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
					     CPU_BASED_CR3_STORE_EXITING |
					     CPU_BASED_INVLPG_EXITING);
3700 3701 3702 3703 3704 3705 3706 3707 3708 3709
	} else if (vmx_capability.ept) {
		vmx_capability.ept = 0;
		pr_warn_once("EPT CAP should not exist if not support "
				"1-setting enable EPT VM-execution control\n");
	}
	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
		vmx_capability.vpid) {
		vmx_capability.vpid = 0;
		pr_warn_once("VPID CAP should not exist if not support "
				"1-setting enable VPID VM-execution control\n");
S
Sheng Yang 已提交
3710
	}
3711

3712
	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3713 3714 3715
#ifdef CONFIG_X86_64
	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
3716
	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3717
		VM_EXIT_CLEAR_BNDCFGS;
3718 3719
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
				&_vmexit_control) < 0)
Y
Yang, Sheng 已提交
3720
		return -EIO;
3721

3722 3723 3724
	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
		 PIN_BASED_VMX_PREEMPTION_TIMER;
3725 3726 3727 3728
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
				&_pin_based_exec_control) < 0)
		return -EIO;

3729 3730
	if (cpu_has_broken_vmx_preemption_timer())
		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3731
	if (!(_cpu_based_2nd_exec_control &
3732
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3733 3734
		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;

3735
	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3736
	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3737 3738
	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
				&_vmentry_control) < 0)
Y
Yang, Sheng 已提交
3739
		return -EIO;
A
Avi Kivity 已提交
3740

N
Nguyen Anh Quynh 已提交
3741
	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3742 3743 3744

	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
Y
Yang, Sheng 已提交
3745
		return -EIO;
3746 3747 3748 3749

#ifdef CONFIG_X86_64
	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
	if (vmx_msr_high & (1u<<16))
Y
Yang, Sheng 已提交
3750
		return -EIO;
3751 3752 3753 3754
#endif

	/* Require Write-Back (WB) memory type for VMCS accesses. */
	if (((vmx_msr_high >> 18) & 15) != 6)
Y
Yang, Sheng 已提交
3755
		return -EIO;
3756

Y
Yang, Sheng 已提交
3757
	vmcs_conf->size = vmx_msr_high & 0x1fff;
3758
	vmcs_conf->order = get_order(vmcs_conf->size);
3759
	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
Y
Yang, Sheng 已提交
3760
	vmcs_conf->revision_id = vmx_msr_low;
3761

Y
Yang, Sheng 已提交
3762 3763
	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3764
	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
Y
Yang, Sheng 已提交
3765 3766
	vmcs_conf->vmexit_ctrl         = _vmexit_control;
	vmcs_conf->vmentry_ctrl        = _vmentry_control;
3767

A
Avi Kivity 已提交
3768 3769 3770 3771 3772 3773
	cpu_has_load_ia32_efer =
		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
				VM_ENTRY_LOAD_IA32_EFER)
		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
				   VM_EXIT_LOAD_IA32_EFER);

3774 3775 3776 3777 3778 3779 3780 3781
	cpu_has_load_perf_global_ctrl =
		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
				   VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);

	/*
	 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
A
Andrea Gelmini 已提交
3782
	 * but due to errata below it can't be used. Workaround is to use
3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809
	 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
	 *
	 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
	 *
	 * AAK155             (model 26)
	 * AAP115             (model 30)
	 * AAT100             (model 37)
	 * BC86,AAY89,BD102   (model 44)
	 * BA97               (model 46)
	 *
	 */
	if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
		switch (boot_cpu_data.x86_model) {
		case 26:
		case 30:
		case 37:
		case 44:
		case 46:
			cpu_has_load_perf_global_ctrl = false;
			printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
					"does not work properly. Using workaround\n");
			break;
		default:
			break;
		}
	}

3810
	if (boot_cpu_has(X86_FEATURE_XSAVES))
W
Wanpeng Li 已提交
3811 3812
		rdmsrl(MSR_IA32_XSS, host_xss);

3813
	return 0;
N
Nguyen Anh Quynh 已提交
3814
}
A
Avi Kivity 已提交
3815 3816 3817 3818 3819 3820 3821

static struct vmcs *alloc_vmcs_cpu(int cpu)
{
	int node = cpu_to_node(cpu);
	struct page *pages;
	struct vmcs *vmcs;

3822
	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
A
Avi Kivity 已提交
3823 3824 3825
	if (!pages)
		return NULL;
	vmcs = page_address(pages);
3826 3827
	memset(vmcs, 0, vmcs_config.size);
	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
A
Avi Kivity 已提交
3828 3829 3830 3831 3832
	return vmcs;
}

static struct vmcs *alloc_vmcs(void)
{
3833
	return alloc_vmcs_cpu(raw_smp_processor_id());
A
Avi Kivity 已提交
3834 3835 3836 3837
}

static void free_vmcs(struct vmcs *vmcs)
{
3838
	free_pages((unsigned long)vmcs, vmcs_config.order);
A
Avi Kivity 已提交
3839 3840
}

3841 3842 3843 3844 3845 3846 3847 3848 3849 3850
/*
 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
 */
static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
	if (!loaded_vmcs->vmcs)
		return;
	loaded_vmcs_clear(loaded_vmcs);
	free_vmcs(loaded_vmcs->vmcs);
	loaded_vmcs->vmcs = NULL;
3851
	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3852 3853
}

3854
static void free_kvm_area(void)
A
Avi Kivity 已提交
3855 3856 3857
{
	int cpu;

Z
Zachary Amsden 已提交
3858
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
3859
		free_vmcs(per_cpu(vmxarea, cpu));
Z
Zachary Amsden 已提交
3860 3861
		per_cpu(vmxarea, cpu) = NULL;
	}
A
Avi Kivity 已提交
3862 3863
}

3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882
enum vmcs_field_type {
	VMCS_FIELD_TYPE_U16 = 0,
	VMCS_FIELD_TYPE_U64 = 1,
	VMCS_FIELD_TYPE_U32 = 2,
	VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
};

static inline int vmcs_field_type(unsigned long field)
{
	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
		return VMCS_FIELD_TYPE_U32;
	return (field >> 13) & 0x3 ;
}

static inline int vmcs_field_readonly(unsigned long field)
{
	return (((field >> 10) & 0x3) == 1);
}

3883 3884 3885 3886 3887 3888 3889 3890 3891
static void init_vmcs_shadow_fields(void)
{
	int i, j;

	/* No checks for read only fields yet */

	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
		switch (shadow_read_write_fields[i]) {
		case GUEST_BNDCFGS:
3892
			if (!kvm_mpx_supported())
3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907
				continue;
			break;
		default:
			break;
		}

		if (j < i)
			shadow_read_write_fields[j] =
				shadow_read_write_fields[i];
		j++;
	}
	max_shadow_read_write_fields = j;

	/* shadowed fields guest access without vmexit */
	for (i = 0; i < max_shadow_read_write_fields; i++) {
3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922
		unsigned long field = shadow_read_write_fields[i];

		clear_bit(field, vmx_vmwrite_bitmap);
		clear_bit(field, vmx_vmread_bitmap);
		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
			clear_bit(field + 1, vmx_vmwrite_bitmap);
			clear_bit(field + 1, vmx_vmread_bitmap);
		}
	}
	for (i = 0; i < max_shadow_read_only_fields; i++) {
		unsigned long field = shadow_read_only_fields[i];

		clear_bit(field, vmx_vmread_bitmap);
		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
			clear_bit(field + 1, vmx_vmread_bitmap);
3923 3924 3925
	}
}

A
Avi Kivity 已提交
3926 3927 3928 3929
static __init int alloc_kvm_area(void)
{
	int cpu;

Z
Zachary Amsden 已提交
3930
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943
		struct vmcs *vmcs;

		vmcs = alloc_vmcs_cpu(cpu);
		if (!vmcs) {
			free_kvm_area();
			return -ENOMEM;
		}

		per_cpu(vmxarea, cpu) = vmcs;
	}
	return 0;
}

3944
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3945
		struct kvm_segment *save)
A
Avi Kivity 已提交
3946
{
3947 3948 3949 3950 3951 3952 3953 3954 3955
	if (!emulate_invalid_guest_state) {
		/*
		 * CS and SS RPL should be equal during guest entry according
		 * to VMX spec, but in reality it is not always so. Since vcpu
		 * is in the middle of the transition from real mode to
		 * protected mode it is safe to assume that RPL 0 is a good
		 * default value.
		 */
		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3956 3957
			save->selector &= ~SEGMENT_RPL_MASK;
		save->dpl = save->selector & SEGMENT_RPL_MASK;
3958
		save->s = 1;
A
Avi Kivity 已提交
3959
	}
3960
	vmx_set_segment(vcpu, save, seg);
A
Avi Kivity 已提交
3961 3962 3963 3964 3965
}

static void enter_pmode(struct kvm_vcpu *vcpu)
{
	unsigned long flags;
3966
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
3967

3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978
	/*
	 * Update real mode segment cache. It may be not up-to-date if sement
	 * register was written while vcpu was in a guest mode.
	 */
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

3979
	vmx->rmode.vm86_active = 0;
A
Avi Kivity 已提交
3980

A
Avi Kivity 已提交
3981 3982
	vmx_segment_cache_clear(vmx);

3983
	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
A
Avi Kivity 已提交
3984 3985

	flags = vmcs_readl(GUEST_RFLAGS);
3986 3987
	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
A
Avi Kivity 已提交
3988 3989
	vmcs_writel(GUEST_RFLAGS, flags);

3990 3991
	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
A
Avi Kivity 已提交
3992 3993 3994

	update_exception_bitmap(vcpu);

3995 3996 3997 3998 3999 4000
	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
A
Avi Kivity 已提交
4001 4002
}

4003
static void fix_rmode_seg(int seg, struct kvm_segment *save)
A
Avi Kivity 已提交
4004
{
4005
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028
	struct kvm_segment var = *save;

	var.dpl = 0x3;
	if (seg == VCPU_SREG_CS)
		var.type = 0x3;

	if (!emulate_invalid_guest_state) {
		var.selector = var.base >> 4;
		var.base = var.base & 0xffff0;
		var.limit = 0xffff;
		var.g = 0;
		var.db = 0;
		var.present = 1;
		var.s = 1;
		var.l = 0;
		var.unusable = 0;
		var.type = 0x3;
		var.avl = 0;
		if (save->base & 0xf)
			printk_once(KERN_WARNING "kvm: segment base is not "
					"paragraph aligned when entering "
					"protected mode (seg=%d)", seg);
	}
A
Avi Kivity 已提交
4029

4030
	vmcs_write16(sf->selector, var.selector);
4031
	vmcs_writel(sf->base, var.base);
4032 4033
	vmcs_write32(sf->limit, var.limit);
	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
A
Avi Kivity 已提交
4034 4035 4036 4037 4038
}

static void enter_rmode(struct kvm_vcpu *vcpu)
{
	unsigned long flags;
4039
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
4040

4041 4042 4043 4044 4045
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4046 4047
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4048

4049
	vmx->rmode.vm86_active = 1;
A
Avi Kivity 已提交
4050

4051 4052
	/*
	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4053
	 * vcpu. Warn the user that an update is overdue.
4054
	 */
4055
	if (!vcpu->kvm->arch.tss_addr)
4056 4057 4058
		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
			     "called before entering vcpu\n");

A
Avi Kivity 已提交
4059 4060
	vmx_segment_cache_clear(vmx);

4061
	vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
A
Avi Kivity 已提交
4062 4063 4064 4065
	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

	flags = vmcs_readl(GUEST_RFLAGS);
4066
	vmx->rmode.save_rflags = flags;
A
Avi Kivity 已提交
4067

4068
	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
A
Avi Kivity 已提交
4069 4070

	vmcs_writel(GUEST_RFLAGS, flags);
4071
	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
A
Avi Kivity 已提交
4072 4073
	update_exception_bitmap(vcpu);

4074 4075 4076 4077 4078 4079
	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
4080

4081
	kvm_mmu_reset_context(vcpu);
A
Avi Kivity 已提交
4082 4083
}

4084 4085 4086
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
4087 4088 4089 4090
	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);

	if (!msr)
		return;
4091

4092 4093 4094 4095 4096
	/*
	 * Force kernel_gs_base reloading before EFER changes, as control
	 * of this msr depends on is_long_mode().
	 */
	vmx_load_host_state(to_vmx(vcpu));
4097
	vcpu->arch.efer = efer;
4098
	if (efer & EFER_LMA) {
4099
		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4100 4101
		msr->data = efer;
	} else {
4102
		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4103 4104 4105 4106 4107 4108

		msr->data = efer & ~EFER_LME;
	}
	setup_msrs(vmx);
}

4109
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
4110 4111 4112 4113 4114

static void enter_lmode(struct kvm_vcpu *vcpu)
{
	u32 guest_tr_ar;

A
Avi Kivity 已提交
4115 4116
	vmx_segment_cache_clear(to_vmx(vcpu));

A
Avi Kivity 已提交
4117
	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4118
	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
4119 4120
		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
				     __func__);
A
Avi Kivity 已提交
4121
		vmcs_write32(GUEST_TR_AR_BYTES,
4122 4123
			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
			     | VMX_AR_TYPE_BUSY_64_TSS);
A
Avi Kivity 已提交
4124
	}
4125
	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
A
Avi Kivity 已提交
4126 4127 4128 4129
}

static void exit_lmode(struct kvm_vcpu *vcpu)
{
4130
	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
4131
	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
A
Avi Kivity 已提交
4132 4133 4134 4135
}

#endif

4136
static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
4137
{
4138 4139 4140
	if (enable_ept) {
		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
			return;
4141
		ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
4142 4143
	} else {
		vpid_sync_context(vpid);
4144
	}
4145 4146
}

4147 4148 4149 4150 4151
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
}

4152 4153 4154 4155 4156 4157
static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
{
	if (enable_ept)
		vmx_flush_tlb(vcpu);
}

4158 4159 4160 4161 4162 4163 4164 4165
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;

	vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}

4166 4167 4168 4169 4170 4171 4172
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
	if (enable_ept && is_paging(vcpu))
		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}

4173
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
4174
{
4175 4176 4177 4178
	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;

	vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
4179 4180
}

4181 4182
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
G
Gleb Natapov 已提交
4183 4184
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

A
Avi Kivity 已提交
4185 4186 4187 4188
	if (!test_bit(VCPU_EXREG_PDPTR,
		      (unsigned long *)&vcpu->arch.regs_dirty))
		return;

4189
	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
G
Gleb Natapov 已提交
4190 4191 4192 4193
		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
4194 4195 4196
	}
}

4197 4198
static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
G
Gleb Natapov 已提交
4199 4200
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

4201
	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
G
Gleb Natapov 已提交
4202 4203 4204 4205
		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
4206
	}
A
Avi Kivity 已提交
4207 4208 4209 4210 4211

	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_avail);
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_dirty);
4212 4213
}

4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247
static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
		fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);

	return fixed_bits_valid(val, fixed0, fixed1);
}

static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;

	return fixed_bits_valid(val, fixed0, fixed1);
}

static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;

	return fixed_bits_valid(val, fixed0, fixed1);
}

/* No difference in the restrictions on guest and host CR4 in VMX operation. */
#define nested_guest_cr4_valid	nested_cr4_valid
#define nested_host_cr4_valid	nested_cr4_valid

4248
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
4249 4250 4251 4252 4253

static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
					unsigned long cr0,
					struct kvm_vcpu *vcpu)
{
4254 4255
	if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
		vmx_decache_cr3(vcpu);
4256 4257 4258
	if (!(cr0 & X86_CR0_PG)) {
		/* From paging/starting to nonpaging */
		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4259
			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
4260 4261 4262
			     (CPU_BASED_CR3_LOAD_EXITING |
			      CPU_BASED_CR3_STORE_EXITING));
		vcpu->arch.cr0 = cr0;
4263
		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4264 4265 4266
	} else if (!is_paging(vcpu)) {
		/* From nonpaging to paging */
		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
4267
			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
4268 4269 4270
			     ~(CPU_BASED_CR3_LOAD_EXITING |
			       CPU_BASED_CR3_STORE_EXITING));
		vcpu->arch.cr0 = cr0;
4271
		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
4272
	}
4273 4274 4275

	if (!(cr0 & X86_CR0_WP))
		*hw_cr0 &= ~X86_CR0_WP;
4276 4277
}

A
Avi Kivity 已提交
4278 4279
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
4280
	struct vcpu_vmx *vmx = to_vmx(vcpu);
4281 4282
	unsigned long hw_cr0;

G
Gleb Natapov 已提交
4283
	hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
4284
	if (enable_unrestricted_guest)
G
Gleb Natapov 已提交
4285
		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
4286
	else {
G
Gleb Natapov 已提交
4287
		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
4288

4289 4290
		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
			enter_pmode(vcpu);
A
Avi Kivity 已提交
4291

4292 4293 4294
		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
			enter_rmode(vcpu);
	}
A
Avi Kivity 已提交
4295

4296
#ifdef CONFIG_X86_64
4297
	if (vcpu->arch.efer & EFER_LME) {
4298
		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
A
Avi Kivity 已提交
4299
			enter_lmode(vcpu);
4300
		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
A
Avi Kivity 已提交
4301 4302 4303 4304
			exit_lmode(vcpu);
	}
#endif

4305
	if (enable_ept)
4306 4307
		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);

A
Avi Kivity 已提交
4308
	vmcs_writel(CR0_READ_SHADOW, cr0);
4309
	vmcs_writel(GUEST_CR0, hw_cr0);
4310
	vcpu->arch.cr0 = cr0;
4311 4312 4313

	/* depends on vcpu->arch.cr0 to be set to a new value */
	vmx->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
4314 4315
}

4316 4317 4318 4319 4320 4321 4322
static int get_ept_level(struct kvm_vcpu *vcpu)
{
	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
		return 5;
	return 4;
}

4323
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4324
{
4325 4326 4327
	u64 eptp = VMX_EPTP_MT_WB;

	eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
4328

4329 4330
	if (enable_ept_ad_bits &&
	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4331
		eptp |= VMX_EPTP_AD_ENABLE_BIT;
4332 4333 4334 4335 4336
	eptp |= (root_hpa & PAGE_MASK);

	return eptp;
}

A
Avi Kivity 已提交
4337 4338
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
4339 4340 4341 4342
	unsigned long guest_cr3;
	u64 eptp;

	guest_cr3 = cr3;
4343
	if (enable_ept) {
4344
		eptp = construct_eptp(vcpu, cr3);
4345
		vmcs_write64(EPT_POINTER, eptp);
4346 4347 4348 4349
		if (is_paging(vcpu) || is_guest_mode(vcpu))
			guest_cr3 = kvm_read_cr3(vcpu);
		else
			guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
4350
		ept_load_pdptrs(vcpu);
4351 4352
	}

4353
	vmx_flush_tlb(vcpu);
4354
	vmcs_writel(GUEST_CR3, guest_cr3);
A
Avi Kivity 已提交
4355 4356
}

4357
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
A
Avi Kivity 已提交
4358
{
4359 4360 4361 4362 4363 4364 4365 4366 4367 4368
	/*
	 * Pass through host's Machine Check Enable value to hw_cr4, which
	 * is in force while we are in guest mode.  Do not let guests control
	 * this bit, even if host CR4.MCE == 0.
	 */
	unsigned long hw_cr4 =
		(cr4_read_shadow() & X86_CR4_MCE) |
		(cr4 & ~X86_CR4_MCE) |
		(to_vmx(vcpu)->rmode.vm86_active ?
		 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
4369

4370 4371 4372 4373 4374 4375 4376 4377 4378
	if (cr4 & X86_CR4_VMXE) {
		/*
		 * To use VMXON (and later other VMX instructions), a guest
		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
		 * So basically the check on whether to allow nested VMX
		 * is here.
		 */
		if (!nested_vmx_allowed(vcpu))
			return 1;
4379
	}
4380 4381

	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
4382 4383
		return 1;

4384
	vcpu->arch.cr4 = cr4;
4385 4386 4387 4388 4389 4390 4391 4392
	if (enable_ept) {
		if (!is_paging(vcpu)) {
			hw_cr4 &= ~X86_CR4_PAE;
			hw_cr4 |= X86_CR4_PSE;
		} else if (!(cr4 & X86_CR4_PAE)) {
			hw_cr4 &= ~X86_CR4_PAE;
		}
	}
4393

4394 4395
	if (!enable_unrestricted_guest && !is_paging(vcpu))
		/*
4396 4397 4398 4399 4400 4401 4402 4403 4404
		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
		 * to be manually disabled when guest switches to non-paging
		 * mode.
		 *
		 * If !enable_unrestricted_guest, the CPU is always running
		 * with CR0.PG=1 and CR4 needs to be modified.
		 * If enable_unrestricted_guest, the CPU automatically
		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
4405
		 */
4406
		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
4407

4408 4409
	vmcs_writel(CR4_READ_SHADOW, cr4);
	vmcs_writel(GUEST_CR4, hw_cr4);
4410
	return 0;
A
Avi Kivity 已提交
4411 4412 4413 4414 4415
}

static void vmx_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
4416
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
4417 4418
	u32 ar;

4419
	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
4420
		*var = vmx->rmode.segs[seg];
4421
		if (seg == VCPU_SREG_TR
A
Avi Kivity 已提交
4422
		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
4423
			return;
4424 4425 4426
		var->base = vmx_read_guest_seg_base(vmx, seg);
		var->selector = vmx_read_guest_seg_selector(vmx, seg);
		return;
4427
	}
A
Avi Kivity 已提交
4428 4429 4430 4431
	var->base = vmx_read_guest_seg_base(vmx, seg);
	var->limit = vmx_read_guest_seg_limit(vmx, seg);
	var->selector = vmx_read_guest_seg_selector(vmx, seg);
	ar = vmx_read_guest_seg_ar(vmx, seg);
4432
	var->unusable = (ar >> 16) & 1;
A
Avi Kivity 已提交
4433 4434 4435
	var->type = ar & 15;
	var->s = (ar >> 4) & 1;
	var->dpl = (ar >> 5) & 3;
4436 4437 4438 4439 4440 4441 4442 4443
	/*
	 * Some userspaces do not preserve unusable property. Since usable
	 * segment has to be present according to VMX spec we can use present
	 * property to amend userspace bug by making unusable segment always
	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
	 * segment as unusable.
	 */
	var->present = !var->unusable;
A
Avi Kivity 已提交
4444 4445 4446 4447 4448 4449
	var->avl = (ar >> 12) & 1;
	var->l = (ar >> 13) & 1;
	var->db = (ar >> 14) & 1;
	var->g = (ar >> 15) & 1;
}

4450 4451 4452 4453 4454 4455 4456 4457
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment s;

	if (to_vmx(vcpu)->rmode.vm86_active) {
		vmx_get_segment(vcpu, &s, seg);
		return s.base;
	}
A
Avi Kivity 已提交
4458
	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
4459 4460
}

4461
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
4462
{
4463 4464
	struct vcpu_vmx *vmx = to_vmx(vcpu);

P
Paolo Bonzini 已提交
4465
	if (unlikely(vmx->rmode.vm86_active))
4466
		return 0;
P
Paolo Bonzini 已提交
4467 4468
	else {
		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4469
		return VMX_AR_DPL(ar);
A
Avi Kivity 已提交
4470 4471 4472
	}
}

4473
static u32 vmx_segment_access_rights(struct kvm_segment *var)
A
Avi Kivity 已提交
4474 4475 4476
{
	u32 ar;

4477
	if (var->unusable || !var->present)
A
Avi Kivity 已提交
4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488
		ar = 1 << 16;
	else {
		ar = var->type & 15;
		ar |= (var->s & 1) << 4;
		ar |= (var->dpl & 3) << 5;
		ar |= (var->present & 1) << 7;
		ar |= (var->avl & 1) << 12;
		ar |= (var->l & 1) << 13;
		ar |= (var->db & 1) << 14;
		ar |= (var->g & 1) << 15;
	}
4489 4490 4491 4492 4493 4494 4495

	return ar;
}

static void vmx_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
4496
	struct vcpu_vmx *vmx = to_vmx(vcpu);
4497
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4498

A
Avi Kivity 已提交
4499 4500
	vmx_segment_cache_clear(vmx);

4501 4502 4503 4504 4505 4506
	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
		vmx->rmode.segs[seg] = *var;
		if (seg == VCPU_SREG_TR)
			vmcs_write16(sf->selector, var->selector);
		else if (var->s)
			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4507
		goto out;
4508
	}
4509

4510 4511 4512
	vmcs_writel(sf->base, var->base);
	vmcs_write32(sf->limit, var->limit);
	vmcs_write16(sf->selector, var->selector);
4513 4514 4515 4516 4517 4518

	/*
	 *   Fix the "Accessed" bit in AR field of segment registers for older
	 * qemu binaries.
	 *   IA32 arch specifies that at the time of processor reset the
	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
G
Guo Chao 已提交
4519
	 * is setting it to 0 in the userland code. This causes invalid guest
4520 4521 4522 4523 4524 4525
	 * state vmexit when "unrestricted guest" mode is turned on.
	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
	 * tree. Newer qemu binaries with that qemu fix would not need this
	 * kvm hack.
	 */
	if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4526
		var->type |= 0x1; /* Accessed */
4527

4528
	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4529 4530

out:
4531
	vmx->emulation_required = emulation_required(vcpu);
A
Avi Kivity 已提交
4532 4533 4534 4535
}

static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
A
Avi Kivity 已提交
4536
	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
A
Avi Kivity 已提交
4537 4538 4539 4540 4541

	*db = (ar >> 14) & 1;
	*l = (ar >> 13) & 1;
}

4542
static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
4543
{
4544 4545
	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
	dt->address = vmcs_readl(GUEST_IDTR_BASE);
A
Avi Kivity 已提交
4546 4547
}

4548
static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
4549
{
4550 4551
	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
	vmcs_writel(GUEST_IDTR_BASE, dt->address);
A
Avi Kivity 已提交
4552 4553
}

4554
static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
4555
{
4556 4557
	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
	dt->address = vmcs_readl(GUEST_GDTR_BASE);
A
Avi Kivity 已提交
4558 4559
}

4560
static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
4561
{
4562 4563
	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
	vmcs_writel(GUEST_GDTR_BASE, dt->address);
A
Avi Kivity 已提交
4564 4565
}

4566 4567 4568 4569 4570 4571
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment var;
	u32 ar;

	vmx_get_segment(vcpu, &var, seg);
4572
	var.dpl = 0x3;
4573 4574
	if (seg == VCPU_SREG_CS)
		var.type = 0x3;
4575 4576 4577 4578
	ar = vmx_segment_access_rights(&var);

	if (var.base != (var.selector << 4))
		return false;
4579
	if (var.limit != 0xffff)
4580
		return false;
4581
	if (ar != 0xf3)
4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592
		return false;

	return true;
}

static bool code_segment_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment cs;
	unsigned int cs_rpl;

	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4593
	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4594

4595 4596
	if (cs.unusable)
		return false;
4597
	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4598 4599 4600
		return false;
	if (!cs.s)
		return false;
4601
	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4602 4603
		if (cs.dpl > cs_rpl)
			return false;
4604
	} else {
4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620
		if (cs.dpl != cs_rpl)
			return false;
	}
	if (!cs.present)
		return false;

	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
	return true;
}

static bool stack_segment_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment ss;
	unsigned int ss_rpl;

	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4621
	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4622

4623 4624 4625
	if (ss.unusable)
		return true;
	if (ss.type != 3 && ss.type != 7)
4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642
		return false;
	if (!ss.s)
		return false;
	if (ss.dpl != ss_rpl) /* DPL != RPL */
		return false;
	if (!ss.present)
		return false;

	return true;
}

static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
	struct kvm_segment var;
	unsigned int rpl;

	vmx_get_segment(vcpu, &var, seg);
4643
	rpl = var.selector & SEGMENT_RPL_MASK;
4644

4645 4646
	if (var.unusable)
		return true;
4647 4648 4649 4650
	if (!var.s)
		return false;
	if (!var.present)
		return false;
4651
	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667
		if (var.dpl < rpl) /* DPL < RPL */
			return false;
	}

	/* TODO: Add other members to kvm_segment_field to allow checking for other access
	 * rights flags
	 */
	return true;
}

static bool tr_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment tr;

	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);

4668 4669
	if (tr.unusable)
		return false;
4670
	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
4671
		return false;
4672
	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685
		return false;
	if (!tr.present)
		return false;

	return true;
}

static bool ldtr_valid(struct kvm_vcpu *vcpu)
{
	struct kvm_segment ldtr;

	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);

4686 4687
	if (ldtr.unusable)
		return true;
4688
	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704
		return false;
	if (ldtr.type != 2)
		return false;
	if (!ldtr.present)
		return false;

	return true;
}

static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
{
	struct kvm_segment cs, ss;

	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);

4705 4706
	return ((cs.selector & SEGMENT_RPL_MASK) ==
		 (ss.selector & SEGMENT_RPL_MASK));
4707 4708 4709 4710 4711 4712 4713 4714 4715
}

/*
 * Check if guest state is valid. Returns true if valid, false if
 * not.
 * We assume that registers are always usable
 */
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
4716 4717 4718
	if (enable_unrestricted_guest)
		return true;

4719
	/* real mode guest state checks */
4720
	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761
		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
			return false;
		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
			return false;
	} else {
	/* protected mode guest state checks */
		if (!cs_ss_rpl_check(vcpu))
			return false;
		if (!code_segment_valid(vcpu))
			return false;
		if (!stack_segment_valid(vcpu))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
			return false;
		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
			return false;
		if (!tr_valid(vcpu))
			return false;
		if (!ldtr_valid(vcpu))
			return false;
	}
	/* TODO:
	 * - Add checks on RIP
	 * - Add checks on RFLAGS
	 */

	return true;
}

4762 4763 4764 4765 4766
static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
{
	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
}

M
Mike Day 已提交
4767
static int init_rmode_tss(struct kvm *kvm)
A
Avi Kivity 已提交
4768
{
4769
	gfn_t fn;
4770
	u16 data = 0;
4771
	int idx, r;
A
Avi Kivity 已提交
4772

4773
	idx = srcu_read_lock(&kvm->srcu);
4774
	fn = kvm->arch.tss_addr >> PAGE_SHIFT;
4775 4776
	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
	if (r < 0)
4777
		goto out;
4778
	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4779 4780
	r = kvm_write_guest_page(kvm, fn++, &data,
			TSS_IOPB_BASE_OFFSET, sizeof(u16));
4781
	if (r < 0)
4782
		goto out;
4783 4784
	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
	if (r < 0)
4785
		goto out;
4786 4787
	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
	if (r < 0)
4788
		goto out;
4789
	data = ~0;
4790 4791 4792 4793
	r = kvm_write_guest_page(kvm, fn, &data,
				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
				 sizeof(u8));
out:
4794
	srcu_read_unlock(&kvm->srcu, idx);
4795
	return r;
A
Avi Kivity 已提交
4796 4797
}

4798 4799
static int init_rmode_identity_map(struct kvm *kvm)
{
4800
	int i, idx, r = 0;
D
Dan Williams 已提交
4801
	kvm_pfn_t identity_map_pfn;
4802 4803
	u32 tmp;

4804 4805 4806
	/* Protect kvm->arch.ept_identity_pagetable_done. */
	mutex_lock(&kvm->slots_lock);

4807
	if (likely(kvm->arch.ept_identity_pagetable_done))
4808 4809
		goto out2;

4810 4811
	if (!kvm->arch.ept_identity_map_addr)
		kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4812
	identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4813

4814 4815
	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
				    kvm->arch.ept_identity_map_addr, PAGE_SIZE);
4816
	if (r < 0)
4817 4818
		goto out2;

4819
	idx = srcu_read_lock(&kvm->srcu);
4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832
	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
	if (r < 0)
		goto out;
	/* Set up identity-mapping pagetable for EPT in real mode */
	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
		r = kvm_write_guest_page(kvm, identity_map_pfn,
				&tmp, i * sizeof(tmp), sizeof(tmp));
		if (r < 0)
			goto out;
	}
	kvm->arch.ept_identity_pagetable_done = true;
4833

4834
out:
4835
	srcu_read_unlock(&kvm->srcu, idx);
4836 4837 4838

out2:
	mutex_unlock(&kvm->slots_lock);
4839
	return r;
4840 4841
}

A
Avi Kivity 已提交
4842 4843
static void seg_setup(int seg)
{
4844
	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4845
	unsigned int ar;
A
Avi Kivity 已提交
4846 4847 4848 4849

	vmcs_write16(sf->selector, 0);
	vmcs_writel(sf->base, 0);
	vmcs_write32(sf->limit, 0xffff);
4850 4851 4852
	ar = 0x93;
	if (seg == VCPU_SREG_CS)
		ar |= 0x08; /* code segment */
4853 4854

	vmcs_write32(sf->ar_bytes, ar);
A
Avi Kivity 已提交
4855 4856
}

4857 4858
static int alloc_apic_access_page(struct kvm *kvm)
{
4859
	struct page *page;
4860 4861
	int r = 0;

4862
	mutex_lock(&kvm->slots_lock);
4863
	if (kvm->arch.apic_access_page_done)
4864
		goto out;
4865 4866
	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
4867 4868
	if (r)
		goto out;
4869

4870
	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4871 4872 4873 4874 4875
	if (is_error_page(page)) {
		r = -EFAULT;
		goto out;
	}

4876 4877 4878 4879 4880 4881
	/*
	 * Do not pin the page in memory, so that memory hot-unplug
	 * is able to migrate it.
	 */
	put_page(page);
	kvm->arch.apic_access_page_done = true;
4882
out:
4883
	mutex_unlock(&kvm->slots_lock);
4884 4885 4886
	return r;
}

4887
static int allocate_vpid(void)
4888 4889 4890
{
	int vpid;

4891
	if (!enable_vpid)
4892
		return 0;
4893 4894
	spin_lock(&vmx_vpid_lock);
	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4895
	if (vpid < VMX_NR_VPIDS)
4896
		__set_bit(vpid, vmx_vpid_bitmap);
4897 4898
	else
		vpid = 0;
4899
	spin_unlock(&vmx_vpid_lock);
4900
	return vpid;
4901 4902
}

4903
static void free_vpid(int vpid)
4904
{
4905
	if (!enable_vpid || vpid == 0)
4906 4907
		return;
	spin_lock(&vmx_vpid_lock);
4908
	__clear_bit(vpid, vmx_vpid_bitmap);
4909 4910 4911
	spin_unlock(&vmx_vpid_lock);
}

4912 4913 4914 4915
#define MSR_TYPE_R	1
#define MSR_TYPE_W	2
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
						u32 msr, int type)
S
Sheng Yang 已提交
4916
{
4917
	int f = sizeof(unsigned long);
S
Sheng Yang 已提交
4918 4919 4920 4921 4922 4923 4924 4925 4926 4927

	if (!cpu_has_vmx_msr_bitmap())
		return;

	/*
	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
	 * have the write-low and read-high bitmap offsets the wrong way round.
	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
	 */
	if (msr <= 0x1fff) {
4928 4929 4930 4931 4932 4933 4934 4935
		if (type & MSR_TYPE_R)
			/* read-low */
			__clear_bit(msr, msr_bitmap + 0x000 / f);

		if (type & MSR_TYPE_W)
			/* write-low */
			__clear_bit(msr, msr_bitmap + 0x800 / f);

S
Sheng Yang 已提交
4936 4937
	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		msr &= 0x1fff;
4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948
		if (type & MSR_TYPE_R)
			/* read-high */
			__clear_bit(msr, msr_bitmap + 0x400 / f);

		if (type & MSR_TYPE_W)
			/* write-high */
			__clear_bit(msr, msr_bitmap + 0xc00 / f);

	}
}

4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994
/*
 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 */
static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
					       unsigned long *msr_bitmap_nested,
					       u32 msr, int type)
{
	int f = sizeof(unsigned long);

	if (!cpu_has_vmx_msr_bitmap()) {
		WARN_ON(1);
		return;
	}

	/*
	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
	 * have the write-low and read-high bitmap offsets the wrong way round.
	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
	 */
	if (msr <= 0x1fff) {
		if (type & MSR_TYPE_R &&
		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
			/* read-low */
			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);

		if (type & MSR_TYPE_W &&
		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
			/* write-low */
			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);

	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		msr &= 0x1fff;
		if (type & MSR_TYPE_R &&
		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
			/* read-high */
			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);

		if (type & MSR_TYPE_W &&
		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
			/* write-high */
			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);

	}
}

4995 4996 4997
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
	if (!longmode_only)
4998 4999 5000 5001 5002 5003
		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
						msr, MSR_TYPE_R | MSR_TYPE_W);
	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
						msr, MSR_TYPE_R | MSR_TYPE_W);
}

5004
static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
5005
{
5006
	if (apicv_active) {
5007
		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
5008
				msr, type);
5009
		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
5010
				msr, type);
5011 5012
	} else {
		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
5013
				msr, type);
5014
		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
5015
				msr, type);
5016
	}
5017 5018
}

5019
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
5020
{
5021
	return enable_apicv;
5022 5023
}

5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045
static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	gfn_t gfn;

	/*
	 * Don't need to mark the APIC access page dirty; it is never
	 * written to by the CPU during APIC virtualization.
	 */

	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
		kvm_vcpu_mark_page_dirty(vcpu, gfn);
	}

	if (nested_cpu_has_posted_intr(vmcs12)) {
		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
		kvm_vcpu_mark_page_dirty(vcpu, gfn);
	}
}


5046
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
5047 5048 5049 5050 5051 5052
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int max_irr;
	void *vapic_page;
	u16 status;

5053 5054
	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
		return;
5055

5056 5057 5058
	vmx->nested.pi_pending = false;
	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
		return;
5059

5060 5061
	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
	if (max_irr != 256) {
5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072
		vapic_page = kmap(vmx->nested.virtual_apic_page);
		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
		kunmap(vmx->nested.virtual_apic_page);

		status = vmcs_read16(GUEST_INTR_STATUS);
		if ((u8)max_irr > ((u8)status & 0xff)) {
			status &= ~0xff;
			status |= (u8)max_irr;
			vmcs_write16(GUEST_INTR_STATUS, status);
		}
	}
5073 5074

	nested_mark_vmcs12_pages_dirty(vcpu);
5075 5076
}

5077 5078
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
						     bool nested)
5079 5080
{
#ifdef CONFIG_SMP
5081 5082
	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;

5083
	if (vcpu->mode == IN_GUEST_MODE) {
5084
		/*
5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099
		 * The vector of interrupt to be delivered to vcpu had
		 * been set in PIR before this function.
		 *
		 * Following cases will be reached in this block, and
		 * we always send a notification event in all cases as
		 * explained below.
		 *
		 * Case 1: vcpu keeps in non-root mode. Sending a
		 * notification event posts the interrupt to vcpu.
		 *
		 * Case 2: vcpu exits to root mode and is still
		 * runnable. PIR will be synced to vIRR before the
		 * next vcpu entry. Sending a notification event in
		 * this case has no effect, as vcpu is not in root
		 * mode.
5100
		 *
5101 5102 5103 5104 5105 5106
		 * Case 3: vcpu exits to root mode and is blocked.
		 * vcpu_block() has already synced PIR to vIRR and
		 * never blocks vcpu if vIRR is not cleared. Therefore,
		 * a blocked vcpu here does not wait for any requested
		 * interrupts in PIR, and sending a notification event
		 * which has no effect is safe here.
5107 5108
		 */

5109
		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
5110 5111 5112 5113 5114 5115
		return true;
	}
#endif
	return false;
}

5116 5117 5118 5119 5120 5121 5122 5123
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
						int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (is_guest_mode(vcpu) &&
	    vector == vmx->nested.posted_intr_nv) {
		/* the PIR and ON have been set by L1. */
5124
		kvm_vcpu_trigger_posted_interrupt(vcpu, true);
5125 5126 5127 5128 5129 5130 5131 5132 5133 5134
		/*
		 * If a posted intr is not recognized by hardware,
		 * we will accomplish it in the next vmentry.
		 */
		vmx->nested.pi_pending = true;
		kvm_make_request(KVM_REQ_EVENT, vcpu);
		return 0;
	}
	return -1;
}
5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146
/*
 * Send interrupt to vcpu via posted interrupt way.
 * 1. If target vcpu is running(non-root mode), send posted interrupt
 * notification to vcpu and hardware will sync PIR to vIRR atomically.
 * 2. If target vcpu isn't running(root mode), kick it to pick up the
 * interrupt from PIR in next vmentry.
 */
static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int r;

5147 5148 5149 5150
	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
	if (!r)
		return;

5151 5152 5153
	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
		return;

5154 5155 5156 5157
	/* If a previous notification has sent the IPI, nothing to do.  */
	if (pi_test_and_set_on(&vmx->pi_desc))
		return;

5158
	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
5159 5160 5161
		kvm_vcpu_kick(vcpu);
}

5162 5163 5164 5165 5166 5167
/*
 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
 * will not change in the lifetime of the guest.
 * Note that host-state that does change is set elsewhere. E.g., host-state
 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
 */
5168
static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
5169 5170 5171 5172
{
	u32 low32, high32;
	unsigned long tmpl;
	struct desc_ptr dt;
5173
	unsigned long cr0, cr3, cr4;
5174

5175 5176 5177
	cr0 = read_cr0();
	WARN_ON(cr0 & X86_CR0_TS);
	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
5178 5179 5180 5181 5182

	/*
	 * Save the most likely value for this task's CR3 in the VMCS.
	 * We can't use __get_current_cr3_fast() because we're not atomic.
	 */
5183
	cr3 = __read_cr3();
5184
	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
5185
	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
5186

5187
	/* Save the most likely value for this task's CR4 in the VMCS. */
5188
	cr4 = cr4_read_shadow();
5189
	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
5190
	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
5191

5192
	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
A
Avi Kivity 已提交
5193 5194 5195 5196 5197 5198 5199 5200 5201
#ifdef CONFIG_X86_64
	/*
	 * Load null selectors, so we can avoid reloading them in
	 * __vmx_load_host_state(), in case userspace uses the null selectors
	 * too (the expected case).
	 */
	vmcs_write16(HOST_DS_SELECTOR, 0);
	vmcs_write16(HOST_ES_SELECTOR, 0);
#else
5202 5203
	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
A
Avi Kivity 已提交
5204
#endif
5205 5206 5207
	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */

5208
	store_idt(&dt);
5209
	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
5210
	vmx->host_idt_base = dt.address;
5211

A
Avi Kivity 已提交
5212
	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224

	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */

	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
		rdmsr(MSR_IA32_CR_PAT, low32, high32);
		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
	}
}

5225 5226 5227 5228 5229
static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
	if (enable_ept)
		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
5230 5231 5232
	if (is_guest_mode(&vmx->vcpu))
		vmx->vcpu.arch.cr4_guest_owned_bits &=
			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
5233 5234 5235
	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
}

5236 5237 5238 5239
static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;

5240
	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
5241
		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
5242 5243 5244 5245

	if (!enable_vnmi)
		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;

5246 5247
	/* Enable the preemption timer dynamically */
	pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
5248 5249 5250
	return pin_based_exec_ctrl;
}

5251 5252 5253 5254 5255
static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268
	if (cpu_has_secondary_exec_ctrls()) {
		if (kvm_vcpu_apicv_active(vcpu))
			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
		else
			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
					SECONDARY_EXEC_APIC_REGISTER_VIRT |
					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
	}

	if (cpu_has_vmx_msr_bitmap())
		vmx_set_msr_bitmap(vcpu);
5269 5270
}

5271 5272 5273
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
5274 5275 5276 5277

	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
		exec_control &= ~CPU_BASED_MOV_DR_EXITING;

5278
	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291
		exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
		exec_control |= CPU_BASED_CR8_STORE_EXITING |
				CPU_BASED_CR8_LOAD_EXITING;
#endif
	}
	if (!enable_ept)
		exec_control |= CPU_BASED_CR3_STORE_EXITING |
				CPU_BASED_CR3_LOAD_EXITING  |
				CPU_BASED_INVLPG_EXITING;
	return exec_control;
}

5292
static bool vmx_rdrand_supported(void)
5293
{
5294
	return vmcs_config.cpu_based_2nd_exec_ctrl &
5295
		SECONDARY_EXEC_RDRAND_EXITING;
5296 5297
}

5298 5299 5300
static bool vmx_rdseed_supported(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
5301
		SECONDARY_EXEC_RDSEED_EXITING;
5302 5303
}

5304
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5305
{
5306 5307
	struct kvm_vcpu *vcpu = &vmx->vcpu;

5308
	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
5309
	if (!cpu_need_virtualize_apic_accesses(vcpu))
5310 5311 5312 5313 5314 5315
		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
	if (vmx->vpid == 0)
		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
	if (!enable_ept) {
		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
		enable_unrestricted_guest = 0;
5316 5317
		/* Enable INVPCID for non-ept guests may cause performance regression. */
		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
5318 5319 5320 5321 5322
	}
	if (!enable_unrestricted_guest)
		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
	if (!ple_gap)
		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5323
	if (!kvm_vcpu_apicv_active(vcpu))
5324 5325
		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
5326
	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5327 5328 5329 5330 5331 5332
	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
	   (handle_vmptrld).
	   We can NOT enable shadow_vmcs here because we don't have yet
	   a current VMCS12
	*/
	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
K
Kai Huang 已提交
5333 5334 5335

	if (!enable_pml)
		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
K
Kai Huang 已提交
5336

5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355
	if (vmx_xsaves_supported()) {
		/* Exposing XSAVES only when XSAVE is exposed */
		bool xsaves_enabled =
			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);

		if (!xsaves_enabled)
			exec_control &= ~SECONDARY_EXEC_XSAVES;

		if (nested) {
			if (xsaves_enabled)
				vmx->nested.nested_vmx_secondary_ctls_high |=
					SECONDARY_EXEC_XSAVES;
			else
				vmx->nested.nested_vmx_secondary_ctls_high &=
					~SECONDARY_EXEC_XSAVES;
		}
	}

5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391
	if (vmx_rdtscp_supported()) {
		bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
		if (!rdtscp_enabled)
			exec_control &= ~SECONDARY_EXEC_RDTSCP;

		if (nested) {
			if (rdtscp_enabled)
				vmx->nested.nested_vmx_secondary_ctls_high |=
					SECONDARY_EXEC_RDTSCP;
			else
				vmx->nested.nested_vmx_secondary_ctls_high &=
					~SECONDARY_EXEC_RDTSCP;
		}
	}

	if (vmx_invpcid_supported()) {
		/* Exposing INVPCID only when PCID is exposed */
		bool invpcid_enabled =
			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
			guest_cpuid_has(vcpu, X86_FEATURE_PCID);

		if (!invpcid_enabled) {
			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
			guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
		}

		if (nested) {
			if (invpcid_enabled)
				vmx->nested.nested_vmx_secondary_ctls_high |=
					SECONDARY_EXEC_ENABLE_INVPCID;
			else
				vmx->nested.nested_vmx_secondary_ctls_high &=
					~SECONDARY_EXEC_ENABLE_INVPCID;
		}
	}

5392 5393 5394
	if (vmx_rdrand_supported()) {
		bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
		if (rdrand_enabled)
5395
			exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
5396 5397 5398 5399

		if (nested) {
			if (rdrand_enabled)
				vmx->nested.nested_vmx_secondary_ctls_high |=
5400
					SECONDARY_EXEC_RDRAND_EXITING;
5401 5402
			else
				vmx->nested.nested_vmx_secondary_ctls_high &=
5403
					~SECONDARY_EXEC_RDRAND_EXITING;
5404 5405 5406
		}
	}

5407 5408 5409
	if (vmx_rdseed_supported()) {
		bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
		if (rdseed_enabled)
5410
			exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
5411 5412 5413 5414

		if (nested) {
			if (rdseed_enabled)
				vmx->nested.nested_vmx_secondary_ctls_high |=
5415
					SECONDARY_EXEC_RDSEED_EXITING;
5416 5417
			else
				vmx->nested.nested_vmx_secondary_ctls_high &=
5418
					~SECONDARY_EXEC_RDSEED_EXITING;
5419 5420 5421
		}
	}

5422
	vmx->secondary_exec_control = exec_control;
5423 5424
}

5425 5426 5427 5428 5429 5430
static void ept_set_mmio_spte_mask(void)
{
	/*
	 * EPT Misconfigurations can be generated if the value of bits 2:0
	 * of an EPT paging-structure entry is 110b (write/execute).
	 */
5431 5432
	kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
				   VMX_EPT_MISCONFIG_WX_VALUE);
5433 5434
}

5435
#define VMX_XSS_EXIT_BITMAP 0
A
Avi Kivity 已提交
5436 5437 5438
/*
 * Sets up the vmcs for emulated real mode.
 */
5439
static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
A
Avi Kivity 已提交
5440
{
5441
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
5442
	unsigned long a;
5443
#endif
A
Avi Kivity 已提交
5444 5445 5446
	int i;

	/* I/O */
5447 5448
	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
A
Avi Kivity 已提交
5449

5450 5451 5452 5453
	if (enable_shadow_vmcs) {
		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
	}
S
Sheng Yang 已提交
5454
	if (cpu_has_vmx_msr_bitmap())
5455
		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
S
Sheng Yang 已提交
5456

A
Avi Kivity 已提交
5457 5458 5459
	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

	/* Control */
5460
	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
5461
	vmx->hv_deadline_tsc = -1;
5462

5463
	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
A
Avi Kivity 已提交
5464

5465
	if (cpu_has_secondary_exec_ctrls()) {
5466
		vmx_compute_secondary_exec_control(vmx);
5467
		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
5468
			     vmx->secondary_exec_control);
5469
	}
5470

5471
	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
5472 5473 5474 5475 5476 5477
		vmcs_write64(EOI_EXIT_BITMAP0, 0);
		vmcs_write64(EOI_EXIT_BITMAP1, 0);
		vmcs_write64(EOI_EXIT_BITMAP2, 0);
		vmcs_write64(EOI_EXIT_BITMAP3, 0);

		vmcs_write16(GUEST_INTR_STATUS, 0);
5478

5479
		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5480
		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5481 5482
	}

5483 5484
	if (ple_gap) {
		vmcs_write32(PLE_GAP, ple_gap);
5485 5486
		vmx->ple_window = ple_window;
		vmx->ple_window_dirty = true;
5487 5488
	}

5489 5490
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
A
Avi Kivity 已提交
5491 5492
	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */

5493 5494
	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
5495
	vmx_set_constant_host_state(vmx);
5496
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
5497 5498 5499 5500 5501 5502 5503 5504 5505
	rdmsrl(MSR_FS_BASE, a);
	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
	rdmsrl(MSR_GS_BASE, a);
	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
#else
	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
#endif

B
Bandan Das 已提交
5506 5507 5508
	if (cpu_has_vmx_vmfunc())
		vmcs_write64(VM_FUNCTION_CONTROL, 0);

5509 5510
	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5511
	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
5512
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5513
	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
A
Avi Kivity 已提交
5514

5515 5516
	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
S
Sheng Yang 已提交
5517

5518
	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
A
Avi Kivity 已提交
5519 5520
		u32 index = vmx_msr_index[i];
		u32 data_low, data_high;
5521
		int j = vmx->nmsrs;
A
Avi Kivity 已提交
5522 5523 5524

		if (rdmsr_safe(index, &data_low, &data_high) < 0)
			continue;
5525 5526
		if (wrmsr_safe(index, data_low, data_high) < 0)
			continue;
5527 5528
		vmx->guest_msrs[j].index = i;
		vmx->guest_msrs[j].data = 0;
5529
		vmx->guest_msrs[j].mask = -1ull;
5530
		++vmx->nmsrs;
A
Avi Kivity 已提交
5531 5532
	}

5533 5534

	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
A
Avi Kivity 已提交
5535 5536

	/* 22.2.1, 20.8.1 */
5537
	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
5538

5539 5540 5541
	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);

5542
	set_cr4_guest_host_mask(vmx);
5543

5544 5545 5546
	if (vmx_xsaves_supported())
		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);

5547 5548 5549 5550 5551
	if (enable_pml) {
		ASSERT(vmx->pml_pg);
		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
	}
5552 5553
}

5554
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5555 5556
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5557
	struct msr_data apic_base_msr;
5558
	u64 cr0;
5559

5560
	vmx->rmode.vm86_active = 0;
5561

5562
	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5563 5564 5565 5566 5567 5568 5569 5570 5571 5572
	kvm_set_cr8(vcpu, 0);

	if (!init_event) {
		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
				     MSR_IA32_APICBASE_ENABLE;
		if (kvm_vcpu_is_reset_bsp(vcpu))
			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
		apic_base_msr.host_initiated = true;
		kvm_set_apic_base(vcpu, &apic_base_msr);
	}
5573

A
Avi Kivity 已提交
5574 5575
	vmx_segment_cache_clear(vmx);

5576
	seg_setup(VCPU_SREG_CS);
5577
	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5578
	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595

	seg_setup(VCPU_SREG_DS);
	seg_setup(VCPU_SREG_ES);
	seg_setup(VCPU_SREG_FS);
	seg_setup(VCPU_SREG_GS);
	seg_setup(VCPU_SREG_SS);

	vmcs_write16(GUEST_TR_SELECTOR, 0);
	vmcs_writel(GUEST_TR_BASE, 0);
	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
	vmcs_writel(GUEST_LDTR_BASE, 0);
	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);

5596 5597 5598 5599 5600 5601
	if (!init_event) {
		vmcs_write32(GUEST_SYSENTER_CS, 0);
		vmcs_writel(GUEST_SYSENTER_ESP, 0);
		vmcs_writel(GUEST_SYSENTER_EIP, 0);
		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
	}
5602 5603

	vmcs_writel(GUEST_RFLAGS, 0x02);
5604
	kvm_rip_write(vcpu, 0xfff0);
5605 5606 5607 5608 5609 5610 5611

	vmcs_writel(GUEST_GDTR_BASE, 0);
	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);

	vmcs_writel(GUEST_IDTR_BASE, 0);
	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);

5612
	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5613
	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5614
	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5615 5616
	if (kvm_mpx_supported())
		vmcs_write64(GUEST_BNDCFGS, 0);
5617 5618 5619

	setup_msrs(vmx);

A
Avi Kivity 已提交
5620 5621
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */

5622
	if (cpu_has_vmx_tpr_shadow() && !init_event) {
5623
		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5624
		if (cpu_need_tpr_shadow(vcpu))
5625
			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5626
				     __pa(vcpu->arch.apic->regs));
5627 5628 5629
		vmcs_write32(TPR_THRESHOLD, 0);
	}

5630
	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
A
Avi Kivity 已提交
5631

5632 5633 5634
	if (vmx->vpid != 0)
		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);

5635 5636
	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
	vmx->vcpu.arch.cr0 = cr0;
5637
	vmx_set_cr0(vcpu, cr0); /* enter rmode */
5638
	vmx_set_cr4(vcpu, 0);
P
Paolo Bonzini 已提交
5639
	vmx_set_efer(vcpu, 0);
5640

5641
	update_exception_bitmap(vcpu);
A
Avi Kivity 已提交
5642

5643
	vpid_sync_context(vmx->vpid);
A
Avi Kivity 已提交
5644 5645
}

5646 5647 5648 5649 5650 5651 5652 5653 5654 5655
/*
 * In nested virtualization, check if L1 asked to exit on external interrupts.
 * For most existing hypervisors, this will always return true.
 */
static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
{
	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
		PIN_BASED_EXT_INTR_MASK;
}

5656 5657 5658 5659 5660 5661 5662 5663 5664 5665
/*
 * In nested virtualization, check if L1 has set
 * VM_EXIT_ACK_INTR_ON_EXIT
 */
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
{
	return get_vmcs12(vcpu)->vm_exit_controls &
		VM_EXIT_ACK_INTR_ON_EXIT;
}

5666 5667 5668 5669 5670 5671
static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
		PIN_BASED_NMI_EXITING;
}

5672
static void enable_irq_window(struct kvm_vcpu *vcpu)
5673
{
5674 5675
	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
		      CPU_BASED_VIRTUAL_INTR_PENDING);
5676 5677
}

5678
static void enable_nmi_window(struct kvm_vcpu *vcpu)
5679
{
5680
	if (!enable_vnmi ||
5681
	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5682 5683 5684
		enable_irq_window(vcpu);
		return;
	}
5685

5686 5687
	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
		      CPU_BASED_VIRTUAL_NMI_PENDING);
5688 5689
}

5690
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5691
{
5692
	struct vcpu_vmx *vmx = to_vmx(vcpu);
5693 5694
	uint32_t intr;
	int irq = vcpu->arch.interrupt.nr;
5695

5696
	trace_kvm_inj_virq(irq);
F
Feng (Eric) Liu 已提交
5697

5698
	++vcpu->stat.irq_injections;
5699
	if (vmx->rmode.vm86_active) {
5700 5701 5702 5703
		int inc_eip = 0;
		if (vcpu->arch.interrupt.soft)
			inc_eip = vcpu->arch.event_exit_inst_len;
		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
5704
			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5705 5706
		return;
	}
5707 5708 5709 5710 5711 5712 5713 5714
	intr = irq | INTR_INFO_VALID_MASK;
	if (vcpu->arch.interrupt.soft) {
		intr |= INTR_TYPE_SOFT_INTR;
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
	} else
		intr |= INTR_TYPE_EXT_INTR;
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5715 5716
}

5717 5718
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
J
Jan Kiszka 已提交
5719 5720
	struct vcpu_vmx *vmx = to_vmx(vcpu);

5721
	if (!enable_vnmi) {
5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733
		/*
		 * Tracking the NMI-blocked state in software is built upon
		 * finding the next open IRQ window. This, in turn, depends on
		 * well-behaving guests: They have to keep IRQs disabled at
		 * least as long as the NMI handler runs. Otherwise we may
		 * cause NMI nesting, maybe breaking the guest. But as this is
		 * highly unlikely, we can live with the residual risk.
		 */
		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
		vmx->loaded_vmcs->vnmi_blocked_time = 0;
	}

5734 5735
	++vcpu->stat.nmi_injections;
	vmx->loaded_vmcs->nmi_known_unmasked = false;
5736

5737
	if (vmx->rmode.vm86_active) {
5738
		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
5739
			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
J
Jan Kiszka 已提交
5740 5741
		return;
	}
5742

5743 5744 5745 5746
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}

J
Jan Kiszka 已提交
5747 5748
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
5749 5750 5751
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	bool masked;

5752
	if (!enable_vnmi)
5753
		return vmx->loaded_vmcs->soft_vnmi_blocked;
5754
	if (vmx->loaded_vmcs->nmi_known_unmasked)
5755
		return false;
5756 5757 5758
	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
	return masked;
J
Jan Kiszka 已提交
5759 5760 5761 5762 5763 5764
}

static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

5765
	if (!enable_vnmi) {
5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778
		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
			vmx->loaded_vmcs->vnmi_blocked_time = 0;
		}
	} else {
		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
		if (masked)
			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				      GUEST_INTR_STATE_NMI);
		else
			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
					GUEST_INTR_STATE_NMI);
	}
J
Jan Kiszka 已提交
5779 5780
}

5781 5782
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
{
5783 5784
	if (to_vmx(vcpu)->nested.nested_run_pending)
		return 0;
5785

5786
	if (!enable_vnmi &&
5787 5788 5789
	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
		return 0;

5790 5791 5792 5793 5794
	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
		   | GUEST_INTR_STATE_NMI));
}

5795 5796
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
5797 5798
	return (!to_vmx(vcpu)->nested.nested_run_pending &&
		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
5799 5800
		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5801 5802
}

5803 5804 5805 5806
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
	int ret;

5807 5808
	ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
				    PAGE_SIZE * 3);
5809 5810
	if (ret)
		return ret;
5811
	kvm->arch.tss_addr = addr;
5812
	return init_rmode_tss(kvm);
5813 5814
}

5815
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
A
Avi Kivity 已提交
5816
{
5817 5818
	switch (vec) {
	case BP_VECTOR:
5819 5820 5821 5822 5823 5824
		/*
		 * Update instruction length as we may reinject the exception
		 * from user space while in guest debugging mode.
		 */
		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
J
Jan Kiszka 已提交
5825
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5826 5827 5828 5829 5830 5831
			return false;
		/* fall through */
	case DB_VECTOR:
		if (vcpu->guest_debug &
			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
			return false;
J
Jan Kiszka 已提交
5832 5833
		/* fall through */
	case DE_VECTOR:
5834 5835 5836 5837 5838 5839 5840
	case OF_VECTOR:
	case BR_VECTOR:
	case UD_VECTOR:
	case DF_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
	case MF_VECTOR:
5841 5842
		return true;
	break;
5843
	}
5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857
	return false;
}

static int handle_rmode_exception(struct kvm_vcpu *vcpu,
				  int vec, u32 err_code)
{
	/*
	 * Instruction with address size override prefix opcode 0x67
	 * Cause the #SS fault with 0 error code in VM86 mode.
	 */
	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
		if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
			if (vcpu->arch.halt_request) {
				vcpu->arch.halt_request = 0;
5858
				return kvm_vcpu_halt(vcpu);
5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871
			}
			return 1;
		}
		return 0;
	}

	/*
	 * Forward all other exceptions that are valid in real mode.
	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
	 *        the required debugging infrastructure rework.
	 */
	kvm_queue_exception(vcpu, vec);
	return 1;
A
Avi Kivity 已提交
5872 5873
}

A
Andi Kleen 已提交
5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892
/*
 * Trigger machine check on the host. We assume all the MSRs are already set up
 * by the CPU and that we still run on the same CPU as the MCE occurred on.
 * We pass a fake environment to the machine check handler because we want
 * the guest to be always treated like user space, no matter what context
 * it used internally.
 */
static void kvm_machine_check(void)
{
#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
	struct pt_regs regs = {
		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
		.flags = X86_EFLAGS_IF,
	};

	do_machine_check(&regs, 0);
#endif
}

A
Avi Kivity 已提交
5893
static int handle_machine_check(struct kvm_vcpu *vcpu)
A
Andi Kleen 已提交
5894 5895 5896 5897 5898
{
	/* already handled by vcpu_run */
	return 1;
}

A
Avi Kivity 已提交
5899
static int handle_exception(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
5900
{
5901
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Avi Kivity 已提交
5902
	struct kvm_run *kvm_run = vcpu->run;
J
Jan Kiszka 已提交
5903
	u32 intr_info, ex_no, error_code;
5904
	unsigned long cr2, rip, dr6;
A
Avi Kivity 已提交
5905 5906 5907
	u32 vect_info;
	enum emulation_result er;

5908
	vect_info = vmx->idt_vectoring_info;
5909
	intr_info = vmx->exit_intr_info;
A
Avi Kivity 已提交
5910

A
Andi Kleen 已提交
5911
	if (is_machine_check(intr_info))
A
Avi Kivity 已提交
5912
		return handle_machine_check(vcpu);
A
Andi Kleen 已提交
5913

5914
	if (is_nmi(intr_info))
5915
		return 1;  /* already handled by vmx_vcpu_run() */
5916

5917
	if (is_invalid_opcode(intr_info)) {
5918 5919 5920 5921
		if (is_guest_mode(vcpu)) {
			kvm_queue_exception(vcpu, UD_VECTOR);
			return 1;
		}
5922
		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5923
		if (er != EMULATE_DONE)
5924
			kvm_queue_exception(vcpu, UD_VECTOR);
5925 5926 5927
		return 1;
	}

A
Avi Kivity 已提交
5928
	error_code = 0;
5929
	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
A
Avi Kivity 已提交
5930
		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5931 5932 5933 5934 5935 5936 5937 5938 5939 5940

	/*
	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
	 * MMIO, it is better to report an internal error.
	 * See the comments in vmx_handle_exit.
	 */
	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5941
		vcpu->run->internal.ndata = 3;
5942 5943
		vcpu->run->internal.data[0] = vect_info;
		vcpu->run->internal.data[1] = intr_info;
5944
		vcpu->run->internal.data[2] = error_code;
5945 5946 5947
		return 0;
	}

A
Avi Kivity 已提交
5948 5949
	if (is_page_fault(intr_info)) {
		cr2 = vmcs_readl(EXIT_QUALIFICATION);
5950 5951
		/* EPT won't cause page fault directly */
		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
5952
		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
A
Avi Kivity 已提交
5953 5954
	}

J
Jan Kiszka 已提交
5955
	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5956 5957 5958 5959

	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
		return handle_rmode_exception(vcpu, ex_no, error_code);

5960
	switch (ex_no) {
5961 5962 5963
	case AC_VECTOR:
		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
		return 1;
5964 5965 5966 5967
	case DB_VECTOR:
		dr6 = vmcs_readl(EXIT_QUALIFICATION);
		if (!(vcpu->guest_debug &
		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5968
			vcpu->arch.dr6 &= ~15;
5969
			vcpu->arch.dr6 |= dr6 | DR6_RTM;
5970 5971 5972
			if (!(dr6 & ~DR6_RESERVED)) /* icebp */
				skip_emulated_instruction(vcpu);

5973 5974 5975 5976 5977 5978 5979
			kvm_queue_exception(vcpu, DB_VECTOR);
			return 1;
		}
		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
		/* fall through */
	case BP_VECTOR:
5980 5981 5982 5983 5984 5985 5986
		/*
		 * Update instruction length as we may reinject #BP from
		 * user space while in guest debugging mode. Reading it for
		 * #DB as well causes no harm, it is not used in that case.
		 */
		vmx->vcpu.arch.event_exit_inst_len =
			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
A
Avi Kivity 已提交
5987
		kvm_run->exit_reason = KVM_EXIT_DEBUG;
5988
		rip = kvm_rip_read(vcpu);
J
Jan Kiszka 已提交
5989 5990
		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
		kvm_run->debug.arch.exception = ex_no;
5991 5992
		break;
	default:
J
Jan Kiszka 已提交
5993 5994 5995
		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
		kvm_run->ex.exception = ex_no;
		kvm_run->ex.error_code = error_code;
5996
		break;
A
Avi Kivity 已提交
5997 5998 5999 6000
	}
	return 0;
}

A
Avi Kivity 已提交
6001
static int handle_external_interrupt(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6002
{
A
Avi Kivity 已提交
6003
	++vcpu->stat.irq_exits;
A
Avi Kivity 已提交
6004 6005 6006
	return 1;
}

A
Avi Kivity 已提交
6007
static int handle_triple_fault(struct kvm_vcpu *vcpu)
6008
{
A
Avi Kivity 已提交
6009
	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
6010
	vcpu->mmio_needed = 0;
6011 6012
	return 0;
}
A
Avi Kivity 已提交
6013

A
Avi Kivity 已提交
6014
static int handle_io(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6015
{
6016
	unsigned long exit_qualification;
6017
	int size, in, string, ret;
6018
	unsigned port;
A
Avi Kivity 已提交
6019

6020
	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6021
	string = (exit_qualification & 16) != 0;
6022
	in = (exit_qualification & 8) != 0;
6023

6024
	++vcpu->stat.io_exits;
6025

6026
	if (string || in)
6027
		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6028

6029 6030 6031
	port = exit_qualification >> 16;
	size = (exit_qualification & 7) + 1;

6032 6033 6034 6035 6036 6037 6038
	ret = kvm_skip_emulated_instruction(vcpu);

	/*
	 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
	 * KVM_EXIT_DEBUG here.
	 */
	return kvm_fast_pio_out(vcpu, size, port) && ret;
A
Avi Kivity 已提交
6039 6040
}

I
Ingo Molnar 已提交
6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051
static void
vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
	/*
	 * Patch in the VMCALL instruction:
	 */
	hypercall[0] = 0x0f;
	hypercall[1] = 0x01;
	hypercall[2] = 0xc1;
}

G
Guo Chao 已提交
6052
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
6053 6054 6055
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
	if (is_guest_mode(vcpu)) {
6056 6057 6058
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		unsigned long orig_val = val;

6059 6060 6061
		/*
		 * We get here when L2 changed cr0 in a way that did not change
		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
6062 6063 6064 6065
		 * but did change L0 shadowed bits. So we first calculate the
		 * effective cr0 value that L1 would like to write into the
		 * hardware. It consists of the L2-owned bits from the new
		 * value combined with the L1-owned bits from L1's guest_cr0.
6066
		 */
6067 6068 6069
		val = (val & ~vmcs12->cr0_guest_host_mask) |
			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);

6070
		if (!nested_guest_cr0_valid(vcpu, val))
6071
			return 1;
6072 6073 6074 6075

		if (kvm_set_cr0(vcpu, val))
			return 1;
		vmcs_writel(CR0_READ_SHADOW, orig_val);
6076
		return 0;
6077 6078
	} else {
		if (to_vmx(vcpu)->nested.vmxon &&
6079
		    !nested_host_cr0_valid(vcpu, val))
6080
			return 1;
6081

6082
		return kvm_set_cr0(vcpu, val);
6083
	}
6084 6085 6086 6087 6088
}

static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
{
	if (is_guest_mode(vcpu)) {
6089 6090 6091 6092 6093 6094 6095
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		unsigned long orig_val = val;

		/* analogously to handle_set_cr0 */
		val = (val & ~vmcs12->cr4_guest_host_mask) |
			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
		if (kvm_set_cr4(vcpu, val))
6096
			return 1;
6097
		vmcs_writel(CR4_READ_SHADOW, orig_val);
6098 6099 6100 6101 6102
		return 0;
	} else
		return kvm_set_cr4(vcpu, val);
}

A
Avi Kivity 已提交
6103
static int handle_cr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6104
{
6105
	unsigned long exit_qualification, val;
A
Avi Kivity 已提交
6106 6107
	int cr;
	int reg;
6108
	int err;
6109
	int ret;
A
Avi Kivity 已提交
6110

6111
	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
A
Avi Kivity 已提交
6112 6113 6114 6115
	cr = exit_qualification & 15;
	reg = (exit_qualification >> 8) & 15;
	switch ((exit_qualification >> 4) & 3) {
	case 0: /* mov to cr */
6116
		val = kvm_register_readl(vcpu, reg);
6117
		trace_kvm_cr_write(cr, val);
A
Avi Kivity 已提交
6118 6119
		switch (cr) {
		case 0:
6120
			err = handle_set_cr0(vcpu, val);
6121
			return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
6122
		case 3:
6123
			err = kvm_set_cr3(vcpu, val);
6124
			return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
6125
		case 4:
6126
			err = handle_set_cr4(vcpu, val);
6127
			return kvm_complete_insn_gp(vcpu, err);
6128 6129
		case 8: {
				u8 cr8_prev = kvm_get_cr8(vcpu);
6130
				u8 cr8 = (u8)val;
A
Andre Przywara 已提交
6131
				err = kvm_set_cr8(vcpu, cr8);
6132
				ret = kvm_complete_insn_gp(vcpu, err);
6133
				if (lapic_in_kernel(vcpu))
6134
					return ret;
6135
				if (cr8_prev <= cr8)
6136 6137 6138 6139 6140 6141
					return ret;
				/*
				 * TODO: we might be squashing a
				 * KVM_GUESTDBG_SINGLESTEP-triggered
				 * KVM_EXIT_DEBUG here.
				 */
A
Avi Kivity 已提交
6142
				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
6143 6144
				return 0;
			}
6145
		}
A
Avi Kivity 已提交
6146
		break;
6147
	case 2: /* clts */
6148 6149
		WARN_ONCE(1, "Guest should always own CR0.TS");
		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
6150
		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6151
		return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6152 6153 6154
	case 1: /*mov from cr*/
		switch (cr) {
		case 3:
6155 6156 6157
			val = kvm_read_cr3(vcpu);
			kvm_register_write(vcpu, reg, val);
			trace_kvm_cr_read(cr, val);
6158
			return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6159
		case 8:
6160 6161 6162
			val = kvm_get_cr8(vcpu);
			kvm_register_write(vcpu, reg, val);
			trace_kvm_cr_read(cr, val);
6163
			return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6164 6165 6166
		}
		break;
	case 3: /* lmsw */
6167
		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6168
		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
6169
		kvm_lmsw(vcpu, val);
A
Avi Kivity 已提交
6170

6171
		return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6172 6173 6174
	default:
		break;
	}
A
Avi Kivity 已提交
6175
	vcpu->run->exit_reason = 0;
6176
	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
A
Avi Kivity 已提交
6177 6178 6179 6180
	       (int)(exit_qualification >> 4) & 3, cr);
	return 0;
}

A
Avi Kivity 已提交
6181
static int handle_dr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6182
{
6183
	unsigned long exit_qualification;
6184 6185 6186 6187 6188 6189 6190 6191
	int dr, dr7, reg;

	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;

	/* First, if DR does not exist, trigger UD */
	if (!kvm_require_dr(vcpu, dr))
		return 1;
A
Avi Kivity 已提交
6192

6193
	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
6194 6195
	if (!kvm_require_cpl(vcpu, 0))
		return 1;
6196 6197
	dr7 = vmcs_readl(GUEST_DR7);
	if (dr7 & DR7_GD) {
6198 6199 6200 6201 6202 6203
		/*
		 * As the vm-exit takes precedence over the debug trap, we
		 * need to emulate the latter, either for the host or the
		 * guest debugging itself.
		 */
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
A
Avi Kivity 已提交
6204
			vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
6205
			vcpu->run->debug.arch.dr7 = dr7;
6206
			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
A
Avi Kivity 已提交
6207 6208
			vcpu->run->debug.arch.exception = DB_VECTOR;
			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
6209 6210
			return 0;
		} else {
6211
			vcpu->arch.dr6 &= ~15;
6212
			vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
6213 6214 6215 6216 6217
			kvm_queue_exception(vcpu, DB_VECTOR);
			return 1;
		}
	}

6218
	if (vcpu->guest_debug == 0) {
6219 6220
		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
				CPU_BASED_MOV_DR_EXITING);
6221 6222 6223 6224 6225 6226 6227 6228 6229 6230

		/*
		 * No more DR vmexits; force a reload of the debug registers
		 * and reenter on this instruction.  The next vmexit will
		 * retrieve the full state of the debug registers.
		 */
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
		return 1;
	}

6231 6232
	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
	if (exit_qualification & TYPE_MOV_FROM_DR) {
6233
		unsigned long val;
6234 6235 6236 6237

		if (kvm_get_dr(vcpu, dr, &val))
			return 1;
		kvm_register_write(vcpu, reg, val);
6238
	} else
6239
		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
6240 6241
			return 1;

6242
	return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6243 6244
}

J
Jan Kiszka 已提交
6245 6246 6247 6248 6249 6250 6251 6252 6253
static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
{
	return vcpu->arch.dr6;
}

static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
{
}

6254 6255 6256 6257 6258 6259 6260 6261 6262 6263
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
	get_debugreg(vcpu->arch.db[0], 0);
	get_debugreg(vcpu->arch.db[1], 1);
	get_debugreg(vcpu->arch.db[2], 2);
	get_debugreg(vcpu->arch.db[3], 3);
	get_debugreg(vcpu->arch.dr6, 6);
	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);

	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
6264
	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
6265 6266
}

6267 6268 6269 6270 6271
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
	vmcs_writel(GUEST_DR7, val);
}

A
Avi Kivity 已提交
6272
static int handle_cpuid(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6273
{
6274
	return kvm_emulate_cpuid(vcpu);
A
Avi Kivity 已提交
6275 6276
}

A
Avi Kivity 已提交
6277
static int handle_rdmsr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6278
{
6279
	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
6280
	struct msr_data msr_info;
A
Avi Kivity 已提交
6281

6282 6283 6284
	msr_info.index = ecx;
	msr_info.host_initiated = false;
	if (vmx_get_msr(vcpu, &msr_info)) {
6285
		trace_kvm_msr_read_ex(ecx);
6286
		kvm_inject_gp(vcpu, 0);
A
Avi Kivity 已提交
6287 6288 6289
		return 1;
	}

6290
	trace_kvm_msr_read(ecx, msr_info.data);
F
Feng (Eric) Liu 已提交
6291

A
Avi Kivity 已提交
6292
	/* FIXME: handling of bits 32:63 of rax, rdx */
6293 6294
	vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
	vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
6295
	return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6296 6297
}

A
Avi Kivity 已提交
6298
static int handle_wrmsr(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6299
{
6300
	struct msr_data msr;
6301 6302 6303
	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
A
Avi Kivity 已提交
6304

6305 6306 6307
	msr.data = data;
	msr.index = ecx;
	msr.host_initiated = false;
6308
	if (kvm_set_msr(vcpu, &msr) != 0) {
6309
		trace_kvm_msr_write_ex(ecx, data);
6310
		kvm_inject_gp(vcpu, 0);
A
Avi Kivity 已提交
6311 6312 6313
		return 1;
	}

6314
	trace_kvm_msr_write(ecx, data);
6315
	return kvm_skip_emulated_instruction(vcpu);
A
Avi Kivity 已提交
6316 6317
}

A
Avi Kivity 已提交
6318
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6319
{
6320
	kvm_apic_update_ppr(vcpu);
6321 6322 6323
	return 1;
}

A
Avi Kivity 已提交
6324
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6325
{
6326 6327
	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
			CPU_BASED_VIRTUAL_INTR_PENDING);
F
Feng (Eric) Liu 已提交
6328

6329 6330
	kvm_make_request(KVM_REQ_EVENT, vcpu);

6331
	++vcpu->stat.irq_window_exits;
A
Avi Kivity 已提交
6332 6333 6334
	return 1;
}

A
Avi Kivity 已提交
6335
static int handle_halt(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
6336
{
6337
	return kvm_emulate_halt(vcpu);
A
Avi Kivity 已提交
6338 6339
}

A
Avi Kivity 已提交
6340
static int handle_vmcall(struct kvm_vcpu *vcpu)
6341
{
6342
	return kvm_emulate_hypercall(vcpu);
6343 6344
}

6345 6346
static int handle_invd(struct kvm_vcpu *vcpu)
{
6347
	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6348 6349
}

A
Avi Kivity 已提交
6350
static int handle_invlpg(struct kvm_vcpu *vcpu)
M
Marcelo Tosatti 已提交
6351
{
6352
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
M
Marcelo Tosatti 已提交
6353 6354

	kvm_mmu_invlpg(vcpu, exit_qualification);
6355
	return kvm_skip_emulated_instruction(vcpu);
M
Marcelo Tosatti 已提交
6356 6357
}

A
Avi Kivity 已提交
6358 6359 6360 6361 6362
static int handle_rdpmc(struct kvm_vcpu *vcpu)
{
	int err;

	err = kvm_rdpmc(vcpu);
6363
	return kvm_complete_insn_gp(vcpu, err);
A
Avi Kivity 已提交
6364 6365
}

A
Avi Kivity 已提交
6366
static int handle_wbinvd(struct kvm_vcpu *vcpu)
E
Eddie Dong 已提交
6367
{
6368
	return kvm_emulate_wbinvd(vcpu);
E
Eddie Dong 已提交
6369 6370
}

6371 6372 6373 6374 6375 6376
static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
	u64 new_bv = kvm_read_edx_eax(vcpu);
	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);

	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6377
		return kvm_skip_emulated_instruction(vcpu);
6378 6379 6380
	return 1;
}

6381 6382
static int handle_xsaves(struct kvm_vcpu *vcpu)
{
6383
	kvm_skip_emulated_instruction(vcpu);
6384 6385 6386 6387 6388 6389
	WARN(1, "this should never happen\n");
	return 1;
}

static int handle_xrstors(struct kvm_vcpu *vcpu)
{
6390
	kvm_skip_emulated_instruction(vcpu);
6391 6392 6393 6394
	WARN(1, "this should never happen\n");
	return 1;
}

A
Avi Kivity 已提交
6395
static int handle_apic_access(struct kvm_vcpu *vcpu)
6396
{
6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410
	if (likely(fasteoi)) {
		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
		int access_type, offset;

		access_type = exit_qualification & APIC_ACCESS_TYPE;
		offset = exit_qualification & APIC_ACCESS_OFFSET;
		/*
		 * Sane guest uses MOV to write EOI, with written value
		 * not cared. So make a short-circuit here by avoiding
		 * heavy instruction emulation.
		 */
		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
		    (offset == APIC_EOI)) {
			kvm_lapic_set_eoi(vcpu);
6411
			return kvm_skip_emulated_instruction(vcpu);
6412 6413
		}
	}
6414
	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6415 6416
}

6417 6418 6419 6420 6421 6422 6423 6424 6425 6426
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	int vector = exit_qualification & 0xff;

	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
	kvm_apic_set_eoi_accelerated(vcpu, vector);
	return 1;
}

6427 6428 6429 6430 6431 6432 6433 6434 6435 6436
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	u32 offset = exit_qualification & 0xfff;

	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
	kvm_apic_write_nodecode(vcpu, offset);
	return 1;
}

A
Avi Kivity 已提交
6437
static int handle_task_switch(struct kvm_vcpu *vcpu)
6438
{
J
Jan Kiszka 已提交
6439
	struct vcpu_vmx *vmx = to_vmx(vcpu);
6440
	unsigned long exit_qualification;
6441 6442
	bool has_error_code = false;
	u32 error_code = 0;
6443
	u16 tss_selector;
6444
	int reason, type, idt_v, idt_index;
6445 6446

	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
6447
	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
6448
	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
6449 6450 6451 6452

	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);

	reason = (u32)exit_qualification >> 30;
6453 6454 6455 6456
	if (reason == TASK_SWITCH_GATE && idt_v) {
		switch (type) {
		case INTR_TYPE_NMI_INTR:
			vcpu->arch.nmi_injected = false;
6457
			vmx_set_nmi_mask(vcpu, true);
6458 6459
			break;
		case INTR_TYPE_EXT_INTR:
6460
		case INTR_TYPE_SOFT_INTR:
6461 6462 6463
			kvm_clear_interrupt_queue(vcpu);
			break;
		case INTR_TYPE_HARD_EXCEPTION:
6464 6465 6466 6467 6468 6469 6470
			if (vmx->idt_vectoring_info &
			    VECTORING_INFO_DELIVER_CODE_MASK) {
				has_error_code = true;
				error_code =
					vmcs_read32(IDT_VECTORING_ERROR_CODE);
			}
			/* fall through */
6471 6472 6473 6474 6475 6476
		case INTR_TYPE_SOFT_EXCEPTION:
			kvm_clear_exception_queue(vcpu);
			break;
		default:
			break;
		}
J
Jan Kiszka 已提交
6477
	}
6478 6479
	tss_selector = exit_qualification;

6480 6481 6482 6483 6484
	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
		       type != INTR_TYPE_EXT_INTR &&
		       type != INTR_TYPE_NMI_INTR))
		skip_emulated_instruction(vcpu);

6485 6486 6487
	if (kvm_task_switch(vcpu, tss_selector,
			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
			    has_error_code, error_code) == EMULATE_FAIL) {
6488 6489 6490
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
		vcpu->run->internal.ndata = 0;
6491
		return 0;
6492
	}
6493 6494 6495 6496 6497 6498 6499

	/*
	 * TODO: What about debug traps on tss switch?
	 *       Are we supposed to inject them and update dr6?
	 */

	return 1;
6500 6501
}

A
Avi Kivity 已提交
6502
static int handle_ept_violation(struct kvm_vcpu *vcpu)
6503
{
6504
	unsigned long exit_qualification;
6505
	gpa_t gpa;
6506
	u64 error_code;
6507

6508
	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6509

6510 6511 6512 6513 6514 6515
	/*
	 * EPT violation happened while executing iret from NMI,
	 * "blocked by NMI" bit has to be set before next VM entry.
	 * There are errata that may cause this bit to not be set:
	 * AAK134, BY25.
	 */
6516
	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6517
			enable_vnmi &&
6518
			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
6519 6520
		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);

6521
	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6522
	trace_kvm_page_fault(gpa, exit_qualification);
6523

6524
	/* Is it a read fault? */
6525
	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
6526 6527
		     ? PFERR_USER_MASK : 0;
	/* Is it a write fault? */
6528
	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
6529 6530
		      ? PFERR_WRITE_MASK : 0;
	/* Is it a fetch fault? */
6531
	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
6532 6533 6534 6535 6536 6537
		      ? PFERR_FETCH_MASK : 0;
	/* ept page table entry is present? */
	error_code |= (exit_qualification &
		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
			EPT_VIOLATION_EXECUTABLE))
		      ? PFERR_PRESENT_MASK : 0;
6538

6539 6540
	error_code |= (exit_qualification & 0x100) != 0 ?
	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
6541 6542

	vcpu->arch.exit_qualification = exit_qualification;
6543
	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
6544 6545
}

A
Avi Kivity 已提交
6546
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6547
{
6548
	int ret;
6549 6550
	gpa_t gpa;

6551 6552 6553 6554
	/*
	 * A nested guest cannot optimize MMIO vmexits, because we have an
	 * nGPA here instead of the required GPA.
	 */
6555
	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6556 6557
	if (!is_guest_mode(vcpu) &&
	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
J
Jason Wang 已提交
6558
		trace_kvm_fast_mmio(gpa);
6559
		return kvm_skip_emulated_instruction(vcpu);
6560
	}
6561

P
Paolo Bonzini 已提交
6562 6563 6564
	ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
	if (ret >= 0)
		return ret;
6565 6566

	/* It is the real ept misconfig */
6567
	WARN_ON(1);
6568

A
Avi Kivity 已提交
6569 6570
	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
6571 6572 6573 6574

	return 0;
}

A
Avi Kivity 已提交
6575
static int handle_nmi_window(struct kvm_vcpu *vcpu)
6576
{
6577
	WARN_ON_ONCE(!enable_vnmi);
6578 6579
	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
			CPU_BASED_VIRTUAL_NMI_PENDING);
6580
	++vcpu->stat.nmi_window_exits;
6581
	kvm_make_request(KVM_REQ_EVENT, vcpu);
6582 6583 6584 6585

	return 1;
}

6586
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6587
{
6588 6589
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	enum emulation_result err = EMULATE_DONE;
6590
	int ret = 1;
6591 6592
	u32 cpu_exec_ctrl;
	bool intr_window_requested;
6593
	unsigned count = 130;
6594 6595 6596

	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
6597

6598
	while (vmx->emulation_required && count-- != 0) {
6599
		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
6600 6601
			return handle_interrupt_window(&vmx->vcpu);

6602
		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6603 6604
			return 1;

6605
		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
6606

P
Paolo Bonzini 已提交
6607
		if (err == EMULATE_USER_EXIT) {
6608
			++vcpu->stat.mmio_exits;
6609 6610 6611
			ret = 0;
			goto out;
		}
6612

6613 6614 6615 6616
		if (err != EMULATE_DONE) {
			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
			vcpu->run->internal.ndata = 0;
6617
			return 0;
6618
		}
6619

6620 6621
		if (vcpu->arch.halt_request) {
			vcpu->arch.halt_request = 0;
6622
			ret = kvm_vcpu_halt(vcpu);
6623 6624 6625
			goto out;
		}

6626
		if (signal_pending(current))
6627
			goto out;
6628 6629 6630 6631
		if (need_resched())
			schedule();
	}

6632 6633
out:
	return ret;
6634 6635
}

R
Radim Krčmář 已提交
6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672
static int __grow_ple_window(int val)
{
	if (ple_window_grow < 1)
		return ple_window;

	val = min(val, ple_window_actual_max);

	if (ple_window_grow < ple_window)
		val *= ple_window_grow;
	else
		val += ple_window_grow;

	return val;
}

static int __shrink_ple_window(int val, int modifier, int minimum)
{
	if (modifier < 1)
		return ple_window;

	if (modifier < ple_window)
		val /= modifier;
	else
		val -= modifier;

	return max(val, minimum);
}

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int old = vmx->ple_window;

	vmx->ple_window = __grow_ple_window(old);

	if (vmx->ple_window != old)
		vmx->ple_window_dirty = true;
6673 6674

	trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
R
Radim Krčmář 已提交
6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int old = vmx->ple_window;

	vmx->ple_window = __shrink_ple_window(old,
	                                      ple_window_shrink, ple_window);

	if (vmx->ple_window != old)
		vmx->ple_window_dirty = true;
6687 6688

	trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
R
Radim Krčmář 已提交
6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705
}

/*
 * ple_window_actual_max is computed to be one grow_ple_window() below
 * ple_window_max. (See __grow_ple_window for the reason.)
 * This prevents overflows, because ple_window_max is int.
 * ple_window_max effectively rounded down to a multiple of ple_window_grow in
 * this process.
 * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
 */
static void update_ple_window_actual_max(void)
{
	ple_window_actual_max =
			__shrink_ple_window(max(ple_window_max, ple_window),
			                    ple_window_grow, INT_MIN);
}

6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724
/*
 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 */
static void wakeup_handler(void)
{
	struct kvm_vcpu *vcpu;
	int cpu = smp_processor_id();

	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
			blocked_vcpu_list) {
		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

		if (pi_test_on(pi_desc) == 1)
			kvm_vcpu_kick(vcpu);
	}
	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}

6725 6726 6727 6728 6729 6730 6731
void vmx_enable_tdp(void)
{
	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
		0ull, VMX_EPT_EXECUTABLE_MASK,
		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6732
		VMX_EPT_RWX_MASK, 0ull);
6733 6734 6735 6736 6737

	ept_set_mmio_spte_mask();
	kvm_enable_tdp();
}

6738 6739
static __init int hardware_setup(void)
{
6740 6741 6742 6743 6744 6745 6746
	int r = -ENOMEM, i, msr;

	rdmsrl_safe(MSR_EFER, &host_efer);

	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
		kvm_define_shared_msr(i, vmx_msr_index[i]);

6747 6748 6749 6750 6751
	for (i = 0; i < VMX_BITMAP_NR; i++) {
		vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
		if (!vmx_bitmap[i])
			goto out;
	}
6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770

	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);

	/*
	 * Allow direct access to the PC debug port (it is often used for I/O
	 * delays, but the vmexits simply slow things down).
	 */
	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
	clear_bit(0x80, vmx_io_bitmap_a);

	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);

	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);

	if (setup_vmcs_config(&vmcs_config) < 0) {
		r = -EIO;
6771
		goto out;
6772
	}
6773 6774 6775 6776

	if (boot_cpu_has(X86_FEATURE_NX))
		kvm_enable_efer_bits(EFER_NX);

6777 6778
	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
6779
		enable_vpid = 0;
6780

6781 6782 6783 6784 6785 6786
	if (!cpu_has_vmx_shadow_vmcs())
		enable_shadow_vmcs = 0;
	if (enable_shadow_vmcs)
		init_vmcs_shadow_fields();

	if (!cpu_has_vmx_ept() ||
6787
	    !cpu_has_vmx_ept_4levels() ||
6788
	    !cpu_has_vmx_ept_mt_wb() ||
6789
	    !cpu_has_vmx_invept_global())
6790 6791
		enable_ept = 0;

6792
	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
6793 6794
		enable_ept_ad_bits = 0;

6795
	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
6796 6797
		enable_unrestricted_guest = 0;

6798
	if (!cpu_has_vmx_flexpriority())
6799 6800
		flexpriority_enabled = 0;

6801 6802 6803
	if (!cpu_has_virtual_nmis())
		enable_vnmi = 0;

6804 6805 6806 6807 6808 6809
	/*
	 * set_apic_access_page_addr() is used to reload apic access
	 * page upon invalidation.  No need to do anything if not
	 * using the APIC_ACCESS_ADDR VMCS field.
	 */
	if (!flexpriority_enabled)
6810 6811 6812 6813 6814 6815 6816 6817
		kvm_x86_ops->set_apic_access_page_addr = NULL;

	if (!cpu_has_vmx_tpr_shadow())
		kvm_x86_ops->update_cr8_intercept = NULL;

	if (enable_ept && !cpu_has_vmx_ept_2m_page())
		kvm_disable_largepages();

6818
	if (!cpu_has_vmx_ple()) {
6819
		ple_gap = 0;
6820 6821 6822 6823 6824
		ple_window = 0;
		ple_window_grow = 0;
		ple_window_max = 0;
		ple_window_shrink = 0;
	}
6825

6826
	if (!cpu_has_vmx_apicv()) {
6827
		enable_apicv = 0;
6828 6829
		kvm_x86_ops->sync_pir_to_irr = NULL;
	}
6830

6831 6832 6833 6834 6835 6836
	if (cpu_has_vmx_tsc_scaling()) {
		kvm_has_tsc_control = true;
		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
		kvm_tsc_scaling_ratio_frac_bits = 48;
	}

6837 6838 6839 6840 6841 6842 6843
	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

6844
	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
6845
			vmx_msr_bitmap_legacy, PAGE_SIZE);
6846
	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
6847
			vmx_msr_bitmap_longmode, PAGE_SIZE);
6848
	memcpy(vmx_msr_bitmap_legacy_x2apic,
6849
			vmx_msr_bitmap_legacy, PAGE_SIZE);
6850
	memcpy(vmx_msr_bitmap_longmode_x2apic,
6851
			vmx_msr_bitmap_longmode, PAGE_SIZE);
6852

6853 6854
	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */

6855 6856 6857
	for (msr = 0x800; msr <= 0x8ff; msr++) {
		if (msr == 0x839 /* TMCCT */)
			continue;
6858
		vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
6859
	}
6860

6861
	/*
6862 6863
	 * TPR reads and writes can be virtualized even if virtual interrupt
	 * delivery is not in use.
6864
	 */
6865 6866
	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
6867 6868

	/* EOI */
6869
	vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
6870
	/* SELF-IPI */
6871
	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
6872

6873 6874 6875
	if (enable_ept)
		vmx_enable_tdp();
	else
6876 6877 6878 6879
		kvm_disable_tdp();

	update_ple_window_actual_max();

K
Kai Huang 已提交
6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893
	/*
	 * Only enable PML when hardware supports PML feature, and both EPT
	 * and EPT A/D bit features are enabled -- PML depends on them to work.
	 */
	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
		enable_pml = 0;

	if (!enable_pml) {
		kvm_x86_ops->slot_enable_log_dirty = NULL;
		kvm_x86_ops->slot_disable_log_dirty = NULL;
		kvm_x86_ops->flush_log_dirty = NULL;
		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
	}

6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904
	if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
		u64 vmx_msr;

		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
		cpu_preemption_timer_multi =
			 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
	} else {
		kvm_x86_ops->set_hv_timer = NULL;
		kvm_x86_ops->cancel_hv_timer = NULL;
	}

6905 6906
	kvm_set_posted_intr_wakeup_handler(wakeup_handler);

6907 6908
	kvm_mce_cap_supported |= MCG_LMCE_P;

6909
	return alloc_kvm_area();
6910 6911

out:
6912 6913
	for (i = 0; i < VMX_BITMAP_NR; i++)
		free_page((unsigned long)vmx_bitmap[i]);
6914 6915

    return r;
6916 6917 6918 6919
}

static __exit void hardware_unsetup(void)
{
6920 6921 6922 6923
	int i;

	for (i = 0; i < VMX_BITMAP_NR; i++)
		free_page((unsigned long)vmx_bitmap[i]);
6924

6925 6926 6927
	free_kvm_area();
}

6928 6929 6930 6931
/*
 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
 */
6932
static int handle_pause(struct kvm_vcpu *vcpu)
6933
{
R
Radim Krčmář 已提交
6934 6935 6936
	if (ple_gap)
		grow_ple_window(vcpu);

6937 6938 6939 6940 6941 6942 6943
	/*
	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
	 * never set PAUSE_EXITING and just set PLE if supported,
	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
	 */
	kvm_vcpu_on_spin(vcpu, true);
6944
	return kvm_skip_emulated_instruction(vcpu);
6945 6946
}

6947
static int handle_nop(struct kvm_vcpu *vcpu)
6948
{
6949
	return kvm_skip_emulated_instruction(vcpu);
6950 6951
}

6952 6953 6954 6955 6956 6957
static int handle_mwait(struct kvm_vcpu *vcpu)
{
	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
	return handle_nop(vcpu);
}

6958 6959 6960 6961 6962 6963
static int handle_invalid_op(struct kvm_vcpu *vcpu)
{
	kvm_queue_exception(vcpu, UD_VECTOR);
	return 1;
}

6964 6965 6966 6967 6968
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
	return 1;
}

6969 6970 6971 6972 6973 6974
static int handle_monitor(struct kvm_vcpu *vcpu)
{
	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
	return handle_nop(vcpu);
}

6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999
/*
 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
 * We could reuse a single VMCS for all the L2 guests, but we also want the
 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
 * allows keeping them loaded on the processor, and in the future will allow
 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
 * every entry if they never change.
 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
 *
 * The following functions allocate and free a vmcs02 in this pool.
 */

/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
{
	struct vmcs02_list *item;
	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
		if (item->vmptr == vmx->nested.current_vmptr) {
			list_move(&item->list, &vmx->nested.vmcs02_pool);
			return &item->vmcs02;
		}

	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
		/* Recycle the least recently used VMCS. */
G
Geliang Tang 已提交
7000 7001
		item = list_last_entry(&vmx->nested.vmcs02_pool,
				       struct vmcs02_list, list);
7002 7003 7004 7005 7006 7007
		item->vmptr = vmx->nested.current_vmptr;
		list_move(&item->list, &vmx->nested.vmcs02_pool);
		return &item->vmcs02;
	}

	/* Create a new VMCS */
7008
	item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
7009 7010 7011
	if (!item)
		return NULL;
	item->vmcs02.vmcs = alloc_vmcs();
7012
	item->vmcs02.shadow_vmcs = NULL;
7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039
	if (!item->vmcs02.vmcs) {
		kfree(item);
		return NULL;
	}
	loaded_vmcs_init(&item->vmcs02);
	item->vmptr = vmx->nested.current_vmptr;
	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
	vmx->nested.vmcs02_num++;
	return &item->vmcs02;
}

/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
{
	struct vmcs02_list *item;
	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
		if (item->vmptr == vmptr) {
			free_loaded_vmcs(&item->vmcs02);
			list_del(&item->list);
			kfree(item);
			vmx->nested.vmcs02_num--;
			return;
		}
}

/*
 * Free all VMCSs saved for this vcpu, except the one pointed by
7040 7041
 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
 * must be &vmx->vmcs01.
7042 7043 7044 7045
 */
static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
{
	struct vmcs02_list *item, *n;
7046 7047

	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
7048
	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
7049 7050 7051 7052 7053 7054 7055 7056
		/*
		 * Something will leak if the above WARN triggers.  Better than
		 * a use-after-free.
		 */
		if (vmx->loaded_vmcs == &item->vmcs02)
			continue;

		free_loaded_vmcs(&item->vmcs02);
7057 7058
		list_del(&item->list);
		kfree(item);
7059
		vmx->nested.vmcs02_num--;
7060 7061 7062
	}
}

7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082
/*
 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 * set the success or error code of an emulated VMX instruction, as specified
 * by Vol 2B, VMX Instruction Reference, "Conventions".
 */
static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
{
	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
}

static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
{
	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
			    X86_EFLAGS_SF | X86_EFLAGS_OF))
			| X86_EFLAGS_CF);
}

A
Abel Gordon 已提交
7083
static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103
					u32 vm_instruction_error)
{
	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
		/*
		 * failValid writes the error number to the current VMCS, which
		 * can't be done there isn't a current VMCS.
		 */
		nested_vmx_failInvalid(vcpu);
		return;
	}
	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
			    X86_EFLAGS_SF | X86_EFLAGS_OF))
			| X86_EFLAGS_ZF);
	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
	/*
	 * We don't need to force a shadow sync because
	 * VM_INSTRUCTION_ERROR is not shadowed
	 */
}
A
Abel Gordon 已提交
7104

7105 7106 7107 7108
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
	/* TODO: not to reset guest simply here. */
	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7109
	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
7110 7111
}

7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
{
	struct vcpu_vmx *vmx =
		container_of(timer, struct vcpu_vmx, nested.preemption_timer);

	vmx->nested.preemption_timer_expired = true;
	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
	kvm_vcpu_kick(&vmx->vcpu);

	return HRTIMER_NORESTART;
}

7124 7125 7126 7127 7128 7129 7130 7131
/*
 * Decode the memory-address operand of a vmx instruction, as recorded on an
 * exit caused by such an instruction (run by a guest hypervisor).
 * On success, returns 0. When the operand is invalid, returns 1 and throws
 * #UD or #GP.
 */
static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
				 unsigned long exit_qualification,
7132
				 u32 vmx_instruction_info, bool wr, gva_t *ret)
7133
{
7134 7135 7136 7137
	gva_t off;
	bool exn;
	struct kvm_segment s;

7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161
	/*
	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
	 * Execution", on an exit, vmx_instruction_info holds most of the
	 * addressing components of the operand. Only the displacement part
	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
	 * For how an actual address is calculated from all these components,
	 * refer to Vol. 1, "Operand Addressing".
	 */
	int  scaling = vmx_instruction_info & 3;
	int  addr_size = (vmx_instruction_info >> 7) & 7;
	bool is_reg = vmx_instruction_info & (1u << 10);
	int  seg_reg = (vmx_instruction_info >> 15) & 7;
	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));

	if (is_reg) {
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

	/* Addr = segment_base + offset */
	/* offset = base + [index * scale] + displacement */
7162
	off = exit_qualification; /* holds the displacement */
7163
	if (base_is_valid)
7164
		off += kvm_register_read(vcpu, base_reg);
7165
	if (index_is_valid)
7166 7167 7168
		off += kvm_register_read(vcpu, index_reg)<<scaling;
	vmx_get_segment(vcpu, &s, seg_reg);
	*ret = s.base + off;
7169 7170 7171 7172

	if (addr_size == 1) /* 32 bit */
		*ret &= 0xffffffff;

7173 7174
	/* Checks for #GP/#SS exceptions. */
	exn = false;
7175 7176 7177 7178 7179
	if (is_long_mode(vcpu)) {
		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
		 * non-canonical form. This is the only check on the memory
		 * destination for long mode!
		 */
7180
		exn = is_noncanonical_address(*ret, vcpu);
7181
	} else if (is_protmode(vcpu)) {
7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197
		/* Protected mode: apply checks for segment validity in the
		 * following order:
		 * - segment type check (#GP(0) may be thrown)
		 * - usability check (#GP(0)/#SS(0))
		 * - limit check (#GP(0)/#SS(0))
		 */
		if (wr)
			/* #GP(0) if the destination operand is located in a
			 * read-only data segment or any code segment.
			 */
			exn = ((s.type & 0xa) == 0 || (s.type & 8));
		else
			/* #GP(0) if the source operand is located in an
			 * execute-only code segment
			 */
			exn = ((s.type & 0xa) == 8);
7198 7199 7200 7201
		if (exn) {
			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
			return 1;
		}
7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217
		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
		 */
		exn = (s.unusable != 0);
		/* Protected mode: #GP(0)/#SS(0) if the memory
		 * operand is outside the segment limit.
		 */
		exn = exn || (off + sizeof(u64) > s.limit);
	}
	if (exn) {
		kvm_queue_exception_e(vcpu,
				      seg_reg == VCPU_SREG_SS ?
						SS_VECTOR : GP_VECTOR,
				      0);
		return 1;
	}

7218 7219 7220
	return 0;
}

7221
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
7222 7223 7224 7225 7226
{
	gva_t gva;
	struct x86_exception e;

	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7227
			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
7228 7229
		return 1;

7230 7231
	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
				sizeof(*vmpointer), &e)) {
7232 7233 7234 7235 7236 7237 7238
		kvm_inject_page_fault(vcpu, &e);
		return 1;
	}

	return 0;
}

J
Jim Mattson 已提交
7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285
static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct vmcs *shadow_vmcs;

	if (cpu_has_vmx_msr_bitmap()) {
		vmx->nested.msr_bitmap =
				(unsigned long *)__get_free_page(GFP_KERNEL);
		if (!vmx->nested.msr_bitmap)
			goto out_msr_bitmap;
	}

	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
	if (!vmx->nested.cached_vmcs12)
		goto out_cached_vmcs12;

	if (enable_shadow_vmcs) {
		shadow_vmcs = alloc_vmcs();
		if (!shadow_vmcs)
			goto out_shadow_vmcs;
		/* mark vmcs as shadow */
		shadow_vmcs->revision_id |= (1u << 31);
		/* init shadow vmcs */
		vmcs_clear(shadow_vmcs);
		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
	}

	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
	vmx->nested.vmcs02_num = 0;

	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
		     HRTIMER_MODE_REL_PINNED);
	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;

	vmx->nested.vmxon = true;
	return 0;

out_shadow_vmcs:
	kfree(vmx->nested.cached_vmcs12);

out_cached_vmcs12:
	free_page((unsigned long)vmx->nested.msr_bitmap);

out_msr_bitmap:
	return -ENOMEM;
}

7286 7287 7288 7289 7290 7291 7292 7293 7294 7295
/*
 * Emulate the VMXON instruction.
 * Currently, we just remember that VMX is active, and do not save or even
 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
 * do not currently need to store anything in that guest-allocated memory
 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
 * argument is different from the VMXON pointer (which the spec says they do).
 */
static int handle_vmon(struct kvm_vcpu *vcpu)
{
J
Jim Mattson 已提交
7296
	int ret;
7297 7298
	gpa_t vmptr;
	struct page *page;
7299
	struct vcpu_vmx *vmx = to_vmx(vcpu);
7300 7301
	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7302

7303 7304 7305 7306 7307 7308 7309 7310
	/*
	 * The Intel VMX Instruction Reference lists a bunch of bits that are
	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
	 * Otherwise, we should fail with #UD.  But most faulting conditions
	 * have already been checked by hardware, prior to the VM-exit for
	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
	 * that bit set to 1 in non-root mode.
7311
	 */
7312
	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
7313 7314 7315 7316
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

A
Abel Gordon 已提交
7317 7318
	if (vmx->nested.vmxon) {
		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
7319
		return kvm_skip_emulated_instruction(vcpu);
A
Abel Gordon 已提交
7320
	}
7321

7322
	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
7323 7324 7325 7326 7327
			!= VMXON_NEEDED_FEATURES) {
		kvm_inject_gp(vcpu, 0);
		return 1;
	}

7328
	if (nested_vmx_get_vmptr(vcpu, &vmptr))
7329
		return 1;
7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343

	/*
	 * SDM 3: 24.11.5
	 * The first 4 bytes of VMXON region contain the supported
	 * VMCS revision identifier
	 *
	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
	 * which replaces physical address width with 32
	 */
	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
		nested_vmx_failInvalid(vcpu);
		return kvm_skip_emulated_instruction(vcpu);
	}

7344 7345
	page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
	if (is_error_page(page)) {
7346 7347 7348 7349 7350
		nested_vmx_failInvalid(vcpu);
		return kvm_skip_emulated_instruction(vcpu);
	}
	if (*(u32 *)kmap(page) != VMCS12_REVISION) {
		kunmap(page);
7351
		kvm_release_page_clean(page);
7352 7353 7354 7355
		nested_vmx_failInvalid(vcpu);
		return kvm_skip_emulated_instruction(vcpu);
	}
	kunmap(page);
7356
	kvm_release_page_clean(page);
7357 7358

	vmx->nested.vmxon_ptr = vmptr;
J
Jim Mattson 已提交
7359 7360 7361
	ret = enter_vmx_operation(vcpu);
	if (ret)
		return ret;
7362

7363
	nested_vmx_succeed(vcpu);
7364
	return kvm_skip_emulated_instruction(vcpu);
7365 7366 7367 7368 7369 7370
}

/*
 * Intel's VMX Instruction Reference specifies a common set of prerequisites
 * for running VMX instructions (except VMXON, whose prerequisites are
 * slightly different). It also specifies what exception to inject otherwise.
7371 7372
 * Note that many of these exceptions have priority over VM exits, so they
 * don't have to be checked again here.
7373 7374 7375
 */
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
7376
	if (!to_vmx(vcpu)->nested.vmxon) {
7377 7378 7379 7380 7381 7382
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 0;
	}
	return 1;
}

7383 7384 7385 7386 7387 7388
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
	vmcs_write64(VMCS_LINK_POINTER, -1ull);
}

A
Abel Gordon 已提交
7389 7390
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
7391 7392 7393
	if (vmx->nested.current_vmptr == -1ull)
		return;

7394
	if (enable_shadow_vmcs) {
7395 7396 7397 7398
		/* copy to memory all shadowed fields in case
		   they were modified */
		copy_shadow_to_vmcs12(vmx);
		vmx->nested.sync_shadow_vmcs = false;
7399
		vmx_disable_shadow_vmcs(vmx);
7400
	}
7401
	vmx->nested.posted_intr_nv = -1;
7402 7403

	/* Flush VMCS12 to guest memory */
P
Paolo Bonzini 已提交
7404 7405 7406
	kvm_vcpu_write_guest_page(&vmx->vcpu,
				  vmx->nested.current_vmptr >> PAGE_SHIFT,
				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
7407

7408
	vmx->nested.current_vmptr = -1ull;
A
Abel Gordon 已提交
7409 7410
}

7411 7412 7413 7414 7415 7416 7417 7418
/*
 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 * just stops using VMX.
 */
static void free_nested(struct vcpu_vmx *vmx)
{
	if (!vmx->nested.vmxon)
		return;
7419

7420
	vmx->nested.vmxon = false;
W
Wanpeng Li 已提交
7421
	free_vpid(vmx->nested.vpid02);
7422 7423
	vmx->nested.posted_intr_nv = -1;
	vmx->nested.current_vmptr = -1ull;
7424 7425 7426 7427
	if (vmx->nested.msr_bitmap) {
		free_page((unsigned long)vmx->nested.msr_bitmap);
		vmx->nested.msr_bitmap = NULL;
	}
7428
	if (enable_shadow_vmcs) {
7429
		vmx_disable_shadow_vmcs(vmx);
7430 7431 7432 7433
		vmcs_clear(vmx->vmcs01.shadow_vmcs);
		free_vmcs(vmx->vmcs01.shadow_vmcs);
		vmx->vmcs01.shadow_vmcs = NULL;
	}
7434
	kfree(vmx->nested.cached_vmcs12);
7435 7436
	/* Unpin physical memory we referred to in current vmcs02 */
	if (vmx->nested.apic_access_page) {
7437
		kvm_release_page_dirty(vmx->nested.apic_access_page);
7438
		vmx->nested.apic_access_page = NULL;
7439
	}
7440
	if (vmx->nested.virtual_apic_page) {
7441
		kvm_release_page_dirty(vmx->nested.virtual_apic_page);
7442
		vmx->nested.virtual_apic_page = NULL;
7443
	}
7444 7445
	if (vmx->nested.pi_desc_page) {
		kunmap(vmx->nested.pi_desc_page);
7446
		kvm_release_page_dirty(vmx->nested.pi_desc_page);
7447 7448 7449
		vmx->nested.pi_desc_page = NULL;
		vmx->nested.pi_desc = NULL;
	}
7450 7451

	nested_free_all_saved_vmcss(vmx);
7452 7453 7454 7455 7456 7457 7458 7459
}

/* Emulate the VMXOFF instruction */
static int handle_vmoff(struct kvm_vcpu *vcpu)
{
	if (!nested_vmx_check_permission(vcpu))
		return 1;
	free_nested(to_vmx(vcpu));
7460
	nested_vmx_succeed(vcpu);
7461
	return kvm_skip_emulated_instruction(vcpu);
7462 7463
}

N
Nadav Har'El 已提交
7464 7465 7466 7467
/* Emulate the VMCLEAR instruction */
static int handle_vmclear(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
7468
	u32 zero = 0;
N
Nadav Har'El 已提交
7469 7470 7471 7472 7473
	gpa_t vmptr;

	if (!nested_vmx_check_permission(vcpu))
		return 1;

7474
	if (nested_vmx_get_vmptr(vcpu, &vmptr))
N
Nadav Har'El 已提交
7475 7476
		return 1;

7477 7478 7479 7480 7481 7482 7483 7484 7485 7486
	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
		return kvm_skip_emulated_instruction(vcpu);
	}

	if (vmptr == vmx->nested.vmxon_ptr) {
		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
		return kvm_skip_emulated_instruction(vcpu);
	}

7487
	if (vmptr == vmx->nested.current_vmptr)
A
Abel Gordon 已提交
7488
		nested_release_vmcs12(vmx);
N
Nadav Har'El 已提交
7489

7490 7491 7492
	kvm_vcpu_write_guest(vcpu,
			vmptr + offsetof(struct vmcs12, launch_state),
			&zero, sizeof(zero));
N
Nadav Har'El 已提交
7493 7494 7495 7496

	nested_free_vmcs02(vmx, vmptr);

	nested_vmx_succeed(vcpu);
7497
	return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7498 7499
}

7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);

/* Emulate the VMLAUNCH instruction */
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
{
	return nested_vmx_run(vcpu, true);
}

/* Emulate the VMRESUME instruction */
static int handle_vmresume(struct kvm_vcpu *vcpu)
{

	return nested_vmx_run(vcpu, false);
}

7515 7516 7517 7518 7519 7520 7521
/*
 * Read a vmcs12 field. Since these can have varying lengths and we return
 * one type, we chose the biggest type (u64) and zero-extend the return value
 * to that size. Note that the caller, handle_vmread, might need to use only
 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
 * 64-bit fields are to be returned).
 */
7522 7523
static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
				  unsigned long field, u64 *ret)
7524 7525 7526 7527 7528
{
	short offset = vmcs_field_to_offset(field);
	char *p;

	if (offset < 0)
7529
		return offset;
7530 7531 7532 7533 7534 7535

	p = ((char *)(get_vmcs12(vcpu))) + offset;

	switch (vmcs_field_type(field)) {
	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
		*ret = *((natural_width *)p);
7536
		return 0;
7537 7538
	case VMCS_FIELD_TYPE_U16:
		*ret = *((u16 *)p);
7539
		return 0;
7540 7541
	case VMCS_FIELD_TYPE_U32:
		*ret = *((u32 *)p);
7542
		return 0;
7543 7544
	case VMCS_FIELD_TYPE_U64:
		*ret = *((u64 *)p);
7545
		return 0;
7546
	default:
7547 7548
		WARN_ON(1);
		return -ENOENT;
7549 7550 7551
	}
}

A
Abel Gordon 已提交
7552

7553 7554
static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
				   unsigned long field, u64 field_value){
A
Abel Gordon 已提交
7555 7556 7557
	short offset = vmcs_field_to_offset(field);
	char *p = ((char *) get_vmcs12(vcpu)) + offset;
	if (offset < 0)
7558
		return offset;
A
Abel Gordon 已提交
7559 7560 7561 7562

	switch (vmcs_field_type(field)) {
	case VMCS_FIELD_TYPE_U16:
		*(u16 *)p = field_value;
7563
		return 0;
A
Abel Gordon 已提交
7564 7565
	case VMCS_FIELD_TYPE_U32:
		*(u32 *)p = field_value;
7566
		return 0;
A
Abel Gordon 已提交
7567 7568
	case VMCS_FIELD_TYPE_U64:
		*(u64 *)p = field_value;
7569
		return 0;
A
Abel Gordon 已提交
7570 7571
	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
		*(natural_width *)p = field_value;
7572
		return 0;
A
Abel Gordon 已提交
7573
	default:
7574 7575
		WARN_ON(1);
		return -ENOENT;
A
Abel Gordon 已提交
7576 7577 7578 7579
	}

}

7580 7581 7582 7583 7584
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{
	int i;
	unsigned long field;
	u64 field_value;
7585
	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
7586 7587
	const unsigned long *fields = shadow_read_write_fields;
	const int num_fields = max_shadow_read_write_fields;
7588

7589 7590
	preempt_disable();

7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607
	vmcs_load(shadow_vmcs);

	for (i = 0; i < num_fields; i++) {
		field = fields[i];
		switch (vmcs_field_type(field)) {
		case VMCS_FIELD_TYPE_U16:
			field_value = vmcs_read16(field);
			break;
		case VMCS_FIELD_TYPE_U32:
			field_value = vmcs_read32(field);
			break;
		case VMCS_FIELD_TYPE_U64:
			field_value = vmcs_read64(field);
			break;
		case VMCS_FIELD_TYPE_NATURAL_WIDTH:
			field_value = vmcs_readl(field);
			break;
7608 7609 7610
		default:
			WARN_ON(1);
			continue;
7611 7612 7613 7614 7615 7616
		}
		vmcs12_write_any(&vmx->vcpu, field, field_value);
	}

	vmcs_clear(shadow_vmcs);
	vmcs_load(vmx->loaded_vmcs->vmcs);
7617 7618

	preempt_enable();
7619 7620
}

7621 7622
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{
7623 7624 7625
	const unsigned long *fields[] = {
		shadow_read_write_fields,
		shadow_read_only_fields
7626
	};
7627
	const int max_fields[] = {
7628 7629 7630 7631 7632 7633
		max_shadow_read_write_fields,
		max_shadow_read_only_fields
	};
	int i, q;
	unsigned long field;
	u64 field_value = 0;
7634
	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
7635 7636 7637

	vmcs_load(shadow_vmcs);

7638
	for (q = 0; q < ARRAY_SIZE(fields); q++) {
7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655
		for (i = 0; i < max_fields[q]; i++) {
			field = fields[q][i];
			vmcs12_read_any(&vmx->vcpu, field, &field_value);

			switch (vmcs_field_type(field)) {
			case VMCS_FIELD_TYPE_U16:
				vmcs_write16(field, (u16)field_value);
				break;
			case VMCS_FIELD_TYPE_U32:
				vmcs_write32(field, (u32)field_value);
				break;
			case VMCS_FIELD_TYPE_U64:
				vmcs_write64(field, (u64)field_value);
				break;
			case VMCS_FIELD_TYPE_NATURAL_WIDTH:
				vmcs_writel(field, (long)field_value);
				break;
7656 7657 7658
			default:
				WARN_ON(1);
				break;
7659 7660 7661 7662 7663 7664 7665 7666
			}
		}
	}

	vmcs_clear(shadow_vmcs);
	vmcs_load(vmx->loaded_vmcs->vmcs);
}

7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688
/*
 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
 * used before) all generate the same failure when it is missing.
 */
static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	if (vmx->nested.current_vmptr == -1ull) {
		nested_vmx_failInvalid(vcpu);
		return 0;
	}
	return 1;
}

static int handle_vmread(struct kvm_vcpu *vcpu)
{
	unsigned long field;
	u64 field_value;
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
	gva_t gva = 0;

7689
	if (!nested_vmx_check_permission(vcpu))
7690 7691
		return 1;

7692 7693
	if (!nested_vmx_check_vmcs12(vcpu))
		return kvm_skip_emulated_instruction(vcpu);
7694 7695

	/* Decode instruction info and find the field to read */
7696
	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7697
	/* Read the field, zero-extended to a u64 field_value */
7698
	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
7699
		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7700
		return kvm_skip_emulated_instruction(vcpu);
7701 7702 7703 7704 7705 7706 7707
	}
	/*
	 * Now copy part of this value to register or memory, as requested.
	 * Note that the number of bits actually copied is 32 or 64 depending
	 * on the guest's mode (32 or 64 bit), not on the given field's length.
	 */
	if (vmx_instruction_info & (1u << 10)) {
7708
		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
7709 7710 7711
			field_value);
	} else {
		if (get_vmx_mem_address(vcpu, exit_qualification,
7712
				vmx_instruction_info, true, &gva))
7713
			return 1;
7714
		/* _system ok, as hardware has verified cpl=0 */
7715 7716 7717 7718 7719
		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
	}

	nested_vmx_succeed(vcpu);
7720
	return kvm_skip_emulated_instruction(vcpu);
7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732
}


static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
	unsigned long field;
	gva_t gva;
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
	/* The value to write might be 32 or 64 bits, depending on L1's long
	 * mode, and eventually we need to write that into a field of several
	 * possible lengths. The code below first zero-extends the value to 64
7733
	 * bit (field_value), and then copies only the appropriate number of
7734 7735 7736 7737 7738
	 * bits into the vmcs12 field.
	 */
	u64 field_value = 0;
	struct x86_exception e;

7739
	if (!nested_vmx_check_permission(vcpu))
7740 7741
		return 1;

7742 7743
	if (!nested_vmx_check_vmcs12(vcpu))
		return kvm_skip_emulated_instruction(vcpu);
7744

7745
	if (vmx_instruction_info & (1u << 10))
7746
		field_value = kvm_register_readl(vcpu,
7747 7748 7749
			(((vmx_instruction_info) >> 3) & 0xf));
	else {
		if (get_vmx_mem_address(vcpu, exit_qualification,
7750
				vmx_instruction_info, false, &gva))
7751 7752
			return 1;
		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
7753
			   &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
7754 7755 7756 7757 7758 7759
			kvm_inject_page_fault(vcpu, &e);
			return 1;
		}
	}


7760
	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7761 7762 7763
	if (vmcs_field_readonly(field)) {
		nested_vmx_failValid(vcpu,
			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
7764
		return kvm_skip_emulated_instruction(vcpu);
7765 7766
	}

7767
	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
7768
		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7769
		return kvm_skip_emulated_instruction(vcpu);
7770 7771 7772
	}

	nested_vmx_succeed(vcpu);
7773
	return kvm_skip_emulated_instruction(vcpu);
7774 7775
}

7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787
static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
{
	vmx->nested.current_vmptr = vmptr;
	if (enable_shadow_vmcs) {
		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
			      SECONDARY_EXEC_SHADOW_VMCS);
		vmcs_write64(VMCS_LINK_POINTER,
			     __pa(vmx->vmcs01.shadow_vmcs));
		vmx->nested.sync_shadow_vmcs = true;
	}
}

N
Nadav Har'El 已提交
7788 7789 7790 7791 7792 7793 7794 7795 7796
/* Emulate the VMPTRLD instruction */
static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	gpa_t vmptr;

	if (!nested_vmx_check_permission(vcpu))
		return 1;

7797
	if (nested_vmx_get_vmptr(vcpu, &vmptr))
N
Nadav Har'El 已提交
7798 7799
		return 1;

7800 7801 7802 7803 7804 7805 7806 7807 7808 7809
	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
		return kvm_skip_emulated_instruction(vcpu);
	}

	if (vmptr == vmx->nested.vmxon_ptr) {
		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
		return kvm_skip_emulated_instruction(vcpu);
	}

N
Nadav Har'El 已提交
7810 7811 7812
	if (vmx->nested.current_vmptr != vmptr) {
		struct vmcs12 *new_vmcs12;
		struct page *page;
7813 7814
		page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
		if (is_error_page(page)) {
N
Nadav Har'El 已提交
7815
			nested_vmx_failInvalid(vcpu);
7816
			return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7817 7818 7819 7820
		}
		new_vmcs12 = kmap(page);
		if (new_vmcs12->revision_id != VMCS12_REVISION) {
			kunmap(page);
7821
			kvm_release_page_clean(page);
N
Nadav Har'El 已提交
7822 7823
			nested_vmx_failValid(vcpu,
				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
7824
			return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7825 7826
		}

7827
		nested_release_vmcs12(vmx);
7828 7829 7830 7831
		/*
		 * Load VMCS12 from guest memory since it is not already
		 * cached.
		 */
P
Paolo Bonzini 已提交
7832 7833
		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
		kunmap(page);
7834
		kvm_release_page_clean(page);
P
Paolo Bonzini 已提交
7835

7836
		set_current_vmptr(vmx, vmptr);
N
Nadav Har'El 已提交
7837 7838 7839
	}

	nested_vmx_succeed(vcpu);
7840
	return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7841 7842
}

N
Nadav Har'El 已提交
7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854
/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
	gva_t vmcs_gva;
	struct x86_exception e;

	if (!nested_vmx_check_permission(vcpu))
		return 1;

	if (get_vmx_mem_address(vcpu, exit_qualification,
7855
			vmx_instruction_info, true, &vmcs_gva))
N
Nadav Har'El 已提交
7856
		return 1;
7857
	/* ok to use *_system, as hardware has verified cpl=0 */
N
Nadav Har'El 已提交
7858 7859 7860 7861 7862 7863 7864
	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
				 sizeof(u64), &e)) {
		kvm_inject_page_fault(vcpu, &e);
		return 1;
	}
	nested_vmx_succeed(vcpu);
7865
	return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7866 7867
}

N
Nadav Har'El 已提交
7868 7869 7870
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
7871
	struct vcpu_vmx *vmx = to_vmx(vcpu);
N
Nadav Har'El 已提交
7872 7873 7874 7875 7876 7877 7878 7879
	u32 vmx_instruction_info, types;
	unsigned long type;
	gva_t gva;
	struct x86_exception e;
	struct {
		u64 eptp, gpa;
	} operand;

7880 7881 7882
	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
	      SECONDARY_EXEC_ENABLE_EPT) ||
	    !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
N
Nadav Har'El 已提交
7883 7884 7885 7886 7887 7888 7889 7890
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

	if (!nested_vmx_check_permission(vcpu))
		return 1;

	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7891
	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
N
Nadav Har'El 已提交
7892

7893
	types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
N
Nadav Har'El 已提交
7894

7895
	if (type >= 32 || !(types & (1 << type))) {
N
Nadav Har'El 已提交
7896 7897
		nested_vmx_failValid(vcpu,
				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7898
		return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7899 7900 7901 7902 7903 7904
	}

	/* According to the Intel VMX instruction reference, the memory
	 * operand is read even if it isn't needed (e.g., for type==global)
	 */
	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7905
			vmx_instruction_info, false, &gva))
N
Nadav Har'El 已提交
7906 7907 7908 7909 7910 7911 7912 7913 7914
		return 1;
	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
				sizeof(operand), &e)) {
		kvm_inject_page_fault(vcpu, &e);
		return 1;
	}

	switch (type) {
	case VMX_EPT_EXTENT_GLOBAL:
7915 7916 7917 7918 7919
	/*
	 * TODO: track mappings and invalidate
	 * single context requests appropriately
	 */
	case VMX_EPT_EXTENT_CONTEXT:
N
Nadav Har'El 已提交
7920
		kvm_mmu_sync_roots(vcpu);
7921
		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
N
Nadav Har'El 已提交
7922 7923 7924 7925 7926 7927 7928
		nested_vmx_succeed(vcpu);
		break;
	default:
		BUG_ON(1);
		break;
	}

7929
	return kvm_skip_emulated_instruction(vcpu);
N
Nadav Har'El 已提交
7930 7931
}

7932 7933
static int handle_invvpid(struct kvm_vcpu *vcpu)
{
7934 7935 7936 7937 7938
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u32 vmx_instruction_info;
	unsigned long type, types;
	gva_t gva;
	struct x86_exception e;
7939 7940 7941 7942
	struct {
		u64 vpid;
		u64 gla;
	} operand;
7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956

	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
	      SECONDARY_EXEC_ENABLE_VPID) ||
			!(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

	if (!nested_vmx_check_permission(vcpu))
		return 1;

	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);

7957 7958
	types = (vmx->nested.nested_vmx_vpid_caps &
			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
7959

7960
	if (type >= 32 || !(types & (1 << type))) {
7961 7962
		nested_vmx_failValid(vcpu,
			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7963
		return kvm_skip_emulated_instruction(vcpu);
7964 7965 7966 7967 7968 7969 7970 7971
	}

	/* according to the intel vmx instruction reference, the memory
	 * operand is read even if it isn't needed (e.g., for type==global)
	 */
	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
			vmx_instruction_info, false, &gva))
		return 1;
7972 7973
	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
				sizeof(operand), &e)) {
7974 7975 7976
		kvm_inject_page_fault(vcpu, &e);
		return 1;
	}
7977 7978 7979 7980 7981
	if (operand.vpid >> 16) {
		nested_vmx_failValid(vcpu,
			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
		return kvm_skip_emulated_instruction(vcpu);
	}
7982 7983

	switch (type) {
7984
	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
7985
		if (is_noncanonical_address(operand.gla, vcpu)) {
7986 7987 7988 7989 7990
			nested_vmx_failValid(vcpu,
				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
			return kvm_skip_emulated_instruction(vcpu);
		}
		/* fall through */
7991
	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
7992
	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
7993
		if (!operand.vpid) {
7994 7995
			nested_vmx_failValid(vcpu,
				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7996
			return kvm_skip_emulated_instruction(vcpu);
7997 7998
		}
		break;
7999 8000 8001
	case VMX_VPID_EXTENT_ALL_CONTEXT:
		break;
	default:
8002
		WARN_ON_ONCE(1);
8003
		return kvm_skip_emulated_instruction(vcpu);
8004 8005
	}

8006 8007 8008
	__vmx_flush_tlb(vcpu, vmx->nested.vpid02);
	nested_vmx_succeed(vcpu);

8009
	return kvm_skip_emulated_instruction(vcpu);
8010 8011
}

K
Kai Huang 已提交
8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024
static int handle_pml_full(struct kvm_vcpu *vcpu)
{
	unsigned long exit_qualification;

	trace_kvm_pml_full(vcpu->vcpu_id);

	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);

	/*
	 * PML buffer FULL happened while executing iret from NMI,
	 * "blocked by NMI" bit has to be set before next VM entry.
	 */
	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
8025
			enable_vnmi &&
K
Kai Huang 已提交
8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036
			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				GUEST_INTR_STATE_NMI);

	/*
	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
	 * here.., and there's no userspace involvement needed for PML.
	 */
	return 1;
}

8037 8038 8039 8040 8041 8042
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
	kvm_lapic_expired_hv_timer(vcpu);
	return 1;
}

8043 8044 8045 8046 8047 8048
static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int maxphyaddr = cpuid_maxphyaddr(vcpu);

	/* Check for memory type validity */
8049 8050
	switch (address & VMX_EPTP_MT_MASK) {
	case VMX_EPTP_MT_UC:
8051 8052 8053
		if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
			return false;
		break;
8054
	case VMX_EPTP_MT_WB:
8055 8056 8057 8058 8059 8060 8061
		if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
			return false;
		break;
	default:
		return false;
	}

8062 8063
	/* only 4 levels page-walk length are valid */
	if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
8064 8065 8066 8067 8068 8069 8070
		return false;

	/* Reserved bits should not be set */
	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
		return false;

	/* AD, if set, should be supported */
8071
	if (address & VMX_EPTP_AD_ENABLE_BIT) {
8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098
		if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
			return false;
	}

	return true;
}

static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
				     struct vmcs12 *vmcs12)
{
	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
	u64 address;
	bool accessed_dirty;
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

	if (!nested_cpu_has_eptp_switching(vmcs12) ||
	    !nested_cpu_has_ept(vmcs12))
		return 1;

	if (index >= VMFUNC_EPTP_ENTRIES)
		return 1;


	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
				     &address, index * 8, 8))
		return 1;

8099
	accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123

	/*
	 * If the (L2) guest does a vmfunc to the currently
	 * active ept pointer, we don't have to do anything else
	 */
	if (vmcs12->ept_pointer != address) {
		if (!valid_ept_address(vcpu, address))
			return 1;

		kvm_mmu_unload(vcpu);
		mmu->ept_ad = accessed_dirty;
		mmu->base_role.ad_disabled = !accessed_dirty;
		vmcs12->ept_pointer = address;
		/*
		 * TODO: Check what's the correct approach in case
		 * mmu reload fails. Currently, we just let the next
		 * reload potentially fail
		 */
		kvm_mmu_reload(vcpu);
	}

	return 0;
}

B
Bandan Das 已提交
8124 8125
static int handle_vmfunc(struct kvm_vcpu *vcpu)
{
8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct vmcs12 *vmcs12;
	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];

	/*
	 * VMFUNC is only supported for nested guests, but we always enable the
	 * secondary control for simplicity; for non-nested mode, fake that we
	 * didn't by injecting #UD.
	 */
	if (!is_guest_mode(vcpu)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}

	vmcs12 = get_vmcs12(vcpu);
	if ((vmcs12->vm_function_control & (1 << function)) == 0)
		goto fail;
8143 8144 8145 8146 8147 8148 8149 8150 8151 8152

	switch (function) {
	case 0:
		if (nested_vmx_eptp_switching(vcpu, vmcs12))
			goto fail;
		break;
	default:
		goto fail;
	}
	return kvm_skip_emulated_instruction(vcpu);
8153 8154 8155 8156 8157

fail:
	nested_vmx_vmexit(vcpu, vmx->exit_reason,
			  vmcs_read32(VM_EXIT_INTR_INFO),
			  vmcs_readl(EXIT_QUALIFICATION));
B
Bandan Das 已提交
8158 8159 8160
	return 1;
}

A
Avi Kivity 已提交
8161 8162 8163 8164 8165
/*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
 * to be done to userspace and return 0.
 */
8166
static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
A
Avi Kivity 已提交
8167 8168
	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
8169
	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
8170
	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
A
Avi Kivity 已提交
8171 8172 8173 8174 8175 8176 8177 8178
	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
	[EXIT_REASON_CR_ACCESS]               = handle_cr,
	[EXIT_REASON_DR_ACCESS]               = handle_dr,
	[EXIT_REASON_CPUID]                   = handle_cpuid,
	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
	[EXIT_REASON_HLT]                     = handle_halt,
8179
	[EXIT_REASON_INVD]		      = handle_invd,
M
Marcelo Tosatti 已提交
8180
	[EXIT_REASON_INVLPG]		      = handle_invlpg,
A
Avi Kivity 已提交
8181
	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
8182
	[EXIT_REASON_VMCALL]                  = handle_vmcall,
N
Nadav Har'El 已提交
8183
	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
8184
	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
N
Nadav Har'El 已提交
8185
	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
N
Nadav Har'El 已提交
8186
	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
8187
	[EXIT_REASON_VMREAD]                  = handle_vmread,
8188
	[EXIT_REASON_VMRESUME]                = handle_vmresume,
8189
	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
8190 8191
	[EXIT_REASON_VMOFF]                   = handle_vmoff,
	[EXIT_REASON_VMON]                    = handle_vmon,
8192 8193
	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
8194
	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
8195
	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
E
Eddie Dong 已提交
8196
	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
8197
	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
8198
	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
A
Andi Kleen 已提交
8199
	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
8200 8201
	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
8202
	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
8203
	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
8204
	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
8205
	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
N
Nadav Har'El 已提交
8206
	[EXIT_REASON_INVEPT]                  = handle_invept,
8207
	[EXIT_REASON_INVVPID]                 = handle_invvpid,
8208
	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
8209
	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
8210 8211
	[EXIT_REASON_XSAVES]                  = handle_xsaves,
	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
K
Kai Huang 已提交
8212
	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
B
Bandan Das 已提交
8213
	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
8214
	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
A
Avi Kivity 已提交
8215 8216 8217
};

static const int kvm_vmx_max_exit_handlers =
8218
	ARRAY_SIZE(kvm_vmx_exit_handlers);
A
Avi Kivity 已提交
8219

8220 8221 8222 8223 8224 8225 8226 8227 8228 8229
static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{
	unsigned long exit_qualification;
	gpa_t bitmap, last_bitmap;
	unsigned int port;
	int size;
	u8 b;

	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8230
		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245

	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);

	port = exit_qualification >> 16;
	size = (exit_qualification & 7) + 1;

	last_bitmap = (gpa_t)-1;
	b = -1;

	while (size > 0) {
		if (port < 0x8000)
			bitmap = vmcs12->io_bitmap_a;
		else if (port < 0x10000)
			bitmap = vmcs12->io_bitmap_b;
		else
8246
			return true;
8247 8248 8249
		bitmap += (port & 0x7fff) / 8;

		if (last_bitmap != bitmap)
8250
			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
8251
				return true;
8252
		if (b & (1 << (port & 7)))
8253
			return true;
8254 8255 8256 8257 8258 8259

		port++;
		size--;
		last_bitmap = bitmap;
	}

8260
	return false;
8261 8262
}

8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274
/*
 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
 * disinterest in the current event (read or write a specific MSR) by using an
 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
 */
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
	struct vmcs12 *vmcs12, u32 exit_reason)
{
	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
	gpa_t bitmap;

8275
	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8276
		return true;
8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293

	/*
	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
	 * for the four combinations of read/write and low/high MSR numbers.
	 * First we need to figure out which of the four to use:
	 */
	bitmap = vmcs12->msr_bitmap;
	if (exit_reason == EXIT_REASON_MSR_WRITE)
		bitmap += 2048;
	if (msr_index >= 0xc0000000) {
		msr_index -= 0xc0000000;
		bitmap += 1024;
	}

	/* Then read the msr_index'th bit from this bitmap: */
	if (msr_index < 1024*8) {
		unsigned char b;
8294
		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
8295
			return true;
8296 8297
		return 1 & (b >> (msr_index & 7));
	} else
8298
		return true; /* let L1 handle the wrong parameter */
8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310
}

/*
 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
 * intercept (via guest_host_mask etc.) the current event.
 */
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
	struct vmcs12 *vmcs12)
{
	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
	int cr = exit_qualification & 15;
8311 8312
	int reg;
	unsigned long val;
8313 8314 8315

	switch ((exit_qualification >> 4) & 3) {
	case 0: /* mov to cr */
8316 8317
		reg = (exit_qualification >> 8) & 15;
		val = kvm_register_readl(vcpu, reg);
8318 8319 8320 8321
		switch (cr) {
		case 0:
			if (vmcs12->cr0_guest_host_mask &
			    (val ^ vmcs12->cr0_read_shadow))
8322
				return true;
8323 8324 8325 8326 8327 8328 8329 8330 8331 8332
			break;
		case 3:
			if ((vmcs12->cr3_target_count >= 1 &&
					vmcs12->cr3_target_value0 == val) ||
				(vmcs12->cr3_target_count >= 2 &&
					vmcs12->cr3_target_value1 == val) ||
				(vmcs12->cr3_target_count >= 3 &&
					vmcs12->cr3_target_value2 == val) ||
				(vmcs12->cr3_target_count >= 4 &&
					vmcs12->cr3_target_value3 == val))
8333
				return false;
8334
			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
8335
				return true;
8336 8337 8338 8339
			break;
		case 4:
			if (vmcs12->cr4_guest_host_mask &
			    (vmcs12->cr4_read_shadow ^ val))
8340
				return true;
8341 8342 8343
			break;
		case 8:
			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
8344
				return true;
8345 8346 8347 8348 8349 8350
			break;
		}
		break;
	case 2: /* clts */
		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
8351
			return true;
8352 8353 8354 8355 8356 8357
		break;
	case 1: /* mov from cr */
		switch (cr) {
		case 3:
			if (vmcs12->cpu_based_vm_exec_control &
			    CPU_BASED_CR3_STORE_EXITING)
8358
				return true;
8359 8360 8361 8362
			break;
		case 8:
			if (vmcs12->cpu_based_vm_exec_control &
			    CPU_BASED_CR8_STORE_EXITING)
8363
				return true;
8364 8365 8366 8367 8368 8369 8370 8371
			break;
		}
		break;
	case 3: /* lmsw */
		/*
		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
		 * cr0. Other attempted changes are ignored, with no exit.
		 */
8372
		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
8373 8374
		if (vmcs12->cr0_guest_host_mask & 0xe &
		    (val ^ vmcs12->cr0_read_shadow))
8375
			return true;
8376 8377 8378
		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
		    !(vmcs12->cr0_read_shadow & 0x1) &&
		    (val & 0x1))
8379
			return true;
8380 8381
		break;
	}
8382
	return false;
8383 8384 8385 8386 8387 8388 8389
}

/*
 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
 * should handle it ourselves in L0 (and then continue L2). Only call this
 * when in is_guest_mode (L2).
 */
8390
static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8391 8392 8393 8394 8395
{
	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

8396 8397 8398 8399 8400 8401 8402 8403
	if (vmx->nested.nested_run_pending)
		return false;

	if (unlikely(vmx->fail)) {
		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
				    vmcs_read32(VM_INSTRUCTION_ERROR));
		return true;
	}
8404

8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416
	/*
	 * The host physical addresses of some pages of guest memory
	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
	 * may write to these pages via their host physical address while
	 * L2 is running, bypassing any address-translation-based dirty
	 * tracking (e.g. EPT write protection).
	 *
	 * Mark them dirty on every exit from L2 to prevent them from
	 * getting out of sync with dirty tracking.
	 */
	nested_mark_vmcs12_pages_dirty(vcpu);

8417 8418 8419 8420 8421 8422
	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
				vmcs_readl(EXIT_QUALIFICATION),
				vmx->idt_vectoring_info,
				intr_info,
				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
				KVM_ISA_VMX);
8423 8424 8425

	switch (exit_reason) {
	case EXIT_REASON_EXCEPTION_NMI:
8426
		if (is_nmi(intr_info))
8427
			return false;
8428
		else if (is_page_fault(intr_info))
8429
			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
8430
		else if (is_no_device(intr_info) &&
8431
			 !(vmcs12->guest_cr0 & X86_CR0_TS))
8432
			return false;
8433 8434 8435 8436 8437 8438 8439
		else if (is_debug(intr_info) &&
			 vcpu->guest_debug &
			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
			return false;
		else if (is_breakpoint(intr_info) &&
			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
			return false;
8440 8441 8442
		return vmcs12->exception_bitmap &
				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
	case EXIT_REASON_EXTERNAL_INTERRUPT:
8443
		return false;
8444
	case EXIT_REASON_TRIPLE_FAULT:
8445
		return true;
8446
	case EXIT_REASON_PENDING_INTERRUPT:
8447
		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
8448
	case EXIT_REASON_NMI_WINDOW:
8449
		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
8450
	case EXIT_REASON_TASK_SWITCH:
8451
		return true;
8452
	case EXIT_REASON_CPUID:
8453
		return true;
8454 8455 8456
	case EXIT_REASON_HLT:
		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
	case EXIT_REASON_INVD:
8457
		return true;
8458 8459 8460 8461
	case EXIT_REASON_INVLPG:
		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
	case EXIT_REASON_RDPMC:
		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
8462
	case EXIT_REASON_RDRAND:
8463
		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
8464
	case EXIT_REASON_RDSEED:
8465
		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
J
Jan Kiszka 已提交
8466
	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
8467 8468 8469 8470 8471 8472
		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
8473
	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
8474 8475 8476 8477
		/*
		 * VMX instructions trap unconditionally. This allows L1 to
		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
		 */
8478
		return true;
8479 8480 8481 8482 8483
	case EXIT_REASON_CR_ACCESS:
		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
	case EXIT_REASON_DR_ACCESS:
		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
	case EXIT_REASON_IO_INSTRUCTION:
8484
		return nested_vmx_exit_handled_io(vcpu, vmcs12);
8485 8486
	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
8487 8488 8489 8490
	case EXIT_REASON_MSR_READ:
	case EXIT_REASON_MSR_WRITE:
		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
	case EXIT_REASON_INVALID_STATE:
8491
		return true;
8492 8493
	case EXIT_REASON_MWAIT_INSTRUCTION:
		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
8494 8495
	case EXIT_REASON_MONITOR_TRAP_FLAG:
		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
8496 8497 8498 8499 8500 8501 8502
	case EXIT_REASON_MONITOR_INSTRUCTION:
		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
	case EXIT_REASON_PAUSE_INSTRUCTION:
		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
			nested_cpu_has2(vmcs12,
				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
	case EXIT_REASON_MCE_DURING_VMENTRY:
8503
		return false;
8504
	case EXIT_REASON_TPR_BELOW_THRESHOLD:
8505
		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
8506 8507 8508
	case EXIT_REASON_APIC_ACCESS:
		return nested_cpu_has2(vmcs12,
			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
8509
	case EXIT_REASON_APIC_WRITE:
8510 8511
	case EXIT_REASON_EOI_INDUCED:
		/* apic_write and eoi_induced should exit unconditionally. */
8512
		return true;
8513
	case EXIT_REASON_EPT_VIOLATION:
N
Nadav Har'El 已提交
8514 8515 8516 8517 8518 8519
		/*
		 * L0 always deals with the EPT violation. If nested EPT is
		 * used, and the nested mmu code discovers that the address is
		 * missing in the guest EPT table (EPT12), the EPT violation
		 * will be injected with nested_ept_inject_page_fault()
		 */
8520
		return false;
8521
	case EXIT_REASON_EPT_MISCONFIG:
N
Nadav Har'El 已提交
8522 8523 8524 8525 8526 8527
		/*
		 * L2 never uses directly L1's EPT, but rather L0's own EPT
		 * table (shadow on EPT) or a merged EPT table that L0 built
		 * (EPT on EPT). So any problems with the structure of the
		 * table is L0's fault.
		 */
8528
		return false;
P
Paolo Bonzini 已提交
8529 8530 8531 8532
	case EXIT_REASON_INVPCID:
		return
			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
8533 8534 8535
	case EXIT_REASON_WBINVD:
		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
	case EXIT_REASON_XSETBV:
8536
		return true;
8537 8538 8539 8540 8541 8542 8543 8544
	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
		/*
		 * This should never happen, since it is not possible to
		 * set XSS to a non-zero value---neither in L1 nor in L2.
		 * If if it were, XSS would have to be checked against
		 * the XSS exit bitmap in vmcs12.
		 */
		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
8545 8546
	case EXIT_REASON_PREEMPTION_TIMER:
		return false;
8547
	case EXIT_REASON_PML_FULL:
8548
		/* We emulate PML support to L1. */
8549
		return false;
B
Bandan Das 已提交
8550 8551 8552
	case EXIT_REASON_VMFUNC:
		/* VM functions are emulated through L2->L0 vmexits. */
		return false;
8553
	default:
8554
		return true;
8555 8556 8557
	}
}

8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580
static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
{
	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);

	/*
	 * At this point, the exit interruption info in exit_intr_info
	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
	 * we need to query the in-kernel LAPIC.
	 */
	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
	if ((exit_intr_info &
	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		vmcs12->vm_exit_intr_error_code =
			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
	}

	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
			  vmcs_readl(EXIT_QUALIFICATION));
	return 1;
}

8581 8582 8583 8584 8585 8586
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
	*info1 = vmcs_readl(EXIT_QUALIFICATION);
	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}

K
Kai Huang 已提交
8587
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
K
Kai Huang 已提交
8588
{
K
Kai Huang 已提交
8589 8590 8591 8592
	if (vmx->pml_pg) {
		__free_page(vmx->pml_pg);
		vmx->pml_pg = NULL;
	}
K
Kai Huang 已提交
8593 8594
}

8595
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
K
Kai Huang 已提交
8596
{
8597
	struct vcpu_vmx *vmx = to_vmx(vcpu);
K
Kai Huang 已提交
8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618
	u64 *pml_buf;
	u16 pml_idx;

	pml_idx = vmcs_read16(GUEST_PML_INDEX);

	/* Do nothing if PML buffer is empty */
	if (pml_idx == (PML_ENTITY_NUM - 1))
		return;

	/* PML index always points to next available PML buffer entity */
	if (pml_idx >= PML_ENTITY_NUM)
		pml_idx = 0;
	else
		pml_idx++;

	pml_buf = page_address(vmx->pml_pg);
	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
		u64 gpa;

		gpa = pml_buf[pml_idx];
		WARN_ON(gpa & (PAGE_SIZE - 1));
8619
		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
K
Kai Huang 已提交
8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643
	}

	/* reset PML index */
	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}

/*
 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
 * Called before reporting dirty_bitmap to userspace.
 */
static void kvm_flush_pml_buffers(struct kvm *kvm)
{
	int i;
	struct kvm_vcpu *vcpu;
	/*
	 * We only need to kick vcpu out of guest mode here, as PML buffer
	 * is flushed at beginning of all VMEXITs, and it's obvious that only
	 * vcpus running in guest are possible to have unflushed GPAs in PML
	 * buffer.
	 */
	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_vcpu_kick(vcpu);
}

8644 8645 8646
static void vmx_dump_sel(char *name, uint32_t sel)
{
	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
8647
	       name, vmcs_read16(sel),
8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667
	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
}

static void vmx_dump_dtsel(char *name, uint32_t limit)
{
	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
	       name, vmcs_read32(limit),
	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}

static void dump_vmcs(void)
{
	u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
	u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
	u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
	u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
	u32 secondary_exec_control = 0;
	unsigned long cr4 = vmcs_readl(GUEST_CR4);
8668
	u64 efer = vmcs_read64(GUEST_IA32_EFER);
8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683
	int i, n;

	if (cpu_has_secondary_exec_ctrls())
		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);

	pr_err("*** Guest State ***\n");
	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
	       vmcs_readl(CR0_GUEST_HOST_MASK));
	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
	{
8684 8685 8686 8687
		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707
	}
	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
	       vmcs_readl(GUEST_SYSENTER_ESP),
	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
8708 8709 8710 8711
		pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
		       efer, vmcs_read64(GUEST_IA32_PAT));
	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
	       vmcs_read64(GUEST_IA32_DEBUGCTL),
8712 8713
	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
8714 8715
		pr_err("PerfGlobCtl = 0x%016llx\n",
		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
8716
	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
8717
		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745
	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
	       vmcs_read32(GUEST_ACTIVITY_STATE));
	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
		pr_err("InterruptStatus = %04x\n",
		       vmcs_read16(GUEST_INTR_STATUS));

	pr_err("*** Host State ***\n");
	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
	       vmcs_read16(HOST_TR_SELECTOR));
	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
	       vmcs_readl(HOST_TR_BASE));
	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
	       vmcs_readl(HOST_CR4));
	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
	       vmcs_read32(HOST_IA32_SYSENTER_CS),
	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
8746 8747 8748
		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
		       vmcs_read64(HOST_IA32_EFER),
		       vmcs_read64(HOST_IA32_PAT));
8749
	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8750 8751
		pr_err("PerfGlobCtl = 0x%016llx\n",
		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773

	pr_err("*** Control State ***\n");
	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
	       vmcs_read32(EXCEPTION_BITMAP),
	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
	       vmcs_read32(VM_EXIT_INTR_INFO),
	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
	pr_err("        reason=%08x qualification=%016lx\n",
	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
8774
	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
8775
	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
8776 8777
		pr_err("TSC Multiplier = 0x%016llx\n",
		       vmcs_read64(TSC_MULTIPLIER));
8778 8779 8780 8781 8782
	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
		pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
8783
		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799
	n = vmcs_read32(CR3_TARGET_COUNT);
	for (i = 0; i + 1 < n; i += 4)
		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
	if (i < n)
		pr_err("CR3 target%u=%016lx\n",
		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
		pr_err("PLE Gap=%08x Window=%08x\n",
		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
		pr_err("Virtual processor ID = 0x%04x\n",
		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
}

A
Avi Kivity 已提交
8800 8801 8802 8803
/*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
 */
A
Avi Kivity 已提交
8804
static int vmx_handle_exit(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
8805
{
8806
	struct vcpu_vmx *vmx = to_vmx(vcpu);
A
Andi Kleen 已提交
8807
	u32 exit_reason = vmx->exit_reason;
8808
	u32 vectoring_info = vmx->idt_vectoring_info;
8809

8810 8811
	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);

K
Kai Huang 已提交
8812 8813 8814 8815 8816 8817 8818 8819
	/*
	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
	 * mode as if vcpus is in root mode, the PML buffer must has been
	 * flushed already.
	 */
	if (enable_pml)
8820
		vmx_flush_pml_buffer(vcpu);
K
Kai Huang 已提交
8821

8822
	/* If guest state is invalid, start emulating */
8823
	if (vmx->emulation_required)
8824
		return handle_invalid_guest_state(vcpu);
8825

8826 8827
	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
8828

8829
	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
8830
		dump_vmcs();
8831 8832 8833 8834 8835 8836
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		vcpu->run->fail_entry.hardware_entry_failure_reason
			= exit_reason;
		return 0;
	}

8837
	if (unlikely(vmx->fail)) {
A
Avi Kivity 已提交
8838 8839
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		vcpu->run->fail_entry.hardware_entry_failure_reason
8840 8841 8842
			= vmcs_read32(VM_INSTRUCTION_ERROR);
		return 0;
	}
A
Avi Kivity 已提交
8843

8844 8845 8846 8847 8848 8849 8850
	/*
	 * Note:
	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
	 * delivery event since it indicates guest is accessing MMIO.
	 * The vm-exit can be triggered again after return to guest that
	 * will cause infinite loop.
	 */
M
Mike Day 已提交
8851
	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
8852
			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
J
Jan Kiszka 已提交
8853
			exit_reason != EXIT_REASON_EPT_VIOLATION &&
8854
			exit_reason != EXIT_REASON_PML_FULL &&
8855 8856 8857
			exit_reason != EXIT_REASON_TASK_SWITCH)) {
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
8858
		vcpu->run->internal.ndata = 3;
8859 8860
		vcpu->run->internal.data[0] = vectoring_info;
		vcpu->run->internal.data[1] = exit_reason;
8861 8862 8863 8864 8865 8866
		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
			vcpu->run->internal.ndata++;
			vcpu->run->internal.data[3] =
				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
		}
8867 8868
		return 0;
	}
8869

8870
	if (unlikely(!enable_vnmi &&
8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888
		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
		if (vmx_interrupt_allowed(vcpu)) {
			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
			   vcpu->arch.nmi_pending) {
			/*
			 * This CPU don't support us in finding the end of an
			 * NMI-blocked window if the guest runs with IRQs
			 * disabled. So we pull the trigger after 1 s of
			 * futile waiting, but inform the user about this.
			 */
			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
			       "state on VCPU %d after 1 s timeout\n",
			       __func__, vcpu->vcpu_id);
			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
		}
	}

A
Avi Kivity 已提交
8889 8890
	if (exit_reason < kvm_vmx_max_exit_handlers
	    && kvm_vmx_exit_handlers[exit_reason])
A
Avi Kivity 已提交
8891
		return kvm_vmx_exit_handlers[exit_reason](vcpu);
A
Avi Kivity 已提交
8892
	else {
8893 8894
		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
				exit_reason);
8895 8896
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
A
Avi Kivity 已提交
8897 8898 8899
	}
}

8900
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
8901
{
8902 8903 8904 8905 8906 8907
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

	if (is_guest_mode(vcpu) &&
		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
		return;

8908
	if (irr == -1 || tpr < irr) {
8909 8910 8911 8912
		vmcs_write32(TPR_THRESHOLD, 0);
		return;
	}

8913
	vmcs_write32(TPR_THRESHOLD, irr);
8914 8915
}

8916 8917 8918 8919
static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
{
	u32 sec_exec_control;

8920 8921 8922 8923 8924 8925
	/* Postpone execution until vmcs01 is the current VMCS. */
	if (is_guest_mode(vcpu)) {
		to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
		return;
	}

8926
	if (!cpu_has_vmx_virtualize_x2apic_mode())
8927 8928
		return;

8929
	if (!cpu_need_tpr_shadow(vcpu))
8930 8931 8932 8933 8934 8935 8936 8937 8938 8939
		return;

	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);

	if (set) {
		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
	} else {
		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8940
		vmx_flush_tlb_ept_only(vcpu);
8941 8942 8943 8944 8945 8946
	}
	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);

	vmx_set_msr_bitmap(vcpu);
}

8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	/*
	 * Currently we do not handle the nested case where L2 has an
	 * APIC access page of its own; that page is still pinned.
	 * Hence, we skip the case where the VCPU is in guest mode _and_
	 * L1 prepared an APIC access page for L2.
	 *
	 * For the case where L1 and L2 share the same APIC access page
	 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
	 * in the vmcs12), this function will only update either the vmcs01
	 * or the vmcs02.  If the former, the vmcs02 will be updated by
	 * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
	 * the next L2->L1 exit.
	 */
	if (!is_guest_mode(vcpu) ||
8965
	    !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
8966
			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8967
		vmcs_write64(APIC_ACCESS_ADDR, hpa);
8968 8969
		vmx_flush_tlb_ept_only(vcpu);
	}
8970 8971
}

8972
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
8973 8974 8975 8976
{
	u16 status;
	u8 old;

8977 8978
	if (max_isr == -1)
		max_isr = 0;
8979 8980 8981

	status = vmcs_read16(GUEST_INTR_STATUS);
	old = status >> 8;
8982
	if (max_isr != old) {
8983
		status &= 0xff;
8984
		status |= max_isr << 8;
8985 8986 8987 8988 8989 8990 8991 8992 8993
		vmcs_write16(GUEST_INTR_STATUS, status);
	}
}

static void vmx_set_rvi(int vector)
{
	u16 status;
	u8 old;

W
Wei Wang 已提交
8994 8995 8996
	if (vector == -1)
		vector = 0;

8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007
	status = vmcs_read16(GUEST_INTR_STATUS);
	old = (u8)status & 0xff;
	if ((u8)vector != old) {
		status &= ~0xff;
		status |= (u8)vector;
		vmcs_write16(GUEST_INTR_STATUS, status);
	}
}

static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
W
Wei Wang 已提交
9008 9009 9010 9011 9012
	if (!is_guest_mode(vcpu)) {
		vmx_set_rvi(max_irr);
		return;
	}

9013 9014 9015
	if (max_irr == -1)
		return;

9016
	/*
W
Wei Wang 已提交
9017 9018
	 * In guest mode.  If a vmexit is needed, vmx_check_nested_events
	 * handles it.
9019
	 */
W
Wei Wang 已提交
9020
	if (nested_exit_on_intr(vcpu))
9021 9022 9023
		return;

	/*
W
Wei Wang 已提交
9024
	 * Else, fall back to pre-APICv interrupt injection since L2
9025 9026 9027 9028 9029 9030 9031
	 * is run without virtual interrupt delivery.
	 */
	if (!kvm_event_needs_reinjection(vcpu) &&
	    vmx_interrupt_allowed(vcpu)) {
		kvm_queue_interrupt(vcpu, max_irr, false);
		vmx_inject_irq(vcpu);
	}
9032 9033
}

9034
static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
9035 9036
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
9037
	int max_irr;
9038

9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052
	WARN_ON(!vcpu->arch.apicv_active);
	if (pi_test_on(&vmx->pi_desc)) {
		pi_clear_on(&vmx->pi_desc);
		/*
		 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
		 * But on x86 this is just a compiler barrier anyway.
		 */
		smp_mb__after_atomic();
		max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
	} else {
		max_irr = kvm_lapic_find_highest_irr(vcpu);
	}
	vmx_hwapic_irr_update(vcpu, max_irr);
	return max_irr;
9053 9054
}

9055
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
9056
{
9057
	if (!kvm_vcpu_apicv_active(vcpu))
9058 9059
		return;

9060 9061 9062 9063 9064 9065
	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}

9066 9067 9068 9069 9070 9071 9072 9073
static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	pi_clear_on(&vmx->pi_desc);
	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}

9074
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
9075
{
9076 9077
	u32 exit_intr_info = 0;
	u16 basic_exit_reason = (u16)vmx->exit_reason;
9078

9079 9080
	if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
	      || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
9081 9082
		return;

9083 9084 9085
	if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
	vmx->exit_intr_info = exit_intr_info;
A
Andi Kleen 已提交
9086

9087 9088 9089 9090
	/* if exit due to PF check for async PF */
	if (is_page_fault(exit_intr_info))
		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();

A
Andi Kleen 已提交
9091
	/* Handle machine checks before interrupts are enabled */
9092 9093
	if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
	    is_machine_check(exit_intr_info))
A
Andi Kleen 已提交
9094 9095
		kvm_machine_check();

9096
	/* We need to handle NMIs before interrupts are enabled */
9097
	if (is_nmi(exit_intr_info)) {
9098
		kvm_before_handle_nmi(&vmx->vcpu);
9099
		asm("int $2");
9100 9101
		kvm_after_handle_nmi(&vmx->vcpu);
	}
9102
}
9103

9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119
static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
{
	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);

	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
		unsigned int vector;
		unsigned long entry;
		gate_desc *desc;
		struct vcpu_vmx *vmx = to_vmx(vcpu);
#ifdef CONFIG_X86_64
		unsigned long tmp;
#endif

		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
		desc = (gate_desc *)vmx->host_idt_base + vector;
9120
		entry = gate_offset(desc);
9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132
		asm volatile(
#ifdef CONFIG_X86_64
			"mov %%" _ASM_SP ", %[sp]\n\t"
			"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
			"push $%c[ss]\n\t"
			"push %[sp]\n\t"
#endif
			"pushf\n\t"
			__ASM_SIZE(push) " $%c[cs]\n\t"
			"call *%[entry]\n\t"
			:
#ifdef CONFIG_X86_64
9133
			[sp]"=&r"(tmp),
9134
#endif
9135
			ASM_CALL_CONSTRAINT
9136 9137 9138 9139 9140
			:
			[entry]"r"(entry),
			[ss]"i"(__KERNEL_DS),
			[cs]"i"(__KERNEL_CS)
			);
P
Paolo Bonzini 已提交
9141
	}
9142
}
9143
STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
9144

9145 9146 9147 9148 9149
static bool vmx_has_high_real_mode_segbase(void)
{
	return enable_unrestricted_guest || emulate_invalid_guest_state;
}

9150 9151 9152 9153 9154 9155
static bool vmx_mpx_supported(void)
{
	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
		(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
}

9156 9157 9158 9159 9160 9161
static bool vmx_xsaves_supported(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_XSAVES;
}

9162 9163
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
9164
	u32 exit_intr_info;
9165 9166 9167 9168 9169
	bool unblock_nmi;
	u8 vector;
	bool idtv_info_valid;

	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
9170

9171
	if (enable_vnmi) {
9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202
		if (vmx->loaded_vmcs->nmi_known_unmasked)
			return;
		/*
		 * Can't use vmx->exit_intr_info since we're not sure what
		 * the exit reason is.
		 */
		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
		/*
		 * SDM 3: 27.7.1.2 (September 2008)
		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
		 * a guest IRET fault.
		 * SDM 3: 23.2.2 (September 2008)
		 * Bit 12 is undefined in any of the following cases:
		 *  If the VM exit sets the valid bit in the IDT-vectoring
		 *   information field.
		 *  If the VM exit is due to a double fault.
		 */
		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
		    vector != DF_VECTOR && !idtv_info_valid)
			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
				      GUEST_INTR_STATE_NMI);
		else
			vmx->loaded_vmcs->nmi_known_unmasked =
				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
				  & GUEST_INTR_STATE_NMI);
	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
		vmx->loaded_vmcs->vnmi_blocked_time +=
			ktime_to_ns(ktime_sub(ktime_get(),
					      vmx->loaded_vmcs->entry_time));
9203 9204
}

9205
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
9206 9207 9208
				      u32 idt_vectoring_info,
				      int instr_len_field,
				      int error_code_field)
9209 9210 9211 9212 9213 9214
{
	u8 vector;
	int type;
	bool idtv_info_valid;

	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
9215

9216 9217 9218
	vcpu->arch.nmi_injected = false;
	kvm_clear_exception_queue(vcpu);
	kvm_clear_interrupt_queue(vcpu);
9219 9220 9221 9222

	if (!idtv_info_valid)
		return;

9223
	kvm_make_request(KVM_REQ_EVENT, vcpu);
9224

9225 9226
	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
9227

9228
	switch (type) {
9229
	case INTR_TYPE_NMI_INTR:
9230
		vcpu->arch.nmi_injected = true;
9231
		/*
9232
		 * SDM 3: 27.7.1.2 (September 2008)
9233 9234
		 * Clear bit "block by NMI" before VM entry if a NMI
		 * delivery faulted.
9235
		 */
9236
		vmx_set_nmi_mask(vcpu, false);
9237 9238
		break;
	case INTR_TYPE_SOFT_EXCEPTION:
9239
		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
9240 9241
		/* fall through */
	case INTR_TYPE_HARD_EXCEPTION:
9242
		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
9243
			u32 err = vmcs_read32(error_code_field);
9244
			kvm_requeue_exception_e(vcpu, vector, err);
9245
		} else
9246
			kvm_requeue_exception(vcpu, vector);
9247
		break;
9248
	case INTR_TYPE_SOFT_INTR:
9249
		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
9250
		/* fall through */
9251
	case INTR_TYPE_EXT_INTR:
9252
		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
9253 9254 9255
		break;
	default:
		break;
9256
	}
9257 9258
}

9259 9260
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
9261
	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
9262 9263 9264 9265
				  VM_EXIT_INSTRUCTION_LEN,
				  IDT_VECTORING_ERROR_CODE);
}

A
Avi Kivity 已提交
9266 9267
static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
9268
	__vmx_complete_interrupts(vcpu,
A
Avi Kivity 已提交
9269 9270 9271 9272 9273 9274 9275
				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
				  VM_ENTRY_INSTRUCTION_LEN,
				  VM_ENTRY_EXCEPTION_ERROR_CODE);

	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}

9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293
static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
	int i, nr_msrs;
	struct perf_guest_switch_msr *msrs;

	msrs = perf_guest_get_msrs(&nr_msrs);

	if (!msrs)
		return;

	for (i = 0; i < nr_msrs; i++)
		if (msrs[i].host == msrs[i].guest)
			clear_atomic_switch_msr(vmx, msrs[i].msr);
		else
			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
					msrs[i].host);
}

9294
static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u64 tscl;
	u32 delta_tsc;

	if (vmx->hv_deadline_tsc == -1)
		return;

	tscl = rdtsc();
	if (vmx->hv_deadline_tsc > tscl)
		/* sure to be 32 bit only because checked on set_hv_timer */
		delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
			cpu_preemption_timer_multi);
	else
		delta_tsc = 0;

	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
}

9314
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
9315
{
9316
	struct vcpu_vmx *vmx = to_vmx(vcpu);
9317
	unsigned long debugctlmsr, cr3, cr4;
9318

9319
	/* Record the guest's net vcpu time for enforced NMI injections. */
9320
	if (unlikely(!enable_vnmi &&
9321 9322 9323
		     vmx->loaded_vmcs->soft_vnmi_blocked))
		vmx->loaded_vmcs->entry_time = ktime_get();

9324 9325
	/* Don't enter VMX if guest state is invalid, let the exit handler
	   start emulation until we arrive back to a valid state */
9326
	if (vmx->emulation_required)
9327 9328
		return;

9329 9330 9331 9332 9333
	if (vmx->ple_window_dirty) {
		vmx->ple_window_dirty = false;
		vmcs_write32(PLE_WINDOW, vmx->ple_window);
	}

9334 9335 9336 9337 9338
	if (vmx->nested.sync_shadow_vmcs) {
		copy_vmcs12_to_shadow(vmx);
		vmx->nested.sync_shadow_vmcs = false;
	}

9339 9340 9341 9342 9343
	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

9344
	cr3 = __get_current_cr3_fast();
9345
	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
9346
		vmcs_writel(HOST_CR3, cr3);
9347
		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
9348 9349
	}

9350
	cr4 = cr4_read_shadow();
9351
	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
9352
		vmcs_writel(HOST_CR4, cr4);
9353
		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
9354 9355
	}

9356 9357 9358 9359 9360 9361 9362 9363
	/* When single-stepping over STI and MOV SS, we must clear the
	 * corresponding interruptibility bits in the guest state. Otherwise
	 * vmentry fails as it then expects bit 14 (BS) in pending debug
	 * exceptions being set, but that's not correct for the guest debugging
	 * case. */
	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
		vmx_set_interrupt_shadow(vcpu, 0);

9364 9365 9366 9367
	if (static_cpu_has(X86_FEATURE_PKU) &&
	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
	    vcpu->arch.pkru != vmx->host_pkru)
		__write_pkru(vcpu->arch.pkru);
9368

9369
	atomic_switch_perf_msrs(vmx);
9370
	debugctlmsr = get_debugctlmsr();
9371

9372 9373
	vmx_arm_hv_timer(vcpu);

9374
	vmx->__launched = vmx->loaded_vmcs->launched;
9375
	asm(
A
Avi Kivity 已提交
9376
		/* Store host registers */
A
Avi Kivity 已提交
9377 9378 9379 9380
		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
		"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
		"push %%" _ASM_CX " \n\t"
		"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9381
		"je 1f \n\t"
A
Avi Kivity 已提交
9382
		"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9383
		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
9384
		"1: \n\t"
9385
		/* Reload cr2 if changed */
A
Avi Kivity 已提交
9386 9387 9388
		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
		"mov %%cr2, %%" _ASM_DX " \n\t"
		"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
9389
		"je 2f \n\t"
A
Avi Kivity 已提交
9390
		"mov %%" _ASM_AX", %%cr2 \n\t"
9391
		"2: \n\t"
A
Avi Kivity 已提交
9392
		/* Check if vmlaunch of vmresume is needed */
9393
		"cmpl $0, %c[launched](%0) \n\t"
A
Avi Kivity 已提交
9394
		/* Load guest registers.  Don't clobber flags. */
A
Avi Kivity 已提交
9395 9396 9397 9398 9399 9400
		"mov %c[rax](%0), %%" _ASM_AX " \n\t"
		"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
		"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
		"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
		"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
		"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
9401
#ifdef CONFIG_X86_64
9402 9403 9404 9405 9406 9407 9408 9409
		"mov %c[r8](%0),  %%r8  \n\t"
		"mov %c[r9](%0),  %%r9  \n\t"
		"mov %c[r10](%0), %%r10 \n\t"
		"mov %c[r11](%0), %%r11 \n\t"
		"mov %c[r12](%0), %%r12 \n\t"
		"mov %c[r13](%0), %%r13 \n\t"
		"mov %c[r14](%0), %%r14 \n\t"
		"mov %c[r15](%0), %%r15 \n\t"
A
Avi Kivity 已提交
9410
#endif
A
Avi Kivity 已提交
9411
		"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
9412

A
Avi Kivity 已提交
9413
		/* Enter guest mode */
A
Avi Kivity 已提交
9414
		"jne 1f \n\t"
9415
		__ex(ASM_VMX_VMLAUNCH) "\n\t"
A
Avi Kivity 已提交
9416 9417 9418
		"jmp 2f \n\t"
		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
		"2: "
A
Avi Kivity 已提交
9419
		/* Save guest registers, load host registers, keep flags */
A
Avi Kivity 已提交
9420
		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
9421
		"pop %0 \n\t"
A
Avi Kivity 已提交
9422 9423 9424 9425 9426 9427 9428
		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
		"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
		"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
		"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
		"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
9429
#ifdef CONFIG_X86_64
9430 9431 9432 9433 9434 9435 9436 9437
		"mov %%r8,  %c[r8](%0) \n\t"
		"mov %%r9,  %c[r9](%0) \n\t"
		"mov %%r10, %c[r10](%0) \n\t"
		"mov %%r11, %c[r11](%0) \n\t"
		"mov %%r12, %c[r12](%0) \n\t"
		"mov %%r13, %c[r13](%0) \n\t"
		"mov %%r14, %c[r14](%0) \n\t"
		"mov %%r15, %c[r15](%0) \n\t"
A
Avi Kivity 已提交
9438
#endif
A
Avi Kivity 已提交
9439 9440
		"mov %%cr2, %%" _ASM_AX "   \n\t"
		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
9441

A
Avi Kivity 已提交
9442
		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
9443
		"setbe %c[fail](%0) \n\t"
A
Avi Kivity 已提交
9444 9445 9446 9447
		".pushsection .rodata \n\t"
		".global vmx_return \n\t"
		"vmx_return: " _ASM_PTR " 2b \n\t"
		".popsection"
9448
	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
9449
		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
9450
		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
9451
		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
9452 9453 9454 9455 9456 9457 9458
		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
9459
#ifdef CONFIG_X86_64
9460 9461 9462 9463 9464 9465 9466 9467
		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
A
Avi Kivity 已提交
9468
#endif
9469 9470
		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
		[wordsize]"i"(sizeof(ulong))
9471 9472
	      : "cc", "memory"
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
9473
		, "rax", "rbx", "rdi", "rsi"
9474
		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
A
Avi Kivity 已提交
9475 9476
#else
		, "eax", "ebx", "edi", "esi"
9477 9478
#endif
	      );
A
Avi Kivity 已提交
9479

9480 9481 9482 9483
	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
	if (debugctlmsr)
		update_debugctlmsr(debugctlmsr);

9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496
#ifndef CONFIG_X86_64
	/*
	 * The sysexit path does not restore ds/es, so we must set them to
	 * a reasonable value ourselves.
	 *
	 * We can't defer this to vmx_load_host_state() since that function
	 * may be executed in interrupt context, which saves and restore segments
	 * around it, nullifying its effect.
	 */
	loadsegment(ds, __USER_DS);
	loadsegment(es, __USER_DS);
#endif

A
Avi Kivity 已提交
9497
	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
A
Avi Kivity 已提交
9498
				  | (1 << VCPU_EXREG_RFLAGS)
9499
				  | (1 << VCPU_EXREG_PDPTR)
A
Avi Kivity 已提交
9500
				  | (1 << VCPU_EXREG_SEGMENTS)
9501
				  | (1 << VCPU_EXREG_CR3));
9502 9503
	vcpu->arch.regs_dirty = 0;

9504 9505 9506 9507 9508
	/*
	 * eager fpu is enabled if PKEY is supported and CR4 is switched
	 * back on host, so it is safe to read guest PKRU from current
	 * XSAVE.
	 */
9509 9510 9511 9512
	if (static_cpu_has(X86_FEATURE_PKU) &&
	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
		vcpu->arch.pkru = __read_pkru();
		if (vcpu->arch.pkru != vmx->host_pkru)
9513 9514 9515
			__write_pkru(vmx->host_pkru);
	}

9516 9517 9518 9519 9520 9521 9522 9523 9524
	/*
	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
	 * we did not inject a still-pending event to L1 now because of
	 * nested_run_pending, we need to re-enable this bit.
	 */
	if (vmx->nested.nested_run_pending)
		kvm_make_request(KVM_REQ_EVENT, vcpu);

	vmx->nested.nested_run_pending = 0;
9525 9526 9527 9528 9529 9530 9531 9532
	vmx->idt_vectoring_info = 0;

	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
		return;

	vmx->loaded_vmcs->launched = 1;
	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
9533

9534 9535
	vmx_complete_atomic_exit(vmx);
	vmx_recover_nmi_blocking(vmx);
9536
	vmx_complete_interrupts(vmx);
A
Avi Kivity 已提交
9537
}
9538
STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
A
Avi Kivity 已提交
9539

9540
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
9541 9542 9543 9544
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int cpu;

9545
	if (vmx->loaded_vmcs == vmcs)
9546 9547 9548
		return;

	cpu = get_cpu();
9549
	vmx->loaded_vmcs = vmcs;
9550 9551 9552 9553 9554
	vmx_vcpu_put(vcpu);
	vmx_vcpu_load(vcpu, cpu);
	put_cpu();
}

9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565
/*
 * Ensure that the current vmcs of the logical processor is the
 * vmcs01 of the vcpu before calling free_nested().
 */
static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
{
       struct vcpu_vmx *vmx = to_vmx(vcpu);
       int r;

       r = vcpu_load(vcpu);
       BUG_ON(r);
9566
       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
9567 9568 9569 9570
       free_nested(vmx);
       vcpu_put(vcpu);
}

A
Avi Kivity 已提交
9571 9572
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
R
Rusty Russell 已提交
9573 9574
	struct vcpu_vmx *vmx = to_vmx(vcpu);

K
Kai Huang 已提交
9575
	if (enable_pml)
K
Kai Huang 已提交
9576
		vmx_destroy_pml_buffer(vmx);
9577
	free_vpid(vmx->vpid);
9578
	leave_guest_mode(vcpu);
9579
	vmx_free_vcpu_nested(vcpu);
9580
	free_loaded_vmcs(vmx->loaded_vmcs);
R
Rusty Russell 已提交
9581 9582
	kfree(vmx->guest_msrs);
	kvm_vcpu_uninit(vcpu);
9583
	kmem_cache_free(kvm_vcpu_cache, vmx);
A
Avi Kivity 已提交
9584 9585
}

R
Rusty Russell 已提交
9586
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
A
Avi Kivity 已提交
9587
{
R
Rusty Russell 已提交
9588
	int err;
9589
	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
9590
	int cpu;
A
Avi Kivity 已提交
9591

9592
	if (!vmx)
R
Rusty Russell 已提交
9593 9594
		return ERR_PTR(-ENOMEM);

9595
	vmx->vpid = allocate_vpid();
9596

R
Rusty Russell 已提交
9597 9598 9599
	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
	if (err)
		goto free_vcpu;
9600

9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614
	err = -ENOMEM;

	/*
	 * If PML is turned on, failure on enabling PML just results in failure
	 * of creating the vcpu, therefore we can simplify PML logic (by
	 * avoiding dealing with cases, such as enabling PML partially on vcpus
	 * for the guest, etc.
	 */
	if (enable_pml) {
		vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
		if (!vmx->pml_pg)
			goto uninit_vcpu;
	}

9615
	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
9616 9617
	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
		     > PAGE_SIZE);
9618

9619 9620
	if (!vmx->guest_msrs)
		goto free_pml;
9621

9622 9623
	vmx->loaded_vmcs = &vmx->vmcs01;
	vmx->loaded_vmcs->vmcs = alloc_vmcs();
9624
	vmx->loaded_vmcs->shadow_vmcs = NULL;
9625
	if (!vmx->loaded_vmcs->vmcs)
R
Rusty Russell 已提交
9626
		goto free_msrs;
9627
	loaded_vmcs_init(vmx->loaded_vmcs);
9628

9629 9630
	cpu = get_cpu();
	vmx_vcpu_load(&vmx->vcpu, cpu);
Z
Zachary Amsden 已提交
9631
	vmx->vcpu.cpu = cpu;
9632
	vmx_vcpu_setup(vmx);
R
Rusty Russell 已提交
9633
	vmx_vcpu_put(&vmx->vcpu);
9634
	put_cpu();
9635
	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9636 9637
		err = alloc_apic_access_page(kvm);
		if (err)
9638
			goto free_vmcs;
9639
	}
R
Rusty Russell 已提交
9640

9641
	if (enable_ept) {
9642 9643
		err = init_rmode_identity_map(kvm);
		if (err)
9644
			goto free_vmcs;
9645
	}
9646

W
Wanpeng Li 已提交
9647
	if (nested) {
9648
		nested_vmx_setup_ctls_msrs(vmx);
W
Wanpeng Li 已提交
9649 9650
		vmx->nested.vpid02 = allocate_vpid();
	}
9651

9652
	vmx->nested.posted_intr_nv = -1;
9653 9654
	vmx->nested.current_vmptr = -1ull;

9655 9656
	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;

9657 9658 9659 9660 9661 9662 9663
	/*
	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
	 * or POSTED_INTR_WAKEUP_VECTOR.
	 */
	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
	vmx->pi_desc.sn = 1;

R
Rusty Russell 已提交
9664 9665 9666
	return &vmx->vcpu;

free_vmcs:
W
Wanpeng Li 已提交
9667
	free_vpid(vmx->nested.vpid02);
9668
	free_loaded_vmcs(vmx->loaded_vmcs);
R
Rusty Russell 已提交
9669 9670
free_msrs:
	kfree(vmx->guest_msrs);
9671 9672
free_pml:
	vmx_destroy_pml_buffer(vmx);
R
Rusty Russell 已提交
9673 9674 9675
uninit_vcpu:
	kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
9676
	free_vpid(vmx->vpid);
9677
	kmem_cache_free(kvm_vcpu_cache, vmx);
R
Rusty Russell 已提交
9678
	return ERR_PTR(err);
A
Avi Kivity 已提交
9679 9680
}

Y
Yang, Sheng 已提交
9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694
static void __init vmx_check_processor_compat(void *rtn)
{
	struct vmcs_config vmcs_conf;

	*(int *)rtn = 0;
	if (setup_vmcs_config(&vmcs_conf) < 0)
		*(int *)rtn = -EIO;
	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
				smp_processor_id());
		*(int *)rtn = -EIO;
	}
}

9695
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
S
Sheng Yang 已提交
9696
{
9697 9698
	u8 cache;
	u64 ipat = 0;
9699

9700
	/* For VT-d and EPT combination
9701
	 * 1. MMIO: always map as UC
9702 9703
	 * 2. EPT with VT-d:
	 *   a. VT-d without snooping control feature: can't guarantee the
9704
	 *	result, try to trust guest.
9705 9706 9707
	 *   b. VT-d with snooping control feature: snooping control feature of
	 *	VT-d engine can guarantee the cache correctness. Just set it
	 *	to WB to keep consistent with host. So the same as item 3.
9708
	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
9709 9710
	 *    consistent with host MTRR
	 */
9711 9712 9713 9714 9715 9716
	if (is_mmio) {
		cache = MTRR_TYPE_UNCACHABLE;
		goto exit;
	}

	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
9717 9718 9719 9720 9721 9722 9723
		ipat = VMX_EPT_IPAT_BIT;
		cache = MTRR_TYPE_WRBACK;
		goto exit;
	}

	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
		ipat = VMX_EPT_IPAT_BIT;
9724
		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
9725 9726 9727
			cache = MTRR_TYPE_WRBACK;
		else
			cache = MTRR_TYPE_UNCACHABLE;
9728 9729 9730
		goto exit;
	}

9731
	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
9732 9733 9734

exit:
	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
S
Sheng Yang 已提交
9735 9736
}

9737
static int vmx_get_lpage_level(void)
9738
{
9739 9740 9741 9742 9743
	if (enable_ept && !cpu_has_vmx_ept_1g_page())
		return PT_DIRECTORY_LEVEL;
	else
		/* For shadow and EPT supported 1GB page */
		return PT_PDPE_LEVEL;
9744 9745
}

9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764
static void vmcs_set_secondary_exec_control(u32 new_ctl)
{
	/*
	 * These bits in the secondary execution controls field
	 * are dynamic, the others are mostly based on the hypervisor
	 * architecture and the guest's CPUID.  Do not touch the
	 * dynamic bits.
	 */
	u32 mask =
		SECONDARY_EXEC_SHADOW_VMCS |
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;

	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);

	vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
		     (new_ctl & ~mask) | (cur_ctl & mask));
}

9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808
/*
 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
 * (indicating "allowed-1") if they are supported in the guest's CPUID.
 */
static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct kvm_cpuid_entry2 *entry;

	vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
	vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;

#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
	if (entry && (entry->_reg & (_cpuid_mask)))			\
		vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask);	\
} while (0)

	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));

	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
	/* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
	cr4_fixed1_update(bit(11),            ecx, bit(2));

#undef cr4_fixed1_update
}

9809 9810
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
9811 9812
	struct vcpu_vmx *vmx = to_vmx(vcpu);

9813 9814 9815
	if (cpu_has_secondary_exec_ctrls()) {
		vmx_compute_secondary_exec_control(vmx);
		vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
9816
	}
X
Xiao Guangrong 已提交
9817

9818 9819 9820 9821 9822 9823
	if (nested_vmx_allowed(vcpu))
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
	else
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
9824 9825 9826

	if (nested_vmx_allowed(vcpu))
		nested_vmx_cr_fixed1_bits_update(vcpu);
9827 9828
}

9829 9830
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
9831 9832
	if (func == 1 && nested)
		entry->ecx |= bit(X86_FEATURE_VMX);
9833 9834
}

9835 9836 9837
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
		struct x86_exception *fault)
{
9838
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9839
	struct vcpu_vmx *vmx = to_vmx(vcpu);
9840
	u32 exit_reason;
9841
	unsigned long exit_qualification = vcpu->arch.exit_qualification;
9842

9843 9844 9845 9846 9847
	if (vmx->nested.pml_full) {
		exit_reason = EXIT_REASON_PML_FULL;
		vmx->nested.pml_full = false;
		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
	} else if (fault->error_code & PFERR_RSVD_MASK)
9848
		exit_reason = EXIT_REASON_EPT_MISCONFIG;
9849
	else
9850
		exit_reason = EXIT_REASON_EPT_VIOLATION;
9851 9852

	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
9853 9854 9855
	vmcs12->guest_physical_address = fault->address;
}

9856 9857
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
{
9858
	return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
9859 9860
}

N
Nadav Har'El 已提交
9861 9862 9863 9864 9865 9866 9867 9868
/* Callbacks for nested_ept_init_mmu_context: */

static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
{
	/* return the page table to be shadowed - in our case, EPT12 */
	return get_vmcs12(vcpu)->ept_pointer;
}

9869
static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
N
Nadav Har'El 已提交
9870
{
9871
	WARN_ON(mmu_is_nested(vcpu));
9872
	if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
9873 9874 9875
		return 1;

	kvm_mmu_unload(vcpu);
9876
	kvm_init_shadow_ept_mmu(vcpu,
9877
			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
9878
			VMX_EPT_EXECUTE_ONLY_BIT,
9879
			nested_ept_ad_enabled(vcpu));
N
Nadav Har'El 已提交
9880 9881 9882 9883 9884
	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;

	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
9885
	return 0;
N
Nadav Har'El 已提交
9886 9887 9888 9889 9890 9891 9892
}

static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
}

9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
					    u16 error_code)
{
	bool inequality, bit;

	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
	inequality =
		(error_code & vmcs12->page_fault_error_code_mask) !=
		 vmcs12->page_fault_error_code_match;
	return inequality ^ bit;
}

9905 9906 9907 9908 9909 9910 9911
static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
		struct x86_exception *fault)
{
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

	WARN_ON(!is_guest_mode(vcpu));

9912 9913
	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
		!to_vmx(vcpu)->nested.nested_run_pending) {
9914 9915 9916 9917 9918
		vmcs12->vm_exit_intr_error_code = fault->error_code;
		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
				  fault->address);
9919
	} else {
9920
		kvm_inject_page_fault(vcpu, fault);
9921
	}
9922 9923
}

9924 9925 9926 9927
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
					       struct vmcs12 *vmcs12);

static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9928 9929 9930
					struct vmcs12 *vmcs12)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
9931
	struct page *page;
9932
	u64 hpa;
9933 9934 9935 9936 9937 9938 9939 9940

	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
		/*
		 * Translate L1 physical address to host physical
		 * address for vmcs02. Keep the page pinned, so this
		 * physical address remains valid. We keep a reference
		 * to it so we can release it later.
		 */
9941
		if (vmx->nested.apic_access_page) { /* shouldn't happen */
9942
			kvm_release_page_dirty(vmx->nested.apic_access_page);
9943 9944 9945
			vmx->nested.apic_access_page = NULL;
		}
		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
9946 9947 9948 9949 9950 9951
		/*
		 * If translation failed, no matter: This feature asks
		 * to exit when accessing the given address, and if it
		 * can never be accessed, this feature won't do
		 * anything anyway.
		 */
9952 9953
		if (!is_error_page(page)) {
			vmx->nested.apic_access_page = page;
9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964
			hpa = page_to_phys(vmx->nested.apic_access_page);
			vmcs_write64(APIC_ACCESS_ADDR, hpa);
		} else {
			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
		}
	} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
		   cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
			      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
		kvm_vcpu_reload_apic_access_page(vcpu);
9965
	}
9966 9967

	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
9968
		if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
9969
			kvm_release_page_dirty(vmx->nested.virtual_apic_page);
9970 9971 9972
			vmx->nested.virtual_apic_page = NULL;
		}
		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
9973 9974

		/*
9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985
		 * If translation failed, VM entry will fail because
		 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
		 * Failing the vm entry is _not_ what the processor
		 * does but it's basically the only possibility we
		 * have.  We could still enter the guest if CR8 load
		 * exits are enabled, CR8 store exits are enabled, and
		 * virtualize APIC access is disabled; in this case
		 * the processor would never use the TPR shadow and we
		 * could simply clear the bit from the execution
		 * control.  But such a configuration is useless, so
		 * let's keep the code simple.
9986
		 */
9987 9988
		if (!is_error_page(page)) {
			vmx->nested.virtual_apic_page = page;
9989 9990 9991
			hpa = page_to_phys(vmx->nested.virtual_apic_page);
			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
		}
9992 9993
	}

9994 9995 9996
	if (nested_cpu_has_posted_intr(vmcs12)) {
		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
			kunmap(vmx->nested.pi_desc_page);
9997
			kvm_release_page_dirty(vmx->nested.pi_desc_page);
9998
			vmx->nested.pi_desc_page = NULL;
9999
		}
10000 10001
		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
		if (is_error_page(page))
10002
			return;
10003 10004
		vmx->nested.pi_desc_page = page;
		vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
10005 10006 10007 10008
		vmx->nested.pi_desc =
			(struct pi_desc *)((void *)vmx->nested.pi_desc +
			(unsigned long)(vmcs12->posted_intr_desc_addr &
			(PAGE_SIZE - 1)));
10009 10010 10011 10012
		vmcs_write64(POSTED_INTR_DESC_ADDR,
			page_to_phys(vmx->nested.pi_desc_page) +
			(unsigned long)(vmcs12->posted_intr_desc_addr &
			(PAGE_SIZE - 1)));
10013
	}
10014 10015 10016 10017 10018 10019 10020
	if (cpu_has_vmx_msr_bitmap() &&
	    nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
		;
	else
		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
				CPU_BASED_USE_MSR_BITMAPS);
10021 10022
}

10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
{
	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (vcpu->arch.virtual_tsc_khz == 0)
		return;

	/* Make sure short timeouts reliably trigger an immediate vmexit.
	 * hrtimer_start does not guarantee this. */
	if (preemption_timeout <= 1) {
		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
		return;
	}

	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
	preemption_timeout *= 1000000;
	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
	hrtimer_start(&vmx->nested.preemption_timer,
		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}

10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
					       struct vmcs12 *vmcs12)
{
	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
		return 0;

	if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
	    !page_address_valid(vcpu, vmcs12->io_bitmap_b))
		return -EINVAL;

	return 0;
}

10058 10059 10060 10061 10062 10063
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
						struct vmcs12 *vmcs12)
{
	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
		return 0;

10064
	if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
10065 10066 10067 10068 10069
		return -EINVAL;

	return 0;
}

10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081
static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
						struct vmcs12 *vmcs12)
{
	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
		return 0;

	if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
		return -EINVAL;

	return 0;
}

10082 10083 10084 10085 10086 10087 10088
/*
 * Merge L0's and L1's MSR bitmap, return false to indicate that
 * we do not use the hardware.
 */
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
					       struct vmcs12 *vmcs12)
{
10089
	int msr;
10090
	struct page *page;
10091 10092
	unsigned long *msr_bitmap_l1;
	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
10093

10094
	/* This shortcut is ok because we support only x2APIC MSRs so far. */
10095 10096 10097
	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
		return false;

10098 10099
	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
	if (is_error_page(page))
10100
		return false;
10101
	msr_bitmap_l1 = (unsigned long *)kmap(page);
10102

10103 10104
	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);

10105
	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
10106 10107 10108
		if (nested_cpu_has_apic_reg_virt(vmcs12))
			for (msr = 0x800; msr <= 0x8ff; msr++)
				nested_vmx_disable_intercept_for_msr(
10109
					msr_bitmap_l1, msr_bitmap_l0,
10110
					msr, MSR_TYPE_R);
10111 10112 10113

		nested_vmx_disable_intercept_for_msr(
				msr_bitmap_l1, msr_bitmap_l0,
10114 10115
				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
				MSR_TYPE_R | MSR_TYPE_W);
10116

10117 10118
		if (nested_cpu_has_vid(vmcs12)) {
			nested_vmx_disable_intercept_for_msr(
10119
				msr_bitmap_l1, msr_bitmap_l0,
10120 10121 10122
				APIC_BASE_MSR + (APIC_EOI >> 4),
				MSR_TYPE_W);
			nested_vmx_disable_intercept_for_msr(
10123
				msr_bitmap_l1, msr_bitmap_l0,
10124 10125 10126
				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
				MSR_TYPE_W);
		}
10127
	}
10128
	kunmap(page);
10129
	kvm_release_page_clean(page);
10130 10131 10132 10133 10134 10135 10136

	return true;
}

static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
					   struct vmcs12 *vmcs12)
{
10137
	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10138
	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
10139 10140
	    !nested_cpu_has_vid(vmcs12) &&
	    !nested_cpu_has_posted_intr(vmcs12))
10141 10142 10143 10144 10145 10146
		return 0;

	/*
	 * If virtualize x2apic mode is enabled,
	 * virtualize apic access must be disabled.
	 */
10147 10148
	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
10149 10150
		return -EINVAL;

10151 10152 10153 10154 10155 10156 10157 10158
	/*
	 * If virtual interrupt delivery is enabled,
	 * we must exit on external interrupts.
	 */
	if (nested_cpu_has_vid(vmcs12) &&
	   !nested_exit_on_intr(vcpu))
		return -EINVAL;

10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169
	/*
	 * bits 15:8 should be zero in posted_intr_nv,
	 * the descriptor address has been already checked
	 * in nested_get_vmcs12_pages.
	 */
	if (nested_cpu_has_posted_intr(vmcs12) &&
	   (!nested_cpu_has_vid(vmcs12) ||
	    !nested_exit_intr_ack_set(vcpu) ||
	    vmcs12->posted_intr_nv & 0xff00))
		return -EINVAL;

10170 10171 10172 10173 10174
	/* tpr shadow is needed by all apicv features. */
	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
		return -EINVAL;

	return 0;
10175 10176
}

10177 10178
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
				       unsigned long count_field,
10179
				       unsigned long addr_field)
10180
{
10181
	int maxphyaddr;
10182 10183 10184 10185 10186 10187 10188 10189 10190
	u64 count, addr;

	if (vmcs12_read_any(vcpu, count_field, &count) ||
	    vmcs12_read_any(vcpu, addr_field, &addr)) {
		WARN_ON(1);
		return -EINVAL;
	}
	if (count == 0)
		return 0;
10191
	maxphyaddr = cpuid_maxphyaddr(vcpu);
10192 10193
	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
10194
		pr_debug_ratelimited(
10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209
			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
			addr_field, maxphyaddr, count, addr);
		return -EINVAL;
	}
	return 0;
}

static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
						struct vmcs12 *vmcs12)
{
	if (vmcs12->vm_exit_msr_load_count == 0 &&
	    vmcs12->vm_exit_msr_store_count == 0 &&
	    vmcs12->vm_entry_msr_load_count == 0)
		return 0; /* Fast path */
	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
10210
					VM_EXIT_MSR_LOAD_ADDR) ||
10211
	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
10212
					VM_EXIT_MSR_STORE_ADDR) ||
10213
	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
10214
					VM_ENTRY_MSR_LOAD_ADDR))
10215 10216 10217 10218
		return -EINVAL;
	return 0;
}

10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
					 struct vmcs12 *vmcs12)
{
	u64 address = vmcs12->pml_address;
	int maxphyaddr = cpuid_maxphyaddr(vcpu);

	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
		if (!nested_cpu_has_ept(vmcs12) ||
		    !IS_ALIGNED(address, 4096)  ||
		    address >> maxphyaddr)
			return -EINVAL;
	}

	return 0;
}

10235 10236 10237 10238
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
				       struct vmx_msr_entry *e)
{
	/* x2APIC MSR accesses are not allowed */
10239
	if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
10240 10241 10242 10243 10244
		return -EINVAL;
	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
	    e->index == MSR_IA32_UCODE_REV)
		return -EINVAL;
	if (e->reserved != 0)
10245 10246 10247 10248
		return -EINVAL;
	return 0;
}

10249 10250
static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
				     struct vmx_msr_entry *e)
10251 10252 10253
{
	if (e->index == MSR_FS_BASE ||
	    e->index == MSR_GS_BASE ||
10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264
	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
	    nested_vmx_msr_check_common(vcpu, e))
		return -EINVAL;
	return 0;
}

static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
				      struct vmx_msr_entry *e)
{
	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
	    nested_vmx_msr_check_common(vcpu, e))
10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280
		return -EINVAL;
	return 0;
}

/*
 * Load guest's/host's msr at nested entry/exit.
 * return 0 for success, entry index for failure.
 */
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
	u32 i;
	struct vmx_msr_entry e;
	struct msr_data msr;

	msr.host_initiated = false;
	for (i = 0; i < count; i++) {
10281 10282
		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
					&e, sizeof(e))) {
10283
			pr_debug_ratelimited(
10284 10285
				"%s cannot read MSR entry (%u, 0x%08llx)\n",
				__func__, i, gpa + i * sizeof(e));
10286
			goto fail;
10287 10288
		}
		if (nested_vmx_load_msr_check(vcpu, &e)) {
10289
			pr_debug_ratelimited(
10290 10291 10292 10293
				"%s check failed (%u, 0x%x, 0x%x)\n",
				__func__, i, e.index, e.reserved);
			goto fail;
		}
10294 10295
		msr.index = e.index;
		msr.data = e.value;
10296
		if (kvm_set_msr(vcpu, &msr)) {
10297
			pr_debug_ratelimited(
10298 10299
				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
				__func__, i, e.index, e.value);
10300
			goto fail;
10301
		}
10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313
	}
	return 0;
fail:
	return i + 1;
}

static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
	u32 i;
	struct vmx_msr_entry e;

	for (i = 0; i < count; i++) {
10314
		struct msr_data msr_info;
10315 10316 10317
		if (kvm_vcpu_read_guest(vcpu,
					gpa + i * sizeof(e),
					&e, 2 * sizeof(u32))) {
10318
			pr_debug_ratelimited(
10319 10320
				"%s cannot read MSR entry (%u, 0x%08llx)\n",
				__func__, i, gpa + i * sizeof(e));
10321
			return -EINVAL;
10322 10323
		}
		if (nested_vmx_store_msr_check(vcpu, &e)) {
10324
			pr_debug_ratelimited(
10325 10326
				"%s check failed (%u, 0x%x, 0x%x)\n",
				__func__, i, e.index, e.reserved);
10327
			return -EINVAL;
10328
		}
10329 10330 10331
		msr_info.host_initiated = false;
		msr_info.index = e.index;
		if (kvm_get_msr(vcpu, &msr_info)) {
10332
			pr_debug_ratelimited(
10333 10334 10335 10336
				"%s cannot read MSR (%u, 0x%x)\n",
				__func__, i, e.index);
			return -EINVAL;
		}
10337 10338 10339 10340
		if (kvm_vcpu_write_guest(vcpu,
					 gpa + i * sizeof(e) +
					     offsetof(struct vmx_msr_entry, value),
					 &msr_info.data, sizeof(msr_info.data))) {
10341
			pr_debug_ratelimited(
10342
				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
10343
				__func__, i, e.index, msr_info.data);
10344 10345
			return -EINVAL;
		}
10346 10347 10348 10349
	}
	return 0;
}

10350 10351 10352 10353 10354 10355 10356 10357
static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
	unsigned long invalid_mask;

	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
	return (val & invalid_mask) == 0;
}

10358 10359 10360 10361 10362 10363 10364
/*
 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
 * emulating VM entry into a guest with EPT enabled.
 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 * is assigned to entry_failure_code on failure.
 */
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
10365
			       u32 *entry_failure_code)
10366 10367
{
	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
10368
		if (!nested_cr3_valid(vcpu, cr3)) {
10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392
			*entry_failure_code = ENTRY_FAIL_DEFAULT;
			return 1;
		}

		/*
		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
		 * must not be dereferenced.
		 */
		if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
		    !nested_ept) {
			if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
				*entry_failure_code = ENTRY_FAIL_PDPTE;
				return 1;
			}
		}

		vcpu->arch.cr3 = cr3;
		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
	}

	kvm_mmu_reset_context(vcpu);
	return 0;
}

10393 10394 10395
/*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
T
Tiejun Chen 已提交
10396
 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
10397 10398 10399 10400
 * guest in a way that will both be appropriate to L1's requests, and our
 * needs. In addition to modifying the active vmcs (which is vmcs02), this
 * function also has additional necessary side-effects, like setting various
 * vcpu->arch fields.
10401 10402
 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 * is assigned to entry_failure_code on failure.
10403
 */
10404
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10405
			  bool from_vmentry, u32 *entry_failure_code)
10406 10407
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
10408
	u32 exec_control, vmcs12_exec_ctrl;
10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446

	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);

10447 10448
	if (from_vmentry &&
	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
10449 10450 10451 10452 10453 10454
		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
	} else {
		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
	}
10455 10456 10457 10458 10459 10460 10461 10462 10463
	if (from_vmentry) {
		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
			     vmcs12->vm_entry_intr_info_field);
		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
			     vmcs12->vm_entry_exception_error_code);
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmcs12->vm_entry_instruction_len);
		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
			     vmcs12->guest_interruptibility_info);
10464 10465
		vmx->loaded_vmcs->nmi_known_unmasked =
			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
10466 10467 10468
	} else {
		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
	}
10469
	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
10470
	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
10471 10472 10473 10474 10475
	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
		vmcs12->guest_pending_dbg_exceptions);
	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);

10476 10477
	if (nested_cpu_has_xsaves(vmcs12))
		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
10478 10479
	vmcs_write64(VMCS_LINK_POINTER, -1ull);

10480
	exec_control = vmcs12->pin_based_vm_exec_control;
10481 10482

	/* Preemption timer setting is only taken from vmcs01.  */
10483
	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
10484 10485 10486
	exec_control |= vmcs_config.pin_based_exec_ctrl;
	if (vmx->hv_deadline_tsc == -1)
		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
10487

10488
	/* Posted interrupts setting is only taken from vmcs12.  */
10489 10490 10491
	if (nested_cpu_has_posted_intr(vmcs12)) {
		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
		vmx->nested.pi_pending = false;
10492
		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
10493
	} else {
10494
		exec_control &= ~PIN_BASED_POSTED_INTR;
10495
	}
10496

10497
	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
10498

10499 10500 10501
	vmx->nested.preemption_timer_expired = false;
	if (nested_cpu_has_preemption_timer(vmcs12))
		vmx_start_preemption_timer(vcpu);
10502

10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522
	/*
	 * Whether page-faults are trapped is determined by a combination of
	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
	 * If enable_ept, L0 doesn't care about page faults and we should
	 * set all of these to L1's desires. However, if !enable_ept, L0 does
	 * care about (at least some) page faults, and because it is not easy
	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
	 * to exit on each and every L2 page fault. This is done by setting
	 * MASK=MATCH=0 and (see below) EB.PF=1.
	 * Note that below we don't need special code to set EB.PF beyond the
	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
	 */
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
		enable_ept ? vmcs12->page_fault_error_code_match : 0);

	if (cpu_has_secondary_exec_ctrls()) {
10523
		exec_control = vmx->secondary_exec_control;
10524

10525
		/* Take the following fields only from vmcs12 */
10526
		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
P
Paolo Bonzini 已提交
10527
				  SECONDARY_EXEC_ENABLE_INVPCID |
J
Jan Kiszka 已提交
10528
				  SECONDARY_EXEC_RDTSCP |
10529
				  SECONDARY_EXEC_XSAVES |
10530
				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
10531 10532
				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
				  SECONDARY_EXEC_ENABLE_VMFUNC);
10533
		if (nested_cpu_has(vmcs12,
10534 10535 10536 10537 10538
				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
				~SECONDARY_EXEC_ENABLE_PML;
			exec_control |= vmcs12_exec_ctrl;
		}
10539

10540 10541 10542 10543
		/* All VMFUNCs are currently emulated through L0 vmexits.  */
		if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
			vmcs_write64(VM_FUNCTION_CONTROL, 0);

10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556
		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
			vmcs_write64(EOI_EXIT_BITMAP0,
				vmcs12->eoi_exit_bitmap0);
			vmcs_write64(EOI_EXIT_BITMAP1,
				vmcs12->eoi_exit_bitmap1);
			vmcs_write64(EOI_EXIT_BITMAP2,
				vmcs12->eoi_exit_bitmap2);
			vmcs_write64(EOI_EXIT_BITMAP3,
				vmcs12->eoi_exit_bitmap3);
			vmcs_write16(GUEST_INTR_STATUS,
				vmcs12->guest_intr_status);
		}

10557 10558 10559 10560 10561 10562 10563 10564
		/*
		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
		 * nested_get_vmcs12_pages will either fix it up or
		 * remove the VM execution control.
		 */
		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
			vmcs_write64(APIC_ACCESS_ADDR, -1ull);

10565 10566 10567 10568 10569 10570 10571 10572 10573 10574
		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
	}


	/*
	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
	 * Some constant fields are set here by vmx_set_constant_host_state().
	 * Other fields are different per CPU, and will be set later when
	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
	 */
10575
	vmx_set_constant_host_state(vmx);
10576

10577 10578 10579 10580 10581 10582 10583 10584 10585
	/*
	 * Set the MSR load/store lists to match L0's settings.
	 */
	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));

10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599
	/*
	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
	 * entry, but only if the current (host) sp changed from the value
	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
	 * here we just force the write to happen on entry.
	 */
	vmx->host_rsp = 0;

	exec_control = vmx_exec_control(vmx); /* L0's desires */
	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
	exec_control &= ~CPU_BASED_TPR_SHADOW;
	exec_control |= vmcs12->cpu_based_vm_exec_control;
10600

10601 10602 10603 10604 10605
	/*
	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
	 * nested_get_vmcs12_pages can't fix it up, the illegal value
	 * will result in a VM entry failure.
	 */
10606
	if (exec_control & CPU_BASED_TPR_SHADOW) {
10607
		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
10608
		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
10609 10610 10611 10612 10613
	} else {
#ifdef CONFIG_X86_64
		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
				CPU_BASED_CR8_STORE_EXITING;
#endif
10614 10615
	}

10616
	/*
10617
	 * Merging of IO bitmap not currently supported.
10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632
	 * Rather, exit every time.
	 */
	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
	exec_control |= CPU_BASED_UNCOND_IO_EXITING;

	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);

	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
	 * bitwise-or of what L1 wants to trap for L2, and what we want to
	 * trap. Note that CR0.TS also needs updating - we do this later.
	 */
	update_exception_bitmap(vcpu);
	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);

10633 10634 10635 10636
	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
	 * bits are further modified by vmx_set_efer() below.
	 */
10637
	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
10638 10639 10640 10641

	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
	 * emulated by vmx_set_efer(), below.
	 */
10642
	vm_entry_controls_init(vmx, 
10643 10644
		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
			~VM_ENTRY_IA32E_MODE) |
10645 10646
		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));

10647 10648
	if (from_vmentry &&
	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
10649
		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
10650
		vcpu->arch.pat = vmcs12->guest_ia32_pat;
10651
	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
10652
		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
10653
	}
10654 10655 10656

	set_cr4_guest_host_mask(vmx);

10657 10658
	if (from_vmentry &&
	    vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
10659 10660
		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);

10661 10662
	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
		vmcs_write64(TSC_OFFSET,
10663
			vcpu->arch.tsc_offset + vmcs12->tsc_offset);
10664
	else
10665
		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
P
Peter Feiner 已提交
10666 10667
	if (kvm_has_tsc_control)
		decache_tsc_multiplier(vmx);
10668 10669 10670

	if (enable_vpid) {
		/*
W
Wanpeng Li 已提交
10671 10672 10673 10674 10675 10676
		 * There is no direct mapping between vpid02 and vpid12, the
		 * vpid02 is per-vCPU for L0 and reused while the value of
		 * vpid12 is changed w/ one invvpid during nested vmentry.
		 * The vpid12 is allocated by L1 for L2, so it will not
		 * influence global bitmap(for vpid01 and vpid02 allocation)
		 * even if spawn a lot of nested vCPUs.
10677
		 */
W
Wanpeng Li 已提交
10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688
		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
			}
		} else {
			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
			vmx_flush_tlb(vcpu);
		}

10689 10690
	}

10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702
	if (enable_pml) {
		/*
		 * Conceptually we want to copy the PML address and index from
		 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
		 * since we always flush the log on each vmexit, this happens
		 * to be equivalent to simply resetting the fields in vmcs02.
		 */
		ASSERT(vmx->pml_pg);
		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
	}

N
Nadav Har'El 已提交
10703
	if (nested_cpu_has_ept(vmcs12)) {
10704 10705 10706 10707
		if (nested_ept_init_mmu_context(vcpu)) {
			*entry_failure_code = ENTRY_FAIL_DEFAULT;
			return 1;
		}
10708 10709 10710
	} else if (nested_cpu_has2(vmcs12,
				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
		vmx_flush_tlb_ept_only(vcpu);
N
Nadav Har'El 已提交
10711 10712
	}

10713
	/*
10714 10715
	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
	 * bits which we consider mandatory enabled.
10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726
	 * The CR0_READ_SHADOW is what L2 should have expected to read given
	 * the specifications by L1; It's not enough to take
	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
	 * have more bits than L1 expected.
	 */
	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));

	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));

10727 10728
	if (from_vmentry &&
	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
10729 10730 10731 10732 10733 10734 10735 10736
		vcpu->arch.efer = vmcs12->guest_ia32_efer;
	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
	else
		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
	vmx_set_efer(vcpu, vcpu->arch.efer);

10737
	/* Shadow page tables on either EPT or shadow page tables. */
10738
	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
10739 10740
				entry_failure_code))
		return 1;
10741

10742 10743 10744
	if (!enable_ept)
		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;

10745 10746 10747 10748 10749 10750 10751 10752 10753 10754
	/*
	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
	 */
	if (enable_ept) {
		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
	}

10755 10756
	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
10757
	return 0;
10758 10759
}

10760
static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10761 10762
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
10763

10764
	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
10765 10766
	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10767

10768 10769 10770
	if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

10771 10772
	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10773

10774 10775 10776
	if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

10777 10778
	if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10779

10780 10781
	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10782

10783 10784 10785
	if (nested_vmx_check_pml_controls(vcpu, vmcs12))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

10786
	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
10787
				vmx->nested.nested_vmx_procbased_ctls_low,
10788
				vmx->nested.nested_vmx_procbased_ctls_high) ||
10789 10790 10791 10792
	    (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
	     !vmx_control_verify(vmcs12->secondary_vm_exec_control,
				 vmx->nested.nested_vmx_secondary_ctls_low,
				 vmx->nested.nested_vmx_secondary_ctls_high)) ||
10793
	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
10794 10795
				vmx->nested.nested_vmx_pinbased_ctls_low,
				vmx->nested.nested_vmx_pinbased_ctls_high) ||
10796
	    !vmx_control_verify(vmcs12->vm_exit_controls,
10797
				vmx->nested.nested_vmx_exit_ctls_low,
10798
				vmx->nested.nested_vmx_exit_ctls_high) ||
10799
	    !vmx_control_verify(vmcs12->vm_entry_controls,
10800
				vmx->nested.nested_vmx_entry_ctls_low,
10801
				vmx->nested.nested_vmx_entry_ctls_high))
10802
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10803

10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814
	if (nested_cpu_has_vmfunc(vmcs12)) {
		if (vmcs12->vm_function_control &
		    ~vmx->nested.nested_vmx_vmfunc_controls)
			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

		if (nested_cpu_has_eptp_switching(vmcs12)) {
			if (!nested_cpu_has_ept(vmcs12) ||
			    !page_address_valid(vcpu, vmcs12->eptp_list_address))
				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
		}
	}
10815

10816 10817 10818
	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

10819
	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
10820
	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832
	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
		return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;

	return 0;
}

static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
				  u32 *exit_qual)
{
	bool ia32e;

	*exit_qual = ENTRY_FAIL_DEFAULT;
10833

10834
	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
10835
	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
10836
		return 1;
10837 10838 10839 10840

	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
	    vmcs12->vmcs_link_pointer != -1ull) {
		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
10841
		return 1;
10842 10843
	}

10844
	/*
10845
	 * If the load IA32_EFER VM-entry control is 1, the following checks
10846 10847 10848 10849 10850 10851 10852
	 * are performed on the field for the IA32_EFER MSR:
	 * - Bits reserved in the IA32_EFER MSR must be 0.
	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
	 *   the IA-32e mode guest VM-exit control. It must also be identical
	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
	 *   CR0.PG) is 1.
	 */
10853 10854
	if (to_vmx(vcpu)->nested.nested_run_pending &&
	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
10855 10856 10857 10858
		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
10859
		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
10860
			return 1;
10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873
	}

	/*
	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
	 * the values of the LMA and LME bits in the field must each be that of
	 * the host address-space size VM-exit control.
	 */
	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
		ia32e = (vmcs12->vm_exit_controls &
			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
10874
		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
10875
			return 1;
10876 10877 10878 10879 10880
	}

	return 0;
}

10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897
static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
	struct loaded_vmcs *vmcs02;
	u32 msr_entry_idx;
	u32 exit_qual;

	vmcs02 = nested_get_current_vmcs02(vmx);
	if (!vmcs02)
		return -ENOMEM;

	enter_guest_mode(vcpu);

	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);

10898
	vmx_switch_vmcs(vcpu, vmcs02);
10899 10900 10901 10902
	vmx_segment_cache_clear(vmx);

	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
		leave_guest_mode(vcpu);
10903
		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915
		nested_vmx_entry_failure(vcpu, vmcs12,
					 EXIT_REASON_INVALID_STATE, exit_qual);
		return 1;
	}

	nested_get_vmcs12_pages(vcpu, vmcs12);

	msr_entry_idx = nested_vmx_load_msr(vcpu,
					    vmcs12->vm_entry_msr_load_addr,
					    vmcs12->vm_entry_msr_load_count);
	if (msr_entry_idx) {
		leave_guest_mode(vcpu);
10916
		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930
		nested_vmx_entry_failure(vcpu, vmcs12,
				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
		return 1;
	}

	/*
	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
	 * returned as far as L1 is concerned. It will only return (and set
	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
	 */
	return 0;
}

10931 10932 10933 10934 10935 10936 10937 10938
/*
 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
 * for running an L2 nested guest.
 */
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
	struct vmcs12 *vmcs12;
	struct vcpu_vmx *vmx = to_vmx(vcpu);
10939
	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963
	u32 exit_qual;
	int ret;

	if (!nested_vmx_check_permission(vcpu))
		return 1;

	if (!nested_vmx_check_vmcs12(vcpu))
		goto out;

	vmcs12 = get_vmcs12(vcpu);

	if (enable_shadow_vmcs)
		copy_shadow_to_vmcs12(vmx);

	/*
	 * The nested entry process starts with enforcing various prerequisites
	 * on vmcs12 as required by the Intel SDM, and act appropriately when
	 * they fail: As the SDM explains, some conditions should cause the
	 * instruction to fail, while others will cause the instruction to seem
	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
	 * To speed up the normal (success) code path, we should avoid checking
	 * for misconfigurations which will anyway be caught by the processor
	 * when using the merged vmcs02.
	 */
10964 10965 10966 10967 10968 10969
	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
		nested_vmx_failValid(vcpu,
				     VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
		goto out;
	}

10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996
	if (vmcs12->launch_state == launch) {
		nested_vmx_failValid(vcpu,
			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
		goto out;
	}

	ret = check_vmentry_prereqs(vcpu, vmcs12);
	if (ret) {
		nested_vmx_failValid(vcpu, ret);
		goto out;
	}

	/*
	 * After this point, the trap flag no longer triggers a singlestep trap
	 * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
	 * This is not 100% correct; for performance reasons, we delegate most
	 * of the checks on host state to the processor.  If those fail,
	 * the singlestep trap is missed.
	 */
	skip_emulated_instruction(vcpu);

	ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
	if (ret) {
		nested_vmx_entry_failure(vcpu, vmcs12,
					 EXIT_REASON_INVALID_STATE, exit_qual);
		return 1;
10997 10998
	}

10999 11000 11001 11002 11003
	/*
	 * We're finally done with prerequisite checking, and can start with
	 * the nested entry.
	 */

11004 11005 11006
	ret = enter_vmx_non_root_mode(vcpu, true);
	if (ret)
		return ret;
11007

11008
	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
11009
		return kvm_vcpu_halt(vcpu);
11010

11011 11012
	vmx->nested.nested_run_pending = 1;

11013
	return 1;
11014 11015

out:
11016
	return kvm_skip_emulated_instruction(vcpu);
11017 11018
}

N
Nadav Har'El 已提交
11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055
/*
 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
 * This function returns the new value we should put in vmcs12.guest_cr0.
 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
 *     didn't trap the bit, because if L1 did, so would L0).
 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
 *     been modified by L2, and L1 knows it. So just leave the old value of
 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
 *     isn't relevant, because if L0 traps this bit it can set it to anything.
 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
 *     changed these bits, and therefore they need to be updated, but L0
 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
 */
static inline unsigned long
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
	return
	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
			vcpu->arch.cr0_guest_owned_bits));
}

static inline unsigned long
vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
	return
	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
			vcpu->arch.cr4_guest_owned_bits));
}

11056 11057 11058 11059 11060 11061
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{
	u32 idt_vectoring;
	unsigned int nr;

11062
	if (vcpu->arch.exception.injected) {
11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079
		nr = vcpu->arch.exception.nr;
		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;

		if (kvm_exception_is_soft(nr)) {
			vmcs12->vm_exit_instruction_len =
				vcpu->arch.event_exit_inst_len;
			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
		} else
			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;

		if (vcpu->arch.exception.has_error_code) {
			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
			vmcs12->idt_vectoring_error_code =
				vcpu->arch.exception.error_code;
		}

		vmcs12->idt_vectoring_info_field = idt_vectoring;
J
Jan Kiszka 已提交
11080
	} else if (vcpu->arch.nmi_injected) {
11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097
		vmcs12->idt_vectoring_info_field =
			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
	} else if (vcpu->arch.interrupt.pending) {
		nr = vcpu->arch.interrupt.nr;
		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;

		if (vcpu->arch.interrupt.soft) {
			idt_vectoring |= INTR_TYPE_SOFT_INTR;
			vmcs12->vm_entry_instruction_len =
				vcpu->arch.event_exit_inst_len;
		} else
			idt_vectoring |= INTR_TYPE_EXT_INTR;

		vmcs12->idt_vectoring_info_field = idt_vectoring;
	}
}

11098 11099 11100
static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
11101
	unsigned long exit_qual;
11102

11103
	if (kvm_event_needs_reinjection(vcpu))
11104 11105
		return -EBUSY;

11106 11107 11108 11109 11110 11111 11112 11113 11114
	if (vcpu->arch.exception.pending &&
		nested_vmx_check_exception(vcpu, &exit_qual)) {
		if (vmx->nested.nested_run_pending)
			return -EBUSY;
		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
		vcpu->arch.exception.pending = false;
		return 0;
	}

11115 11116 11117 11118 11119 11120 11121 11122
	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
	    vmx->nested.preemption_timer_expired) {
		if (vmx->nested.nested_run_pending)
			return -EBUSY;
		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
		return 0;
	}

11123
	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
11124
		if (vmx->nested.nested_run_pending)
11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142
			return -EBUSY;
		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
				  INTR_INFO_VALID_MASK, 0);
		/*
		 * The NMI-triggered VM exit counts as injection:
		 * clear this one and block further NMIs.
		 */
		vcpu->arch.nmi_pending = 0;
		vmx_set_nmi_mask(vcpu, true);
		return 0;
	}

	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
	    nested_exit_on_intr(vcpu)) {
		if (vmx->nested.nested_run_pending)
			return -EBUSY;
		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
11143
		return 0;
11144 11145
	}

11146 11147
	vmx_complete_nested_posted_interrupt(vcpu);
	return 0;
11148 11149
}

11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
{
	ktime_t remaining =
		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
	u64 value;

	if (ktime_to_ns(remaining) <= 0)
		return 0;

	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
	do_div(value, 1000000);
	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
}

N
Nadav Har'El 已提交
11164
/*
11165 11166 11167 11168
 * Update the guest state fields of vmcs12 to reflect changes that
 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
 * VM-entry controls is also updated, since this is really a guest
 * state bit.)
N
Nadav Har'El 已提交
11169
 */
11170
static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
N
Nadav Har'El 已提交
11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219
{
	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);

	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);

	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);

	vmcs12->guest_interruptibility_info =
		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
	vmcs12->guest_pending_dbg_exceptions =
		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
11220 11221 11222 11223
	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
	else
		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
N
Nadav Har'El 已提交
11224

11225 11226 11227 11228 11229 11230 11231
	if (nested_cpu_has_preemption_timer(vmcs12)) {
		if (vmcs12->vm_exit_controls &
		    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
			vmcs12->vmx_preemption_timer_value =
				vmx_get_preemption_timer_value(vcpu);
		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
	}
11232

11233 11234 11235 11236 11237 11238 11239 11240 11241
	/*
	 * In some cases (usually, nested EPT), L2 is allowed to change its
	 * own CR3 without exiting. If it has changed it, we must keep it.
	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
	 *
	 * Additionally, restore L2's PDPTR to vmcs12.
	 */
	if (enable_ept) {
11242
		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
11243 11244 11245 11246 11247 11248
		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
	}

11249
	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
11250

11251 11252 11253
	if (nested_cpu_has_vid(vmcs12))
		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);

11254 11255
	vmcs12->vm_entry_controls =
		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
11256
		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
11257

11258 11259 11260 11261 11262
	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
		vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
	}

N
Nadav Har'El 已提交
11263 11264
	/* TODO: These cannot have changed unless we have MSR bitmaps and
	 * the relevant bit asks not to trap the change */
11265
	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
N
Nadav Har'El 已提交
11266
		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
11267 11268
	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
		vmcs12->guest_ia32_efer = vcpu->arch.efer;
N
Nadav Har'El 已提交
11269 11270 11271
	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
11272
	if (kvm_mpx_supported())
11273
		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292
}

/*
 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
 * and this function updates it to reflect the changes to the guest state while
 * L2 was running (and perhaps made some exits which were handled directly by L0
 * without going back to L1), and to reflect the exit reason.
 * Note that we do not have to copy here all VMCS fields, just those that
 * could have changed by the L2 guest or the exit - i.e., the guest-state and
 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
 * which already writes to vmcs12 directly.
 */
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
			   u32 exit_reason, u32 exit_intr_info,
			   unsigned long exit_qualification)
{
	/* update guest state fields: */
	sync_vmcs12(vcpu, vmcs12);
N
Nadav Har'El 已提交
11293 11294 11295

	/* update exit information fields: */

11296 11297 11298
	vmcs12->vm_exit_reason = exit_reason;
	vmcs12->exit_qualification = exit_qualification;
	vmcs12->vm_exit_intr_info = exit_intr_info;
11299

11300
	vmcs12->idt_vectoring_info_field = 0;
N
Nadav Har'El 已提交
11301 11302 11303
	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);

11304
	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
11305 11306
		vmcs12->launch_state = 1;

11307 11308
		/* vm_entry_intr_info_field is cleared on exit. Emulate this
		 * instead of reading the real value. */
N
Nadav Har'El 已提交
11309
		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324

		/*
		 * Transfer the event that L0 or L1 may wanted to inject into
		 * L2 to IDT_VECTORING_INFO_FIELD.
		 */
		vmcs12_save_pending_event(vcpu, vmcs12);
	}

	/*
	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
	 * preserved above and would only end up incorrectly in L1.
	 */
	vcpu->arch.nmi_injected = false;
	kvm_clear_exception_queue(vcpu);
	kvm_clear_interrupt_queue(vcpu);
N
Nadav Har'El 已提交
11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335
}

/*
 * A part of what we need to when the nested L2 guest exits and we want to
 * run its L1 parent, is to reset L1's guest state to the host state specified
 * in vmcs12.
 * This function is to be called not only on normal nested exit, but also on
 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
 * Failures During or After Loading Guest State").
 * This function should be called when the active VMCS is L1's (vmcs01).
 */
11336 11337
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
				   struct vmcs12 *vmcs12)
N
Nadav Har'El 已提交
11338
{
11339
	struct kvm_segment seg;
11340
	u32 entry_failure_code;
11341

N
Nadav Har'El 已提交
11342 11343
	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
		vcpu->arch.efer = vmcs12->host_ia32_efer;
11344
	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
N
Nadav Har'El 已提交
11345 11346 11347 11348 11349 11350 11351
		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
	else
		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
	vmx_set_efer(vcpu, vcpu->arch.efer);

	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
11352
	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
N
Nadav Har'El 已提交
11353 11354
	/*
	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
11355 11356 11357 11358
	 * actually changed, because vmx_set_cr0 refers to efer set above.
	 *
	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
	 * (KVM doesn't change it);
N
Nadav Har'El 已提交
11359
	 */
11360
	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
11361
	vmx_set_cr0(vcpu, vmcs12->host_cr0);
N
Nadav Har'El 已提交
11362

11363
	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
N
Nadav Har'El 已提交
11364
	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
11365
	vmx_set_cr4(vcpu, vmcs12->host_cr4);
N
Nadav Har'El 已提交
11366

11367
	nested_ept_uninit_mmu_context(vcpu);
N
Nadav Har'El 已提交
11368

11369 11370 11371 11372 11373 11374
	/*
	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
	 * couldn't have changed.
	 */
	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
N
Nadav Har'El 已提交
11375

11376 11377 11378
	if (!enable_ept)
		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;

N
Nadav Har'El 已提交
11379 11380 11381 11382 11383 11384 11385 11386
	if (enable_vpid) {
		/*
		 * Trivially support vpid by letting L2s share their parent
		 * L1's vpid. TODO: move to a more elaborate solution, giving
		 * each L2 its own vpid and exposing the vpid feature to L1.
		 */
		vmx_flush_tlb(vcpu);
	}
11387 11388 11389
	/* Restore posted intr vector. */
	if (nested_cpu_has_posted_intr(vmcs12))
		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
N
Nadav Har'El 已提交
11390 11391 11392 11393 11394 11395

	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
11396 11397
	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
N
Nadav Har'El 已提交
11398

11399 11400 11401 11402
	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
		vmcs_write64(GUEST_BNDCFGS, 0);

11403
	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
N
Nadav Har'El 已提交
11404
		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
11405 11406
		vcpu->arch.pat = vmcs12->host_ia32_pat;
	}
N
Nadav Har'El 已提交
11407 11408 11409
	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
			vmcs12->host_ia32_perf_global_ctrl);
11410

11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448
	/* Set L1 segment info according to Intel SDM
	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
	seg = (struct kvm_segment) {
		.base = 0,
		.limit = 0xFFFFFFFF,
		.selector = vmcs12->host_cs_selector,
		.type = 11,
		.present = 1,
		.s = 1,
		.g = 1
	};
	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
		seg.l = 1;
	else
		seg.db = 1;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
	seg = (struct kvm_segment) {
		.base = 0,
		.limit = 0xFFFFFFFF,
		.type = 3,
		.present = 1,
		.s = 1,
		.db = 1,
		.g = 1
	};
	seg.selector = vmcs12->host_ds_selector;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
	seg.selector = vmcs12->host_es_selector;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
	seg.selector = vmcs12->host_ss_selector;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
	seg.selector = vmcs12->host_fs_selector;
	seg.base = vmcs12->host_fs_base;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
	seg.selector = vmcs12->host_gs_selector;
	seg.base = vmcs12->host_gs_base;
	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
	seg = (struct kvm_segment) {
11449
		.base = vmcs12->host_tr_base,
11450 11451 11452 11453 11454 11455 11456
		.limit = 0x67,
		.selector = vmcs12->host_tr_selector,
		.type = 11,
		.present = 1
	};
	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);

11457 11458
	kvm_set_dr(vcpu, 7, 0x400);
	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
11459

11460 11461 11462
	if (cpu_has_vmx_msr_bitmap())
		vmx_set_msr_bitmap(vcpu);

11463 11464 11465
	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
				vmcs12->vm_exit_msr_load_count))
		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
N
Nadav Har'El 已提交
11466 11467 11468 11469 11470 11471 11472
}

/*
 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
 * and modify vmcs12 to make it see what it would expect to see there if
 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
 */
11473 11474 11475
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
			      u32 exit_intr_info,
			      unsigned long exit_qualification)
N
Nadav Har'El 已提交
11476 11477 11478 11479
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

11480 11481 11482
	/* trying to cancel vmlaunch/vmresume is a bug */
	WARN_ON_ONCE(vmx->nested.nested_run_pending);

11483 11484 11485 11486 11487 11488 11489 11490
	/*
	 * The only expected VM-instruction error is "VM entry with
	 * invalid control field(s)." Anything else indicates a
	 * problem with L0.
	 */
	WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
				   VMXERR_ENTRY_INVALID_CONTROL_FIELD));

N
Nadav Har'El 已提交
11491 11492
	leave_guest_mode(vcpu);

11493
	if (likely(!vmx->fail)) {
11494 11495 11496 11497 11498
		if (exit_reason == -1)
			sync_vmcs12(vcpu, vmcs12);
		else
			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
				       exit_qualification);
11499

11500 11501 11502 11503
		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
					 vmcs12->vm_exit_msr_store_count))
			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
	}
11504

11505
	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
11506 11507
	vm_entry_controls_reset_shadow(vmx);
	vm_exit_controls_reset_shadow(vmx);
11508 11509
	vmx_segment_cache_clear(vmx);

N
Nadav Har'El 已提交
11510 11511 11512 11513
	/* if no vmcs02 cache requested, remove the one we used */
	if (VMCS02_POOL_SIZE == 0)
		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);

11514
	/* Update any VMCS fields that might have changed while L2 ran */
11515 11516
	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
11517
	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
11518 11519 11520 11521 11522 11523
	if (vmx->hv_deadline_tsc == -1)
		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
				PIN_BASED_VMX_PREEMPTION_TIMER);
	else
		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
			      PIN_BASED_VMX_PREEMPTION_TIMER);
P
Peter Feiner 已提交
11524 11525
	if (kvm_has_tsc_control)
		decache_tsc_multiplier(vmx);
N
Nadav Har'El 已提交
11526

11527 11528 11529 11530
	if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
		vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
		vmx_set_virtual_x2apic_mode(vcpu,
				vcpu->arch.apic_base & X2APIC_ENABLE);
11531 11532 11533 11534
	} else if (!nested_cpu_has_ept(vmcs12) &&
		   nested_cpu_has2(vmcs12,
				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
		vmx_flush_tlb_ept_only(vcpu);
11535
	}
N
Nadav Har'El 已提交
11536 11537 11538 11539 11540 11541

	/* This is needed for same reason as it was needed in prepare_vmcs02 */
	vmx->host_rsp = 0;

	/* Unpin physical memory we referred to in vmcs02 */
	if (vmx->nested.apic_access_page) {
11542
		kvm_release_page_dirty(vmx->nested.apic_access_page);
11543
		vmx->nested.apic_access_page = NULL;
N
Nadav Har'El 已提交
11544
	}
11545
	if (vmx->nested.virtual_apic_page) {
11546
		kvm_release_page_dirty(vmx->nested.virtual_apic_page);
11547
		vmx->nested.virtual_apic_page = NULL;
11548
	}
11549 11550
	if (vmx->nested.pi_desc_page) {
		kunmap(vmx->nested.pi_desc_page);
11551
		kvm_release_page_dirty(vmx->nested.pi_desc_page);
11552 11553 11554
		vmx->nested.pi_desc_page = NULL;
		vmx->nested.pi_desc = NULL;
	}
N
Nadav Har'El 已提交
11555

11556 11557 11558 11559
	/*
	 * We are now running in L2, mmu_notifier will force to reload the
	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
	 */
11560
	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
11561

11562
	if (enable_shadow_vmcs && exit_reason != -1)
11563
		vmx->nested.sync_shadow_vmcs = true;
11564 11565 11566

	/* in case we halted in L2 */
	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585

	if (likely(!vmx->fail)) {
		/*
		 * TODO: SDM says that with acknowledge interrupt on
		 * exit, bit 31 of the VM-exit interrupt information
		 * (valid interrupt) is always set to 1 on
		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
		 * need kvm_cpu_has_interrupt().  See the commit
		 * message for details.
		 */
		if (nested_exit_intr_ack_set(vcpu) &&
		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
		    kvm_cpu_has_interrupt(vcpu)) {
			int irq = kvm_cpu_get_interrupt(vcpu);
			WARN_ON(irq < 0);
			vmcs12->vm_exit_intr_info = irq |
				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
		}

11586 11587 11588 11589 11590 11591 11592
		if (exit_reason != -1)
			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
						       vmcs12->exit_qualification,
						       vmcs12->idt_vectoring_info_field,
						       vmcs12->vm_exit_intr_info,
						       vmcs12->vm_exit_intr_error_code,
						       KVM_ISA_VMX);
11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613

		load_vmcs12_host_state(vcpu, vmcs12);

		return;
	}
	
	/*
	 * After an early L2 VM-entry failure, we're now back
	 * in L1 which thinks it just finished a VMLAUNCH or
	 * VMRESUME instruction, so we need to set the failure
	 * flag and the VM-instruction error field of the VMCS
	 * accordingly.
	 */
	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
	/*
	 * The emulated instruction was already skipped in
	 * nested_vmx_run, but the updated RIP was never
	 * written back to the vmcs01.
	 */
	skip_emulated_instruction(vcpu);
	vmx->fail = 0;
N
Nadav Har'El 已提交
11614 11615
}

11616 11617 11618 11619 11620
/*
 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
 */
static void vmx_leave_nested(struct kvm_vcpu *vcpu)
{
11621 11622
	if (is_guest_mode(vcpu)) {
		to_vmx(vcpu)->nested.nested_run_pending = 0;
11623
		nested_vmx_vmexit(vcpu, -1, 0, 0);
11624
	}
11625 11626 11627
	free_nested(to_vmx(vcpu));
}

11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642
/*
 * L1's failure to enter L2 is a subset of a normal exit, as explained in
 * 23.7 "VM-entry failures during or after loading guest state" (this also
 * lists the acceptable exit-reason and exit-qualification parameters).
 * It should only be called before L2 actually succeeded to run, and when
 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
 */
static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
			struct vmcs12 *vmcs12,
			u32 reason, unsigned long qualification)
{
	load_vmcs12_host_state(vcpu, vmcs12);
	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
	vmcs12->exit_qualification = qualification;
	nested_vmx_succeed(vcpu);
11643 11644
	if (enable_shadow_vmcs)
		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
11645 11646
}

11647 11648 11649 11650 11651 11652 11653
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
			       struct x86_instruction_info *info,
			       enum x86_intercept_stage stage)
{
	return X86EMUL_CONTINUE;
}

11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675
#ifdef CONFIG_X86_64
/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
static inline int u64_shl_div_u64(u64 a, unsigned int shift,
				  u64 divisor, u64 *result)
{
	u64 low = a << shift, high = a >> (64 - shift);

	/* To avoid the overflow on divq */
	if (high >= divisor)
		return 1;

	/* Low hold the result, high hold rem which is discarded */
	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
	    "rm" (divisor), "0" (low), "1" (high));
	*result = low;

	return 0;
}

static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
11676 11677 11678
	u64 tscl = rdtsc();
	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699

	/* Convert to host delta tsc if tsc scaling is enabled */
	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
			u64_shl_div_u64(delta_tsc,
				kvm_tsc_scaling_ratio_frac_bits,
				vcpu->arch.tsc_scaling_ratio,
				&delta_tsc))
		return -ERANGE;

	/*
	 * If the delta tsc can't fit in the 32 bit after the multi shift,
	 * we can't use the preemption timer.
	 * It's possible that it fits on later vmentries, but checking
	 * on every vmentry is costly so we just use an hrtimer.
	 */
	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
		return -ERANGE;

	vmx->hv_deadline_tsc = tscl + delta_tsc;
	vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
			PIN_BASED_VMX_PREEMPTION_TIMER);
11700 11701

	return delta_tsc == 0;
11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712
}

static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	vmx->hv_deadline_tsc = -1;
	vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
			PIN_BASED_VMX_PREEMPTION_TIMER);
}
#endif

11713
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
11714
{
R
Radim Krčmář 已提交
11715 11716
	if (ple_gap)
		shrink_ple_window(vcpu);
11717 11718
}

K
Kai Huang 已提交
11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
				     struct kvm_memory_slot *slot)
{
	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
}

static void vmx_slot_disable_log_dirty(struct kvm *kvm,
				       struct kvm_memory_slot *slot)
{
	kvm_mmu_slot_set_dirty(kvm, slot);
}

static void vmx_flush_log_dirty(struct kvm *kvm)
{
	kvm_flush_pml_buffers(kvm);
}

11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756
static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
{
	struct vmcs12 *vmcs12;
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	gpa_t gpa;
	struct page *page = NULL;
	u64 *pml_address;

	if (is_guest_mode(vcpu)) {
		WARN_ON_ONCE(vmx->nested.pml_full);

		/*
		 * Check if PML is enabled for the nested guest.
		 * Whether eptp bit 6 is set is already checked
		 * as part of A/D emulation.
		 */
		vmcs12 = get_vmcs12(vcpu);
		if (!nested_cpu_has_pml(vmcs12))
			return 0;

11757
		if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
11758 11759 11760 11761 11762 11763
			vmx->nested.pml_full = true;
			return 1;
		}

		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;

11764 11765
		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
		if (is_error_page(page))
11766 11767 11768 11769 11770
			return 0;

		pml_address = kmap(page);
		pml_address[vmcs12->guest_pml_index--] = gpa;
		kunmap(page);
11771
		kvm_release_page_clean(page);
11772 11773 11774 11775 11776
	}

	return 0;
}

K
Kai Huang 已提交
11777 11778 11779 11780 11781 11782 11783
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
					   struct kvm_memory_slot *memslot,
					   gfn_t offset, unsigned long mask)
{
	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}

11784 11785 11786 11787 11788 11789 11790 11791
static void __pi_post_block(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct pi_desc old, new;
	unsigned int dest;

	do {
		old.control = new.control = pi_desc->control;
11792 11793
		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
		     "Wakeup handler not enabled while the VCPU is blocked\n");
11794 11795 11796 11797 11798 11799 11800 11801 11802 11803

		dest = cpu_physical_id(vcpu->cpu);

		if (x2apic_enabled())
			new.ndst = dest;
		else
			new.ndst = (dest << 8) & 0xFF00;

		/* set 'NV' to 'notification vector' */
		new.nv = POSTED_INTR_VECTOR;
P
Paolo Bonzini 已提交
11804 11805
	} while (cmpxchg64(&pi_desc->control, old.control,
			   new.control) != old.control);
11806

11807 11808
	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11809
		list_del(&vcpu->blocked_vcpu_list);
11810
		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
11811 11812 11813 11814
		vcpu->pre_pcpu = -1;
	}
}

11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827
/*
 * This routine does the following things for vCPU which is going
 * to be blocked if VT-d PI is enabled.
 * - Store the vCPU to the wakeup list, so when interrupts happen
 *   we can find the right vCPU to wake up.
 * - Change the Posted-interrupt descriptor as below:
 *      'NDST' <-- vcpu->pre_pcpu
 *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
 * - If 'ON' is set during this process, which means at least one
 *   interrupt is posted for this vCPU, we cannot block it, in
 *   this case, return 1, otherwise, return 0.
 *
 */
11828
static int pi_pre_block(struct kvm_vcpu *vcpu)
11829 11830 11831 11832 11833 11834
{
	unsigned int dest;
	struct pi_desc old, new;
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
11835 11836
		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
		!kvm_vcpu_apicv_active(vcpu))
11837 11838
		return 0;

11839 11840 11841 11842 11843 11844 11845 11846 11847 11848
	WARN_ON(irqs_disabled());
	local_irq_disable();
	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
		vcpu->pre_pcpu = vcpu->cpu;
		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
		list_add_tail(&vcpu->blocked_vcpu_list,
			      &per_cpu(blocked_vcpu_on_cpu,
				       vcpu->pre_pcpu));
		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
	}
11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873

	do {
		old.control = new.control = pi_desc->control;

		WARN((pi_desc->sn == 1),
		     "Warning: SN field of posted-interrupts "
		     "is set before blocking\n");

		/*
		 * Since vCPU can be preempted during this process,
		 * vcpu->cpu could be different with pre_pcpu, we
		 * need to set pre_pcpu as the destination of wakeup
		 * notification event, then we can find the right vCPU
		 * to wakeup in wakeup handler if interrupts happen
		 * when the vCPU is in blocked state.
		 */
		dest = cpu_physical_id(vcpu->pre_pcpu);

		if (x2apic_enabled())
			new.ndst = dest;
		else
			new.ndst = (dest << 8) & 0xFF00;

		/* set 'NV' to 'wakeup vector' */
		new.nv = POSTED_INTR_WAKEUP_VECTOR;
P
Paolo Bonzini 已提交
11874 11875
	} while (cmpxchg64(&pi_desc->control, old.control,
			   new.control) != old.control);
11876

11877 11878 11879 11880 11881 11882
	/* We should not block the vCPU if an interrupt is posted for it.  */
	if (pi_test_on(pi_desc) == 1)
		__pi_post_block(vcpu);

	local_irq_enable();
	return (vcpu->pre_pcpu == -1);
11883 11884
}

11885 11886 11887 11888 11889
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
	if (pi_pre_block(vcpu))
		return 1;

11890 11891 11892
	if (kvm_lapic_hv_timer_in_use(vcpu))
		kvm_lapic_switch_to_sw_timer(vcpu);

11893 11894 11895 11896
	return 0;
}

static void pi_post_block(struct kvm_vcpu *vcpu)
11897
{
11898
	if (vcpu->pre_pcpu == -1)
11899 11900
		return;

11901 11902
	WARN_ON(irqs_disabled());
	local_irq_disable();
11903
	__pi_post_block(vcpu);
11904
	local_irq_enable();
11905 11906
}

11907 11908
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
11909 11910 11911
	if (kvm_x86_ops->set_hv_timer)
		kvm_lapic_switch_to_hv_timer(vcpu);

11912 11913 11914
	pi_post_block(vcpu);
}

11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931
/*
 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
 *
 * @kvm: kvm
 * @host_irq: host irq of the interrupt
 * @guest_irq: gsi of the interrupt
 * @set: set or unset PI
 * returns 0 on success, < 0 on failure
 */
static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
			      uint32_t guest_irq, bool set)
{
	struct kvm_kernel_irq_routing_entry *e;
	struct kvm_irq_routing_table *irq_rt;
	struct kvm_lapic_irq irq;
	struct kvm_vcpu *vcpu;
	struct vcpu_data vcpu_info;
11932
	int idx, ret = 0;
11933 11934

	if (!kvm_arch_has_assigned_device(kvm) ||
11935 11936
		!irq_remapping_cap(IRQ_POSTING_CAP) ||
		!kvm_vcpu_apicv_active(kvm->vcpus[0]))
11937 11938 11939 11940
		return 0;

	idx = srcu_read_lock(&kvm->irq_srcu);
	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
11941 11942 11943 11944 11945 11946
	if (guest_irq >= irq_rt->nr_rt_entries ||
	    hlist_empty(&irq_rt->map[guest_irq])) {
		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
			     guest_irq, irq_rt->nr_rt_entries);
		goto out;
	}
11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963

	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
		if (e->type != KVM_IRQ_ROUTING_MSI)
			continue;
		/*
		 * VT-d PI cannot support posting multicast/broadcast
		 * interrupts to a vCPU, we still use interrupt remapping
		 * for these kind of interrupts.
		 *
		 * For lowest-priority interrupts, we only support
		 * those with single CPU as the destination, e.g. user
		 * configures the interrupts via /proc/irq or uses
		 * irqbalance to make the interrupts single-CPU.
		 *
		 * We will support full lowest-priority interrupt later.
		 */

11964
		kvm_set_msi_irq(kvm, e, &irq);
11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977
		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
			/*
			 * Make sure the IRTE is in remapped mode if
			 * we don't handle it in posted mode.
			 */
			ret = irq_set_vcpu_affinity(host_irq, NULL);
			if (ret < 0) {
				printk(KERN_INFO
				   "failed to back to remapped mode, irq: %u\n",
				   host_irq);
				goto out;
			}

11978
			continue;
11979
		}
11980 11981 11982 11983

		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
		vcpu_info.vector = irq.vector;

11984
		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
11985 11986 11987 11988
				vcpu_info.vector, vcpu_info.pi_desc_addr, set);

		if (set)
			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
11989
		else
11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004
			ret = irq_set_vcpu_affinity(host_irq, NULL);

		if (ret < 0) {
			printk(KERN_INFO "%s: failed to update PI IRTE\n",
					__func__);
			goto out;
		}
	}

	ret = 0;
out:
	srcu_read_unlock(&kvm->irq_srcu, idx);
	return ret;
}

12005 12006 12007 12008 12009 12010 12011 12012 12013 12014
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
			FEATURE_CONTROL_LMCE;
	else
		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
			~FEATURE_CONTROL_LMCE;
}

12015 12016
static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
{
12017 12018 12019
	/* we need a nested vmexit to enter SMM, postpone if run is pending */
	if (to_vmx(vcpu)->nested.nested_run_pending)
		return 0;
12020 12021 12022
	return 1;
}

12023 12024
static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
12025 12026 12027 12028 12029 12030 12031 12032
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
	if (vmx->nested.smm.guest_mode)
		nested_vmx_vmexit(vcpu, -1, 0, 0);

	vmx->nested.smm.vmxon = vmx->nested.vmxon;
	vmx->nested.vmxon = false;
12033 12034 12035 12036 12037
	return 0;
}

static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
{
12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int ret;

	if (vmx->nested.smm.vmxon) {
		vmx->nested.vmxon = true;
		vmx->nested.smm.vmxon = false;
	}

	if (vmx->nested.smm.guest_mode) {
		vcpu->arch.hflags &= ~HF_SMM_MASK;
		ret = enter_vmx_non_root_mode(vcpu, false);
		vcpu->arch.hflags |= HF_SMM_MASK;
		if (ret)
			return ret;

		vmx->nested.smm.guest_mode = false;
	}
12055 12056 12057
	return 0;
}

12058 12059 12060 12061 12062
static int enable_smi_window(struct kvm_vcpu *vcpu)
{
	return 0;
}

12063
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
A
Avi Kivity 已提交
12064 12065 12066 12067
	.cpu_has_kvm_support = cpu_has_kvm_support,
	.disabled_by_bios = vmx_disabled_by_bios,
	.hardware_setup = hardware_setup,
	.hardware_unsetup = hardware_unsetup,
Y
Yang, Sheng 已提交
12068
	.check_processor_compatibility = vmx_check_processor_compat,
A
Avi Kivity 已提交
12069 12070
	.hardware_enable = hardware_enable,
	.hardware_disable = hardware_disable,
12071
	.cpu_has_accelerated_tpr = report_flexpriority,
12072
	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
A
Avi Kivity 已提交
12073 12074 12075

	.vcpu_create = vmx_create_vcpu,
	.vcpu_free = vmx_free_vcpu,
12076
	.vcpu_reset = vmx_vcpu_reset,
A
Avi Kivity 已提交
12077

12078
	.prepare_guest_switch = vmx_save_host_state,
A
Avi Kivity 已提交
12079 12080 12081
	.vcpu_load = vmx_vcpu_load,
	.vcpu_put = vmx_vcpu_put,

12082
	.update_bp_intercept = update_exception_bitmap,
A
Avi Kivity 已提交
12083 12084 12085 12086 12087
	.get_msr = vmx_get_msr,
	.set_msr = vmx_set_msr,
	.get_segment_base = vmx_get_segment_base,
	.get_segment = vmx_get_segment,
	.set_segment = vmx_set_segment,
12088
	.get_cpl = vmx_get_cpl,
A
Avi Kivity 已提交
12089
	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
12090
	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
12091
	.decache_cr3 = vmx_decache_cr3,
12092
	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
A
Avi Kivity 已提交
12093 12094 12095 12096 12097 12098 12099 12100
	.set_cr0 = vmx_set_cr0,
	.set_cr3 = vmx_set_cr3,
	.set_cr4 = vmx_set_cr4,
	.set_efer = vmx_set_efer,
	.get_idt = vmx_get_idt,
	.set_idt = vmx_set_idt,
	.get_gdt = vmx_get_gdt,
	.set_gdt = vmx_set_gdt,
J
Jan Kiszka 已提交
12101 12102
	.get_dr6 = vmx_get_dr6,
	.set_dr6 = vmx_set_dr6,
12103
	.set_dr7 = vmx_set_dr7,
12104
	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
12105
	.cache_reg = vmx_cache_reg,
A
Avi Kivity 已提交
12106 12107
	.get_rflags = vmx_get_rflags,
	.set_rflags = vmx_set_rflags,
12108

A
Avi Kivity 已提交
12109 12110 12111
	.tlb_flush = vmx_flush_tlb,

	.run = vmx_vcpu_run,
12112
	.handle_exit = vmx_handle_exit,
A
Avi Kivity 已提交
12113
	.skip_emulated_instruction = skip_emulated_instruction,
12114 12115
	.set_interrupt_shadow = vmx_set_interrupt_shadow,
	.get_interrupt_shadow = vmx_get_interrupt_shadow,
I
Ingo Molnar 已提交
12116
	.patch_hypercall = vmx_patch_hypercall,
E
Eddie Dong 已提交
12117
	.set_irq = vmx_inject_irq,
12118
	.set_nmi = vmx_inject_nmi,
12119
	.queue_exception = vmx_queue_exception,
A
Avi Kivity 已提交
12120
	.cancel_injection = vmx_cancel_injection,
12121
	.interrupt_allowed = vmx_interrupt_allowed,
12122
	.nmi_allowed = vmx_nmi_allowed,
J
Jan Kiszka 已提交
12123 12124
	.get_nmi_mask = vmx_get_nmi_mask,
	.set_nmi_mask = vmx_set_nmi_mask,
12125 12126 12127
	.enable_nmi_window = enable_nmi_window,
	.enable_irq_window = enable_irq_window,
	.update_cr8_intercept = update_cr8_intercept,
12128
	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
12129
	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
12130 12131
	.get_enable_apicv = vmx_get_enable_apicv,
	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
12132
	.load_eoi_exitmap = vmx_load_eoi_exitmap,
12133
	.apicv_post_state_restore = vmx_apicv_post_state_restore,
12134 12135
	.hwapic_irr_update = vmx_hwapic_irr_update,
	.hwapic_isr_update = vmx_hwapic_isr_update,
12136 12137
	.sync_pir_to_irr = vmx_sync_pir_to_irr,
	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
12138

12139
	.set_tss_addr = vmx_set_tss_addr,
12140
	.get_tdp_level = get_ept_level,
12141
	.get_mt_mask = vmx_get_mt_mask,
12142

12143 12144
	.get_exit_info = vmx_get_exit_info,

12145
	.get_lpage_level = vmx_get_lpage_level,
12146 12147

	.cpuid_update = vmx_cpuid_update,
12148 12149

	.rdtscp_supported = vmx_rdtscp_supported,
12150
	.invpcid_supported = vmx_invpcid_supported,
12151 12152

	.set_supported_cpuid = vmx_set_supported_cpuid,
12153 12154

	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
12155 12156

	.write_tsc_offset = vmx_write_tsc_offset,
12157 12158

	.set_tdp_cr3 = vmx_set_cr3,
12159 12160

	.check_intercept = vmx_check_intercept,
12161
	.handle_external_intr = vmx_handle_external_intr,
12162
	.mpx_supported = vmx_mpx_supported,
12163
	.xsaves_supported = vmx_xsaves_supported,
12164 12165

	.check_nested_events = vmx_check_nested_events,
12166 12167

	.sched_in = vmx_sched_in,
K
Kai Huang 已提交
12168 12169 12170 12171 12172

	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
	.flush_log_dirty = vmx_flush_log_dirty,
	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
12173
	.write_log_dirty = vmx_write_pml_buffer,
12174

12175 12176 12177
	.pre_block = vmx_pre_block,
	.post_block = vmx_post_block,

12178
	.pmu_ops = &intel_pmu_ops,
12179 12180

	.update_pi_irte = vmx_update_pi_irte,
12181 12182 12183 12184 12185

#ifdef CONFIG_X86_64
	.set_hv_timer = vmx_set_hv_timer,
	.cancel_hv_timer = vmx_cancel_hv_timer,
#endif
12186 12187

	.setup_mce = vmx_setup_mce,
12188

12189
	.smi_allowed = vmx_smi_allowed,
12190 12191
	.pre_enter_smm = vmx_pre_enter_smm,
	.pre_leave_smm = vmx_pre_leave_smm,
12192
	.enable_smi_window = enable_smi_window,
A
Avi Kivity 已提交
12193 12194 12195 12196
};

static int __init vmx_init(void)
{
12197 12198
	int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
                     __alignof__(struct vcpu_vmx), THIS_MODULE);
12199
	if (r)
12200
		return r;
S
Sheng Yang 已提交
12201

12202
#ifdef CONFIG_KEXEC_CORE
12203 12204 12205 12206
	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
			   crash_vmclear_local_loaded_vmcss);
#endif

12207
	return 0;
A
Avi Kivity 已提交
12208 12209 12210 12211
}

static void __exit vmx_exit(void)
{
12212
#ifdef CONFIG_KEXEC_CORE
12213
	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
12214 12215 12216
	synchronize_rcu();
#endif

12217
	kvm_exit();
A
Avi Kivity 已提交
12218 12219 12220 12221
}

module_init(vmx_init)
module_exit(vmx_exit)