svm.c 106.1 KB
Newer Older
1 2
#define pr_fmt(fmt) "SVM: " fmt

3 4
#include <linux/kvm_host.h>

5
#include "irq.h"
6
#include "mmu.h"
7
#include "kvm_cache_regs.h"
8
#include "x86.h"
9
#include "cpuid.h"
10
#include "pmu.h"
A
Avi Kivity 已提交
11

A
Avi Kivity 已提交
12
#include <linux/module.h>
13
#include <linux/mod_devicetable.h>
14
#include <linux/kernel.h>
A
Avi Kivity 已提交
15 16
#include <linux/vmalloc.h>
#include <linux/highmem.h>
17
#include <linux/amd-iommu.h>
A
Alexey Dobriyan 已提交
18
#include <linux/sched.h>
19
#include <linux/trace_events.h>
20
#include <linux/slab.h>
21
#include <linux/hashtable.h>
22
#include <linux/frame.h>
B
Brijesh Singh 已提交
23
#include <linux/psp-sev.h>
B
Brijesh Singh 已提交
24
#include <linux/file.h>
25 26
#include <linux/pagemap.h>
#include <linux/swap.h>
27
#include <linux/rwsem.h>
A
Avi Kivity 已提交
28

29
#include <asm/apic.h>
30
#include <asm/perf_event.h>
31
#include <asm/tlbflush.h>
A
Avi Kivity 已提交
32
#include <asm/desc.h>
33
#include <asm/debugreg.h>
G
Gleb Natapov 已提交
34
#include <asm/kvm_para.h>
35
#include <asm/irq_remapping.h>
36
#include <asm/spec-ctrl.h>
37
#include <asm/cpu_device_id.h>
A
Avi Kivity 已提交
38

39
#include <asm/virtext.h>
40
#include "trace.h"
41

42 43
#include "svm.h"

44 45
#define __ex(x) __kvm_handle_fault_on_reboot(x)

A
Avi Kivity 已提交
46 47 48
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

49
#ifdef MODULE
50
static const struct x86_cpu_id svm_cpu_id[] = {
51
	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
52 53 54
	{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
55
#endif
56

A
Avi Kivity 已提交
57 58 59 60 61 62
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1

#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3

63 64
#define SVM_FEATURE_LBRV           (1 <<  1)
#define SVM_FEATURE_SVML           (1 <<  2)
65 66 67 68
#define SVM_FEATURE_TSC_RATE       (1 <<  4)
#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
69
#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
70

71 72
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))

73
#define TSC_RATIO_RSVD          0xffffff0000000000ULL
74 75
#define TSC_RATIO_MIN		0x0000000000000001ULL
#define TSC_RATIO_MAX		0x000000ffffffffffULL
76

77 78
static bool erratum_383_found __read_mostly;

79
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
80

81 82 83 84 85 86
/*
 * Set osvw_len to higher value when updated Revision Guides
 * are published and we know what the new status bits are
 */
static uint64_t osvw_len = 4, osvw_status;

87 88 89
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT	0x0100000000ULL

90
static const struct svm_direct_access_msrs {
91 92 93
	u32 index;   /* Index of the MSR */
	bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
B
Brian Gerst 已提交
94
	{ .index = MSR_STAR,				.always = true  },
95 96 97 98 99 100 101 102 103
	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
#ifdef CONFIG_X86_64
	{ .index = MSR_GS_BASE,				.always = true  },
	{ .index = MSR_FS_BASE,				.always = true  },
	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
	{ .index = MSR_LSTAR,				.always = true  },
	{ .index = MSR_CSTAR,				.always = true  },
	{ .index = MSR_SYSCALL_MASK,			.always = true  },
#endif
104
	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
A
Ashok Raj 已提交
105
	{ .index = MSR_IA32_PRED_CMD,			.always = false },
106 107 108 109 110
	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
	{ .index = MSR_INVALID,				.always = false },
A
Avi Kivity 已提交
111 112
};

113 114
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
115
bool npt_enabled = true;
116
#else
117
bool npt_enabled;
118
#endif
119

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * pause_filter_count: On processors that support Pause filtering(indicated
 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 *	count value. On VMRUN this value is loaded into an internal counter.
 *	Each time a pause instruction is executed, this counter is decremented
 *	until it reaches zero at which time a #VMEXIT is generated if pause
 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 *	Intercept Filtering for more details.
 *	This also indicate if ple logic enabled.
 *
 * pause_filter_thresh: In addition, some processor families support advanced
 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 *	the amount of time a guest is allowed to execute in a pause loop.
 *	In this mode, a 16-bit pause filter threshold field is added in the
 *	VMCB. The threshold value is a cycle count that is used to reset the
 *	pause counter. As with simple pause filtering, VMRUN loads the pause
 *	count value from VMCB into an internal counter. Then, on each pause
 *	instruction the hardware checks the elapsed number of cycles since
 *	the most recent pause instruction against the pause filter threshold.
 *	If the elapsed cycle count is greater than the pause filter threshold,
 *	then the internal pause count is reloaded from the VMCB and execution
 *	continues. If the elapsed cycle count is less than the pause filter
 *	threshold, then the internal pause count is decremented. If the count
 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 *	triggered. If advanced pause filtering is supported and pause filter
 *	threshold field is set to zero, the filter will operate in the simpler,
 *	count only mode.
 */

static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
module_param(pause_filter_thresh, ushort, 0444);

static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
module_param(pause_filter_count, ushort, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(pause_filter_count_grow, ushort, 0444);

/* Default resets per-vcpu window every exit to pause_filter_count. */
static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(pause_filter_count_shrink, ushort, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);

168 169
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
170
module_param(npt, int, S_IRUGO);
171

172 173
/* allow nested virtualization in KVM/SVM */
static int nested = true;
174 175
module_param(nested, int, S_IRUGO);

176 177 178 179
/* enable/disable Next RIP Save */
static int nrips = true;
module_param(nrips, int, 0444);

180 181 182 183
/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);

184 185 186
/* enable/disable Virtual GIF */
static int vgif = true;
module_param(vgif, int, 0444);
187

B
Brijesh Singh 已提交
188 189 190 191
/* enable/disable SEV support */
static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);

192 193 194
static bool __read_mostly dump_invalid_vmcb = 0;
module_param(dump_invalid_vmcb, bool, 0644);

B
Brijesh Singh 已提交
195 196
static u8 rsm_ins_bytes[] = "\x0f\xaa";

197
static void svm_complete_interrupts(struct vcpu_svm *svm);
198

199
static unsigned long iopm_base;
A
Avi Kivity 已提交
200 201 202 203

struct kvm_ldttss_desc {
	u16 limit0;
	u16 base0;
J
Joerg Roedel 已提交
204 205
	unsigned base1:8, type:5, dpl:2, p:1;
	unsigned limit1:4, zero0:3, g:1, base2:8;
A
Avi Kivity 已提交
206 207 208 209
	u32 base3;
	u32 zero1;
} __attribute__((packed));

210
DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
A
Avi Kivity 已提交
211

212
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
A
Avi Kivity 已提交
213

214
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
A
Avi Kivity 已提交
215 216 217
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)

218
u32 svm_msrpm_offset(u32 msr)
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
{
	u32 offset;
	int i;

	for (i = 0; i < NUM_MSR_MAPS; i++) {
		if (msr < msrpm_ranges[i] ||
		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
			continue;

		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */

		/* Now we have the u8 offset - but need the u32 offset */
		return offset / 4;
	}

	/* MSR not in any range */
	return MSR_INVALID;
}

A
Avi Kivity 已提交
239 240 241 242
#define MAX_INST_SIZE 15

static inline void clgi(void)
{
243
	asm volatile (__ex("clgi"));
A
Avi Kivity 已提交
244 245 246 247
}

static inline void stgi(void)
{
248
	asm volatile (__ex("stgi"));
A
Avi Kivity 已提交
249 250 251 252
}

static inline void invlpga(unsigned long addr, u32 asid)
{
253
	asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
A
Avi Kivity 已提交
254 255
}

256
static int get_npt_level(struct kvm_vcpu *vcpu)
257 258
{
#ifdef CONFIG_X86_64
259
	return PT64_ROOT_4LEVEL;
260 261 262 263 264
#else
	return PT32E_ROOT_LEVEL;
#endif
}

265
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
A
Avi Kivity 已提交
266
{
267
	vcpu->arch.efer = efer;
268 269 270 271 272 273 274 275

	if (!npt_enabled) {
		/* Shadow paging assumes NX to be available.  */
		efer |= EFER_NX;

		if (!(efer & EFER_LMA))
			efer &= ~EFER_LME;
	}
A
Avi Kivity 已提交
276

277
	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
278
	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
A
Avi Kivity 已提交
279 280 281 282 283 284 285 286
}

static int is_external_interrupt(u32 info)
{
	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}

287
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
288 289 290 291 292
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u32 ret = 0;

	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
293 294
		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
	return ret;
295 296 297 298 299 300 301 302 303 304 305 306 307
}

static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (mask == 0)
		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
	else
		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;

}

308
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
309
{
310 311
	struct vcpu_svm *svm = to_svm(vcpu);

312
	if (nrips && svm->vmcb->control.next_rip != 0) {
313
		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
314
		svm->next_rip = svm->vmcb->control.next_rip;
315
	}
316

317 318 319 320 321 322 323 324 325
	if (!svm->next_rip) {
		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
			return 0;
	} else {
		if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
			pr_err("%s: ip 0x%lx next 0x%llx\n",
			       __func__, kvm_rip_read(vcpu), svm->next_rip);
		kvm_rip_write(vcpu, svm->next_rip);
	}
326
	svm_set_interrupt_shadow(vcpu, 0);
327

328
	return 1;
A
Avi Kivity 已提交
329 330
}

331
static void svm_queue_exception(struct kvm_vcpu *vcpu)
J
Jan Kiszka 已提交
332 333
{
	struct vcpu_svm *svm = to_svm(vcpu);
334 335
	unsigned nr = vcpu->arch.exception.nr;
	bool has_error_code = vcpu->arch.exception.has_error_code;
336
	bool reinject = vcpu->arch.exception.injected;
337
	u32 error_code = vcpu->arch.exception.error_code;
J
Jan Kiszka 已提交
338

J
Joerg Roedel 已提交
339 340 341 342
	/*
	 * If we are within a nested VM we'd better #VMEXIT and let the guest
	 * handle the exception
	 */
343 344
	if (!reinject &&
	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
J
Jan Kiszka 已提交
345 346
		return;

347 348
	kvm_deliver_exception_payload(&svm->vcpu);

349
	if (nr == BP_VECTOR && !nrips) {
350 351 352 353 354 355 356 357 358
		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);

		/*
		 * For guest debugging where we have to reinject #BP if some
		 * INT3 is guest-owned:
		 * Emulate nRIP by moving RIP forward. Will fail if injection
		 * raises a fault that is not intercepted. Still better than
		 * failing in all cases.
		 */
359
		(void)skip_emulated_instruction(&svm->vcpu);
360 361 362 363 364
		rip = kvm_rip_read(&svm->vcpu);
		svm->int3_rip = rip + svm->vmcb->save.cs.base;
		svm->int3_injected = rip - old_rip;
	}

J
Jan Kiszka 已提交
365 366 367 368 369 370 371
	svm->vmcb->control.event_inj = nr
		| SVM_EVTINJ_VALID
		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
		| SVM_EVTINJ_TYPE_EXEPT;
	svm->vmcb->control.event_inj_err = error_code;
}

372 373 374 375 376 377
static void svm_init_erratum_383(void)
{
	u32 low, high;
	int err;
	u64 val;

378
	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
		return;

	/* Use _safe variants to not break nested virtualization */
	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
	if (err)
		return;

	val |= (1ULL << 47);

	low  = lower_32_bits(val);
	high = upper_32_bits(val);

	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);

	erratum_383_found = true;
}

396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
	/*
	 * Guests should see errata 400 and 415 as fixed (assuming that
	 * HLT and IO instructions are intercepted).
	 */
	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
	vcpu->arch.osvw.status = osvw_status & ~(6ULL);

	/*
	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
	 * all osvw.status bits inside that length, including bit 0 (which is
	 * reserved for erratum 298), are valid. However, if host processor's
	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
	 * be conservative here and therefore we tell the guest that erratum 298
	 * is present (because we really don't know).
	 */
	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
		vcpu->arch.osvw.status |= 1;
}

A
Avi Kivity 已提交
417 418
static int has_svm(void)
{
419
	const char *msg;
A
Avi Kivity 已提交
420

421
	if (!cpu_has_svm(&msg)) {
J
Joe Perches 已提交
422
		printk(KERN_INFO "has_svm: %s\n", msg);
A
Avi Kivity 已提交
423 424 425 426 427 428
		return 0;
	}

	return 1;
}

429
static void svm_hardware_disable(void)
A
Avi Kivity 已提交
430
{
431 432 433 434
	/* Make sure we clean up behind us */
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);

435
	cpu_svm_disable();
436 437

	amd_pmu_disable_virt();
A
Avi Kivity 已提交
438 439
}

440
static int svm_hardware_enable(void)
A
Avi Kivity 已提交
441 442
{

443
	struct svm_cpu_data *sd;
A
Avi Kivity 已提交
444 445 446 447
	uint64_t efer;
	struct desc_struct *gdt;
	int me = raw_smp_processor_id();

448 449 450 451
	rdmsrl(MSR_EFER, efer);
	if (efer & EFER_SVME)
		return -EBUSY;

A
Avi Kivity 已提交
452
	if (!has_svm()) {
453
		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
454
		return -EINVAL;
A
Avi Kivity 已提交
455
	}
456 457
	sd = per_cpu(svm_data, me);
	if (!sd) {
458
		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
459
		return -EINVAL;
A
Avi Kivity 已提交
460 461
	}

462 463 464
	sd->asid_generation = 1;
	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
	sd->next_asid = sd->max_asid + 1;
465
	sd->min_asid = max_sev_asid + 1;
A
Avi Kivity 已提交
466

467
	gdt = get_current_gdt_rw();
468
	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
A
Avi Kivity 已提交
469

470
	wrmsrl(MSR_EFER, efer | EFER_SVME);
A
Avi Kivity 已提交
471

472
	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
473

474 475
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
476
		__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
477 478
	}

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508

	/*
	 * Get OSVW bits.
	 *
	 * Note that it is possible to have a system with mixed processor
	 * revisions and therefore different OSVW bits. If bits are not the same
	 * on different processors then choose the worst case (i.e. if erratum
	 * is present on one processor and not on another then assume that the
	 * erratum is present everywhere).
	 */
	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
		uint64_t len, status = 0;
		int err;

		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
		if (!err)
			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
						      &err);

		if (err)
			osvw_status = osvw_len = 0;
		else {
			if (len < osvw_len)
				osvw_len = len;
			osvw_status |= status;
			osvw_status &= (1ULL << osvw_len) - 1;
		}
	} else
		osvw_status = osvw_len = 0;

509 510
	svm_init_erratum_383();

511 512
	amd_pmu_enable_virt();

513
	return 0;
A
Avi Kivity 已提交
514 515
}

516 517
static void svm_cpu_uninit(int cpu)
{
518
	struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
519

520
	if (!sd)
521 522 523
		return;

	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
524
	kfree(sd->sev_vmcbs);
525 526
	__free_page(sd->save_area);
	kfree(sd);
527 528
}

A
Avi Kivity 已提交
529 530
static int svm_cpu_init(int cpu)
{
531
	struct svm_cpu_data *sd;
A
Avi Kivity 已提交
532

533 534
	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
	if (!sd)
A
Avi Kivity 已提交
535
		return -ENOMEM;
536
	sd->cpu = cpu;
537
	sd->save_area = alloc_page(GFP_KERNEL);
538
	if (!sd->save_area)
539
		goto free_cpu_data;
A
Avi Kivity 已提交
540

541
	if (svm_sev_enabled()) {
542 543 544
		sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
					      sizeof(void *),
					      GFP_KERNEL);
545
		if (!sd->sev_vmcbs)
546
			goto free_save_area;
547 548
	}

549
	per_cpu(svm_data, cpu) = sd;
A
Avi Kivity 已提交
550 551 552

	return 0;

553 554 555
free_save_area:
	__free_page(sd->save_area);
free_cpu_data:
556
	kfree(sd);
557
	return -ENOMEM;
A
Avi Kivity 已提交
558 559 560

}

561 562 563 564 565 566 567 568 569 570 571
static bool valid_msr_intercept(u32 index)
{
	int i;

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
		if (direct_access_msrs[i].index == index)
			return true;

	return false;
}

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
{
	u8 bit_write;
	unsigned long tmp;
	u32 offset;
	u32 *msrpm;

	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
				      to_svm(vcpu)->msrpm;

	offset    = svm_msrpm_offset(msr);
	bit_write = 2 * (msr & 0x0f) + 1;
	tmp       = msrpm[offset];

	BUG_ON(offset == MSR_INVALID);

	return !!test_bit(bit_write,  &tmp);
}

591 592
static void set_msr_interception(u32 *msrpm, unsigned msr,
				 int read, int write)
A
Avi Kivity 已提交
593
{
594 595 596
	u8 bit_read, bit_write;
	unsigned long tmp;
	u32 offset;
A
Avi Kivity 已提交
597

598 599 600 601 602 603
	/*
	 * If this warning triggers extend the direct_access_msrs list at the
	 * beginning of the file
	 */
	WARN_ON(!valid_msr_intercept(msr));

604 605 606 607 608 609 610 611 612 613 614
	offset    = svm_msrpm_offset(msr);
	bit_read  = 2 * (msr & 0x0f);
	bit_write = 2 * (msr & 0x0f) + 1;
	tmp       = msrpm[offset];

	BUG_ON(offset == MSR_INVALID);

	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);

	msrpm[offset] = tmp;
A
Avi Kivity 已提交
615 616
}

617
static void svm_vcpu_init_msrpm(u32 *msrpm)
A
Avi Kivity 已提交
618 619 620
{
	int i;

621 622
	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));

623 624 625 626 627 628
	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		if (!direct_access_msrs[i].always)
			continue;

		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
	}
629 630
}

631 632 633 634 635 636 637 638
static void add_msr_offset(u32 offset)
{
	int i;

	for (i = 0; i < MSRPM_OFFSETS; ++i) {

		/* Offset already in list? */
		if (msrpm_offsets[i] == offset)
639
			return;
640 641 642 643 644 645 646 647 648

		/* Slot used by another offset? */
		if (msrpm_offsets[i] != MSR_INVALID)
			continue;

		/* Add offset to list */
		msrpm_offsets[i] = offset;

		return;
A
Avi Kivity 已提交
649
	}
650 651 652 653 654

	/*
	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
	 * increase MSRPM_OFFSETS in this case.
	 */
655
	BUG();
A
Avi Kivity 已提交
656 657
}

658
static void init_msrpm_offsets(void)
659
{
660
	int i;
661

662 663 664 665 666 667 668 669 670 671
	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		u32 offset;

		offset = svm_msrpm_offset(direct_access_msrs[i].index);
		BUG_ON(offset == MSR_INVALID);

		add_msr_offset(offset);
	}
672 673
}

674 675 676 677
static void svm_enable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

678
	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
679 680 681 682 683 684 685 686 687 688
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}

static void svm_disable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

689
	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
690 691 692 693 694 695
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}

696
void disable_nmi_singlestep(struct vcpu_svm *svm)
697 698
{
	svm->nmi_singlestep = false;
699

700 701 702 703 704 705 706
	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
		/* Clear our flags if they were not set by the guest */
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
	}
707 708
}

709 710 711 712 713 714 715 716 717 718 719
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	int old = control->pause_filter_count;

	control->pause_filter_count = __grow_ple_window(old,
							pause_filter_count,
							pause_filter_count_grow,
							pause_filter_count_max);

P
Peter Xu 已提交
720
	if (control->pause_filter_count != old) {
721
		mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
P
Peter Xu 已提交
722 723 724
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    control->pause_filter_count, old);
	}
725 726 727 728 729 730 731 732 733 734 735 736 737
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	int old = control->pause_filter_count;

	control->pause_filter_count =
				__shrink_ple_window(old,
						    pause_filter_count,
						    pause_filter_count_shrink,
						    pause_filter_count);
P
Peter Xu 已提交
738
	if (control->pause_filter_count != old) {
739
		mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
P
Peter Xu 已提交
740 741 742
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    control->pause_filter_count, old);
	}
743 744
}

745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
/*
 * The default MMIO mask is a single bit (excluding the present bit),
 * which could conflict with the memory encryption bit. Check for
 * memory encryption support and override the default MMIO mask if
 * memory encryption is enabled.
 */
static __init void svm_adjust_mmio_mask(void)
{
	unsigned int enc_bit, mask_bit;
	u64 msr, mask;

	/* If there is no memory encryption support, use existing mask */
	if (cpuid_eax(0x80000000) < 0x8000001f)
		return;

	/* If memory encryption is not enabled, use existing mask */
	rdmsrl(MSR_K8_SYSCFG, msr);
	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
		return;

	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
	mask_bit = boot_cpu_data.x86_phys_bits;

	/* Increment the mask bit if it is the same as the encryption bit */
	if (enc_bit == mask_bit)
		mask_bit++;

	/*
	 * If the mask bit location is below 52, then some bits above the
	 * physical addressing limit will always be reserved, so use the
	 * rsvd_bits() function to generate the mask. This mask, along with
	 * the present bit, will be used to generate a page fault with
	 * PFER.RSV = 1.
	 *
	 * If the mask bit location is 52 (or above), then clear the mask.
	 */
	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;

	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}

786 787 788 789
static void svm_hardware_teardown(void)
{
	int cpu;

790 791
	if (svm_sev_enabled())
		sev_hardware_teardown();
792 793 794 795 796 797 798 799

	for_each_possible_cpu(cpu)
		svm_cpu_uninit(cpu);

	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
	iopm_base = 0;
}

800 801 802 803
static __init void svm_set_cpu_caps(void)
{
	kvm_set_cpu_caps();

804 805
	supported_xss = 0;

806 807
	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
	if (nested) {
808 809
		kvm_cpu_cap_set(X86_FEATURE_SVM);

810
		if (nrips)
811 812 813 814 815 816
			kvm_cpu_cap_set(X86_FEATURE_NRIPS);

		if (npt_enabled)
			kvm_cpu_cap_set(X86_FEATURE_NPT);
	}

817 818 819 820
	/* CPUID 0x80000008 */
	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
821 822
}

A
Avi Kivity 已提交
823 824 825 826
static __init int svm_hardware_setup(void)
{
	int cpu;
	struct page *iopm_pages;
827
	void *iopm_va;
A
Avi Kivity 已提交
828 829 830 831 832 833
	int r;

	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);

	if (!iopm_pages)
		return -ENOMEM;
834 835 836

	iopm_va = page_address(iopm_pages);
	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
A
Avi Kivity 已提交
837 838
	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;

839 840
	init_msrpm_offsets();

841 842
	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);

843 844 845
	if (boot_cpu_has(X86_FEATURE_NX))
		kvm_enable_efer_bits(EFER_NX);

A
Alexander Graf 已提交
846 847 848
	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
		kvm_enable_efer_bits(EFER_FFXSR);

849 850
	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		kvm_has_tsc_control = true;
851 852
		kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
		kvm_tsc_scaling_ratio_frac_bits = 32;
853 854
	}

855 856 857 858 859 860 861 862
	/* Check for pause filtering support */
	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
		pause_filter_count = 0;
		pause_filter_thresh = 0;
	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
		pause_filter_thresh = 0;
	}

863 864
	if (nested) {
		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
865
		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
866 867
	}

B
Brijesh Singh 已提交
868 869 870 871 872 873 874 875 876 877 878
	if (sev) {
		if (boot_cpu_has(X86_FEATURE_SEV) &&
		    IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
			r = sev_hardware_setup();
			if (r)
				sev = false;
		} else {
			sev = false;
		}
	}

879 880
	svm_adjust_mmio_mask();

Z
Zachary Amsden 已提交
881
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
882 883
		r = svm_cpu_init(cpu);
		if (r)
884
			goto err;
A
Avi Kivity 已提交
885
	}
886

887
	if (!boot_cpu_has(X86_FEATURE_NPT))
888 889
		npt_enabled = false;

890
	if (npt_enabled && !npt)
891 892
		npt_enabled = false;

893
	kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
894
	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
895

896 897 898 899 900
	if (nrips) {
		if (!boot_cpu_has(X86_FEATURE_NRIPS))
			nrips = false;
	}

901 902 903
	if (avic) {
		if (!npt_enabled ||
		    !boot_cpu_has(X86_FEATURE_AVIC) ||
904
		    !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
905
			avic = false;
906
		} else {
907
			pr_info("AVIC enabled\n");
908 909 910

			amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
		}
911
	}
912

913 914
	if (vls) {
		if (!npt_enabled ||
915
		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
916 917 918 919 920 921 922
		    !IS_ENABLED(CONFIG_X86_64)) {
			vls = false;
		} else {
			pr_info("Virtual VMLOAD VMSAVE supported\n");
		}
	}

923 924 925 926 927 928 929
	if (vgif) {
		if (!boot_cpu_has(X86_FEATURE_VGIF))
			vgif = false;
		else
			pr_info("Virtual GIF supported\n");
	}

930
	svm_set_cpu_caps();
931

A
Avi Kivity 已提交
932 933
	return 0;

934
err:
935
	svm_hardware_teardown();
A
Avi Kivity 已提交
936 937 938 939 940 941 942
	return r;
}

static void init_seg(struct vmcb_seg *seg)
{
	seg->selector = 0;
	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
J
Joerg Roedel 已提交
943
		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
A
Avi Kivity 已提交
944 945 946 947 948 949 950 951 952 953 954 955
	seg->limit = 0xffff;
	seg->base = 0;
}

static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
{
	seg->selector = 0;
	seg->attrib = SVM_SELECTOR_P_MASK | type;
	seg->limit = 0xffff;
	seg->base = 0;
}

956 957 958 959 960 961 962 963 964 965
static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (is_guest_mode(vcpu))
		return svm->nested.hsave->control.tsc_offset;

	return vcpu->arch.tsc_offset;
}

966
static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
967 968 969 970
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u64 g_tsc_offset = 0;

971
	if (is_guest_mode(vcpu)) {
972
		/* Write L1's TSC offset.  */
973 974 975
		g_tsc_offset = svm->vmcb->control.tsc_offset -
			       svm->nested.hsave->control.tsc_offset;
		svm->nested.hsave->control.tsc_offset = offset;
976 977 978 979 980
	}

	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				   svm->vmcb->control.tsc_offset - g_tsc_offset,
				   offset);
981 982

	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
983 984

	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
985
	return svm->vmcb->control.tsc_offset;
986 987
}

P
Paolo Bonzini 已提交
988
static void init_vmcb(struct vcpu_svm *svm)
A
Avi Kivity 已提交
989
{
990 991
	struct vmcb_control_area *control = &svm->vmcb->control;
	struct vmcb_save_area *save = &svm->vmcb->save;
A
Avi Kivity 已提交
992

993
	svm->vcpu.arch.hflags = 0;
994

995 996 997 998 999 1000
	set_cr_intercept(svm, INTERCEPT_CR0_READ);
	set_cr_intercept(svm, INTERCEPT_CR3_READ);
	set_cr_intercept(svm, INTERCEPT_CR4_READ);
	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
	set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1001 1002
	if (!kvm_vcpu_apicv_active(&svm->vcpu))
		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
A
Avi Kivity 已提交
1003

1004
	set_dr_intercepts(svm);
A
Avi Kivity 已提交
1005

1006 1007 1008
	set_exception_intercept(svm, PF_VECTOR);
	set_exception_intercept(svm, UD_VECTOR);
	set_exception_intercept(svm, MC_VECTOR);
1009
	set_exception_intercept(svm, AC_VECTOR);
1010
	set_exception_intercept(svm, DB_VECTOR);
1011 1012 1013 1014 1015 1016 1017 1018
	/*
	 * Guest access to VMware backdoor ports could legitimately
	 * trigger #GP because of TSS I/O permission bitmap.
	 * We intercept those #GP and allow access to them anyway
	 * as VMware does.
	 */
	if (enable_vmware_backdoor)
		set_exception_intercept(svm, GP_VECTOR);
A
Avi Kivity 已提交
1019

1020 1021 1022 1023
	set_intercept(svm, INTERCEPT_INTR);
	set_intercept(svm, INTERCEPT_NMI);
	set_intercept(svm, INTERCEPT_SMI);
	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
A
Avi Kivity 已提交
1024
	set_intercept(svm, INTERCEPT_RDPMC);
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
	set_intercept(svm, INTERCEPT_CPUID);
	set_intercept(svm, INTERCEPT_INVD);
	set_intercept(svm, INTERCEPT_INVLPG);
	set_intercept(svm, INTERCEPT_INVLPGA);
	set_intercept(svm, INTERCEPT_IOIO_PROT);
	set_intercept(svm, INTERCEPT_MSR_PROT);
	set_intercept(svm, INTERCEPT_TASK_SWITCH);
	set_intercept(svm, INTERCEPT_SHUTDOWN);
	set_intercept(svm, INTERCEPT_VMRUN);
	set_intercept(svm, INTERCEPT_VMMCALL);
	set_intercept(svm, INTERCEPT_VMLOAD);
	set_intercept(svm, INTERCEPT_VMSAVE);
	set_intercept(svm, INTERCEPT_STGI);
	set_intercept(svm, INTERCEPT_CLGI);
	set_intercept(svm, INTERCEPT_SKINIT);
	set_intercept(svm, INTERCEPT_WBINVD);
J
Joerg Roedel 已提交
1041
	set_intercept(svm, INTERCEPT_XSETBV);
J
Jim Mattson 已提交
1042
	set_intercept(svm, INTERCEPT_RDPRU);
B
Brijesh Singh 已提交
1043
	set_intercept(svm, INTERCEPT_RSM);
A
Avi Kivity 已提交
1044

1045
	if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1046 1047 1048 1049
		set_intercept(svm, INTERCEPT_MONITOR);
		set_intercept(svm, INTERCEPT_MWAIT);
	}

1050 1051 1052
	if (!kvm_hlt_in_guest(svm->vcpu.kvm))
		set_intercept(svm, INTERCEPT_HLT);

1053 1054
	control->iopm_base_pa = __sme_set(iopm_base);
	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
A
Avi Kivity 已提交
1055 1056 1057 1058 1059 1060 1061 1062 1063
	control->int_ctl = V_INTR_MASKING_MASK;

	init_seg(&save->es);
	init_seg(&save->ss);
	init_seg(&save->ds);
	init_seg(&save->fs);
	init_seg(&save->gs);

	save->cs.selector = 0xf000;
1064
	save->cs.base = 0xffff0000;
A
Avi Kivity 已提交
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
	/* Executable/Readable Code Segment */
	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
	save->cs.limit = 0xffff;

	save->gdtr.limit = 0xffff;
	save->idtr.limit = 0xffff;

	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);

P
Paolo Bonzini 已提交
1076
	svm_set_efer(&svm->vcpu, 0);
M
Mike Day 已提交
1077
	save->dr6 = 0xffff0ff0;
1078
	kvm_set_rflags(&svm->vcpu, 2);
A
Avi Kivity 已提交
1079
	save->rip = 0x0000fff0;
1080
	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
A
Avi Kivity 已提交
1081

J
Joerg Roedel 已提交
1082
	/*
1083
	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1084
	 * It also updates the guest-visible cr0 value.
A
Avi Kivity 已提交
1085
	 */
1086
	svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1087
	kvm_mmu_reset_context(&svm->vcpu);
1088

1089
	save->cr4 = X86_CR4_PAE;
A
Avi Kivity 已提交
1090
	/* rdx = ?? */
1091 1092 1093

	if (npt_enabled) {
		/* Setup VMCB for Nested Paging */
1094
		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1095
		clr_intercept(svm, INTERCEPT_INVLPG);
1096
		clr_exception_intercept(svm, PF_VECTOR);
1097 1098
		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1099
		save->g_pat = svm->vcpu.arch.pat;
1100 1101 1102
		save->cr3 = 0;
		save->cr4 = 0;
	}
1103
	svm->asid_generation = 0;
1104

1105
	svm->nested.vmcb = 0;
1106 1107
	svm->vcpu.arch.hflags = 0;

1108 1109 1110 1111
	if (pause_filter_count) {
		control->pause_filter_count = pause_filter_count;
		if (pause_filter_thresh)
			control->pause_filter_thresh = pause_filter_thresh;
1112
		set_intercept(svm, INTERCEPT_PAUSE);
1113 1114
	} else {
		clr_intercept(svm, INTERCEPT_PAUSE);
1115 1116
	}

1117
	if (kvm_vcpu_apicv_active(&svm->vcpu))
1118 1119
		avic_init_vmcb(svm);

1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
	/*
	 * If hardware supports Virtual VMLOAD VMSAVE then enable it
	 * in VMCB and clear intercepts to avoid #VMEXIT.
	 */
	if (vls) {
		clr_intercept(svm, INTERCEPT_VMLOAD);
		clr_intercept(svm, INTERCEPT_VMSAVE);
		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
	}

1130 1131 1132 1133 1134 1135
	if (vgif) {
		clr_intercept(svm, INTERCEPT_STGI);
		clr_intercept(svm, INTERCEPT_CLGI);
		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
	}

1136
	if (sev_guest(svm->vcpu.kvm)) {
B
Brijesh Singh 已提交
1137
		svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1138 1139
		clr_exception_intercept(svm, UD_VECTOR);
	}
B
Brijesh Singh 已提交
1140

1141 1142
	mark_all_dirty(svm->vmcb);

1143
	enable_gif(svm);
1144 1145 1146

}

1147
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1148 1149
{
	struct vcpu_svm *svm = to_svm(vcpu);
1150 1151
	u32 dummy;
	u32 eax = 1;
1152

1153
	svm->spec_ctrl = 0;
1154
	svm->virt_spec_ctrl = 0;
1155

1156 1157 1158 1159 1160 1161
	if (!init_event) {
		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
					   MSR_IA32_APICBASE_ENABLE;
		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
	}
P
Paolo Bonzini 已提交
1162
	init_vmcb(svm);
A
Avi Kivity 已提交
1163

1164
	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
1165
	kvm_rdx_write(vcpu, eax);
1166 1167 1168

	if (kvm_vcpu_apicv_active(vcpu) && !init_event)
		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1169 1170
}

1171
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1172
{
1173
	struct vcpu_svm *svm;
A
Avi Kivity 已提交
1174
	struct page *page;
1175
	struct page *msrpm_pages;
A
Alexander Graf 已提交
1176
	struct page *hsave_page;
A
Alexander Graf 已提交
1177
	struct page *nested_msrpm_pages;
R
Rusty Russell 已提交
1178
	int err;
A
Avi Kivity 已提交
1179

1180 1181
	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
	svm = to_svm(vcpu);
R
Rusty Russell 已提交
1182

1183
	err = -ENOMEM;
1184
	page = alloc_page(GFP_KERNEL_ACCOUNT);
1185
	if (!page)
1186
		goto out;
A
Avi Kivity 已提交
1187

1188
	msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
1189
	if (!msrpm_pages)
1190
		goto free_page1;
A
Alexander Graf 已提交
1191

1192
	nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
A
Alexander Graf 已提交
1193
	if (!nested_msrpm_pages)
1194
		goto free_page2;
1195

1196
	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
A
Alexander Graf 已提交
1197
	if (!hsave_page)
1198 1199
		goto free_page3;

1200 1201 1202
	err = avic_init_vcpu(svm);
	if (err)
		goto free_page4;
1203

1204 1205 1206
	/* We initialize this flag to true to make sure that the is_running
	 * bit would be set the first time the vcpu is loaded.
	 */
1207 1208
	if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
		svm->avic_is_running = true;
1209

1210
	svm->nested.hsave = page_address(hsave_page);
A
Alexander Graf 已提交
1211

1212 1213 1214
	svm->msrpm = page_address(msrpm_pages);
	svm_vcpu_init_msrpm(svm->msrpm);

1215
	svm->nested.msrpm = page_address(nested_msrpm_pages);
1216
	svm_vcpu_init_msrpm(svm->nested.msrpm);
A
Alexander Graf 已提交
1217

1218 1219
	svm->vmcb = page_address(page);
	clear_page(svm->vmcb);
1220
	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
1221
	svm->asid_generation = 0;
P
Paolo Bonzini 已提交
1222
	init_vmcb(svm);
A
Avi Kivity 已提交
1223

1224
	svm_init_osvw(vcpu);
1225
	vcpu->arch.microcode_version = 0x01000065;
1226

1227
	return 0;
1228

1229 1230
free_page4:
	__free_page(hsave_page);
1231 1232 1233 1234 1235 1236
free_page3:
	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
	__free_page(page);
1237
out:
1238
	return err;
A
Avi Kivity 已提交
1239 1240
}

1241 1242 1243 1244 1245 1246 1247 1248
static void svm_clear_current_vmcb(struct vmcb *vmcb)
{
	int i;

	for_each_online_cpu(i)
		cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}

A
Avi Kivity 已提交
1249 1250
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
1251 1252
	struct vcpu_svm *svm = to_svm(vcpu);

1253 1254 1255 1256 1257 1258 1259
	/*
	 * The vmcb page can be recycled, causing a false negative in
	 * svm_vcpu_load(). So, ensure that no logical CPU has this
	 * vmcb page recorded as its current vmcb.
	 */
	svm_clear_current_vmcb(svm->vmcb);

1260
	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1261
	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1262 1263
	__free_page(virt_to_page(svm->nested.hsave));
	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
A
Avi Kivity 已提交
1264 1265
}

1266
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
A
Avi Kivity 已提交
1267
{
1268
	struct vcpu_svm *svm = to_svm(vcpu);
A
Ashok Raj 已提交
1269
	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1270
	int i;
1271 1272

	if (unlikely(cpu != vcpu->cpu)) {
1273
		svm->asid_generation = 0;
1274
		mark_all_dirty(svm->vmcb);
1275
	}
1276

1277 1278 1279
#ifdef CONFIG_X86_64
	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
#endif
1280 1281 1282 1283
	savesegment(fs, svm->host.fs);
	savesegment(gs, svm->host.gs);
	svm->host.ldt = kvm_read_ldt();

1284
	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1285
		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1286

1287 1288 1289 1290 1291 1292
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
		if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
			__this_cpu_write(current_tsc_ratio, tsc_ratio);
			wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
		}
1293
	}
P
Paolo Bonzini 已提交
1294 1295 1296
	/* This assumes that the kernel never uses MSR_TSC_AUX */
	if (static_cpu_has(X86_FEATURE_RDTSCP))
		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1297

A
Ashok Raj 已提交
1298 1299 1300 1301
	if (sd->current_vmcb != svm->vmcb) {
		sd->current_vmcb = svm->vmcb;
		indirect_branch_prediction_barrier();
	}
1302
	avic_vcpu_load(vcpu, cpu);
A
Avi Kivity 已提交
1303 1304 1305 1306
}

static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
1307
	struct vcpu_svm *svm = to_svm(vcpu);
1308 1309
	int i;

1310 1311
	avic_vcpu_put(vcpu);

1312
	++vcpu->stat.host_state_reload;
1313 1314 1315
	kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
	loadsegment(fs, svm->host.fs);
1316
	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1317
	load_gs_index(svm->host.gs);
1318
#else
1319
#ifdef CONFIG_X86_32_LAZY_GS
1320
	loadsegment(gs, svm->host.gs);
1321
#endif
1322
#endif
1323
	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1324
		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
A
Avi Kivity 已提交
1325 1326 1327 1328
}

static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
	struct vcpu_svm *svm = to_svm(vcpu);
	unsigned long rflags = svm->vmcb->save.rflags;

	if (svm->nmi_singlestep) {
		/* Hide our flags if they were not set by the guest */
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
			rflags &= ~X86_EFLAGS_TF;
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
			rflags &= ~X86_EFLAGS_RF;
	}
	return rflags;
A
Avi Kivity 已提交
1340 1341 1342 1343
}

static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
1344 1345 1346
	if (to_svm(vcpu)->nmi_singlestep)
		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);

P
Paolo Bonzini 已提交
1347
       /*
A
Andrea Gelmini 已提交
1348
        * Any change of EFLAGS.VM is accompanied by a reload of SS
P
Paolo Bonzini 已提交
1349 1350 1351
        * (caused by either a task switch or an inter-privilege IRET),
        * so we do not need to update the CPL here.
        */
1352
	to_svm(vcpu)->vmcb->save.rflags = rflags;
A
Avi Kivity 已提交
1353 1354
}

A
Avi Kivity 已提交
1355 1356 1357 1358 1359
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
	switch (reg) {
	case VCPU_EXREG_PDPTR:
		BUG_ON(!npt_enabled);
1360
		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
A
Avi Kivity 已提交
1361 1362
		break;
	default:
1363
		WARN_ON_ONCE(1);
A
Avi Kivity 已提交
1364 1365 1366
	}
}

1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
static inline void svm_enable_vintr(struct vcpu_svm *svm)
{
	struct vmcb_control_area *control;

	/* The following fields are ignored when AVIC is enabled */
	WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));

	/*
	 * This is just a dummy VINTR to actually cause a vmexit to happen.
	 * Actual injection of virtual interrupts happens through EVENTINJ.
	 */
	control = &svm->vmcb->control;
	control->int_vector = 0x0;
	control->int_ctl &= ~V_INTR_PRIO_MASK;
	control->int_ctl |= V_IRQ_MASK |
		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
	mark_dirty(svm->vmcb, VMCB_INTR);
}

1386 1387
static void svm_set_vintr(struct vcpu_svm *svm)
{
1388
	set_intercept(svm, INTERCEPT_VINTR);
1389 1390
	if (is_intercept(svm, INTERCEPT_VINTR))
		svm_enable_vintr(svm);
1391 1392 1393 1394
}

static void svm_clear_vintr(struct vcpu_svm *svm)
{
1395
	clr_intercept(svm, INTERCEPT_VINTR);
1396 1397 1398

	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
	mark_dirty(svm->vmcb, VMCB_INTR);
1399 1400
}

A
Avi Kivity 已提交
1401 1402
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
1403
	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
A
Avi Kivity 已提交
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415

	switch (seg) {
	case VCPU_SREG_CS: return &save->cs;
	case VCPU_SREG_DS: return &save->ds;
	case VCPU_SREG_ES: return &save->es;
	case VCPU_SREG_FS: return &save->fs;
	case VCPU_SREG_GS: return &save->gs;
	case VCPU_SREG_SS: return &save->ss;
	case VCPU_SREG_TR: return &save->tr;
	case VCPU_SREG_LDTR: return &save->ldtr;
	}
	BUG();
A
Al Viro 已提交
1416
	return NULL;
A
Avi Kivity 已提交
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
}

static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	return s->base;
}

static void svm_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	var->base = s->base;
	var->limit = s->limit;
	var->selector = s->selector;
	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1441 1442 1443 1444 1445 1446 1447 1448 1449 1450

	/*
	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
	 * However, the SVM spec states that the G bit is not observed by the
	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
	 * So let's synthesize a legal G bit for all segments, this helps
	 * running KVM nested. It also helps cross-vendor migration, because
	 * Intel's vmentry has a check on the 'G' bit.
	 */
	var->g = s->limit > 0xfffff;
1451

J
Joerg Roedel 已提交
1452 1453
	/*
	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1454 1455
	 * for cross vendor migration purposes by "not present"
	 */
1456
	var->unusable = !var->present;
1457

1458 1459 1460 1461 1462 1463
	switch (seg) {
	case VCPU_SREG_TR:
		/*
		 * Work around a bug where the busy flag in the tr selector
		 * isn't exposed
		 */
1464
		var->type |= 0x2;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
		break;
	case VCPU_SREG_DS:
	case VCPU_SREG_ES:
	case VCPU_SREG_FS:
	case VCPU_SREG_GS:
		/*
		 * The accessed bit must always be set in the segment
		 * descriptor cache, although it can be cleared in the
		 * descriptor, the cached bit always remains at 1. Since
		 * Intel has a check on this, set it here to support
		 * cross-vendor migration.
		 */
		if (!var->unusable)
			var->type |= 0x1;
		break;
1480
	case VCPU_SREG_SS:
J
Joerg Roedel 已提交
1481 1482
		/*
		 * On AMD CPUs sometimes the DB bit in the segment
1483 1484 1485 1486 1487 1488
		 * descriptor is left as 1, although the whole segment has
		 * been made unusable. Clear it here to pass an Intel VMX
		 * entry check when cross vendor migrating.
		 */
		if (var->unusable)
			var->db = 0;
1489
		/* This is symmetric with svm_set_segment() */
J
Jan Kiszka 已提交
1490
		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1491
		break;
1492
	}
A
Avi Kivity 已提交
1493 1494
}

1495 1496 1497 1498 1499 1500 1501
static int svm_get_cpl(struct kvm_vcpu *vcpu)
{
	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;

	return save->cpl;
}

1502
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1503
{
1504 1505
	struct vcpu_svm *svm = to_svm(vcpu);

1506 1507
	dt->size = svm->vmcb->save.idtr.limit;
	dt->address = svm->vmcb->save.idtr.base;
A
Avi Kivity 已提交
1508 1509
}

1510
static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1511
{
1512 1513
	struct vcpu_svm *svm = to_svm(vcpu);

1514 1515
	svm->vmcb->save.idtr.limit = dt->size;
	svm->vmcb->save.idtr.base = dt->address ;
1516
	mark_dirty(svm->vmcb, VMCB_DT);
A
Avi Kivity 已提交
1517 1518
}

1519
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1520
{
1521 1522
	struct vcpu_svm *svm = to_svm(vcpu);

1523 1524
	dt->size = svm->vmcb->save.gdtr.limit;
	dt->address = svm->vmcb->save.gdtr.base;
A
Avi Kivity 已提交
1525 1526
}

1527
static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1528
{
1529 1530
	struct vcpu_svm *svm = to_svm(vcpu);

1531 1532
	svm->vmcb->save.gdtr.limit = dt->size;
	svm->vmcb->save.gdtr.base = dt->address ;
1533
	mark_dirty(svm->vmcb, VMCB_DT);
A
Avi Kivity 已提交
1534 1535
}

1536 1537 1538 1539
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}

1540
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1541 1542 1543
{
}

A
Avi Kivity 已提交
1544 1545 1546 1547 1548
static void update_cr0_intercept(struct vcpu_svm *svm)
{
	ulong gcr0 = svm->vcpu.arch.cr0;
	u64 *hcr0 = &svm->vmcb->save.cr0;

1549 1550
	*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
		| (gcr0 & SVM_CR0_SELECTIVE_MASK);
A
Avi Kivity 已提交
1551

1552
	mark_dirty(svm->vmcb, VMCB_CR);
A
Avi Kivity 已提交
1553

1554
	if (gcr0 == *hcr0) {
1555 1556
		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
A
Avi Kivity 已提交
1557
	} else {
1558 1559
		set_cr_intercept(svm, INTERCEPT_CR0_READ);
		set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
A
Avi Kivity 已提交
1560 1561 1562
	}
}

1563
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
A
Avi Kivity 已提交
1564
{
1565 1566
	struct vcpu_svm *svm = to_svm(vcpu);

1567
#ifdef CONFIG_X86_64
1568
	if (vcpu->arch.efer & EFER_LME) {
1569
		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1570
			vcpu->arch.efer |= EFER_LMA;
1571
			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
A
Avi Kivity 已提交
1572 1573
		}

M
Mike Day 已提交
1574
		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1575
			vcpu->arch.efer &= ~EFER_LMA;
1576
			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
A
Avi Kivity 已提交
1577 1578 1579
		}
	}
#endif
1580
	vcpu->arch.cr0 = cr0;
1581 1582 1583

	if (!npt_enabled)
		cr0 |= X86_CR0_PG | X86_CR0_WP;
1584

1585 1586 1587 1588 1589 1590 1591
	/*
	 * re-enable caching here because the QEMU bios
	 * does not do it - this results in some delay at
	 * reboot
	 */
	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
		cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1592
	svm->vmcb->save.cr0 = cr0;
1593
	mark_dirty(svm->vmcb, VMCB_CR);
A
Avi Kivity 已提交
1594
	update_cr0_intercept(svm);
A
Avi Kivity 已提交
1595 1596
}

1597
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
A
Avi Kivity 已提交
1598
{
1599
	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1600 1601
	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;

1602 1603 1604
	if (cr4 & X86_CR4_VMXE)
		return 1;

1605
	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1606
		svm_flush_tlb(vcpu, true);
1607

1608 1609 1610
	vcpu->arch.cr4 = cr4;
	if (!npt_enabled)
		cr4 |= X86_CR4_PAE;
1611
	cr4 |= host_cr4_mce;
1612
	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1613
	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1614
	return 0;
A
Avi Kivity 已提交
1615 1616 1617 1618 1619
}

static void svm_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
1620
	struct vcpu_svm *svm = to_svm(vcpu);
A
Avi Kivity 已提交
1621 1622 1623 1624 1625
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	s->base = var->base;
	s->limit = var->limit;
	s->selector = var->selector;
1626 1627 1628 1629 1630 1631 1632 1633
	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
P
Paolo Bonzini 已提交
1634 1635 1636 1637 1638 1639 1640 1641

	/*
	 * This is always accurate, except if SYSRET returned to a segment
	 * with SS.DPL != 3.  Intel does not have this quirk, and always
	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
	 * would entail passing the CPL to userspace and back.
	 */
	if (seg == VCPU_SREG_SS)
1642 1643
		/* This is symmetric with svm_get_segment() */
		svm->vmcb->save.cpl = (var->dpl & 3);
A
Avi Kivity 已提交
1644

1645
	mark_dirty(svm->vmcb, VMCB_SEG);
A
Avi Kivity 已提交
1646 1647
}

1648
static void update_bp_intercept(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1649
{
J
Jan Kiszka 已提交
1650 1651
	struct vcpu_svm *svm = to_svm(vcpu);

1652
	clr_exception_intercept(svm, BP_VECTOR);
1653

J
Jan Kiszka 已提交
1654 1655
	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1656
			set_exception_intercept(svm, BP_VECTOR);
J
Jan Kiszka 已提交
1657 1658
	} else
		vcpu->guest_debug = 0;
1659 1660
}

1661
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
A
Avi Kivity 已提交
1662
{
1663 1664
	if (sd->next_asid > sd->max_asid) {
		++sd->asid_generation;
1665
		sd->next_asid = sd->min_asid;
1666
		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
A
Avi Kivity 已提交
1667 1668
	}

1669 1670
	svm->asid_generation = sd->asid_generation;
	svm->vmcb->control.asid = sd->next_asid++;
1671 1672

	mark_dirty(svm->vmcb, VMCB_ASID);
A
Avi Kivity 已提交
1673 1674
}

1675
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
J
Jan Kiszka 已提交
1676
{
1677
	struct vmcb *vmcb = svm->vmcb;
J
Jan Kiszka 已提交
1678

1679 1680 1681 1682
	if (unlikely(value != vmcb->save.dr6)) {
		vmcb->save.dr6 = value;
		mark_dirty(vmcb, VMCB_DR);
	}
J
Jan Kiszka 已提交
1683 1684
}

1685 1686 1687 1688 1689 1690 1691 1692
static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	get_debugreg(vcpu->arch.db[0], 0);
	get_debugreg(vcpu->arch.db[1], 1);
	get_debugreg(vcpu->arch.db[2], 2);
	get_debugreg(vcpu->arch.db[3], 3);
1693 1694 1695 1696
	/*
	 * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
	 * because db_interception might need it.  We can do it before vmentry.
	 */
1697
	vcpu->arch.dr6 = svm->vmcb->save.dr6;
1698 1699 1700 1701 1702
	vcpu->arch.dr7 = svm->vmcb->save.dr7;
	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
	set_dr_intercepts(svm);
}

1703
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
A
Avi Kivity 已提交
1704
{
1705 1706
	struct vcpu_svm *svm = to_svm(vcpu);

1707
	svm->vmcb->save.dr7 = value;
1708
	mark_dirty(svm->vmcb, VMCB_DR);
A
Avi Kivity 已提交
1709 1710
}

A
Avi Kivity 已提交
1711
static int pf_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1712
{
1713
	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
1714
	u64 error_code = svm->vmcb->control.exit_info_1;
A
Avi Kivity 已提交
1715

1716
	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
1717 1718
			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
			svm->vmcb->control.insn_bytes : NULL,
1719 1720 1721 1722 1723
			svm->vmcb->control.insn_len);
}

static int npf_interception(struct vcpu_svm *svm)
{
1724
	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
1725 1726 1727 1728
	u64 error_code = svm->vmcb->control.exit_info_1;

	trace_kvm_page_fault(fault_address, error_code);
	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1729 1730
			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
			svm->vmcb->control.insn_bytes : NULL,
1731
			svm->vmcb->control.insn_len);
A
Avi Kivity 已提交
1732 1733
}

A
Avi Kivity 已提交
1734
static int db_interception(struct vcpu_svm *svm)
J
Jan Kiszka 已提交
1735
{
A
Avi Kivity 已提交
1736
	struct kvm_run *kvm_run = svm->vcpu.run;
1737
	struct kvm_vcpu *vcpu = &svm->vcpu;
A
Avi Kivity 已提交
1738

J
Jan Kiszka 已提交
1739
	if (!(svm->vcpu.guest_debug &
1740
	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
J
Jan Kiszka 已提交
1741
		!svm->nmi_singlestep) {
1742 1743
		u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
		kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
J
Jan Kiszka 已提交
1744 1745
		return 1;
	}
1746

J
Jan Kiszka 已提交
1747
	if (svm->nmi_singlestep) {
1748
		disable_nmi_singlestep(svm);
1749 1750
		/* Make sure we check for pending NMIs upon entry */
		kvm_make_request(KVM_REQ_EVENT, vcpu);
1751 1752 1753
	}

	if (svm->vcpu.guest_debug &
J
Joerg Roedel 已提交
1754
	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1755
		kvm_run->exit_reason = KVM_EXIT_DEBUG;
1756 1757
		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1758 1759 1760 1761 1762 1763 1764
		kvm_run->debug.arch.pc =
			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
		kvm_run->debug.arch.exception = DB_VECTOR;
		return 0;
	}

	return 1;
J
Jan Kiszka 已提交
1765 1766
}

A
Avi Kivity 已提交
1767
static int bp_interception(struct vcpu_svm *svm)
J
Jan Kiszka 已提交
1768
{
A
Avi Kivity 已提交
1769 1770
	struct kvm_run *kvm_run = svm->vcpu.run;

J
Jan Kiszka 已提交
1771 1772 1773 1774 1775 1776
	kvm_run->exit_reason = KVM_EXIT_DEBUG;
	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
	kvm_run->debug.arch.exception = BP_VECTOR;
	return 0;
}

A
Avi Kivity 已提交
1777
static int ud_interception(struct vcpu_svm *svm)
1778
{
W
Wanpeng Li 已提交
1779
	return handle_ud(&svm->vcpu);
1780 1781
}

1782 1783 1784 1785 1786 1787
static int ac_interception(struct vcpu_svm *svm)
{
	kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
	return 1;
}

1788 1789 1790 1791 1792 1793 1794
static int gp_interception(struct vcpu_svm *svm)
{
	struct kvm_vcpu *vcpu = &svm->vcpu;
	u32 error_code = svm->vmcb->control.exit_info_1;

	WARN_ON_ONCE(!enable_vmware_backdoor);

1795 1796 1797 1798 1799 1800 1801 1802
	/*
	 * VMware backdoor emulation on #GP interception only handles IN{S},
	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
	 */
	if (error_code) {
		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
		return 1;
	}
1803
	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
1804 1805
}

1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
static bool is_erratum_383(void)
{
	int err, i;
	u64 value;

	if (!erratum_383_found)
		return false;

	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
	if (err)
		return false;

	/* Bit 62 may or may not be set for this mce */
	value &= ~(1ULL << 62);

	if (value != 0xb600000000010015ULL)
		return false;

	/* Clear MCi_STATUS registers */
	for (i = 0; i < 6; ++i)
		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);

	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
	if (!err) {
		u32 low, high;

		value &= ~(1ULL << 2);
		low    = lower_32_bits(value);
		high   = upper_32_bits(value);

		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
	}

	/* Flush tlb to evict multi-match entries */
	__flush_tlb_all();

	return true;
}

1845
static void svm_handle_mce(struct vcpu_svm *svm)
1846
{
1847 1848 1849 1850 1851 1852 1853
	if (is_erratum_383()) {
		/*
		 * Erratum 383 triggered. Guest state is corrupt so kill the
		 * guest.
		 */
		pr_err("KVM: Guest triggered AMD Erratum 383\n");

1854
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1855 1856 1857 1858

		return;
	}

1859 1860 1861 1862 1863 1864 1865 1866
	/*
	 * On an #MC intercept the MCE handler is not called automatically in
	 * the host. So do it by hand here.
	 */
	asm volatile (
		"int $0x12\n");
	/* not sure if we ever come back to this point */

1867 1868 1869 1870 1871
	return;
}

static int mc_interception(struct vcpu_svm *svm)
{
1872 1873 1874
	return 1;
}

A
Avi Kivity 已提交
1875
static int shutdown_interception(struct vcpu_svm *svm)
1876
{
A
Avi Kivity 已提交
1877 1878
	struct kvm_run *kvm_run = svm->vcpu.run;

1879 1880 1881 1882
	/*
	 * VMCB is undefined after a SHUTDOWN intercept
	 * so reinitialize it.
	 */
1883
	clear_page(svm->vmcb);
P
Paolo Bonzini 已提交
1884
	init_vmcb(svm);
1885 1886 1887 1888 1889

	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
	return 0;
}

A
Avi Kivity 已提交
1890
static int io_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1891
{
1892
	struct kvm_vcpu *vcpu = &svm->vcpu;
M
Mike Day 已提交
1893
	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1894
	int size, in, string;
1895
	unsigned port;
A
Avi Kivity 已提交
1896

R
Rusty Russell 已提交
1897
	++svm->vcpu.stat.io_exits;
1898
	string = (io_info & SVM_IOIO_STR_MASK) != 0;
1899
	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1900
	if (string)
1901
		return kvm_emulate_instruction(vcpu, 0);
1902

1903 1904
	port = io_info >> 16;
	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1905 1906
	svm->next_rip = svm->vmcb->control.exit_info_2;

1907
	return kvm_fast_pio(&svm->vcpu, size, port, in);
A
Avi Kivity 已提交
1908 1909
}

A
Avi Kivity 已提交
1910
static int nmi_interception(struct vcpu_svm *svm)
1911 1912 1913 1914
{
	return 1;
}

A
Avi Kivity 已提交
1915
static int intr_interception(struct vcpu_svm *svm)
1916 1917 1918 1919 1920
{
	++svm->vcpu.stat.irq_exits;
	return 1;
}

A
Avi Kivity 已提交
1921
static int nop_on_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1922 1923 1924 1925
{
	return 1;
}

A
Avi Kivity 已提交
1926
static int halt_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1927
{
R
Rusty Russell 已提交
1928
	return kvm_emulate_halt(&svm->vcpu);
A
Avi Kivity 已提交
1929 1930
}

A
Avi Kivity 已提交
1931
static int vmmcall_interception(struct vcpu_svm *svm)
1932
{
1933
	return kvm_emulate_hypercall(&svm->vcpu);
1934 1935
}

A
Avi Kivity 已提交
1936
static int vmload_interception(struct vcpu_svm *svm)
1937
{
1938
	struct vmcb *nested_vmcb;
1939
	struct kvm_host_map map;
1940
	int ret;
1941

1942 1943 1944
	if (nested_svm_check_permissions(svm))
		return 1;

1945 1946 1947 1948
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
	if (ret) {
		if (ret == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
1949
		return 1;
1950 1951 1952
	}

	nested_vmcb = map.hva;
1953

1954
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
1955

1956
	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1957
	kvm_vcpu_unmap(&svm->vcpu, &map, true);
1958

1959
	return ret;
1960 1961
}

A
Avi Kivity 已提交
1962
static int vmsave_interception(struct vcpu_svm *svm)
1963
{
1964
	struct vmcb *nested_vmcb;
1965
	struct kvm_host_map map;
1966
	int ret;
1967

1968 1969 1970
	if (nested_svm_check_permissions(svm))
		return 1;

1971 1972 1973 1974
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
	if (ret) {
		if (ret == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
1975
		return 1;
1976 1977 1978
	}

	nested_vmcb = map.hva;
1979

1980
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
1981

1982
	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1983
	kvm_vcpu_unmap(&svm->vcpu, &map, true);
1984

1985
	return ret;
1986 1987
}

A
Avi Kivity 已提交
1988
static int vmrun_interception(struct vcpu_svm *svm)
A
Alexander Graf 已提交
1989 1990 1991 1992
{
	if (nested_svm_check_permissions(svm))
		return 1;

1993
	return nested_svm_vmrun(svm);
A
Alexander Graf 已提交
1994 1995
}

A
Avi Kivity 已提交
1996
static int stgi_interception(struct vcpu_svm *svm)
1997
{
1998 1999
	int ret;

2000 2001 2002
	if (nested_svm_check_permissions(svm))
		return 1;

2003 2004
	/*
	 * If VGIF is enabled, the STGI intercept is only added to
2005
	 * detect the opening of the SMI/NMI window; remove it now.
2006 2007 2008 2009
	 */
	if (vgif_enabled(svm))
		clr_intercept(svm, INTERCEPT_STGI);

2010
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
2011
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2012

2013
	enable_gif(svm);
2014

2015
	return ret;
2016 2017
}

A
Avi Kivity 已提交
2018
static int clgi_interception(struct vcpu_svm *svm)
2019
{
2020 2021
	int ret;

2022 2023 2024
	if (nested_svm_check_permissions(svm))
		return 1;

2025
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
2026

2027
	disable_gif(svm);
2028 2029

	/* After a CLGI no interrupts should come */
2030
	if (!kvm_vcpu_apicv_active(&svm->vcpu))
2031
		svm_clear_vintr(svm);
2032

2033
	return ret;
2034 2035
}

A
Avi Kivity 已提交
2036
static int invlpga_interception(struct vcpu_svm *svm)
A
Alexander Graf 已提交
2037 2038 2039
{
	struct kvm_vcpu *vcpu = &svm->vcpu;

2040 2041
	trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
			  kvm_rax_read(&svm->vcpu));
2042

A
Alexander Graf 已提交
2043
	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2044
	kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
A
Alexander Graf 已提交
2045

2046
	return kvm_skip_emulated_instruction(&svm->vcpu);
A
Alexander Graf 已提交
2047 2048
}

2049 2050
static int skinit_interception(struct vcpu_svm *svm)
{
2051
	trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
2052 2053 2054 2055 2056

	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
	return 1;
}

D
David Kaplan 已提交
2057 2058
static int wbinvd_interception(struct vcpu_svm *svm)
{
2059
	return kvm_emulate_wbinvd(&svm->vcpu);
D
David Kaplan 已提交
2060 2061
}

J
Joerg Roedel 已提交
2062 2063 2064
static int xsetbv_interception(struct vcpu_svm *svm)
{
	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2065
	u32 index = kvm_rcx_read(&svm->vcpu);
J
Joerg Roedel 已提交
2066 2067

	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2068
		return kvm_skip_emulated_instruction(&svm->vcpu);
J
Joerg Roedel 已提交
2069 2070 2071 2072 2073
	}

	return 1;
}

J
Jim Mattson 已提交
2074 2075 2076 2077 2078 2079
static int rdpru_interception(struct vcpu_svm *svm)
{
	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
	return 1;
}

A
Avi Kivity 已提交
2080
static int task_switch_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2081
{
2082
	u16 tss_selector;
2083 2084 2085
	int reason;
	int int_type = svm->vmcb->control.exit_int_info &
		SVM_EXITINTINFO_TYPE_MASK;
2086
	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2087 2088 2089 2090
	uint32_t type =
		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
	uint32_t idt_v =
		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2091 2092
	bool has_error_code = false;
	u32 error_code = 0;
2093 2094

	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2095

2096 2097
	if (svm->vmcb->control.exit_info_2 &
	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2098 2099 2100 2101
		reason = TASK_SWITCH_IRET;
	else if (svm->vmcb->control.exit_info_2 &
		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
		reason = TASK_SWITCH_JMP;
2102
	else if (idt_v)
2103 2104 2105 2106
		reason = TASK_SWITCH_GATE;
	else
		reason = TASK_SWITCH_CALL;

2107 2108 2109 2110 2111 2112
	if (reason == TASK_SWITCH_GATE) {
		switch (type) {
		case SVM_EXITINTINFO_TYPE_NMI:
			svm->vcpu.arch.nmi_injected = false;
			break;
		case SVM_EXITINTINFO_TYPE_EXEPT:
2113 2114 2115 2116 2117 2118
			if (svm->vmcb->control.exit_info_2 &
			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
				has_error_code = true;
				error_code =
					(u32)svm->vmcb->control.exit_info_2;
			}
2119 2120 2121 2122 2123 2124 2125 2126 2127
			kvm_clear_exception_queue(&svm->vcpu);
			break;
		case SVM_EXITINTINFO_TYPE_INTR:
			kvm_clear_interrupt_queue(&svm->vcpu);
			break;
		default:
			break;
		}
	}
2128

2129 2130 2131
	if (reason != TASK_SWITCH_GATE ||
	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2132
	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2133
		if (!skip_emulated_instruction(&svm->vcpu))
2134
			return 0;
2135
	}
2136

2137 2138 2139
	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
		int_vec = -1;

2140
	return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2141
			       has_error_code, error_code);
A
Avi Kivity 已提交
2142 2143
}

A
Avi Kivity 已提交
2144
static int cpuid_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2145
{
2146
	return kvm_emulate_cpuid(&svm->vcpu);
A
Avi Kivity 已提交
2147 2148
}

A
Avi Kivity 已提交
2149
static int iret_interception(struct vcpu_svm *svm)
2150 2151
{
	++svm->vcpu.stat.nmi_window_exits;
2152
	clr_intercept(svm, INTERCEPT_IRET);
2153
	svm->vcpu.arch.hflags |= HF_IRET_MASK;
2154
	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2155
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2156 2157 2158
	return 1;
}

A
Avi Kivity 已提交
2159
static int invlpg_interception(struct vcpu_svm *svm)
M
Marcelo Tosatti 已提交
2160
{
2161
	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2162
		return kvm_emulate_instruction(&svm->vcpu, 0);
2163 2164

	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2165
	return kvm_skip_emulated_instruction(&svm->vcpu);
M
Marcelo Tosatti 已提交
2166 2167
}

A
Avi Kivity 已提交
2168
static int emulate_on_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2169
{
2170
	return kvm_emulate_instruction(&svm->vcpu, 0);
A
Avi Kivity 已提交
2171 2172
}

B
Brijesh Singh 已提交
2173 2174
static int rsm_interception(struct vcpu_svm *svm)
{
2175
	return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
B
Brijesh Singh 已提交
2176 2177
}

A
Avi Kivity 已提交
2178 2179 2180 2181
static int rdpmc_interception(struct vcpu_svm *svm)
{
	int err;

2182
	if (!nrips)
A
Avi Kivity 已提交
2183 2184 2185
		return emulate_on_interception(svm);

	err = kvm_rdpmc(&svm->vcpu);
2186
	return kvm_complete_insn_gp(&svm->vcpu, err);
A
Avi Kivity 已提交
2187 2188
}

2189 2190
static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
					    unsigned long val)
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212
{
	unsigned long cr0 = svm->vcpu.arch.cr0;
	bool ret = false;
	u64 intercept;

	intercept = svm->nested.intercept;

	if (!is_guest_mode(&svm->vcpu) ||
	    (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
		return false;

	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
	val &= ~SVM_CR0_SELECTIVE_MASK;

	if (cr0 ^ val) {
		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
	}

	return ret;
}

2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227
#define CR_VALID (1ULL << 63)

static int cr_interception(struct vcpu_svm *svm)
{
	int reg, cr;
	unsigned long val;
	int err;

	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
		return emulate_on_interception(svm);

	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
		return emulate_on_interception(svm);

	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2228 2229 2230 2231
	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
	else
		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2232 2233 2234 2235 2236 2237 2238

	err = 0;
	if (cr >= 16) { /* mov to cr */
		cr -= 16;
		val = kvm_register_read(&svm->vcpu, reg);
		switch (cr) {
		case 0:
2239 2240
			if (!check_selective_cr0_intercepted(svm, val))
				err = kvm_set_cr0(&svm->vcpu, val);
2241 2242 2243
			else
				return 1;

2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
			break;
		case 3:
			err = kvm_set_cr3(&svm->vcpu, val);
			break;
		case 4:
			err = kvm_set_cr4(&svm->vcpu, val);
			break;
		case 8:
			err = kvm_set_cr8(&svm->vcpu, val);
			break;
		default:
			WARN(1, "unhandled write to CR%d", cr);
			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
			return 1;
		}
	} else { /* mov from cr */
		switch (cr) {
		case 0:
			val = kvm_read_cr0(&svm->vcpu);
			break;
		case 2:
			val = svm->vcpu.arch.cr2;
			break;
		case 3:
2268
			val = kvm_read_cr3(&svm->vcpu);
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282
			break;
		case 4:
			val = kvm_read_cr4(&svm->vcpu);
			break;
		case 8:
			val = kvm_get_cr8(&svm->vcpu);
			break;
		default:
			WARN(1, "unhandled read from CR%d", cr);
			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
			return 1;
		}
		kvm_register_write(&svm->vcpu, reg, val);
	}
2283
	return kvm_complete_insn_gp(&svm->vcpu, err);
2284 2285
}

2286 2287 2288 2289 2290
static int dr_interception(struct vcpu_svm *svm)
{
	int reg, dr;
	unsigned long val;

2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301
	if (svm->vcpu.guest_debug == 0) {
		/*
		 * No more DR vmexits; force a reload of the debug registers
		 * and reenter on this instruction.  The next vmexit will
		 * retrieve the full state of the debug registers.
		 */
		clr_dr_intercepts(svm);
		svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
		return 1;
	}

2302 2303 2304 2305 2306 2307 2308
	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
		return emulate_on_interception(svm);

	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;

	if (dr >= 16) { /* mov to DRn */
2309 2310
		if (!kvm_require_dr(&svm->vcpu, dr - 16))
			return 1;
2311 2312 2313
		val = kvm_register_read(&svm->vcpu, reg);
		kvm_set_dr(&svm->vcpu, dr - 16, val);
	} else {
2314 2315 2316 2317
		if (!kvm_require_dr(&svm->vcpu, dr))
			return 1;
		kvm_get_dr(&svm->vcpu, dr, &val);
		kvm_register_write(&svm->vcpu, reg, val);
2318 2319
	}

2320
	return kvm_skip_emulated_instruction(&svm->vcpu);
2321 2322
}

A
Avi Kivity 已提交
2323
static int cr8_write_interception(struct vcpu_svm *svm)
2324
{
A
Avi Kivity 已提交
2325
	struct kvm_run *kvm_run = svm->vcpu.run;
A
Andre Przywara 已提交
2326
	int r;
A
Avi Kivity 已提交
2327

2328 2329
	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
	/* instruction emulation calls kvm_set_cr8() */
2330
	r = cr_interception(svm);
2331
	if (lapic_in_kernel(&svm->vcpu))
2332
		return r;
2333
	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2334
		return r;
2335 2336 2337 2338
	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
	return 0;
}

2339 2340
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
{
2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
	msr->data = 0;

	switch (msr->index) {
	case MSR_F10H_DECFG:
		if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
			msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
		break;
	default:
		return 1;
	}

	return 0;
2353 2354
}

2355
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
A
Avi Kivity 已提交
2356
{
2357 2358
	struct vcpu_svm *svm = to_svm(vcpu);

2359
	switch (msr_info->index) {
B
Brian Gerst 已提交
2360
	case MSR_STAR:
2361
		msr_info->data = svm->vmcb->save.star;
A
Avi Kivity 已提交
2362
		break;
2363
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
2364
	case MSR_LSTAR:
2365
		msr_info->data = svm->vmcb->save.lstar;
A
Avi Kivity 已提交
2366 2367
		break;
	case MSR_CSTAR:
2368
		msr_info->data = svm->vmcb->save.cstar;
A
Avi Kivity 已提交
2369 2370
		break;
	case MSR_KERNEL_GS_BASE:
2371
		msr_info->data = svm->vmcb->save.kernel_gs_base;
A
Avi Kivity 已提交
2372 2373
		break;
	case MSR_SYSCALL_MASK:
2374
		msr_info->data = svm->vmcb->save.sfmask;
A
Avi Kivity 已提交
2375 2376 2377
		break;
#endif
	case MSR_IA32_SYSENTER_CS:
2378
		msr_info->data = svm->vmcb->save.sysenter_cs;
A
Avi Kivity 已提交
2379 2380
		break;
	case MSR_IA32_SYSENTER_EIP:
2381
		msr_info->data = svm->sysenter_eip;
A
Avi Kivity 已提交
2382 2383
		break;
	case MSR_IA32_SYSENTER_ESP:
2384
		msr_info->data = svm->sysenter_esp;
A
Avi Kivity 已提交
2385
		break;
P
Paolo Bonzini 已提交
2386 2387 2388 2389 2390
	case MSR_TSC_AUX:
		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
			return 1;
		msr_info->data = svm->tsc_aux;
		break;
J
Joerg Roedel 已提交
2391 2392 2393 2394 2395
	/*
	 * Nobody will change the following 5 values in the VMCB so we can
	 * safely return them on rdmsr. They will always be 0 until LBRV is
	 * implemented.
	 */
2396
	case MSR_IA32_DEBUGCTLMSR:
2397
		msr_info->data = svm->vmcb->save.dbgctl;
2398 2399
		break;
	case MSR_IA32_LASTBRANCHFROMIP:
2400
		msr_info->data = svm->vmcb->save.br_from;
2401 2402
		break;
	case MSR_IA32_LASTBRANCHTOIP:
2403
		msr_info->data = svm->vmcb->save.br_to;
2404 2405
		break;
	case MSR_IA32_LASTINTFROMIP:
2406
		msr_info->data = svm->vmcb->save.last_excp_from;
2407 2408
		break;
	case MSR_IA32_LASTINTTOIP:
2409
		msr_info->data = svm->vmcb->save.last_excp_to;
2410
		break;
A
Alexander Graf 已提交
2411
	case MSR_VM_HSAVE_PA:
2412
		msr_info->data = svm->nested.hsave_msr;
A
Alexander Graf 已提交
2413
		break;
2414
	case MSR_VM_CR:
2415
		msr_info->data = svm->nested.vm_cr_msr;
2416
		break;
2417 2418
	case MSR_IA32_SPEC_CTRL:
		if (!msr_info->host_initiated &&
2419 2420
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
2421 2422
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
2423 2424 2425 2426
			return 1;

		msr_info->data = svm->spec_ctrl;
		break;
2427 2428 2429 2430 2431 2432 2433
	case MSR_AMD64_VIRT_SPEC_CTRL:
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
			return 1;

		msr_info->data = svm->virt_spec_ctrl;
		break;
2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450
	case MSR_F15H_IC_CFG: {

		int family, model;

		family = guest_cpuid_family(vcpu);
		model  = guest_cpuid_model(vcpu);

		if (family < 0 || model < 0)
			return kvm_get_msr_common(vcpu, msr_info);

		msr_info->data = 0;

		if (family == 0x15 &&
		    (model >= 0x2 && model < 0x20))
			msr_info->data = 0x1E;
		}
		break;
2451 2452 2453
	case MSR_F10H_DECFG:
		msr_info->data = svm->msr_decfg;
		break;
A
Avi Kivity 已提交
2454
	default:
2455
		return kvm_get_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
2456 2457 2458 2459
	}
	return 0;
}

A
Avi Kivity 已提交
2460
static int rdmsr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2461
{
2462
	return kvm_emulate_rdmsr(&svm->vcpu);
A
Avi Kivity 已提交
2463 2464
}

2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	int svm_dis, chg_mask;

	if (data & ~SVM_VM_CR_VALID_MASK)
		return 1;

	chg_mask = SVM_VM_CR_VALID_MASK;

	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);

	svm->nested.vm_cr_msr &= ~chg_mask;
	svm->nested.vm_cr_msr |= (data & chg_mask);

	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;

	/* check for svm_disable while efer.svme is set */
	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
		return 1;

	return 0;
}

2490
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
A
Avi Kivity 已提交
2491
{
2492 2493
	struct vcpu_svm *svm = to_svm(vcpu);

2494 2495
	u32 ecx = msr->index;
	u64 data = msr->data;
A
Avi Kivity 已提交
2496
	switch (ecx) {
P
Paolo Bonzini 已提交
2497 2498 2499 2500 2501 2502 2503
	case MSR_IA32_CR_PAT:
		if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
			return 1;
		vcpu->arch.pat = data;
		svm->vmcb->save.g_pat = data;
		mark_dirty(svm->vmcb, VMCB_NPT);
		break;
2504 2505
	case MSR_IA32_SPEC_CTRL:
		if (!msr->host_initiated &&
2506 2507
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
2508 2509
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
2510 2511
			return 1;

2512
		if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531
			return 1;

		svm->spec_ctrl = data;
		if (!data)
			break;

		/*
		 * For non-nested:
		 * When it's written (to non-zero) for the first time, pass
		 * it through.
		 *
		 * For nested:
		 * The handling of the MSR bitmap for L2 guests is done in
		 * nested_svm_vmrun_msrpm.
		 * We update the L1 MSR bit as well since it will end up
		 * touching the MSR anyway now.
		 */
		set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
		break;
A
Ashok Raj 已提交
2532 2533
	case MSR_IA32_PRED_CMD:
		if (!msr->host_initiated &&
2534
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
A
Ashok Raj 已提交
2535 2536 2537 2538
			return 1;

		if (data & ~PRED_CMD_IBPB)
			return 1;
2539 2540
		if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
			return 1;
A
Ashok Raj 已提交
2541 2542 2543 2544 2545 2546
		if (!data)
			break;

		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
		break;
2547 2548 2549 2550 2551 2552 2553 2554 2555 2556
	case MSR_AMD64_VIRT_SPEC_CTRL:
		if (!msr->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
			return 1;

		if (data & ~SPEC_CTRL_SSBD)
			return 1;

		svm->virt_spec_ctrl = data;
		break;
B
Brian Gerst 已提交
2557
	case MSR_STAR:
2558
		svm->vmcb->save.star = data;
A
Avi Kivity 已提交
2559
		break;
2560
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
2561
	case MSR_LSTAR:
2562
		svm->vmcb->save.lstar = data;
A
Avi Kivity 已提交
2563 2564
		break;
	case MSR_CSTAR:
2565
		svm->vmcb->save.cstar = data;
A
Avi Kivity 已提交
2566 2567
		break;
	case MSR_KERNEL_GS_BASE:
2568
		svm->vmcb->save.kernel_gs_base = data;
A
Avi Kivity 已提交
2569 2570
		break;
	case MSR_SYSCALL_MASK:
2571
		svm->vmcb->save.sfmask = data;
A
Avi Kivity 已提交
2572 2573 2574
		break;
#endif
	case MSR_IA32_SYSENTER_CS:
2575
		svm->vmcb->save.sysenter_cs = data;
A
Avi Kivity 已提交
2576 2577
		break;
	case MSR_IA32_SYSENTER_EIP:
2578
		svm->sysenter_eip = data;
2579
		svm->vmcb->save.sysenter_eip = data;
A
Avi Kivity 已提交
2580 2581
		break;
	case MSR_IA32_SYSENTER_ESP:
2582
		svm->sysenter_esp = data;
2583
		svm->vmcb->save.sysenter_esp = data;
A
Avi Kivity 已提交
2584
		break;
P
Paolo Bonzini 已提交
2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596
	case MSR_TSC_AUX:
		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
			return 1;

		/*
		 * This is rare, so we update the MSR here instead of using
		 * direct_access_msrs.  Doing that would require a rdmsr in
		 * svm_vcpu_put.
		 */
		svm->tsc_aux = data;
		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
		break;
2597
	case MSR_IA32_DEBUGCTLMSR:
2598
		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2599 2600
			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
				    __func__, data);
2601 2602 2603 2604 2605 2606
			break;
		}
		if (data & DEBUGCTL_RESERVED_BITS)
			return 1;

		svm->vmcb->save.dbgctl = data;
2607
		mark_dirty(svm->vmcb, VMCB_LBR);
2608 2609 2610 2611
		if (data & (1ULL<<0))
			svm_enable_lbrv(svm);
		else
			svm_disable_lbrv(svm);
2612
		break;
A
Alexander Graf 已提交
2613
	case MSR_VM_HSAVE_PA:
2614
		svm->nested.hsave_msr = data;
2615
		break;
2616
	case MSR_VM_CR:
2617
		return svm_set_vm_cr(vcpu, data);
2618
	case MSR_VM_IGNNE:
2619
		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2620
		break;
2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
	case MSR_F10H_DECFG: {
		struct kvm_msr_entry msr_entry;

		msr_entry.index = msr->index;
		if (svm_get_msr_feature(&msr_entry))
			return 1;

		/* Check the supported bits */
		if (data & ~msr_entry.data)
			return 1;

		/* Don't allow the guest to change a bit, #GP */
		if (!msr->host_initiated && (data ^ msr_entry.data))
			return 1;

		svm->msr_decfg = data;
		break;
	}
2639 2640 2641
	case MSR_IA32_APICBASE:
		if (kvm_vcpu_apicv_active(vcpu))
			avic_update_vapic_bar(to_svm(vcpu), data);
2642
		/* Fall through */
A
Avi Kivity 已提交
2643
	default:
2644
		return kvm_set_msr_common(vcpu, msr);
A
Avi Kivity 已提交
2645 2646 2647 2648
	}
	return 0;
}

A
Avi Kivity 已提交
2649
static int wrmsr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2650
{
2651
	return kvm_emulate_wrmsr(&svm->vcpu);
A
Avi Kivity 已提交
2652 2653
}

A
Avi Kivity 已提交
2654
static int msr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2655
{
R
Rusty Russell 已提交
2656
	if (svm->vmcb->control.exit_info_1)
A
Avi Kivity 已提交
2657
		return wrmsr_interception(svm);
A
Avi Kivity 已提交
2658
	else
A
Avi Kivity 已提交
2659
		return rdmsr_interception(svm);
A
Avi Kivity 已提交
2660 2661
}

A
Avi Kivity 已提交
2662
static int interrupt_window_interception(struct vcpu_svm *svm)
2663
{
2664
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2665
	svm_clear_vintr(svm);
2666 2667 2668 2669 2670 2671 2672 2673

	/*
	 * For AVIC, the only reason to end up here is ExtINTs.
	 * In this case AVIC was temporarily disabled for
	 * requesting the IRQ window and we have to re-enable it.
	 */
	svm_toggle_avic_for_irq_window(&svm->vcpu, true);

2674
	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2675
	mark_dirty(svm->vmcb, VMCB_INTR);
2676
	++svm->vcpu.stat.irq_window_exits;
2677 2678 2679
	return 1;
}

2680 2681
static int pause_interception(struct vcpu_svm *svm)
{
2682 2683 2684
	struct kvm_vcpu *vcpu = &svm->vcpu;
	bool in_kernel = (svm_get_cpl(vcpu) == 0);

2685 2686 2687
	if (pause_filter_thresh)
		grow_ple_window(vcpu);

2688
	kvm_vcpu_on_spin(vcpu, in_kernel);
2689 2690 2691
	return 1;
}

2692 2693
static int nop_interception(struct vcpu_svm *svm)
{
2694
	return kvm_skip_emulated_instruction(&(svm->vcpu));
2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
}

static int monitor_interception(struct vcpu_svm *svm)
{
	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
	return nop_interception(svm);
}

static int mwait_interception(struct vcpu_svm *svm)
{
	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
	return nop_interception(svm);
}

2709
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
2710 2711 2712 2713
	[SVM_EXIT_READ_CR0]			= cr_interception,
	[SVM_EXIT_READ_CR3]			= cr_interception,
	[SVM_EXIT_READ_CR4]			= cr_interception,
	[SVM_EXIT_READ_CR8]			= cr_interception,
2714
	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
2715
	[SVM_EXIT_WRITE_CR0]			= cr_interception,
2716 2717
	[SVM_EXIT_WRITE_CR3]			= cr_interception,
	[SVM_EXIT_WRITE_CR4]			= cr_interception,
J
Joerg Roedel 已提交
2718
	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734
	[SVM_EXIT_READ_DR0]			= dr_interception,
	[SVM_EXIT_READ_DR1]			= dr_interception,
	[SVM_EXIT_READ_DR2]			= dr_interception,
	[SVM_EXIT_READ_DR3]			= dr_interception,
	[SVM_EXIT_READ_DR4]			= dr_interception,
	[SVM_EXIT_READ_DR5]			= dr_interception,
	[SVM_EXIT_READ_DR6]			= dr_interception,
	[SVM_EXIT_READ_DR7]			= dr_interception,
	[SVM_EXIT_WRITE_DR0]			= dr_interception,
	[SVM_EXIT_WRITE_DR1]			= dr_interception,
	[SVM_EXIT_WRITE_DR2]			= dr_interception,
	[SVM_EXIT_WRITE_DR3]			= dr_interception,
	[SVM_EXIT_WRITE_DR4]			= dr_interception,
	[SVM_EXIT_WRITE_DR5]			= dr_interception,
	[SVM_EXIT_WRITE_DR6]			= dr_interception,
	[SVM_EXIT_WRITE_DR7]			= dr_interception,
J
Jan Kiszka 已提交
2735 2736
	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
2737
	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
J
Joerg Roedel 已提交
2738 2739
	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
2740
	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
2741
	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
J
Joerg Roedel 已提交
2742
	[SVM_EXIT_INTR]				= intr_interception,
2743
	[SVM_EXIT_NMI]				= nmi_interception,
A
Avi Kivity 已提交
2744 2745
	[SVM_EXIT_SMI]				= nop_on_interception,
	[SVM_EXIT_INIT]				= nop_on_interception,
2746
	[SVM_EXIT_VINTR]			= interrupt_window_interception,
A
Avi Kivity 已提交
2747
	[SVM_EXIT_RDPMC]			= rdpmc_interception,
A
Avi Kivity 已提交
2748
	[SVM_EXIT_CPUID]			= cpuid_interception,
2749
	[SVM_EXIT_IRET]                         = iret_interception,
2750
	[SVM_EXIT_INVD]                         = emulate_on_interception,
2751
	[SVM_EXIT_PAUSE]			= pause_interception,
A
Avi Kivity 已提交
2752
	[SVM_EXIT_HLT]				= halt_interception,
M
Marcelo Tosatti 已提交
2753
	[SVM_EXIT_INVLPG]			= invlpg_interception,
A
Alexander Graf 已提交
2754
	[SVM_EXIT_INVLPGA]			= invlpga_interception,
J
Joerg Roedel 已提交
2755
	[SVM_EXIT_IOIO]				= io_interception,
A
Avi Kivity 已提交
2756 2757
	[SVM_EXIT_MSR]				= msr_interception,
	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
2758
	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
A
Alexander Graf 已提交
2759
	[SVM_EXIT_VMRUN]			= vmrun_interception,
2760
	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
2761 2762
	[SVM_EXIT_VMLOAD]			= vmload_interception,
	[SVM_EXIT_VMSAVE]			= vmsave_interception,
2763 2764
	[SVM_EXIT_STGI]				= stgi_interception,
	[SVM_EXIT_CLGI]				= clgi_interception,
2765
	[SVM_EXIT_SKINIT]			= skinit_interception,
D
David Kaplan 已提交
2766
	[SVM_EXIT_WBINVD]                       = wbinvd_interception,
2767 2768
	[SVM_EXIT_MONITOR]			= monitor_interception,
	[SVM_EXIT_MWAIT]			= mwait_interception,
J
Joerg Roedel 已提交
2769
	[SVM_EXIT_XSETBV]			= xsetbv_interception,
J
Jim Mattson 已提交
2770
	[SVM_EXIT_RDPRU]			= rdpru_interception,
2771
	[SVM_EXIT_NPF]				= npf_interception,
B
Brijesh Singh 已提交
2772
	[SVM_EXIT_RSM]                          = rsm_interception,
2773 2774
	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
A
Avi Kivity 已提交
2775 2776
};

2777
static void dump_vmcb(struct kvm_vcpu *vcpu)
2778 2779 2780 2781 2782
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	struct vmcb_save_area *save = &svm->vmcb->save;

2783 2784 2785 2786 2787
	if (!dump_invalid_vmcb) {
		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
		return;
	}

2788
	pr_err("VMCB Control Area:\n");
2789 2790 2791 2792 2793 2794 2795
	pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
	pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
	pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
	pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
	pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
	pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
2796 2797
	pr_err("%-20s%d\n", "pause filter threshold:",
	       control->pause_filter_thresh);
2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812
	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
	pr_err("%-20s%d\n", "asid:", control->asid);
	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
	pr_err("%-20s%08x\n", "int_state:", control->int_state);
	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
2813
	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
2814 2815
	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
2816
	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
2817
	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
2818 2819 2820
	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
2821
	pr_err("VMCB State Save Area:\n");
2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "es:",
	       save->es.selector, save->es.attrib,
	       save->es.limit, save->es.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "cs:",
	       save->cs.selector, save->cs.attrib,
	       save->cs.limit, save->cs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ss:",
	       save->ss.selector, save->ss.attrib,
	       save->ss.limit, save->ss.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ds:",
	       save->ds.selector, save->ds.attrib,
	       save->ds.limit, save->ds.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "fs:",
	       save->fs.selector, save->fs.attrib,
	       save->fs.limit, save->fs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "gs:",
	       save->gs.selector, save->gs.attrib,
	       save->gs.limit, save->gs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "gdtr:",
	       save->gdtr.selector, save->gdtr.attrib,
	       save->gdtr.limit, save->gdtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ldtr:",
	       save->ldtr.selector, save->ldtr.attrib,
	       save->ldtr.limit, save->ldtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "idtr:",
	       save->idtr.selector, save->idtr.attrib,
	       save->idtr.limit, save->idtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "tr:",
	       save->tr.selector, save->tr.attrib,
	       save->tr.limit, save->tr.base);
2862 2863
	pr_err("cpl:            %d                efer:         %016llx\n",
		save->cpl, save->efer);
2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cr0:", save->cr0, "cr2:", save->cr2);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cr3:", save->cr3, "cr4:", save->cr4);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "dr6:", save->dr6, "dr7:", save->dr7);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "rip:", save->rip, "rflags:", save->rflags);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "rsp:", save->rsp, "rax:", save->rax);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "star:", save->star, "lstar:", save->lstar);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cstar:", save->cstar, "sfmask:", save->sfmask);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "kernel_gs_base:", save->kernel_gs_base,
	       "sysenter_cs:", save->sysenter_cs);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "sysenter_esp:", save->sysenter_esp,
	       "sysenter_eip:", save->sysenter_eip);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "br_from:", save->br_from, "br_to:", save->br_to);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "excp_from:", save->last_excp_from,
	       "excp_to:", save->last_excp_to);
2891 2892
}

2893 2894 2895 2896 2897 2898 2899 2900
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;

	*info1 = control->exit_info_1;
	*info2 = control->exit_info_2;
}

2901 2902
static int handle_exit(struct kvm_vcpu *vcpu,
	enum exit_fastpath_completion exit_fastpath)
A
Avi Kivity 已提交
2903
{
2904
	struct vcpu_svm *svm = to_svm(vcpu);
A
Avi Kivity 已提交
2905
	struct kvm_run *kvm_run = vcpu->run;
2906
	u32 exit_code = svm->vmcb->control.exit_code;
A
Avi Kivity 已提交
2907

2908 2909
	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);

2910
	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2911 2912 2913
		vcpu->arch.cr0 = svm->vmcb->save.cr0;
	if (npt_enabled)
		vcpu->arch.cr3 = svm->vmcb->save.cr3;
2914

2915 2916 2917 2918 2919 2920 2921
	if (unlikely(svm->nested.exit_required)) {
		nested_svm_vmexit(svm);
		svm->nested.exit_required = false;

		return 1;
	}

2922
	if (is_guest_mode(vcpu)) {
2923 2924
		int vmexit;

2925 2926 2927 2928
		trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
					svm->vmcb->control.exit_info_1,
					svm->vmcb->control.exit_info_2,
					svm->vmcb->control.exit_int_info,
2929 2930
					svm->vmcb->control.exit_int_info_err,
					KVM_ISA_SVM);
2931

2932 2933 2934 2935 2936 2937
		vmexit = nested_svm_exit_special(svm);

		if (vmexit == NESTED_EXIT_CONTINUE)
			vmexit = nested_svm_exit_handled(svm);

		if (vmexit == NESTED_EXIT_DONE)
2938 2939 2940
			return 1;
	}

2941 2942
	svm_complete_interrupts(svm);

2943 2944 2945 2946
	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		kvm_run->fail_entry.hardware_entry_failure_reason
			= svm->vmcb->control.exit_code;
2947
		dump_vmcb(vcpu);
2948 2949 2950
		return 0;
	}

2951
	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2952
	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2953 2954
	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2955
		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
A
Avi Kivity 已提交
2956
		       "exit_code 0x%x\n",
2957
		       __func__, svm->vmcb->control.exit_int_info,
A
Avi Kivity 已提交
2958 2959
		       exit_code);

2960 2961 2962 2963
	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
		kvm_skip_emulated_instruction(vcpu);
		return 1;
	} else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
J
Joe Perches 已提交
2964
	    || !svm_exit_handlers[exit_code]) {
2965 2966 2967 2968 2969 2970 2971 2972
		vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
		dump_vmcb(vcpu);
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror =
			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
		vcpu->run->internal.ndata = 1;
		vcpu->run->internal.data[0] = exit_code;
		return 0;
A
Avi Kivity 已提交
2973 2974
	}

2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986
#ifdef CONFIG_RETPOLINE
	if (exit_code == SVM_EXIT_MSR)
		return msr_interception(svm);
	else if (exit_code == SVM_EXIT_VINTR)
		return interrupt_window_interception(svm);
	else if (exit_code == SVM_EXIT_INTR)
		return intr_interception(svm);
	else if (exit_code == SVM_EXIT_HLT)
		return halt_interception(svm);
	else if (exit_code == SVM_EXIT_NPF)
		return npf_interception(svm);
#endif
A
Avi Kivity 已提交
2987
	return svm_exit_handlers[exit_code](svm);
A
Avi Kivity 已提交
2988 2989 2990 2991 2992 2993
}

static void reload_tss(struct kvm_vcpu *vcpu)
{
	int cpu = raw_smp_processor_id();

2994 2995
	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
A
Avi Kivity 已提交
2996 2997 2998
	load_TR_desc();
}

R
Rusty Russell 已提交
2999
static void pre_svm_run(struct vcpu_svm *svm)
A
Avi Kivity 已提交
3000 3001 3002
{
	int cpu = raw_smp_processor_id();

3003
	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
A
Avi Kivity 已提交
3004

3005 3006 3007
	if (sev_guest(svm->vcpu.kvm))
		return pre_sev_run(svm, cpu);

3008
	/* FIXME: handle wraparound of asid_generation */
3009 3010
	if (svm->asid_generation != sd->asid_generation)
		new_asid(svm, sd);
A
Avi Kivity 已提交
3011 3012
}

3013 3014 3015 3016 3017 3018
static void svm_inject_nmi(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
	vcpu->arch.hflags |= HF_NMI_MASK;
3019
	set_intercept(svm, INTERCEPT_IRET);
3020 3021
	++vcpu->stat.nmi_injections;
}
A
Avi Kivity 已提交
3022

3023
static void svm_set_irq(struct kvm_vcpu *vcpu)
E
Eddie Dong 已提交
3024 3025 3026
{
	struct vcpu_svm *svm = to_svm(vcpu);

3027
	BUG_ON(!(gif_set(svm)));
3028

3029 3030 3031
	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
	++vcpu->stat.irq_injections;

3032 3033
	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
E
Eddie Dong 已提交
3034 3035
}

3036
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3037 3038 3039
{
	struct vcpu_svm *svm = to_svm(vcpu);

3040
	if (svm_nested_virtualize_tpr(vcpu))
3041 3042
		return;

3043 3044
	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);

3045
	if (irr == -1)
3046 3047
		return;

3048
	if (tpr >= irr)
3049
		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3050
}
3051

3052 3053 3054 3055
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *vmcb = svm->vmcb;
J
Joerg Roedel 已提交
3056 3057 3058 3059 3060 3061
	int ret;
	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
	ret = ret && gif_set(svm) && nested_svm_nmi(svm);

	return ret;
3062 3063
}

J
Jan Kiszka 已提交
3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
}

static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (masked) {
		svm->vcpu.arch.hflags |= HF_NMI_MASK;
3077
		set_intercept(svm, INTERCEPT_IRET);
J
Jan Kiszka 已提交
3078 3079
	} else {
		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3080
		clr_intercept(svm, INTERCEPT_IRET);
J
Jan Kiszka 已提交
3081 3082 3083
	}
}

3084 3085 3086 3087
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *vmcb = svm->vmcb;
3088 3089 3090 3091 3092

	if (!gif_set(svm) ||
	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
		return 0;

3093 3094 3095 3096
	if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
		return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
	else
		return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3097 3098
}

3099
static void enable_irq_window(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3100
{
3101 3102
	struct vcpu_svm *svm = to_svm(vcpu);

J
Joerg Roedel 已提交
3103 3104 3105 3106
	/*
	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
	 * get that intercept, this function will be called again though and
3107 3108 3109
	 * we'll get the vintr intercept. However, if the vGIF feature is
	 * enabled, the STGI interception will not occur. Enable the irq
	 * window under the assumption that the hardware will set the GIF.
J
Joerg Roedel 已提交
3110
	 */
3111
	if (vgif_enabled(svm) || gif_set(svm)) {
3112 3113 3114 3115 3116 3117 3118
		/*
		 * IRQ window is not needed when AVIC is enabled,
		 * unless we have pending ExtINT since it cannot be injected
		 * via AVIC. In such case, we need to temporarily disable AVIC,
		 * and fallback to injecting IRQ via V_IRQ.
		 */
		svm_toggle_avic_for_irq_window(vcpu, false);
3119 3120
		svm_set_vintr(svm);
	}
3121 3122
}

3123
static void enable_nmi_window(struct kvm_vcpu *vcpu)
3124
{
3125
	struct vcpu_svm *svm = to_svm(vcpu);
3126

3127 3128
	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
	    == HF_NMI_MASK)
3129
		return; /* IRET will cause a vm exit */
3130

3131 3132 3133
	if (!gif_set(svm)) {
		if (vgif_enabled(svm))
			set_intercept(svm, INTERCEPT_STGI);
3134
		return; /* STGI will cause a vm exit */
3135
	}
3136 3137 3138 3139

	if (svm->nested.exit_required)
		return; /* we're not going to run the guest yet */

J
Joerg Roedel 已提交
3140 3141 3142 3143
	/*
	 * Something prevents NMI from been injected. Single step over possible
	 * problem (IRET or exception injection or interrupt shadow)
	 */
3144
	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
J
Jan Kiszka 已提交
3145
	svm->nmi_singlestep = true;
3146
	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3147 3148
}

3149 3150 3151 3152 3153
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
	return 0;
}

3154 3155 3156 3157 3158
static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
	return 0;
}

3159
void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
3160
{
3161 3162 3163 3164 3165 3166
	struct vcpu_svm *svm = to_svm(vcpu);

	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
	else
		svm->asid_generation--;
3167 3168
}

3169 3170 3171 3172 3173 3174 3175
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	invlpga(gva, svm->vmcb->control.asid);
}

3176 3177 3178 3179
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
}

3180 3181 3182 3183
static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

3184
	if (svm_nested_virtualize_tpr(vcpu))
3185 3186
		return;

3187
	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3188
		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3189
		kvm_set_cr8(vcpu, cr8);
3190 3191 3192
	}
}

3193 3194 3195 3196 3197
static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u64 cr8;

3198 3199
	if (svm_nested_virtualize_tpr(vcpu) ||
	    kvm_vcpu_apicv_active(vcpu))
3200 3201
		return;

3202 3203 3204 3205 3206
	cr8 = kvm_get_cr8(vcpu);
	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}

3207 3208 3209 3210 3211
static void svm_complete_interrupts(struct vcpu_svm *svm)
{
	u8 vector;
	int type;
	u32 exitintinfo = svm->vmcb->control.exit_int_info;
3212 3213 3214
	unsigned int3_injected = svm->int3_injected;

	svm->int3_injected = 0;
3215

3216 3217 3218 3219 3220 3221
	/*
	 * If we've made progress since setting HF_IRET_MASK, we've
	 * executed an IRET and can allow NMI injection.
	 */
	if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
	    && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3222
		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3223 3224
		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
	}
3225

3226 3227 3228 3229 3230 3231 3232
	svm->vcpu.arch.nmi_injected = false;
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
		return;

3233 3234
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);

3235 3236 3237 3238 3239 3240 3241 3242
	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;

	switch (type) {
	case SVM_EXITINTINFO_TYPE_NMI:
		svm->vcpu.arch.nmi_injected = true;
		break;
	case SVM_EXITINTINFO_TYPE_EXEPT:
3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253
		/*
		 * In case of software exceptions, do not reinject the vector,
		 * but re-execute the instruction instead. Rewind RIP first
		 * if we emulated INT3 before.
		 */
		if (kvm_exception_is_soft(vector)) {
			if (vector == BP_VECTOR && int3_injected &&
			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
				kvm_rip_write(&svm->vcpu,
					      kvm_rip_read(&svm->vcpu) -
					      int3_injected);
3254
			break;
3255
		}
3256 3257
		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
			u32 err = svm->vmcb->control.exit_int_info_err;
3258
			kvm_requeue_exception_e(&svm->vcpu, vector, err);
3259 3260

		} else
3261
			kvm_requeue_exception(&svm->vcpu, vector);
3262 3263
		break;
	case SVM_EXITINTINFO_TYPE_INTR:
3264
		kvm_queue_interrupt(&svm->vcpu, vector, false);
3265 3266 3267 3268 3269 3270
		break;
	default:
		break;
	}
}

A
Avi Kivity 已提交
3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281
static void svm_cancel_injection(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;

	control->exit_int_info = control->event_inj;
	control->exit_int_info_err = control->event_inj_err;
	control->event_inj = 0;
	svm_complete_interrupts(svm);
}

3282
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
3283

A
Avi Kivity 已提交
3284
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3285
{
3286
	struct vcpu_svm *svm = to_svm(vcpu);
3287

3288 3289 3290 3291
	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

3292 3293 3294 3295 3296 3297 3298
	/*
	 * A vmexit emulation is required before the vcpu can be executed
	 * again.
	 */
	if (unlikely(svm->nested.exit_required))
		return;

3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314
	/*
	 * Disable singlestep if we're injecting an interrupt/exception.
	 * We don't want our modified rflags to be pushed on the stack where
	 * we might not be able to easily reset them if we disabled NMI
	 * singlestep later.
	 */
	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
		/*
		 * Event injection happens before external interrupts cause a
		 * vmexit and interrupts are disabled here, so smp_send_reschedule
		 * is enough to force an immediate vmexit.
		 */
		disable_nmi_singlestep(svm);
		smp_send_reschedule(vcpu->cpu);
	}

R
Rusty Russell 已提交
3315
	pre_svm_run(svm);
A
Avi Kivity 已提交
3316

3317 3318
	sync_lapic_to_cr8(vcpu);

3319
	svm->vmcb->save.cr2 = vcpu->arch.cr2;
A
Avi Kivity 已提交
3320

3321 3322 3323 3324 3325 3326 3327 3328 3329
	/*
	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
	 * of a #DB.
	 */
	if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
		svm_set_dr6(svm, vcpu->arch.dr6);
	else
		svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);

3330
	clgi();
3331
	kvm_load_guest_xsave_state(vcpu);
3332

3333 3334 3335 3336
	if (lapic_in_kernel(vcpu) &&
		vcpu->arch.apic->lapic_timer.timer_advance_ns)
		kvm_wait_lapic_expire(vcpu);

3337 3338 3339 3340 3341 3342
	/*
	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
	 * is no need to worry about the conditional branch over the wrmsr
	 * being speculatively taken.
	 */
3343
	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3344

3345
	__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
A
Avi Kivity 已提交
3346

3347 3348 3349 3350 3351 3352 3353 3354 3355
#ifdef CONFIG_X86_64
	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
	loadsegment(fs, svm->host.fs);
#ifndef CONFIG_X86_32_LAZY_GS
	loadsegment(gs, svm->host.gs);
#endif
#endif

3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370
	/*
	 * We do not use IBRS in the kernel. If this vCPU has used the
	 * SPEC_CTRL MSR it may have left it on; save the value and
	 * turn it off. This is much more efficient than blindly adding
	 * it to the atomic save/restore list. Especially as the former
	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
	 *
	 * For non-nested case:
	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 *
	 * For nested case:
	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 */
3371
	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3372
		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3373

A
Avi Kivity 已提交
3374 3375
	reload_tss(vcpu);

3376 3377
	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);

3378 3379 3380 3381 3382
	vcpu->arch.cr2 = svm->vmcb->save.cr2;
	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;

3383
	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3384
		kvm_before_interrupt(&svm->vcpu);
3385

3386
	kvm_load_host_xsave_state(vcpu);
3387 3388 3389 3390 3391
	stgi();

	/* Any pending NMI will happen here */

	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3392
		kvm_after_interrupt(&svm->vcpu);
3393

3394 3395
	sync_cr8_to_lapic(vcpu);

3396
	svm->next_rip = 0;
3397

3398 3399
	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;

G
Gleb Natapov 已提交
3400 3401
	/* if exit due to PF check for async PF */
	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3402
		svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
G
Gleb Natapov 已提交
3403

A
Avi Kivity 已提交
3404 3405 3406 3407
	if (npt_enabled) {
		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
	}
3408 3409 3410 3411 3412 3413 3414 3415

	/*
	 * We need to handle MC intercepts here before the vcpu has a chance to
	 * change the physical cpu
	 */
	if (unlikely(svm->vmcb->control.exit_code ==
		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
		svm_handle_mce(svm);
3416 3417

	mark_all_clean(svm->vmcb);
A
Avi Kivity 已提交
3418 3419
}

3420
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
A
Avi Kivity 已提交
3421
{
3422
	struct vcpu_svm *svm = to_svm(vcpu);
3423 3424
	bool update_guest_cr3 = true;
	unsigned long cr3;
3425

3426 3427 3428 3429
	cr3 = __sme_set(root);
	if (npt_enabled) {
		svm->vmcb->control.nested_cr3 = cr3;
		mark_dirty(svm->vmcb, VMCB_NPT);
3430

3431 3432 3433 3434 3435 3436 3437 3438
		/* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
		if (is_guest_mode(vcpu))
			update_guest_cr3 = false;
		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
			cr3 = vcpu->arch.cr3;
		else /* CR3 is already up-to-date.  */
			update_guest_cr3 = false;
	}
3439

3440 3441 3442 3443
	if (update_guest_cr3) {
		svm->vmcb->save.cr3 = cr3;
		mark_dirty(svm->vmcb, VMCB_CR);
	}
3444 3445
}

A
Avi Kivity 已提交
3446 3447
static int is_disabled(void)
{
3448 3449 3450 3451 3452 3453
	u64 vm_cr;

	rdmsrl(MSR_VM_CR, vm_cr);
	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
		return 1;

A
Avi Kivity 已提交
3454 3455 3456
	return 0;
}

I
Ingo Molnar 已提交
3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
	/*
	 * Patch in the VMMCALL instruction:
	 */
	hypercall[0] = 0x0f;
	hypercall[1] = 0x01;
	hypercall[2] = 0xd9;
}

3468
static int __init svm_check_processor_compat(void)
Y
Yang, Sheng 已提交
3469
{
3470
	return 0;
Y
Yang, Sheng 已提交
3471 3472
}

3473 3474 3475 3476 3477
static bool svm_cpu_has_accelerated_tpr(void)
{
	return false;
}

3478
static bool svm_has_emulated_msr(int index)
3479
{
3480 3481
	switch (index) {
	case MSR_IA32_MCG_EXT_CTL:
3482
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3483 3484 3485 3486 3487
		return false;
	default:
		break;
	}

3488 3489 3490
	return true;
}

3491 3492 3493 3494 3495
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
	return 0;
}

3496 3497
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
3498 3499
	struct vcpu_svm *svm = to_svm(vcpu);

3500
	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
3501
				    boot_cpu_has(X86_FEATURE_XSAVE) &&
3502 3503
				    boot_cpu_has(X86_FEATURE_XSAVES);

3504
	/* Update nrips enabled cache */
3505 3506
	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
			     guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
3507 3508 3509 3510

	if (!kvm_vcpu_apicv_active(vcpu))
		return;

3511 3512 3513 3514 3515 3516 3517
	/*
	 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
	 * is exposed to the guest, disable AVIC.
	 */
	if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
		kvm_request_apicv_update(vcpu->kvm, false,
					 APICV_INHIBIT_REASON_X2APIC);
3518 3519 3520 3521 3522 3523 3524 3525

	/*
	 * Currently, AVIC does not work with nested virtualization.
	 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
	 */
	if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
		kvm_request_apicv_update(vcpu->kvm, false,
					 APICV_INHIBIT_REASON_NESTED);
3526 3527
}

3528 3529 3530 3531 3532
static bool svm_has_wbinvd_exit(void)
{
	return true;
}

3533
#define PRE_EX(exit)  { .exit_code = (exit), \
3534
			.stage = X86_ICPT_PRE_EXCEPT, }
3535
#define POST_EX(exit) { .exit_code = (exit), \
3536
			.stage = X86_ICPT_POST_EXCEPT, }
3537
#define POST_MEM(exit) { .exit_code = (exit), \
3538
			.stage = X86_ICPT_POST_MEMACCESS, }
3539

3540
static const struct __x86_intercept {
3541 3542 3543 3544 3545 3546 3547 3548
	u32 exit_code;
	enum x86_intercept_stage stage;
} x86_intercept_map[] = {
	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
3549 3550
	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
3551 3552 3553 3554 3555 3556 3557 3558
	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
3559 3560 3561 3562 3563 3564 3565 3566
	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
3567 3568 3569
	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
3570 3571 3572 3573 3574 3575 3576 3577 3578
	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
3579 3580 3581 3582 3583 3584 3585
	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
3586 3587 3588 3589
	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
3590
	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
3591 3592
};

3593
#undef PRE_EX
3594
#undef POST_EX
3595
#undef POST_MEM
3596

3597 3598
static int svm_check_intercept(struct kvm_vcpu *vcpu,
			       struct x86_instruction_info *info,
3599 3600
			       enum x86_intercept_stage stage,
			       struct x86_exception *exception)
3601
{
3602 3603 3604 3605 3606 3607 3608 3609 3610 3611
	struct vcpu_svm *svm = to_svm(vcpu);
	int vmexit, ret = X86EMUL_CONTINUE;
	struct __x86_intercept icpt_info;
	struct vmcb *vmcb = svm->vmcb;

	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
		goto out;

	icpt_info = x86_intercept_map[info->intercept];

3612
	if (stage != icpt_info.stage)
3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626
		goto out;

	switch (icpt_info.exit_code) {
	case SVM_EXIT_READ_CR0:
		if (info->intercept == x86_intercept_cr_read)
			icpt_info.exit_code += info->modrm_reg;
		break;
	case SVM_EXIT_WRITE_CR0: {
		unsigned long cr0, val;
		u64 intercept;

		if (info->intercept == x86_intercept_cr_write)
			icpt_info.exit_code += info->modrm_reg;

3627 3628
		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
		    info->intercept == x86_intercept_clts)
3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651
			break;

		intercept = svm->nested.intercept;

		if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
			break;

		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;

		if (info->intercept == x86_intercept_lmsw) {
			cr0 &= 0xfUL;
			val &= 0xfUL;
			/* lmsw can't clear PE - catch this here */
			if (cr0 & X86_CR0_PE)
				val |= X86_CR0_PE;
		}

		if (cr0 ^ val)
			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;

		break;
	}
3652 3653 3654 3655
	case SVM_EXIT_READ_DR0:
	case SVM_EXIT_WRITE_DR0:
		icpt_info.exit_code += info->modrm_reg;
		break;
3656 3657 3658 3659 3660 3661
	case SVM_EXIT_MSR:
		if (info->intercept == x86_intercept_wrmsr)
			vmcb->control.exit_info_1 = 1;
		else
			vmcb->control.exit_info_1 = 0;
		break;
3662 3663 3664 3665 3666 3667 3668
	case SVM_EXIT_PAUSE:
		/*
		 * We get this for NOP only, but pause
		 * is rep not, check this here
		 */
		if (info->rep_prefix != REPE_PREFIX)
			goto out;
3669
		break;
3670 3671 3672 3673 3674 3675
	case SVM_EXIT_IOIO: {
		u64 exit_info;
		u32 bytes;

		if (info->intercept == x86_intercept_in ||
		    info->intercept == x86_intercept_ins) {
3676 3677
			exit_info = ((info->src_val & 0xffff) << 16) |
				SVM_IOIO_TYPE_MASK;
3678
			bytes = info->dst_bytes;
3679
		} else {
3680
			exit_info = (info->dst_val & 0xffff) << 16;
3681
			bytes = info->src_bytes;
3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701
		}

		if (info->intercept == x86_intercept_outs ||
		    info->intercept == x86_intercept_ins)
			exit_info |= SVM_IOIO_STR_MASK;

		if (info->rep_prefix)
			exit_info |= SVM_IOIO_REP_MASK;

		bytes = min(bytes, 4u);

		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;

		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);

		vmcb->control.exit_info_1 = exit_info;
		vmcb->control.exit_info_2 = info->next_rip;

		break;
	}
3702 3703 3704 3705
	default:
		break;
	}

3706 3707 3708
	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
	if (static_cpu_has(X86_FEATURE_NRIPS))
		vmcb->control.next_rip  = info->next_rip;
3709 3710 3711 3712 3713 3714 3715 3716
	vmcb->control.exit_code = icpt_info.exit_code;
	vmexit = nested_svm_exit_handled(svm);

	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
					   : X86EMUL_CONTINUE;

out:
	return ret;
3717 3718
}

3719 3720
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
	enum exit_fastpath_completion *exit_fastpath)
3721
{
3722
	if (!is_guest_mode(vcpu) &&
3723 3724
	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
	    to_svm(vcpu)->vmcb->control.exit_info_1)
3725
		*exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
3726 3727
}

3728 3729
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
3730 3731
	if (pause_filter_thresh)
		shrink_ple_window(vcpu);
3732 3733
}

3734 3735 3736 3737 3738 3739
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
	/* [63:9] are reserved. */
	vcpu->arch.mcg_cap &= 0x1ff;
}

3740 3741
static int svm_smi_allowed(struct kvm_vcpu *vcpu)
{
3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755
	struct vcpu_svm *svm = to_svm(vcpu);

	/* Per APM Vol.2 15.22.2 "Response to SMI" */
	if (!gif_set(svm))
		return 0;

	if (is_guest_mode(&svm->vcpu) &&
	    svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
		/* TODO: Might need to set exit_info_1 and exit_info_2 here */
		svm->vmcb->control.exit_code = SVM_EXIT_SMI;
		svm->nested.exit_required = true;
		return 0;
	}

3756 3757 3758
	return 1;
}

3759 3760
static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777
	struct vcpu_svm *svm = to_svm(vcpu);
	int ret;

	if (is_guest_mode(vcpu)) {
		/* FED8h - SVM Guest */
		put_smstate(u64, smstate, 0x7ed8, 1);
		/* FEE0h - SVM Guest VMCB Physical Address */
		put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);

		svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
		svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
		svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

		ret = nested_svm_vmexit(svm);
		if (ret)
			return ret;
	}
3778 3779 3780
	return 0;
}

3781
static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
3782
{
3783 3784
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *nested_vmcb;
3785
	struct kvm_host_map map;
3786 3787
	u64 guest;
	u64 vmcb;
3788

3789 3790
	guest = GET_SMSTATE(u64, smstate, 0x7ed8);
	vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
3791

3792
	if (guest) {
3793
		if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
3794
			return 1;
3795 3796
		nested_vmcb = map.hva;
		enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
3797
	}
3798
	return 0;
3799 3800
}

3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813
static int enable_smi_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (!gif_set(svm)) {
		if (vgif_enabled(svm))
			set_intercept(svm, INTERCEPT_STGI);
		/* STGI will cause a vm exit */
		return 1;
	}
	return 0;
}

3814 3815
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
{
3816 3817 3818 3819
	unsigned long cr4 = kvm_read_cr4(vcpu);
	bool smep = cr4 & X86_CR4_SMEP;
	bool smap = cr4 & X86_CR4_SMAP;
	bool is_user = svm_get_cpl(vcpu) == 3;
3820 3821

	/*
3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850
	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
	 *
	 * Errata:
	 * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
	 * possible that CPU microcode implementing DecodeAssist will fail
	 * to read bytes of instruction which caused #NPF. In this case,
	 * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
	 * return 0 instead of the correct guest instruction bytes.
	 *
	 * This happens because CPU microcode reading instruction bytes
	 * uses a special opcode which attempts to read data using CPL=0
	 * priviledges. The microcode reads CS:RIP and if it hits a SMAP
	 * fault, it gives up and returns no instruction bytes.
	 *
	 * Detection:
	 * We reach here in case CPU supports DecodeAssist, raised #NPF and
	 * returned 0 in GuestIntrBytes field of the VMCB.
	 * First, errata can only be triggered in case vCPU CR4.SMAP=1.
	 * Second, if vCPU CR4.SMEP=1, errata could only be triggered
	 * in case vCPU CPL==3 (Because otherwise guest would have triggered
	 * a SMEP fault instead of #NPF).
	 * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
	 * As most guests enable SMAP if they have also enabled SMEP, use above
	 * logic in order to attempt minimize false-positive of detecting errata
	 * while still preserving all cases semantic correctness.
	 *
	 * Workaround:
	 * To determine what instruction the guest was executing, the hypervisor
	 * will have to decode the instruction at the instruction pointer.
3851 3852 3853 3854 3855 3856 3857 3858 3859 3860
	 *
	 * In non SEV guest, hypervisor will be able to read the guest
	 * memory to decode the instruction pointer when insn_len is zero
	 * so we return true to indicate that decoding is possible.
	 *
	 * But in the SEV guest, the guest memory is encrypted with the
	 * guest specific key and hypervisor will not be able to decode the
	 * instruction pointer so we will not able to workaround it. Lets
	 * print the error and request to kill the guest.
	 */
3861
	if (smap && (!smep || is_user)) {
3862 3863 3864
		if (!sev_guest(vcpu->kvm))
			return true;

3865
		pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
3866 3867 3868 3869 3870 3871
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	}

	return false;
}

3872 3873 3874 3875 3876 3877 3878 3879
static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	/*
	 * TODO: Last condition latch INIT signals on vCPU when
	 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
	 * To properly emulate the INIT intercept, SVM should implement
3880
	 * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit()
3881 3882 3883 3884 3885 3886
	 * there if an INIT signal is pending.
	 */
	return !gif_set(svm) ||
		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
}

3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904
static void svm_vm_destroy(struct kvm *kvm)
{
	avic_vm_destroy(kvm);
	sev_vm_destroy(kvm);
}

static int svm_vm_init(struct kvm *kvm)
{
	if (avic) {
		int ret = avic_vm_init(kvm);
		if (ret)
			return ret;
	}

	kvm_apicv_init(kvm, avic);
	return 0;
}

3905
static struct kvm_x86_ops svm_x86_ops __initdata = {
3906
	.hardware_unsetup = svm_hardware_teardown,
A
Avi Kivity 已提交
3907 3908
	.hardware_enable = svm_hardware_enable,
	.hardware_disable = svm_hardware_disable,
3909
	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
3910
	.has_emulated_msr = svm_has_emulated_msr,
A
Avi Kivity 已提交
3911 3912 3913

	.vcpu_create = svm_create_vcpu,
	.vcpu_free = svm_free_vcpu,
3914
	.vcpu_reset = svm_vcpu_reset,
A
Avi Kivity 已提交
3915

3916
	.vm_size = sizeof(struct kvm_svm),
3917
	.vm_init = svm_vm_init,
B
Brijesh Singh 已提交
3918
	.vm_destroy = svm_vm_destroy,
3919

3920
	.prepare_guest_switch = svm_prepare_guest_switch,
A
Avi Kivity 已提交
3921 3922
	.vcpu_load = svm_vcpu_load,
	.vcpu_put = svm_vcpu_put,
3923 3924
	.vcpu_blocking = svm_vcpu_blocking,
	.vcpu_unblocking = svm_vcpu_unblocking,
A
Avi Kivity 已提交
3925

3926
	.update_bp_intercept = update_bp_intercept,
3927
	.get_msr_feature = svm_get_msr_feature,
A
Avi Kivity 已提交
3928 3929 3930 3931 3932
	.get_msr = svm_get_msr,
	.set_msr = svm_set_msr,
	.get_segment_base = svm_get_segment_base,
	.get_segment = svm_get_segment,
	.set_segment = svm_set_segment,
3933
	.get_cpl = svm_get_cpl,
3934
	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3935
	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3936
	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
A
Avi Kivity 已提交
3937 3938 3939 3940 3941 3942 3943
	.set_cr0 = svm_set_cr0,
	.set_cr4 = svm_set_cr4,
	.set_efer = svm_set_efer,
	.get_idt = svm_get_idt,
	.set_idt = svm_set_idt,
	.get_gdt = svm_get_gdt,
	.set_gdt = svm_set_gdt,
3944
	.set_dr7 = svm_set_dr7,
3945
	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
A
Avi Kivity 已提交
3946
	.cache_reg = svm_cache_reg,
A
Avi Kivity 已提交
3947 3948
	.get_rflags = svm_get_rflags,
	.set_rflags = svm_set_rflags,
3949

A
Avi Kivity 已提交
3950
	.tlb_flush = svm_flush_tlb,
3951
	.tlb_flush_gva = svm_flush_tlb_gva,
A
Avi Kivity 已提交
3952 3953

	.run = svm_vcpu_run,
3954
	.handle_exit = handle_exit,
A
Avi Kivity 已提交
3955
	.skip_emulated_instruction = skip_emulated_instruction,
3956
	.update_emulated_instruction = NULL,
3957 3958
	.set_interrupt_shadow = svm_set_interrupt_shadow,
	.get_interrupt_shadow = svm_get_interrupt_shadow,
I
Ingo Molnar 已提交
3959
	.patch_hypercall = svm_patch_hypercall,
E
Eddie Dong 已提交
3960
	.set_irq = svm_set_irq,
3961
	.set_nmi = svm_inject_nmi,
3962
	.queue_exception = svm_queue_exception,
A
Avi Kivity 已提交
3963
	.cancel_injection = svm_cancel_injection,
3964
	.interrupt_allowed = svm_interrupt_allowed,
3965
	.nmi_allowed = svm_nmi_allowed,
J
Jan Kiszka 已提交
3966 3967
	.get_nmi_mask = svm_get_nmi_mask,
	.set_nmi_mask = svm_set_nmi_mask,
3968 3969 3970
	.enable_nmi_window = enable_nmi_window,
	.enable_irq_window = enable_irq_window,
	.update_cr8_intercept = update_cr8_intercept,
3971
	.set_virtual_apic_mode = svm_set_virtual_apic_mode,
3972
	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
3973
	.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
3974
	.pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
3975
	.load_eoi_exitmap = svm_load_eoi_exitmap,
3976 3977
	.hwapic_irr_update = svm_hwapic_irr_update,
	.hwapic_isr_update = svm_hwapic_isr_update,
3978
	.sync_pir_to_irr = kvm_lapic_find_highest_irr,
3979
	.apicv_post_state_restore = avic_post_state_restore,
3980 3981

	.set_tss_addr = svm_set_tss_addr,
3982
	.set_identity_map_addr = svm_set_identity_map_addr,
3983
	.get_tdp_level = get_npt_level,
3984
	.get_mt_mask = svm_get_mt_mask,
3985

3986 3987
	.get_exit_info = svm_get_exit_info,

3988
	.cpuid_update = svm_cpuid_update,
3989

3990
	.has_wbinvd_exit = svm_has_wbinvd_exit,
3991

3992
	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
3993
	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
3994

3995
	.load_mmu_pgd = svm_load_mmu_pgd,
3996 3997

	.check_intercept = svm_check_intercept,
3998
	.handle_exit_irqoff = svm_handle_exit_irqoff,
3999

4000 4001
	.request_immediate_exit = __kvm_request_immediate_exit,

4002
	.sched_in = svm_sched_in,
4003 4004

	.pmu_ops = &amd_pmu_ops,
4005
	.deliver_posted_interrupt = svm_deliver_avic_intr,
4006
	.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
4007
	.update_pi_irte = svm_update_pi_irte,
4008
	.setup_mce = svm_setup_mce,
4009

4010
	.smi_allowed = svm_smi_allowed,
4011 4012
	.pre_enter_smm = svm_pre_enter_smm,
	.pre_leave_smm = svm_pre_leave_smm,
4013
	.enable_smi_window = enable_smi_window,
B
Brijesh Singh 已提交
4014 4015

	.mem_enc_op = svm_mem_enc_op,
4016 4017
	.mem_enc_reg_region = svm_register_enc_region,
	.mem_enc_unreg_region = svm_unregister_enc_region,
4018

4019
	.nested_enable_evmcs = NULL,
4020
	.nested_get_evmcs_version = NULL,
4021 4022

	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
4023 4024

	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
4025 4026

	.check_nested_events = svm_check_nested_events,
A
Avi Kivity 已提交
4027 4028
};

4029 4030 4031 4032 4033 4034 4035
static struct kvm_x86_init_ops svm_init_ops __initdata = {
	.cpu_has_kvm_support = has_svm,
	.disabled_by_bios = is_disabled,
	.hardware_setup = svm_hardware_setup,
	.check_processor_compatibility = svm_check_processor_compat,

	.runtime_ops = &svm_x86_ops,
A
Avi Kivity 已提交
4036 4037 4038 4039
};

static int __init svm_init(void)
{
4040
	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
4041
			__alignof__(struct vcpu_svm), THIS_MODULE);
A
Avi Kivity 已提交
4042 4043 4044 4045
}

static void __exit svm_exit(void)
{
4046
	kvm_exit();
A
Avi Kivity 已提交
4047 4048 4049 4050
}

module_init(svm_init)
module_exit(svm_exit)