svm.c 110.3 KB
Newer Older
1 2
#define pr_fmt(fmt) "SVM: " fmt

3 4
#include <linux/kvm_host.h>

5
#include "irq.h"
6
#include "mmu.h"
7
#include "kvm_cache_regs.h"
8
#include "x86.h"
9
#include "cpuid.h"
10
#include "pmu.h"
A
Avi Kivity 已提交
11

A
Avi Kivity 已提交
12
#include <linux/module.h>
13
#include <linux/mod_devicetable.h>
14
#include <linux/kernel.h>
A
Avi Kivity 已提交
15 16
#include <linux/vmalloc.h>
#include <linux/highmem.h>
17
#include <linux/amd-iommu.h>
A
Alexey Dobriyan 已提交
18
#include <linux/sched.h>
19
#include <linux/trace_events.h>
20
#include <linux/slab.h>
21
#include <linux/hashtable.h>
22
#include <linux/frame.h>
B
Brijesh Singh 已提交
23
#include <linux/psp-sev.h>
B
Brijesh Singh 已提交
24
#include <linux/file.h>
25 26
#include <linux/pagemap.h>
#include <linux/swap.h>
27
#include <linux/rwsem.h>
A
Avi Kivity 已提交
28

29
#include <asm/apic.h>
30
#include <asm/perf_event.h>
31
#include <asm/tlbflush.h>
A
Avi Kivity 已提交
32
#include <asm/desc.h>
33
#include <asm/debugreg.h>
G
Gleb Natapov 已提交
34
#include <asm/kvm_para.h>
35
#include <asm/irq_remapping.h>
36
#include <asm/mce.h>
37
#include <asm/spec-ctrl.h>
38
#include <asm/cpu_device_id.h>
A
Avi Kivity 已提交
39

40
#include <asm/virtext.h>
41
#include "trace.h"
42

43 44
#include "svm.h"

45 46
#define __ex(x) __kvm_handle_fault_on_reboot(x)

A
Avi Kivity 已提交
47 48 49
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

50
#ifdef MODULE
51
static const struct x86_cpu_id svm_cpu_id[] = {
52
	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
53 54 55
	{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
56
#endif
57

A
Avi Kivity 已提交
58 59 60 61 62 63
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1

#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3

64 65
#define SVM_FEATURE_LBRV           (1 <<  1)
#define SVM_FEATURE_SVML           (1 <<  2)
66 67 68 69
#define SVM_FEATURE_TSC_RATE       (1 <<  4)
#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
70
#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
71

72 73
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))

74
#define TSC_RATIO_RSVD          0xffffff0000000000ULL
75 76
#define TSC_RATIO_MIN		0x0000000000000001ULL
#define TSC_RATIO_MAX		0x000000ffffffffffULL
77

78 79
static bool erratum_383_found __read_mostly;

80
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
81

82 83 84 85 86 87
/*
 * Set osvw_len to higher value when updated Revision Guides
 * are published and we know what the new status bits are
 */
static uint64_t osvw_len = 4, osvw_status;

88 89 90
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT	0x0100000000ULL

91
static const struct svm_direct_access_msrs {
92 93 94
	u32 index;   /* Index of the MSR */
	bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
B
Brian Gerst 已提交
95
	{ .index = MSR_STAR,				.always = true  },
96 97 98 99 100 101 102 103 104
	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
#ifdef CONFIG_X86_64
	{ .index = MSR_GS_BASE,				.always = true  },
	{ .index = MSR_FS_BASE,				.always = true  },
	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
	{ .index = MSR_LSTAR,				.always = true  },
	{ .index = MSR_CSTAR,				.always = true  },
	{ .index = MSR_SYSCALL_MASK,			.always = true  },
#endif
105
	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
A
Ashok Raj 已提交
106
	{ .index = MSR_IA32_PRED_CMD,			.always = false },
107 108 109 110 111
	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
	{ .index = MSR_INVALID,				.always = false },
A
Avi Kivity 已提交
112 113
};

114 115
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
116
bool npt_enabled = true;
117
#else
118
bool npt_enabled;
119
#endif
120

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * pause_filter_count: On processors that support Pause filtering(indicated
 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 *	count value. On VMRUN this value is loaded into an internal counter.
 *	Each time a pause instruction is executed, this counter is decremented
 *	until it reaches zero at which time a #VMEXIT is generated if pause
 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 *	Intercept Filtering for more details.
 *	This also indicate if ple logic enabled.
 *
 * pause_filter_thresh: In addition, some processor families support advanced
 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 *	the amount of time a guest is allowed to execute in a pause loop.
 *	In this mode, a 16-bit pause filter threshold field is added in the
 *	VMCB. The threshold value is a cycle count that is used to reset the
 *	pause counter. As with simple pause filtering, VMRUN loads the pause
 *	count value from VMCB into an internal counter. Then, on each pause
 *	instruction the hardware checks the elapsed number of cycles since
 *	the most recent pause instruction against the pause filter threshold.
 *	If the elapsed cycle count is greater than the pause filter threshold,
 *	then the internal pause count is reloaded from the VMCB and execution
 *	continues. If the elapsed cycle count is less than the pause filter
 *	threshold, then the internal pause count is decremented. If the count
 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 *	triggered. If advanced pause filtering is supported and pause filter
 *	threshold field is set to zero, the filter will operate in the simpler,
 *	count only mode.
 */

static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
module_param(pause_filter_thresh, ushort, 0444);

static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
module_param(pause_filter_count, ushort, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(pause_filter_count_grow, ushort, 0444);

/* Default resets per-vcpu window every exit to pause_filter_count. */
static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(pause_filter_count_shrink, ushort, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);

169 170
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
171
module_param(npt, int, S_IRUGO);
172

173 174
/* allow nested virtualization in KVM/SVM */
static int nested = true;
175 176
module_param(nested, int, S_IRUGO);

177 178 179 180
/* enable/disable Next RIP Save */
static int nrips = true;
module_param(nrips, int, 0444);

181 182 183 184
/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);

185 186 187
/* enable/disable Virtual GIF */
static int vgif = true;
module_param(vgif, int, 0444);
188

B
Brijesh Singh 已提交
189 190 191 192
/* enable/disable SEV support */
static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);

193 194 195
static bool __read_mostly dump_invalid_vmcb = 0;
module_param(dump_invalid_vmcb, bool, 0644);

B
Brijesh Singh 已提交
196 197
static u8 rsm_ins_bytes[] = "\x0f\xaa";

198
static void svm_complete_interrupts(struct vcpu_svm *svm);
199

200
static unsigned long iopm_base;
A
Avi Kivity 已提交
201 202 203 204

struct kvm_ldttss_desc {
	u16 limit0;
	u16 base0;
J
Joerg Roedel 已提交
205 206
	unsigned base1:8, type:5, dpl:2, p:1;
	unsigned limit1:4, zero0:3, g:1, base2:8;
A
Avi Kivity 已提交
207 208 209 210
	u32 base3;
	u32 zero1;
} __attribute__((packed));

211
DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
A
Avi Kivity 已提交
212

213
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
A
Avi Kivity 已提交
214

215
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
A
Avi Kivity 已提交
216 217 218
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)

219
u32 svm_msrpm_offset(u32 msr)
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
{
	u32 offset;
	int i;

	for (i = 0; i < NUM_MSR_MAPS; i++) {
		if (msr < msrpm_ranges[i] ||
		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
			continue;

		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */

		/* Now we have the u8 offset - but need the u32 offset */
		return offset / 4;
	}

	/* MSR not in any range */
	return MSR_INVALID;
}

A
Avi Kivity 已提交
240 241 242 243
#define MAX_INST_SIZE 15

static inline void clgi(void)
{
244
	asm volatile (__ex("clgi"));
A
Avi Kivity 已提交
245 246 247 248
}

static inline void stgi(void)
{
249
	asm volatile (__ex("stgi"));
A
Avi Kivity 已提交
250 251 252 253
}

static inline void invlpga(unsigned long addr, u32 asid)
{
254
	asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
A
Avi Kivity 已提交
255 256
}

257
static int get_max_npt_level(void)
258 259
{
#ifdef CONFIG_X86_64
260
	return PT64_ROOT_4LEVEL;
261 262 263 264 265
#else
	return PT32E_ROOT_LEVEL;
#endif
}

266
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
A
Avi Kivity 已提交
267
{
268
	struct vcpu_svm *svm = to_svm(vcpu);
269
	vcpu->arch.efer = efer;
270 271 272 273 274 275 276 277

	if (!npt_enabled) {
		/* Shadow paging assumes NX to be available.  */
		efer |= EFER_NX;

		if (!(efer & EFER_LMA))
			efer &= ~EFER_LME;
	}
A
Avi Kivity 已提交
278

279 280 281 282 283 284
	if (!(efer & EFER_SVME)) {
		svm_leave_nested(svm);
		svm_set_gif(svm, true);
	}

	svm->vmcb->save.efer = efer | EFER_SVME;
285
	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
A
Avi Kivity 已提交
286 287 288 289 290 291 292 293
}

static int is_external_interrupt(u32 info)
{
	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}

294
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
295 296 297 298 299
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u32 ret = 0;

	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
300 301
		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
	return ret;
302 303 304 305 306 307 308 309 310 311 312 313 314
}

static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (mask == 0)
		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
	else
		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;

}

315
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
316
{
317 318
	struct vcpu_svm *svm = to_svm(vcpu);

319
	if (nrips && svm->vmcb->control.next_rip != 0) {
320
		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
321
		svm->next_rip = svm->vmcb->control.next_rip;
322
	}
323

324 325 326 327 328 329
	if (!svm->next_rip) {
		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
			return 0;
	} else {
		kvm_rip_write(vcpu, svm->next_rip);
	}
330
	svm_set_interrupt_shadow(vcpu, 0);
331

332
	return 1;
A
Avi Kivity 已提交
333 334
}

335
static void svm_queue_exception(struct kvm_vcpu *vcpu)
J
Jan Kiszka 已提交
336 337
{
	struct vcpu_svm *svm = to_svm(vcpu);
338 339 340
	unsigned nr = vcpu->arch.exception.nr;
	bool has_error_code = vcpu->arch.exception.has_error_code;
	u32 error_code = vcpu->arch.exception.error_code;
J
Jan Kiszka 已提交
341

342 343
	kvm_deliver_exception_payload(&svm->vcpu);

344
	if (nr == BP_VECTOR && !nrips) {
345 346 347 348 349 350 351 352 353
		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);

		/*
		 * For guest debugging where we have to reinject #BP if some
		 * INT3 is guest-owned:
		 * Emulate nRIP by moving RIP forward. Will fail if injection
		 * raises a fault that is not intercepted. Still better than
		 * failing in all cases.
		 */
354
		(void)skip_emulated_instruction(&svm->vcpu);
355 356 357 358 359
		rip = kvm_rip_read(&svm->vcpu);
		svm->int3_rip = rip + svm->vmcb->save.cs.base;
		svm->int3_injected = rip - old_rip;
	}

J
Jan Kiszka 已提交
360 361 362 363 364 365 366
	svm->vmcb->control.event_inj = nr
		| SVM_EVTINJ_VALID
		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
		| SVM_EVTINJ_TYPE_EXEPT;
	svm->vmcb->control.event_inj_err = error_code;
}

367 368 369 370 371 372
static void svm_init_erratum_383(void)
{
	u32 low, high;
	int err;
	u64 val;

373
	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
		return;

	/* Use _safe variants to not break nested virtualization */
	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
	if (err)
		return;

	val |= (1ULL << 47);

	low  = lower_32_bits(val);
	high = upper_32_bits(val);

	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);

	erratum_383_found = true;
}

391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
	/*
	 * Guests should see errata 400 and 415 as fixed (assuming that
	 * HLT and IO instructions are intercepted).
	 */
	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
	vcpu->arch.osvw.status = osvw_status & ~(6ULL);

	/*
	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
	 * all osvw.status bits inside that length, including bit 0 (which is
	 * reserved for erratum 298), are valid. However, if host processor's
	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
	 * be conservative here and therefore we tell the guest that erratum 298
	 * is present (because we really don't know).
	 */
	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
		vcpu->arch.osvw.status |= 1;
}

A
Avi Kivity 已提交
412 413
static int has_svm(void)
{
414
	const char *msg;
A
Avi Kivity 已提交
415

416
	if (!cpu_has_svm(&msg)) {
J
Joe Perches 已提交
417
		printk(KERN_INFO "has_svm: %s\n", msg);
A
Avi Kivity 已提交
418 419 420 421 422 423
		return 0;
	}

	return 1;
}

424
static void svm_hardware_disable(void)
A
Avi Kivity 已提交
425
{
426 427 428 429
	/* Make sure we clean up behind us */
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);

430
	cpu_svm_disable();
431 432

	amd_pmu_disable_virt();
A
Avi Kivity 已提交
433 434
}

435
static int svm_hardware_enable(void)
A
Avi Kivity 已提交
436 437
{

438
	struct svm_cpu_data *sd;
A
Avi Kivity 已提交
439 440 441 442
	uint64_t efer;
	struct desc_struct *gdt;
	int me = raw_smp_processor_id();

443 444 445 446
	rdmsrl(MSR_EFER, efer);
	if (efer & EFER_SVME)
		return -EBUSY;

A
Avi Kivity 已提交
447
	if (!has_svm()) {
448
		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
449
		return -EINVAL;
A
Avi Kivity 已提交
450
	}
451 452
	sd = per_cpu(svm_data, me);
	if (!sd) {
453
		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
454
		return -EINVAL;
A
Avi Kivity 已提交
455 456
	}

457 458 459
	sd->asid_generation = 1;
	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
	sd->next_asid = sd->max_asid + 1;
460
	sd->min_asid = max_sev_asid + 1;
A
Avi Kivity 已提交
461

462
	gdt = get_current_gdt_rw();
463
	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
A
Avi Kivity 已提交
464

465
	wrmsrl(MSR_EFER, efer | EFER_SVME);
A
Avi Kivity 已提交
466

467
	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
468

469 470
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
471
		__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
472 473
	}

474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503

	/*
	 * Get OSVW bits.
	 *
	 * Note that it is possible to have a system with mixed processor
	 * revisions and therefore different OSVW bits. If bits are not the same
	 * on different processors then choose the worst case (i.e. if erratum
	 * is present on one processor and not on another then assume that the
	 * erratum is present everywhere).
	 */
	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
		uint64_t len, status = 0;
		int err;

		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
		if (!err)
			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
						      &err);

		if (err)
			osvw_status = osvw_len = 0;
		else {
			if (len < osvw_len)
				osvw_len = len;
			osvw_status |= status;
			osvw_status &= (1ULL << osvw_len) - 1;
		}
	} else
		osvw_status = osvw_len = 0;

504 505
	svm_init_erratum_383();

506 507
	amd_pmu_enable_virt();

508
	return 0;
A
Avi Kivity 已提交
509 510
}

511 512
static void svm_cpu_uninit(int cpu)
{
513
	struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
514

515
	if (!sd)
516 517 518
		return;

	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
519
	kfree(sd->sev_vmcbs);
520 521
	__free_page(sd->save_area);
	kfree(sd);
522 523
}

A
Avi Kivity 已提交
524 525
static int svm_cpu_init(int cpu)
{
526
	struct svm_cpu_data *sd;
A
Avi Kivity 已提交
527

528 529
	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
	if (!sd)
A
Avi Kivity 已提交
530
		return -ENOMEM;
531
	sd->cpu = cpu;
532
	sd->save_area = alloc_page(GFP_KERNEL);
533
	if (!sd->save_area)
534
		goto free_cpu_data;
A
Avi Kivity 已提交
535

536
	if (svm_sev_enabled()) {
537 538 539
		sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
					      sizeof(void *),
					      GFP_KERNEL);
540
		if (!sd->sev_vmcbs)
541
			goto free_save_area;
542 543
	}

544
	per_cpu(svm_data, cpu) = sd;
A
Avi Kivity 已提交
545 546 547

	return 0;

548 549 550
free_save_area:
	__free_page(sd->save_area);
free_cpu_data:
551
	kfree(sd);
552
	return -ENOMEM;
A
Avi Kivity 已提交
553 554 555

}

556 557 558 559 560 561 562 563 564 565 566
static bool valid_msr_intercept(u32 index)
{
	int i;

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
		if (direct_access_msrs[i].index == index)
			return true;

	return false;
}

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
{
	u8 bit_write;
	unsigned long tmp;
	u32 offset;
	u32 *msrpm;

	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
				      to_svm(vcpu)->msrpm;

	offset    = svm_msrpm_offset(msr);
	bit_write = 2 * (msr & 0x0f) + 1;
	tmp       = msrpm[offset];

	BUG_ON(offset == MSR_INVALID);

	return !!test_bit(bit_write,  &tmp);
}

586 587
static void set_msr_interception(u32 *msrpm, unsigned msr,
				 int read, int write)
A
Avi Kivity 已提交
588
{
589 590 591
	u8 bit_read, bit_write;
	unsigned long tmp;
	u32 offset;
A
Avi Kivity 已提交
592

593 594 595 596 597 598
	/*
	 * If this warning triggers extend the direct_access_msrs list at the
	 * beginning of the file
	 */
	WARN_ON(!valid_msr_intercept(msr));

599 600 601 602 603 604 605 606 607 608 609
	offset    = svm_msrpm_offset(msr);
	bit_read  = 2 * (msr & 0x0f);
	bit_write = 2 * (msr & 0x0f) + 1;
	tmp       = msrpm[offset];

	BUG_ON(offset == MSR_INVALID);

	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);

	msrpm[offset] = tmp;
A
Avi Kivity 已提交
610 611
}

612
static void svm_vcpu_init_msrpm(u32 *msrpm)
A
Avi Kivity 已提交
613 614 615
{
	int i;

616 617
	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));

618 619 620 621 622 623
	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		if (!direct_access_msrs[i].always)
			continue;

		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
	}
624 625
}

626 627 628 629 630 631 632 633
static void add_msr_offset(u32 offset)
{
	int i;

	for (i = 0; i < MSRPM_OFFSETS; ++i) {

		/* Offset already in list? */
		if (msrpm_offsets[i] == offset)
634
			return;
635 636 637 638 639 640 641 642 643

		/* Slot used by another offset? */
		if (msrpm_offsets[i] != MSR_INVALID)
			continue;

		/* Add offset to list */
		msrpm_offsets[i] = offset;

		return;
A
Avi Kivity 已提交
644
	}
645 646 647 648 649

	/*
	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
	 * increase MSRPM_OFFSETS in this case.
	 */
650
	BUG();
A
Avi Kivity 已提交
651 652
}

653
static void init_msrpm_offsets(void)
654
{
655
	int i;
656

657 658 659 660 661 662 663 664 665 666
	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));

	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
		u32 offset;

		offset = svm_msrpm_offset(direct_access_msrs[i].index);
		BUG_ON(offset == MSR_INVALID);

		add_msr_offset(offset);
	}
667 668
}

669 670 671 672
static void svm_enable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

673
	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
674 675 676 677 678 679 680 681 682 683
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}

static void svm_disable_lbrv(struct vcpu_svm *svm)
{
	u32 *msrpm = svm->msrpm;

684
	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
685 686 687 688 689 690
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}

691
void disable_nmi_singlestep(struct vcpu_svm *svm)
692 693
{
	svm->nmi_singlestep = false;
694

695 696 697 698 699 700 701
	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
		/* Clear our flags if they were not set by the guest */
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
	}
702 703
}

704 705 706 707 708 709 710 711 712 713 714
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	int old = control->pause_filter_count;

	control->pause_filter_count = __grow_ple_window(old,
							pause_filter_count,
							pause_filter_count_grow,
							pause_filter_count_max);

P
Peter Xu 已提交
715
	if (control->pause_filter_count != old) {
716
		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
P
Peter Xu 已提交
717 718 719
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    control->pause_filter_count, old);
	}
720 721 722 723 724 725 726 727 728 729 730 731 732
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	int old = control->pause_filter_count;

	control->pause_filter_count =
				__shrink_ple_window(old,
						    pause_filter_count,
						    pause_filter_count_shrink,
						    pause_filter_count);
P
Peter Xu 已提交
733
	if (control->pause_filter_count != old) {
734
		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
P
Peter Xu 已提交
735 736 737
		trace_kvm_ple_window_update(vcpu->vcpu_id,
					    control->pause_filter_count, old);
	}
738 739
}

740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
/*
 * The default MMIO mask is a single bit (excluding the present bit),
 * which could conflict with the memory encryption bit. Check for
 * memory encryption support and override the default MMIO mask if
 * memory encryption is enabled.
 */
static __init void svm_adjust_mmio_mask(void)
{
	unsigned int enc_bit, mask_bit;
	u64 msr, mask;

	/* If there is no memory encryption support, use existing mask */
	if (cpuid_eax(0x80000000) < 0x8000001f)
		return;

	/* If memory encryption is not enabled, use existing mask */
	rdmsrl(MSR_K8_SYSCFG, msr);
	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
		return;

	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
	mask_bit = boot_cpu_data.x86_phys_bits;

	/* Increment the mask bit if it is the same as the encryption bit */
	if (enc_bit == mask_bit)
		mask_bit++;

	/*
	 * If the mask bit location is below 52, then some bits above the
	 * physical addressing limit will always be reserved, so use the
	 * rsvd_bits() function to generate the mask. This mask, along with
	 * the present bit, will be used to generate a page fault with
	 * PFER.RSV = 1.
	 *
	 * If the mask bit location is 52 (or above), then clear the mask.
	 */
	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;

P
Paolo Bonzini 已提交
778
	kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
779 780
}

781 782 783 784
static void svm_hardware_teardown(void)
{
	int cpu;

785 786
	if (svm_sev_enabled())
		sev_hardware_teardown();
787 788 789 790 791 792 793 794

	for_each_possible_cpu(cpu)
		svm_cpu_uninit(cpu);

	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
	iopm_base = 0;
}

795 796 797 798
static __init void svm_set_cpu_caps(void)
{
	kvm_set_cpu_caps();

799 800
	supported_xss = 0;

801 802
	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
	if (nested) {
803 804
		kvm_cpu_cap_set(X86_FEATURE_SVM);

805
		if (nrips)
806 807 808 809 810 811
			kvm_cpu_cap_set(X86_FEATURE_NRIPS);

		if (npt_enabled)
			kvm_cpu_cap_set(X86_FEATURE_NPT);
	}

812 813 814 815
	/* CPUID 0x80000008 */
	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
816 817
}

A
Avi Kivity 已提交
818 819 820 821
static __init int svm_hardware_setup(void)
{
	int cpu;
	struct page *iopm_pages;
822
	void *iopm_va;
A
Avi Kivity 已提交
823 824 825 826 827 828
	int r;

	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);

	if (!iopm_pages)
		return -ENOMEM;
829 830 831

	iopm_va = page_address(iopm_pages);
	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
A
Avi Kivity 已提交
832 833
	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;

834 835
	init_msrpm_offsets();

836 837
	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);

838 839 840
	if (boot_cpu_has(X86_FEATURE_NX))
		kvm_enable_efer_bits(EFER_NX);

A
Alexander Graf 已提交
841 842 843
	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
		kvm_enable_efer_bits(EFER_FFXSR);

844 845
	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		kvm_has_tsc_control = true;
846 847
		kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
		kvm_tsc_scaling_ratio_frac_bits = 32;
848 849
	}

850 851 852 853 854 855 856 857
	/* Check for pause filtering support */
	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
		pause_filter_count = 0;
		pause_filter_thresh = 0;
	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
		pause_filter_thresh = 0;
	}

858 859
	if (nested) {
		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
860
		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
861 862
	}

B
Brijesh Singh 已提交
863 864 865 866 867 868 869 870 871 872 873
	if (sev) {
		if (boot_cpu_has(X86_FEATURE_SEV) &&
		    IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
			r = sev_hardware_setup();
			if (r)
				sev = false;
		} else {
			sev = false;
		}
	}

874 875
	svm_adjust_mmio_mask();

Z
Zachary Amsden 已提交
876
	for_each_possible_cpu(cpu) {
A
Avi Kivity 已提交
877 878
		r = svm_cpu_init(cpu);
		if (r)
879
			goto err;
A
Avi Kivity 已提交
880
	}
881

882
	if (!boot_cpu_has(X86_FEATURE_NPT))
883 884
		npt_enabled = false;

885
	if (npt_enabled && !npt)
886 887
		npt_enabled = false;

888
	kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
889
	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
890

891 892 893 894 895
	if (nrips) {
		if (!boot_cpu_has(X86_FEATURE_NRIPS))
			nrips = false;
	}

896 897 898
	if (avic) {
		if (!npt_enabled ||
		    !boot_cpu_has(X86_FEATURE_AVIC) ||
899
		    !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
900
			avic = false;
901
		} else {
902
			pr_info("AVIC enabled\n");
903 904 905

			amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
		}
906
	}
907

908 909
	if (vls) {
		if (!npt_enabled ||
910
		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
911 912 913 914 915 916 917
		    !IS_ENABLED(CONFIG_X86_64)) {
			vls = false;
		} else {
			pr_info("Virtual VMLOAD VMSAVE supported\n");
		}
	}

918 919 920 921 922 923 924
	if (vgif) {
		if (!boot_cpu_has(X86_FEATURE_VGIF))
			vgif = false;
		else
			pr_info("Virtual GIF supported\n");
	}

925
	svm_set_cpu_caps();
926

927 928 929 930 931 932 933 934 935 936 937 938 939 940 941
	/*
	 * It seems that on AMD processors PTE's accessed bit is
	 * being set by the CPU hardware before the NPF vmexit.
	 * This is not expected behaviour and our tests fail because
	 * of it.
	 * A workaround here is to disable support for
	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
	 * In this case userspace can know if there is support using
	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
	 * it
	 * If future AMD CPU models change the behaviour described above,
	 * this variable can be changed accordingly
	 */
	allow_smaller_maxphyaddr = !npt_enabled;

A
Avi Kivity 已提交
942 943
	return 0;

944
err:
945
	svm_hardware_teardown();
A
Avi Kivity 已提交
946 947 948 949 950 951 952
	return r;
}

static void init_seg(struct vmcb_seg *seg)
{
	seg->selector = 0;
	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
J
Joerg Roedel 已提交
953
		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
A
Avi Kivity 已提交
954 955 956 957 958 959 960 961 962 963 964 965
	seg->limit = 0xffff;
	seg->base = 0;
}

static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
{
	seg->selector = 0;
	seg->attrib = SVM_SELECTOR_P_MASK | type;
	seg->limit = 0xffff;
	seg->base = 0;
}

966
static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
967 968 969 970
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u64 g_tsc_offset = 0;

971
	if (is_guest_mode(vcpu)) {
972
		/* Write L1's TSC offset.  */
973 974 975
		g_tsc_offset = svm->vmcb->control.tsc_offset -
			       svm->nested.hsave->control.tsc_offset;
		svm->nested.hsave->control.tsc_offset = offset;
976 977 978 979 980
	}

	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				   svm->vmcb->control.tsc_offset - g_tsc_offset,
				   offset);
981 982

	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
983

984
	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
985
	return svm->vmcb->control.tsc_offset;
986 987
}

P
Paolo Bonzini 已提交
988
static void init_vmcb(struct vcpu_svm *svm)
A
Avi Kivity 已提交
989
{
990 991
	struct vmcb_control_area *control = &svm->vmcb->control;
	struct vmcb_save_area *save = &svm->vmcb->save;
A
Avi Kivity 已提交
992

993
	svm->vcpu.arch.hflags = 0;
994

995 996 997 998 999 1000
	set_cr_intercept(svm, INTERCEPT_CR0_READ);
	set_cr_intercept(svm, INTERCEPT_CR3_READ);
	set_cr_intercept(svm, INTERCEPT_CR4_READ);
	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
	set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1001 1002
	if (!kvm_vcpu_apicv_active(&svm->vcpu))
		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
A
Avi Kivity 已提交
1003

1004
	set_dr_intercepts(svm);
A
Avi Kivity 已提交
1005

1006 1007 1008
	set_exception_intercept(svm, PF_VECTOR);
	set_exception_intercept(svm, UD_VECTOR);
	set_exception_intercept(svm, MC_VECTOR);
1009
	set_exception_intercept(svm, AC_VECTOR);
1010
	set_exception_intercept(svm, DB_VECTOR);
1011 1012 1013 1014 1015 1016 1017 1018
	/*
	 * Guest access to VMware backdoor ports could legitimately
	 * trigger #GP because of TSS I/O permission bitmap.
	 * We intercept those #GP and allow access to them anyway
	 * as VMware does.
	 */
	if (enable_vmware_backdoor)
		set_exception_intercept(svm, GP_VECTOR);
A
Avi Kivity 已提交
1019

1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	svm_set_intercept(svm, INTERCEPT_INTR);
	svm_set_intercept(svm, INTERCEPT_NMI);
	svm_set_intercept(svm, INTERCEPT_SMI);
	svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
	svm_set_intercept(svm, INTERCEPT_RDPMC);
	svm_set_intercept(svm, INTERCEPT_CPUID);
	svm_set_intercept(svm, INTERCEPT_INVD);
	svm_set_intercept(svm, INTERCEPT_INVLPG);
	svm_set_intercept(svm, INTERCEPT_INVLPGA);
	svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
	svm_set_intercept(svm, INTERCEPT_MSR_PROT);
	svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
	svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
	svm_set_intercept(svm, INTERCEPT_VMRUN);
	svm_set_intercept(svm, INTERCEPT_VMMCALL);
	svm_set_intercept(svm, INTERCEPT_VMLOAD);
	svm_set_intercept(svm, INTERCEPT_VMSAVE);
	svm_set_intercept(svm, INTERCEPT_STGI);
	svm_set_intercept(svm, INTERCEPT_CLGI);
	svm_set_intercept(svm, INTERCEPT_SKINIT);
	svm_set_intercept(svm, INTERCEPT_WBINVD);
	svm_set_intercept(svm, INTERCEPT_XSETBV);
	svm_set_intercept(svm, INTERCEPT_RDPRU);
	svm_set_intercept(svm, INTERCEPT_RSM);
A
Avi Kivity 已提交
1044

1045
	if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1046 1047
		svm_set_intercept(svm, INTERCEPT_MONITOR);
		svm_set_intercept(svm, INTERCEPT_MWAIT);
1048 1049
	}

1050
	if (!kvm_hlt_in_guest(svm->vcpu.kvm))
1051
		svm_set_intercept(svm, INTERCEPT_HLT);
1052

1053 1054
	control->iopm_base_pa = __sme_set(iopm_base);
	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
A
Avi Kivity 已提交
1055 1056 1057 1058 1059 1060 1061 1062 1063
	control->int_ctl = V_INTR_MASKING_MASK;

	init_seg(&save->es);
	init_seg(&save->ss);
	init_seg(&save->ds);
	init_seg(&save->fs);
	init_seg(&save->gs);

	save->cs.selector = 0xf000;
1064
	save->cs.base = 0xffff0000;
A
Avi Kivity 已提交
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
	/* Executable/Readable Code Segment */
	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
	save->cs.limit = 0xffff;

	save->gdtr.limit = 0xffff;
	save->idtr.limit = 0xffff;

	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);

P
Paolo Bonzini 已提交
1076
	svm_set_efer(&svm->vcpu, 0);
M
Mike Day 已提交
1077
	save->dr6 = 0xffff0ff0;
1078
	kvm_set_rflags(&svm->vcpu, 2);
A
Avi Kivity 已提交
1079
	save->rip = 0x0000fff0;
1080
	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
A
Avi Kivity 已提交
1081

J
Joerg Roedel 已提交
1082
	/*
1083
	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1084
	 * It also updates the guest-visible cr0 value.
A
Avi Kivity 已提交
1085
	 */
1086
	svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1087
	kvm_mmu_reset_context(&svm->vcpu);
1088

1089
	save->cr4 = X86_CR4_PAE;
A
Avi Kivity 已提交
1090
	/* rdx = ?? */
1091 1092 1093

	if (npt_enabled) {
		/* Setup VMCB for Nested Paging */
1094
		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1095
		svm_clr_intercept(svm, INTERCEPT_INVLPG);
1096
		clr_exception_intercept(svm, PF_VECTOR);
1097 1098
		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1099
		save->g_pat = svm->vcpu.arch.pat;
1100 1101 1102
		save->cr3 = 0;
		save->cr4 = 0;
	}
1103
	svm->asid_generation = 0;
1104

1105
	svm->nested.vmcb = 0;
1106 1107
	svm->vcpu.arch.hflags = 0;

1108 1109 1110 1111
	if (pause_filter_count) {
		control->pause_filter_count = pause_filter_count;
		if (pause_filter_thresh)
			control->pause_filter_thresh = pause_filter_thresh;
1112
		svm_set_intercept(svm, INTERCEPT_PAUSE);
1113
	} else {
1114
		svm_clr_intercept(svm, INTERCEPT_PAUSE);
1115 1116
	}

1117
	if (kvm_vcpu_apicv_active(&svm->vcpu))
1118 1119
		avic_init_vmcb(svm);

1120 1121 1122 1123 1124
	/*
	 * If hardware supports Virtual VMLOAD VMSAVE then enable it
	 * in VMCB and clear intercepts to avoid #VMEXIT.
	 */
	if (vls) {
1125 1126
		svm_clr_intercept(svm, INTERCEPT_VMLOAD);
		svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1127 1128 1129
		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
	}

1130
	if (vgif) {
1131 1132
		svm_clr_intercept(svm, INTERCEPT_STGI);
		svm_clr_intercept(svm, INTERCEPT_CLGI);
1133 1134 1135
		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
	}

1136
	if (sev_guest(svm->vcpu.kvm)) {
B
Brijesh Singh 已提交
1137
		svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1138 1139
		clr_exception_intercept(svm, UD_VECTOR);
	}
B
Brijesh Singh 已提交
1140

1141
	vmcb_mark_all_dirty(svm->vmcb);
1142

1143
	enable_gif(svm);
1144 1145 1146

}

1147
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1148 1149
{
	struct vcpu_svm *svm = to_svm(vcpu);
1150 1151
	u32 dummy;
	u32 eax = 1;
1152

1153
	svm->spec_ctrl = 0;
1154
	svm->virt_spec_ctrl = 0;
1155

1156 1157 1158 1159 1160 1161
	if (!init_event) {
		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
					   MSR_IA32_APICBASE_ENABLE;
		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
	}
P
Paolo Bonzini 已提交
1162
	init_vmcb(svm);
A
Avi Kivity 已提交
1163

1164
	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
1165
	kvm_rdx_write(vcpu, eax);
1166 1167 1168

	if (kvm_vcpu_apicv_active(vcpu) && !init_event)
		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1169 1170
}

1171
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1172
{
1173
	struct vcpu_svm *svm;
A
Avi Kivity 已提交
1174
	struct page *page;
1175
	struct page *msrpm_pages;
A
Alexander Graf 已提交
1176
	struct page *hsave_page;
A
Alexander Graf 已提交
1177
	struct page *nested_msrpm_pages;
R
Rusty Russell 已提交
1178
	int err;
A
Avi Kivity 已提交
1179

1180 1181
	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
	svm = to_svm(vcpu);
R
Rusty Russell 已提交
1182

1183
	err = -ENOMEM;
1184
	page = alloc_page(GFP_KERNEL_ACCOUNT);
1185
	if (!page)
1186
		goto out;
A
Avi Kivity 已提交
1187

1188
	msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
1189
	if (!msrpm_pages)
1190
		goto free_page1;
A
Alexander Graf 已提交
1191

1192
	nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
A
Alexander Graf 已提交
1193
	if (!nested_msrpm_pages)
1194
		goto free_page2;
1195

1196
	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
A
Alexander Graf 已提交
1197
	if (!hsave_page)
1198 1199
		goto free_page3;

1200 1201 1202
	err = avic_init_vcpu(svm);
	if (err)
		goto free_page4;
1203

1204 1205 1206
	/* We initialize this flag to true to make sure that the is_running
	 * bit would be set the first time the vcpu is loaded.
	 */
1207 1208
	if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
		svm->avic_is_running = true;
1209

1210
	svm->nested.hsave = page_address(hsave_page);
1211
	clear_page(svm->nested.hsave);
A
Alexander Graf 已提交
1212

1213 1214 1215
	svm->msrpm = page_address(msrpm_pages);
	svm_vcpu_init_msrpm(svm->msrpm);

1216
	svm->nested.msrpm = page_address(nested_msrpm_pages);
1217
	svm_vcpu_init_msrpm(svm->nested.msrpm);
A
Alexander Graf 已提交
1218

1219 1220
	svm->vmcb = page_address(page);
	clear_page(svm->vmcb);
1221
	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
1222
	svm->asid_generation = 0;
P
Paolo Bonzini 已提交
1223
	init_vmcb(svm);
A
Avi Kivity 已提交
1224

1225
	svm_init_osvw(vcpu);
1226
	vcpu->arch.microcode_version = 0x01000065;
1227

1228
	return 0;
1229

1230 1231
free_page4:
	__free_page(hsave_page);
1232 1233 1234 1235 1236 1237
free_page3:
	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
	__free_page(page);
1238
out:
1239
	return err;
A
Avi Kivity 已提交
1240 1241
}

1242 1243 1244 1245 1246 1247 1248 1249
static void svm_clear_current_vmcb(struct vmcb *vmcb)
{
	int i;

	for_each_online_cpu(i)
		cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}

A
Avi Kivity 已提交
1250 1251
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
1252 1253
	struct vcpu_svm *svm = to_svm(vcpu);

1254 1255 1256 1257 1258 1259 1260
	/*
	 * The vmcb page can be recycled, causing a false negative in
	 * svm_vcpu_load(). So, ensure that no logical CPU has this
	 * vmcb page recorded as its current vmcb.
	 */
	svm_clear_current_vmcb(svm->vmcb);

1261
	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1262
	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1263 1264
	__free_page(virt_to_page(svm->nested.hsave));
	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
A
Avi Kivity 已提交
1265 1266
}

1267
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
A
Avi Kivity 已提交
1268
{
1269
	struct vcpu_svm *svm = to_svm(vcpu);
A
Ashok Raj 已提交
1270
	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1271
	int i;
1272 1273

	if (unlikely(cpu != vcpu->cpu)) {
1274
		svm->asid_generation = 0;
1275
		vmcb_mark_all_dirty(svm->vmcb);
1276
	}
1277

1278 1279 1280
#ifdef CONFIG_X86_64
	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
#endif
1281 1282 1283 1284
	savesegment(fs, svm->host.fs);
	savesegment(gs, svm->host.gs);
	svm->host.ldt = kvm_read_ldt();

1285
	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1286
		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1287

1288 1289 1290 1291 1292 1293
	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
		if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
			__this_cpu_write(current_tsc_ratio, tsc_ratio);
			wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
		}
1294
	}
P
Paolo Bonzini 已提交
1295 1296 1297
	/* This assumes that the kernel never uses MSR_TSC_AUX */
	if (static_cpu_has(X86_FEATURE_RDTSCP))
		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1298

A
Ashok Raj 已提交
1299 1300 1301 1302
	if (sd->current_vmcb != svm->vmcb) {
		sd->current_vmcb = svm->vmcb;
		indirect_branch_prediction_barrier();
	}
1303
	avic_vcpu_load(vcpu, cpu);
A
Avi Kivity 已提交
1304 1305 1306 1307
}

static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
1308
	struct vcpu_svm *svm = to_svm(vcpu);
1309 1310
	int i;

1311 1312
	avic_vcpu_put(vcpu);

1313
	++vcpu->stat.host_state_reload;
1314 1315 1316
	kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
	loadsegment(fs, svm->host.fs);
1317
	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1318
	load_gs_index(svm->host.gs);
1319
#else
1320
#ifdef CONFIG_X86_32_LAZY_GS
1321
	loadsegment(gs, svm->host.gs);
1322
#endif
1323
#endif
1324
	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1325
		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
A
Avi Kivity 已提交
1326 1327 1328 1329
}

static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340
	struct vcpu_svm *svm = to_svm(vcpu);
	unsigned long rflags = svm->vmcb->save.rflags;

	if (svm->nmi_singlestep) {
		/* Hide our flags if they were not set by the guest */
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
			rflags &= ~X86_EFLAGS_TF;
		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
			rflags &= ~X86_EFLAGS_RF;
	}
	return rflags;
A
Avi Kivity 已提交
1341 1342 1343 1344
}

static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
1345 1346 1347
	if (to_svm(vcpu)->nmi_singlestep)
		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);

P
Paolo Bonzini 已提交
1348
       /*
A
Andrea Gelmini 已提交
1349
        * Any change of EFLAGS.VM is accompanied by a reload of SS
P
Paolo Bonzini 已提交
1350 1351 1352
        * (caused by either a task switch or an inter-privilege IRET),
        * so we do not need to update the CPL here.
        */
1353
	to_svm(vcpu)->vmcb->save.rflags = rflags;
A
Avi Kivity 已提交
1354 1355
}

A
Avi Kivity 已提交
1356 1357 1358 1359 1360
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
	switch (reg) {
	case VCPU_EXREG_PDPTR:
		BUG_ON(!npt_enabled);
1361
		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
A
Avi Kivity 已提交
1362 1363
		break;
	default:
1364
		WARN_ON_ONCE(1);
A
Avi Kivity 已提交
1365 1366 1367
	}
}

1368
static void svm_set_vintr(struct vcpu_svm *svm)
1369 1370 1371 1372 1373
{
	struct vmcb_control_area *control;

	/* The following fields are ignored when AVIC is enabled */
	WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
1374
	svm_set_intercept(svm, INTERCEPT_VINTR);
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384

	/*
	 * This is just a dummy VINTR to actually cause a vmexit to happen.
	 * Actual injection of virtual interrupts happens through EVENTINJ.
	 */
	control = &svm->vmcb->control;
	control->int_vector = 0x0;
	control->int_ctl &= ~V_INTR_PRIO_MASK;
	control->int_ctl |= V_IRQ_MASK |
		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1385
	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1386 1387
}

1388 1389
static void svm_clear_vintr(struct vcpu_svm *svm)
{
1390
	const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
1391
	svm_clr_intercept(svm, INTERCEPT_VINTR);
1392

1393 1394 1395
	/* Drop int_ctl fields related to VINTR injection.  */
	svm->vmcb->control.int_ctl &= mask;
	if (is_guest_mode(&svm->vcpu)) {
1396 1397
		svm->nested.hsave->control.int_ctl &= mask;

1398 1399 1400 1401 1402
		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
			(svm->nested.ctl.int_ctl & V_TPR_MASK));
		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
	}

1403
	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1404 1405
}

A
Avi Kivity 已提交
1406 1407
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
1408
	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
A
Avi Kivity 已提交
1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420

	switch (seg) {
	case VCPU_SREG_CS: return &save->cs;
	case VCPU_SREG_DS: return &save->ds;
	case VCPU_SREG_ES: return &save->es;
	case VCPU_SREG_FS: return &save->fs;
	case VCPU_SREG_GS: return &save->gs;
	case VCPU_SREG_SS: return &save->ss;
	case VCPU_SREG_TR: return &save->tr;
	case VCPU_SREG_LDTR: return &save->ldtr;
	}
	BUG();
A
Al Viro 已提交
1421
	return NULL;
A
Avi Kivity 已提交
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445
}

static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	return s->base;
}

static void svm_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	var->base = s->base;
	var->limit = s->limit;
	var->selector = s->selector;
	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1446 1447 1448 1449 1450 1451 1452 1453 1454 1455

	/*
	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
	 * However, the SVM spec states that the G bit is not observed by the
	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
	 * So let's synthesize a legal G bit for all segments, this helps
	 * running KVM nested. It also helps cross-vendor migration, because
	 * Intel's vmentry has a check on the 'G' bit.
	 */
	var->g = s->limit > 0xfffff;
1456

J
Joerg Roedel 已提交
1457 1458
	/*
	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1459 1460
	 * for cross vendor migration purposes by "not present"
	 */
1461
	var->unusable = !var->present;
1462

1463 1464 1465 1466 1467 1468
	switch (seg) {
	case VCPU_SREG_TR:
		/*
		 * Work around a bug where the busy flag in the tr selector
		 * isn't exposed
		 */
1469
		var->type |= 0x2;
1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
		break;
	case VCPU_SREG_DS:
	case VCPU_SREG_ES:
	case VCPU_SREG_FS:
	case VCPU_SREG_GS:
		/*
		 * The accessed bit must always be set in the segment
		 * descriptor cache, although it can be cleared in the
		 * descriptor, the cached bit always remains at 1. Since
		 * Intel has a check on this, set it here to support
		 * cross-vendor migration.
		 */
		if (!var->unusable)
			var->type |= 0x1;
		break;
1485
	case VCPU_SREG_SS:
J
Joerg Roedel 已提交
1486 1487
		/*
		 * On AMD CPUs sometimes the DB bit in the segment
1488 1489 1490 1491 1492 1493
		 * descriptor is left as 1, although the whole segment has
		 * been made unusable. Clear it here to pass an Intel VMX
		 * entry check when cross vendor migrating.
		 */
		if (var->unusable)
			var->db = 0;
1494
		/* This is symmetric with svm_set_segment() */
J
Jan Kiszka 已提交
1495
		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1496
		break;
1497
	}
A
Avi Kivity 已提交
1498 1499
}

1500 1501 1502 1503 1504 1505 1506
static int svm_get_cpl(struct kvm_vcpu *vcpu)
{
	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;

	return save->cpl;
}

1507
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1508
{
1509 1510
	struct vcpu_svm *svm = to_svm(vcpu);

1511 1512
	dt->size = svm->vmcb->save.idtr.limit;
	dt->address = svm->vmcb->save.idtr.base;
A
Avi Kivity 已提交
1513 1514
}

1515
static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1516
{
1517 1518
	struct vcpu_svm *svm = to_svm(vcpu);

1519 1520
	svm->vmcb->save.idtr.limit = dt->size;
	svm->vmcb->save.idtr.base = dt->address ;
1521
	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
A
Avi Kivity 已提交
1522 1523
}

1524
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1525
{
1526 1527
	struct vcpu_svm *svm = to_svm(vcpu);

1528 1529
	dt->size = svm->vmcb->save.gdtr.limit;
	dt->address = svm->vmcb->save.gdtr.base;
A
Avi Kivity 已提交
1530 1531
}

1532
static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
A
Avi Kivity 已提交
1533
{
1534 1535
	struct vcpu_svm *svm = to_svm(vcpu);

1536 1537
	svm->vmcb->save.gdtr.limit = dt->size;
	svm->vmcb->save.gdtr.base = dt->address ;
1538
	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
A
Avi Kivity 已提交
1539 1540
}

A
Avi Kivity 已提交
1541 1542 1543 1544 1545
static void update_cr0_intercept(struct vcpu_svm *svm)
{
	ulong gcr0 = svm->vcpu.arch.cr0;
	u64 *hcr0 = &svm->vmcb->save.cr0;

1546 1547
	*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
		| (gcr0 & SVM_CR0_SELECTIVE_MASK);
A
Avi Kivity 已提交
1548

1549
	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
A
Avi Kivity 已提交
1550

1551
	if (gcr0 == *hcr0) {
1552 1553
		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
A
Avi Kivity 已提交
1554
	} else {
1555 1556
		set_cr_intercept(svm, INTERCEPT_CR0_READ);
		set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
A
Avi Kivity 已提交
1557 1558 1559
	}
}

1560
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
A
Avi Kivity 已提交
1561
{
1562 1563
	struct vcpu_svm *svm = to_svm(vcpu);

1564
#ifdef CONFIG_X86_64
1565
	if (vcpu->arch.efer & EFER_LME) {
1566
		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1567
			vcpu->arch.efer |= EFER_LMA;
1568
			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
A
Avi Kivity 已提交
1569 1570
		}

M
Mike Day 已提交
1571
		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1572
			vcpu->arch.efer &= ~EFER_LMA;
1573
			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
A
Avi Kivity 已提交
1574 1575 1576
		}
	}
#endif
1577
	vcpu->arch.cr0 = cr0;
1578 1579 1580

	if (!npt_enabled)
		cr0 |= X86_CR0_PG | X86_CR0_WP;
1581

1582 1583 1584 1585 1586 1587 1588
	/*
	 * re-enable caching here because the QEMU bios
	 * does not do it - this results in some delay at
	 * reboot
	 */
	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
		cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1589
	svm->vmcb->save.cr0 = cr0;
1590
	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
A
Avi Kivity 已提交
1591
	update_cr0_intercept(svm);
A
Avi Kivity 已提交
1592 1593
}

1594
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
A
Avi Kivity 已提交
1595
{
1596
	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1597 1598
	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;

1599 1600 1601
	if (cr4 & X86_CR4_VMXE)
		return 1;

1602
	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1603
		svm_flush_tlb(vcpu);
1604

1605 1606 1607
	vcpu->arch.cr4 = cr4;
	if (!npt_enabled)
		cr4 |= X86_CR4_PAE;
1608
	cr4 |= host_cr4_mce;
1609
	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1610
	vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1611
	return 0;
A
Avi Kivity 已提交
1612 1613 1614 1615 1616
}

static void svm_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg)
{
1617
	struct vcpu_svm *svm = to_svm(vcpu);
A
Avi Kivity 已提交
1618 1619 1620 1621 1622
	struct vmcb_seg *s = svm_seg(vcpu, seg);

	s->base = var->base;
	s->limit = var->limit;
	s->selector = var->selector;
1623 1624 1625 1626 1627 1628 1629 1630
	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
P
Paolo Bonzini 已提交
1631 1632 1633 1634 1635 1636 1637 1638

	/*
	 * This is always accurate, except if SYSRET returned to a segment
	 * with SS.DPL != 3.  Intel does not have this quirk, and always
	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
	 * would entail passing the CPL to userspace and back.
	 */
	if (seg == VCPU_SREG_SS)
1639 1640
		/* This is symmetric with svm_get_segment() */
		svm->vmcb->save.cpl = (var->dpl & 3);
A
Avi Kivity 已提交
1641

1642
	vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
A
Avi Kivity 已提交
1643 1644
}

1645
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
1646
{
J
Jan Kiszka 已提交
1647 1648
	struct vcpu_svm *svm = to_svm(vcpu);

1649
	clr_exception_intercept(svm, BP_VECTOR);
1650

J
Jan Kiszka 已提交
1651 1652
	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1653
			set_exception_intercept(svm, BP_VECTOR);
1654
	}
1655 1656
}

1657
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
A
Avi Kivity 已提交
1658
{
1659 1660
	if (sd->next_asid > sd->max_asid) {
		++sd->asid_generation;
1661
		sd->next_asid = sd->min_asid;
1662
		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
A
Avi Kivity 已提交
1663 1664
	}

1665 1666
	svm->asid_generation = sd->asid_generation;
	svm->vmcb->control.asid = sd->next_asid++;
1667

1668
	vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
A
Avi Kivity 已提交
1669 1670
}

1671
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
J
Jan Kiszka 已提交
1672
{
1673
	struct vmcb *vmcb = svm->vmcb;
J
Jan Kiszka 已提交
1674

1675 1676
	if (unlikely(value != vmcb->save.dr6)) {
		vmcb->save.dr6 = value;
1677
		vmcb_mark_dirty(vmcb, VMCB_DR);
1678
	}
J
Jan Kiszka 已提交
1679 1680
}

1681 1682 1683 1684 1685 1686 1687 1688
static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	get_debugreg(vcpu->arch.db[0], 0);
	get_debugreg(vcpu->arch.db[1], 1);
	get_debugreg(vcpu->arch.db[2], 2);
	get_debugreg(vcpu->arch.db[3], 3);
1689 1690 1691 1692
	/*
	 * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
	 * because db_interception might need it.  We can do it before vmentry.
	 */
1693
	vcpu->arch.dr6 = svm->vmcb->save.dr6;
1694 1695 1696 1697 1698
	vcpu->arch.dr7 = svm->vmcb->save.dr7;
	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
	set_dr_intercepts(svm);
}

1699
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
A
Avi Kivity 已提交
1700
{
1701 1702
	struct vcpu_svm *svm = to_svm(vcpu);

1703
	svm->vmcb->save.dr7 = value;
1704
	vmcb_mark_dirty(svm->vmcb, VMCB_DR);
A
Avi Kivity 已提交
1705 1706
}

A
Avi Kivity 已提交
1707
static int pf_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1708
{
1709
	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
1710
	u64 error_code = svm->vmcb->control.exit_info_1;
A
Avi Kivity 已提交
1711

1712
	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
1713 1714
			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
			svm->vmcb->control.insn_bytes : NULL,
1715 1716 1717 1718 1719
			svm->vmcb->control.insn_len);
}

static int npf_interception(struct vcpu_svm *svm)
{
1720
	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
1721 1722 1723 1724
	u64 error_code = svm->vmcb->control.exit_info_1;

	trace_kvm_page_fault(fault_address, error_code);
	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1725 1726
			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
			svm->vmcb->control.insn_bytes : NULL,
1727
			svm->vmcb->control.insn_len);
A
Avi Kivity 已提交
1728 1729
}

A
Avi Kivity 已提交
1730
static int db_interception(struct vcpu_svm *svm)
J
Jan Kiszka 已提交
1731
{
A
Avi Kivity 已提交
1732
	struct kvm_run *kvm_run = svm->vcpu.run;
1733
	struct kvm_vcpu *vcpu = &svm->vcpu;
A
Avi Kivity 已提交
1734

J
Jan Kiszka 已提交
1735
	if (!(svm->vcpu.guest_debug &
1736
	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
J
Jan Kiszka 已提交
1737
		!svm->nmi_singlestep) {
1738 1739
		u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
		kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
J
Jan Kiszka 已提交
1740 1741
		return 1;
	}
1742

J
Jan Kiszka 已提交
1743
	if (svm->nmi_singlestep) {
1744
		disable_nmi_singlestep(svm);
1745 1746
		/* Make sure we check for pending NMIs upon entry */
		kvm_make_request(KVM_REQ_EVENT, vcpu);
1747 1748 1749
	}

	if (svm->vcpu.guest_debug &
J
Joerg Roedel 已提交
1750
	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1751
		kvm_run->exit_reason = KVM_EXIT_DEBUG;
1752 1753
		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1754 1755 1756 1757 1758 1759 1760
		kvm_run->debug.arch.pc =
			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
		kvm_run->debug.arch.exception = DB_VECTOR;
		return 0;
	}

	return 1;
J
Jan Kiszka 已提交
1761 1762
}

A
Avi Kivity 已提交
1763
static int bp_interception(struct vcpu_svm *svm)
J
Jan Kiszka 已提交
1764
{
A
Avi Kivity 已提交
1765 1766
	struct kvm_run *kvm_run = svm->vcpu.run;

J
Jan Kiszka 已提交
1767 1768 1769 1770 1771 1772
	kvm_run->exit_reason = KVM_EXIT_DEBUG;
	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
	kvm_run->debug.arch.exception = BP_VECTOR;
	return 0;
}

A
Avi Kivity 已提交
1773
static int ud_interception(struct vcpu_svm *svm)
1774
{
W
Wanpeng Li 已提交
1775
	return handle_ud(&svm->vcpu);
1776 1777
}

1778 1779 1780 1781 1782 1783
static int ac_interception(struct vcpu_svm *svm)
{
	kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
	return 1;
}

1784 1785 1786 1787 1788 1789 1790
static int gp_interception(struct vcpu_svm *svm)
{
	struct kvm_vcpu *vcpu = &svm->vcpu;
	u32 error_code = svm->vmcb->control.exit_info_1;

	WARN_ON_ONCE(!enable_vmware_backdoor);

1791 1792 1793 1794 1795 1796 1797 1798
	/*
	 * VMware backdoor emulation on #GP interception only handles IN{S},
	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
	 */
	if (error_code) {
		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
		return 1;
	}
1799
	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
1800 1801
}

1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840
static bool is_erratum_383(void)
{
	int err, i;
	u64 value;

	if (!erratum_383_found)
		return false;

	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
	if (err)
		return false;

	/* Bit 62 may or may not be set for this mce */
	value &= ~(1ULL << 62);

	if (value != 0xb600000000010015ULL)
		return false;

	/* Clear MCi_STATUS registers */
	for (i = 0; i < 6; ++i)
		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);

	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
	if (!err) {
		u32 low, high;

		value &= ~(1ULL << 2);
		low    = lower_32_bits(value);
		high   = upper_32_bits(value);

		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
	}

	/* Flush tlb to evict multi-match entries */
	__flush_tlb_all();

	return true;
}

1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
/*
 * Trigger machine check on the host. We assume all the MSRs are already set up
 * by the CPU and that we still run on the same CPU as the MCE occurred on.
 * We pass a fake environment to the machine check handler because we want
 * the guest to be always treated like user space, no matter what context
 * it used internally.
 */
static void kvm_machine_check(void)
{
#if defined(CONFIG_X86_MCE)
	struct pt_regs regs = {
		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
		.flags = X86_EFLAGS_IF,
	};

1856
	do_machine_check(&regs);
1857 1858 1859
#endif
}

1860
static void svm_handle_mce(struct vcpu_svm *svm)
1861
{
1862 1863 1864 1865 1866 1867 1868
	if (is_erratum_383()) {
		/*
		 * Erratum 383 triggered. Guest state is corrupt so kill the
		 * guest.
		 */
		pr_err("KVM: Guest triggered AMD Erratum 383\n");

1869
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1870 1871 1872 1873

		return;
	}

1874 1875 1876 1877
	/*
	 * On an #MC intercept the MCE handler is not called automatically in
	 * the host. So do it by hand here.
	 */
1878
	kvm_machine_check();
1879 1880 1881 1882
}

static int mc_interception(struct vcpu_svm *svm)
{
1883 1884 1885
	return 1;
}

A
Avi Kivity 已提交
1886
static int shutdown_interception(struct vcpu_svm *svm)
1887
{
A
Avi Kivity 已提交
1888 1889
	struct kvm_run *kvm_run = svm->vcpu.run;

1890 1891 1892 1893
	/*
	 * VMCB is undefined after a SHUTDOWN intercept
	 * so reinitialize it.
	 */
1894
	clear_page(svm->vmcb);
P
Paolo Bonzini 已提交
1895
	init_vmcb(svm);
1896 1897 1898 1899 1900

	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
	return 0;
}

A
Avi Kivity 已提交
1901
static int io_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1902
{
1903
	struct kvm_vcpu *vcpu = &svm->vcpu;
M
Mike Day 已提交
1904
	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1905
	int size, in, string;
1906
	unsigned port;
A
Avi Kivity 已提交
1907

R
Rusty Russell 已提交
1908
	++svm->vcpu.stat.io_exits;
1909
	string = (io_info & SVM_IOIO_STR_MASK) != 0;
1910
	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1911
	if (string)
1912
		return kvm_emulate_instruction(vcpu, 0);
1913

1914 1915
	port = io_info >> 16;
	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1916 1917
	svm->next_rip = svm->vmcb->control.exit_info_2;

1918
	return kvm_fast_pio(&svm->vcpu, size, port, in);
A
Avi Kivity 已提交
1919 1920
}

A
Avi Kivity 已提交
1921
static int nmi_interception(struct vcpu_svm *svm)
1922 1923 1924 1925
{
	return 1;
}

A
Avi Kivity 已提交
1926
static int intr_interception(struct vcpu_svm *svm)
1927 1928 1929 1930 1931
{
	++svm->vcpu.stat.irq_exits;
	return 1;
}

A
Avi Kivity 已提交
1932
static int nop_on_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1933 1934 1935 1936
{
	return 1;
}

A
Avi Kivity 已提交
1937
static int halt_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
1938
{
R
Rusty Russell 已提交
1939
	return kvm_emulate_halt(&svm->vcpu);
A
Avi Kivity 已提交
1940 1941
}

A
Avi Kivity 已提交
1942
static int vmmcall_interception(struct vcpu_svm *svm)
1943
{
1944
	return kvm_emulate_hypercall(&svm->vcpu);
1945 1946
}

A
Avi Kivity 已提交
1947
static int vmload_interception(struct vcpu_svm *svm)
1948
{
1949
	struct vmcb *nested_vmcb;
1950
	struct kvm_host_map map;
1951
	int ret;
1952

1953 1954 1955
	if (nested_svm_check_permissions(svm))
		return 1;

1956 1957 1958 1959
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
	if (ret) {
		if (ret == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
1960
		return 1;
1961 1962 1963
	}

	nested_vmcb = map.hva;
1964

1965
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
1966

1967
	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1968
	kvm_vcpu_unmap(&svm->vcpu, &map, true);
1969

1970
	return ret;
1971 1972
}

A
Avi Kivity 已提交
1973
static int vmsave_interception(struct vcpu_svm *svm)
1974
{
1975
	struct vmcb *nested_vmcb;
1976
	struct kvm_host_map map;
1977
	int ret;
1978

1979 1980 1981
	if (nested_svm_check_permissions(svm))
		return 1;

1982 1983 1984 1985
	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
	if (ret) {
		if (ret == -EINVAL)
			kvm_inject_gp(&svm->vcpu, 0);
1986
		return 1;
1987 1988 1989
	}

	nested_vmcb = map.hva;
1990

1991
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
1992

1993
	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1994
	kvm_vcpu_unmap(&svm->vcpu, &map, true);
1995

1996
	return ret;
1997 1998
}

A
Avi Kivity 已提交
1999
static int vmrun_interception(struct vcpu_svm *svm)
A
Alexander Graf 已提交
2000 2001 2002 2003
{
	if (nested_svm_check_permissions(svm))
		return 1;

2004
	return nested_svm_vmrun(svm);
A
Alexander Graf 已提交
2005 2006
}

P
Paolo Bonzini 已提交
2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
void svm_set_gif(struct vcpu_svm *svm, bool value)
{
	if (value) {
		/*
		 * If VGIF is enabled, the STGI intercept is only added to
		 * detect the opening of the SMI/NMI window; remove it now.
		 * Likewise, clear the VINTR intercept, we will set it
		 * again while processing KVM_REQ_EVENT if needed.
		 */
		if (vgif_enabled(svm))
2017 2018
			svm_clr_intercept(svm, INTERCEPT_STGI);
		if (svm_is_intercept(svm, INTERCEPT_VINTR))
P
Paolo Bonzini 已提交
2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
			svm_clear_vintr(svm);

		enable_gif(svm);
		if (svm->vcpu.arch.smi_pending ||
		    svm->vcpu.arch.nmi_pending ||
		    kvm_cpu_has_injectable_intr(&svm->vcpu))
			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
	} else {
		disable_gif(svm);

		/*
		 * After a CLGI no interrupts should come.  But if vGIF is
		 * in use, we still rely on the VINTR intercept (rather than
		 * STGI) to detect an open interrupt window.
		*/
		if (!vgif_enabled(svm))
			svm_clear_vintr(svm);
	}
}

A
Avi Kivity 已提交
2039
static int stgi_interception(struct vcpu_svm *svm)
2040
{
2041 2042
	int ret;

2043 2044 2045
	if (nested_svm_check_permissions(svm))
		return 1;

2046
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
P
Paolo Bonzini 已提交
2047
	svm_set_gif(svm, true);
2048
	return ret;
2049 2050
}

A
Avi Kivity 已提交
2051
static int clgi_interception(struct vcpu_svm *svm)
2052
{
2053 2054
	int ret;

2055 2056 2057
	if (nested_svm_check_permissions(svm))
		return 1;

2058
	ret = kvm_skip_emulated_instruction(&svm->vcpu);
P
Paolo Bonzini 已提交
2059
	svm_set_gif(svm, false);
2060
	return ret;
2061 2062
}

A
Avi Kivity 已提交
2063
static int invlpga_interception(struct vcpu_svm *svm)
A
Alexander Graf 已提交
2064 2065 2066
{
	struct kvm_vcpu *vcpu = &svm->vcpu;

2067 2068
	trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
			  kvm_rax_read(&svm->vcpu));
2069

A
Alexander Graf 已提交
2070
	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2071
	kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
A
Alexander Graf 已提交
2072

2073
	return kvm_skip_emulated_instruction(&svm->vcpu);
A
Alexander Graf 已提交
2074 2075
}

2076 2077
static int skinit_interception(struct vcpu_svm *svm)
{
2078
	trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
2079 2080 2081 2082 2083

	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
	return 1;
}

D
David Kaplan 已提交
2084 2085
static int wbinvd_interception(struct vcpu_svm *svm)
{
2086
	return kvm_emulate_wbinvd(&svm->vcpu);
D
David Kaplan 已提交
2087 2088
}

J
Joerg Roedel 已提交
2089 2090 2091
static int xsetbv_interception(struct vcpu_svm *svm)
{
	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2092
	u32 index = kvm_rcx_read(&svm->vcpu);
J
Joerg Roedel 已提交
2093 2094

	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2095
		return kvm_skip_emulated_instruction(&svm->vcpu);
J
Joerg Roedel 已提交
2096 2097 2098 2099 2100
	}

	return 1;
}

J
Jim Mattson 已提交
2101 2102 2103 2104 2105 2106
static int rdpru_interception(struct vcpu_svm *svm)
{
	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
	return 1;
}

A
Avi Kivity 已提交
2107
static int task_switch_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2108
{
2109
	u16 tss_selector;
2110 2111 2112
	int reason;
	int int_type = svm->vmcb->control.exit_int_info &
		SVM_EXITINTINFO_TYPE_MASK;
2113
	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2114 2115 2116 2117
	uint32_t type =
		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
	uint32_t idt_v =
		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2118 2119
	bool has_error_code = false;
	u32 error_code = 0;
2120 2121

	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2122

2123 2124
	if (svm->vmcb->control.exit_info_2 &
	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2125 2126 2127 2128
		reason = TASK_SWITCH_IRET;
	else if (svm->vmcb->control.exit_info_2 &
		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
		reason = TASK_SWITCH_JMP;
2129
	else if (idt_v)
2130 2131 2132 2133
		reason = TASK_SWITCH_GATE;
	else
		reason = TASK_SWITCH_CALL;

2134 2135 2136 2137 2138 2139
	if (reason == TASK_SWITCH_GATE) {
		switch (type) {
		case SVM_EXITINTINFO_TYPE_NMI:
			svm->vcpu.arch.nmi_injected = false;
			break;
		case SVM_EXITINTINFO_TYPE_EXEPT:
2140 2141 2142 2143 2144 2145
			if (svm->vmcb->control.exit_info_2 &
			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
				has_error_code = true;
				error_code =
					(u32)svm->vmcb->control.exit_info_2;
			}
2146 2147 2148 2149 2150 2151 2152 2153 2154
			kvm_clear_exception_queue(&svm->vcpu);
			break;
		case SVM_EXITINTINFO_TYPE_INTR:
			kvm_clear_interrupt_queue(&svm->vcpu);
			break;
		default:
			break;
		}
	}
2155

2156 2157 2158
	if (reason != TASK_SWITCH_GATE ||
	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2159
	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2160
		if (!skip_emulated_instruction(&svm->vcpu))
2161
			return 0;
2162
	}
2163

2164 2165 2166
	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
		int_vec = -1;

2167
	return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2168
			       has_error_code, error_code);
A
Avi Kivity 已提交
2169 2170
}

A
Avi Kivity 已提交
2171
static int cpuid_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2172
{
2173
	return kvm_emulate_cpuid(&svm->vcpu);
A
Avi Kivity 已提交
2174 2175
}

A
Avi Kivity 已提交
2176
static int iret_interception(struct vcpu_svm *svm)
2177 2178
{
	++svm->vcpu.stat.nmi_window_exits;
2179
	svm_clr_intercept(svm, INTERCEPT_IRET);
2180
	svm->vcpu.arch.hflags |= HF_IRET_MASK;
2181
	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2182
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2183 2184 2185
	return 1;
}

A
Avi Kivity 已提交
2186
static int invlpg_interception(struct vcpu_svm *svm)
M
Marcelo Tosatti 已提交
2187
{
2188
	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2189
		return kvm_emulate_instruction(&svm->vcpu, 0);
2190 2191

	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2192
	return kvm_skip_emulated_instruction(&svm->vcpu);
M
Marcelo Tosatti 已提交
2193 2194
}

A
Avi Kivity 已提交
2195
static int emulate_on_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2196
{
2197
	return kvm_emulate_instruction(&svm->vcpu, 0);
A
Avi Kivity 已提交
2198 2199
}

B
Brijesh Singh 已提交
2200 2201
static int rsm_interception(struct vcpu_svm *svm)
{
2202
	return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
B
Brijesh Singh 已提交
2203 2204
}

A
Avi Kivity 已提交
2205 2206 2207 2208
static int rdpmc_interception(struct vcpu_svm *svm)
{
	int err;

2209
	if (!nrips)
A
Avi Kivity 已提交
2210 2211 2212
		return emulate_on_interception(svm);

	err = kvm_rdpmc(&svm->vcpu);
2213
	return kvm_complete_insn_gp(&svm->vcpu, err);
A
Avi Kivity 已提交
2214 2215
}

2216 2217
static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
					    unsigned long val)
2218 2219 2220 2221 2222
{
	unsigned long cr0 = svm->vcpu.arch.cr0;
	bool ret = false;
	u64 intercept;

2223
	intercept = svm->nested.ctl.intercept;
2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239

	if (!is_guest_mode(&svm->vcpu) ||
	    (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
		return false;

	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
	val &= ~SVM_CR0_SELECTIVE_MASK;

	if (cr0 ^ val) {
		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
	}

	return ret;
}

2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254
#define CR_VALID (1ULL << 63)

static int cr_interception(struct vcpu_svm *svm)
{
	int reg, cr;
	unsigned long val;
	int err;

	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
		return emulate_on_interception(svm);

	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
		return emulate_on_interception(svm);

	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2255 2256 2257 2258
	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
	else
		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2259 2260 2261 2262 2263 2264 2265

	err = 0;
	if (cr >= 16) { /* mov to cr */
		cr -= 16;
		val = kvm_register_read(&svm->vcpu, reg);
		switch (cr) {
		case 0:
2266 2267
			if (!check_selective_cr0_intercepted(svm, val))
				err = kvm_set_cr0(&svm->vcpu, val);
2268 2269 2270
			else
				return 1;

2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294
			break;
		case 3:
			err = kvm_set_cr3(&svm->vcpu, val);
			break;
		case 4:
			err = kvm_set_cr4(&svm->vcpu, val);
			break;
		case 8:
			err = kvm_set_cr8(&svm->vcpu, val);
			break;
		default:
			WARN(1, "unhandled write to CR%d", cr);
			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
			return 1;
		}
	} else { /* mov from cr */
		switch (cr) {
		case 0:
			val = kvm_read_cr0(&svm->vcpu);
			break;
		case 2:
			val = svm->vcpu.arch.cr2;
			break;
		case 3:
2295
			val = kvm_read_cr3(&svm->vcpu);
2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309
			break;
		case 4:
			val = kvm_read_cr4(&svm->vcpu);
			break;
		case 8:
			val = kvm_get_cr8(&svm->vcpu);
			break;
		default:
			WARN(1, "unhandled read from CR%d", cr);
			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
			return 1;
		}
		kvm_register_write(&svm->vcpu, reg, val);
	}
2310
	return kvm_complete_insn_gp(&svm->vcpu, err);
2311 2312
}

2313 2314 2315 2316 2317
static int dr_interception(struct vcpu_svm *svm)
{
	int reg, dr;
	unsigned long val;

2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328
	if (svm->vcpu.guest_debug == 0) {
		/*
		 * No more DR vmexits; force a reload of the debug registers
		 * and reenter on this instruction.  The next vmexit will
		 * retrieve the full state of the debug registers.
		 */
		clr_dr_intercepts(svm);
		svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
		return 1;
	}

2329 2330 2331 2332 2333 2334 2335
	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
		return emulate_on_interception(svm);

	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;

	if (dr >= 16) { /* mov to DRn */
2336 2337
		if (!kvm_require_dr(&svm->vcpu, dr - 16))
			return 1;
2338 2339 2340
		val = kvm_register_read(&svm->vcpu, reg);
		kvm_set_dr(&svm->vcpu, dr - 16, val);
	} else {
2341 2342 2343 2344
		if (!kvm_require_dr(&svm->vcpu, dr))
			return 1;
		kvm_get_dr(&svm->vcpu, dr, &val);
		kvm_register_write(&svm->vcpu, reg, val);
2345 2346
	}

2347
	return kvm_skip_emulated_instruction(&svm->vcpu);
2348 2349
}

A
Avi Kivity 已提交
2350
static int cr8_write_interception(struct vcpu_svm *svm)
2351
{
A
Avi Kivity 已提交
2352
	struct kvm_run *kvm_run = svm->vcpu.run;
A
Andre Przywara 已提交
2353
	int r;
A
Avi Kivity 已提交
2354

2355 2356
	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
	/* instruction emulation calls kvm_set_cr8() */
2357
	r = cr_interception(svm);
2358
	if (lapic_in_kernel(&svm->vcpu))
2359
		return r;
2360
	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2361
		return r;
2362 2363 2364 2365
	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
	return 0;
}

2366 2367
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
{
2368 2369 2370 2371 2372 2373 2374
	msr->data = 0;

	switch (msr->index) {
	case MSR_F10H_DECFG:
		if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
			msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
		break;
2375 2376
	case MSR_IA32_PERF_CAPABILITIES:
		return 0;
2377
	default:
2378
		return KVM_MSR_RET_INVALID;
2379 2380 2381
	}

	return 0;
2382 2383
}

2384
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
A
Avi Kivity 已提交
2385
{
2386 2387
	struct vcpu_svm *svm = to_svm(vcpu);

2388
	switch (msr_info->index) {
B
Brian Gerst 已提交
2389
	case MSR_STAR:
2390
		msr_info->data = svm->vmcb->save.star;
A
Avi Kivity 已提交
2391
		break;
2392
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
2393
	case MSR_LSTAR:
2394
		msr_info->data = svm->vmcb->save.lstar;
A
Avi Kivity 已提交
2395 2396
		break;
	case MSR_CSTAR:
2397
		msr_info->data = svm->vmcb->save.cstar;
A
Avi Kivity 已提交
2398 2399
		break;
	case MSR_KERNEL_GS_BASE:
2400
		msr_info->data = svm->vmcb->save.kernel_gs_base;
A
Avi Kivity 已提交
2401 2402
		break;
	case MSR_SYSCALL_MASK:
2403
		msr_info->data = svm->vmcb->save.sfmask;
A
Avi Kivity 已提交
2404 2405 2406
		break;
#endif
	case MSR_IA32_SYSENTER_CS:
2407
		msr_info->data = svm->vmcb->save.sysenter_cs;
A
Avi Kivity 已提交
2408 2409
		break;
	case MSR_IA32_SYSENTER_EIP:
2410
		msr_info->data = svm->sysenter_eip;
A
Avi Kivity 已提交
2411 2412
		break;
	case MSR_IA32_SYSENTER_ESP:
2413
		msr_info->data = svm->sysenter_esp;
A
Avi Kivity 已提交
2414
		break;
P
Paolo Bonzini 已提交
2415 2416 2417 2418 2419
	case MSR_TSC_AUX:
		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
			return 1;
		msr_info->data = svm->tsc_aux;
		break;
J
Joerg Roedel 已提交
2420 2421 2422 2423 2424
	/*
	 * Nobody will change the following 5 values in the VMCB so we can
	 * safely return them on rdmsr. They will always be 0 until LBRV is
	 * implemented.
	 */
2425
	case MSR_IA32_DEBUGCTLMSR:
2426
		msr_info->data = svm->vmcb->save.dbgctl;
2427 2428
		break;
	case MSR_IA32_LASTBRANCHFROMIP:
2429
		msr_info->data = svm->vmcb->save.br_from;
2430 2431
		break;
	case MSR_IA32_LASTBRANCHTOIP:
2432
		msr_info->data = svm->vmcb->save.br_to;
2433 2434
		break;
	case MSR_IA32_LASTINTFROMIP:
2435
		msr_info->data = svm->vmcb->save.last_excp_from;
2436 2437
		break;
	case MSR_IA32_LASTINTTOIP:
2438
		msr_info->data = svm->vmcb->save.last_excp_to;
2439
		break;
A
Alexander Graf 已提交
2440
	case MSR_VM_HSAVE_PA:
2441
		msr_info->data = svm->nested.hsave_msr;
A
Alexander Graf 已提交
2442
		break;
2443
	case MSR_VM_CR:
2444
		msr_info->data = svm->nested.vm_cr_msr;
2445
		break;
2446 2447
	case MSR_IA32_SPEC_CTRL:
		if (!msr_info->host_initiated &&
2448 2449
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
2450 2451
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
2452 2453 2454 2455
			return 1;

		msr_info->data = svm->spec_ctrl;
		break;
2456 2457 2458 2459 2460 2461 2462
	case MSR_AMD64_VIRT_SPEC_CTRL:
		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
			return 1;

		msr_info->data = svm->virt_spec_ctrl;
		break;
2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
	case MSR_F15H_IC_CFG: {

		int family, model;

		family = guest_cpuid_family(vcpu);
		model  = guest_cpuid_model(vcpu);

		if (family < 0 || model < 0)
			return kvm_get_msr_common(vcpu, msr_info);

		msr_info->data = 0;

		if (family == 0x15 &&
		    (model >= 0x2 && model < 0x20))
			msr_info->data = 0x1E;
		}
		break;
2480 2481 2482
	case MSR_F10H_DECFG:
		msr_info->data = svm->msr_decfg;
		break;
A
Avi Kivity 已提交
2483
	default:
2484
		return kvm_get_msr_common(vcpu, msr_info);
A
Avi Kivity 已提交
2485 2486 2487 2488
	}
	return 0;
}

A
Avi Kivity 已提交
2489
static int rdmsr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2490
{
2491
	return kvm_emulate_rdmsr(&svm->vcpu);
A
Avi Kivity 已提交
2492 2493
}

2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	int svm_dis, chg_mask;

	if (data & ~SVM_VM_CR_VALID_MASK)
		return 1;

	chg_mask = SVM_VM_CR_VALID_MASK;

	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);

	svm->nested.vm_cr_msr &= ~chg_mask;
	svm->nested.vm_cr_msr |= (data & chg_mask);

	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;

	/* check for svm_disable while efer.svme is set */
	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
		return 1;

	return 0;
}

2519
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
A
Avi Kivity 已提交
2520
{
2521 2522
	struct vcpu_svm *svm = to_svm(vcpu);

2523 2524
	u32 ecx = msr->index;
	u64 data = msr->data;
A
Avi Kivity 已提交
2525
	switch (ecx) {
P
Paolo Bonzini 已提交
2526 2527 2528 2529 2530
	case MSR_IA32_CR_PAT:
		if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
			return 1;
		vcpu->arch.pat = data;
		svm->vmcb->save.g_pat = data;
2531
		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
P
Paolo Bonzini 已提交
2532
		break;
2533 2534
	case MSR_IA32_SPEC_CTRL:
		if (!msr->host_initiated &&
2535 2536
		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
2537 2538
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
2539 2540
			return 1;

2541
		if (kvm_spec_ctrl_test_value(data))
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560
			return 1;

		svm->spec_ctrl = data;
		if (!data)
			break;

		/*
		 * For non-nested:
		 * When it's written (to non-zero) for the first time, pass
		 * it through.
		 *
		 * For nested:
		 * The handling of the MSR bitmap for L2 guests is done in
		 * nested_svm_vmrun_msrpm.
		 * We update the L1 MSR bit as well since it will end up
		 * touching the MSR anyway now.
		 */
		set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
		break;
A
Ashok Raj 已提交
2561 2562
	case MSR_IA32_PRED_CMD:
		if (!msr->host_initiated &&
2563
		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
A
Ashok Raj 已提交
2564 2565 2566 2567
			return 1;

		if (data & ~PRED_CMD_IBPB)
			return 1;
2568 2569
		if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
			return 1;
A
Ashok Raj 已提交
2570 2571 2572 2573 2574 2575
		if (!data)
			break;

		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
		break;
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
	case MSR_AMD64_VIRT_SPEC_CTRL:
		if (!msr->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
			return 1;

		if (data & ~SPEC_CTRL_SSBD)
			return 1;

		svm->virt_spec_ctrl = data;
		break;
B
Brian Gerst 已提交
2586
	case MSR_STAR:
2587
		svm->vmcb->save.star = data;
A
Avi Kivity 已提交
2588
		break;
2589
#ifdef CONFIG_X86_64
A
Avi Kivity 已提交
2590
	case MSR_LSTAR:
2591
		svm->vmcb->save.lstar = data;
A
Avi Kivity 已提交
2592 2593
		break;
	case MSR_CSTAR:
2594
		svm->vmcb->save.cstar = data;
A
Avi Kivity 已提交
2595 2596
		break;
	case MSR_KERNEL_GS_BASE:
2597
		svm->vmcb->save.kernel_gs_base = data;
A
Avi Kivity 已提交
2598 2599
		break;
	case MSR_SYSCALL_MASK:
2600
		svm->vmcb->save.sfmask = data;
A
Avi Kivity 已提交
2601 2602 2603
		break;
#endif
	case MSR_IA32_SYSENTER_CS:
2604
		svm->vmcb->save.sysenter_cs = data;
A
Avi Kivity 已提交
2605 2606
		break;
	case MSR_IA32_SYSENTER_EIP:
2607
		svm->sysenter_eip = data;
2608
		svm->vmcb->save.sysenter_eip = data;
A
Avi Kivity 已提交
2609 2610
		break;
	case MSR_IA32_SYSENTER_ESP:
2611
		svm->sysenter_esp = data;
2612
		svm->vmcb->save.sysenter_esp = data;
A
Avi Kivity 已提交
2613
		break;
P
Paolo Bonzini 已提交
2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625
	case MSR_TSC_AUX:
		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
			return 1;

		/*
		 * This is rare, so we update the MSR here instead of using
		 * direct_access_msrs.  Doing that would require a rdmsr in
		 * svm_vcpu_put.
		 */
		svm->tsc_aux = data;
		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
		break;
2626
	case MSR_IA32_DEBUGCTLMSR:
2627
		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2628 2629
			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
				    __func__, data);
2630 2631 2632 2633 2634 2635
			break;
		}
		if (data & DEBUGCTL_RESERVED_BITS)
			return 1;

		svm->vmcb->save.dbgctl = data;
2636
		vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
2637 2638 2639 2640
		if (data & (1ULL<<0))
			svm_enable_lbrv(svm);
		else
			svm_disable_lbrv(svm);
2641
		break;
A
Alexander Graf 已提交
2642
	case MSR_VM_HSAVE_PA:
2643
		svm->nested.hsave_msr = data;
2644
		break;
2645
	case MSR_VM_CR:
2646
		return svm_set_vm_cr(vcpu, data);
2647
	case MSR_VM_IGNNE:
2648
		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2649
		break;
2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667
	case MSR_F10H_DECFG: {
		struct kvm_msr_entry msr_entry;

		msr_entry.index = msr->index;
		if (svm_get_msr_feature(&msr_entry))
			return 1;

		/* Check the supported bits */
		if (data & ~msr_entry.data)
			return 1;

		/* Don't allow the guest to change a bit, #GP */
		if (!msr->host_initiated && (data ^ msr_entry.data))
			return 1;

		svm->msr_decfg = data;
		break;
	}
2668 2669 2670
	case MSR_IA32_APICBASE:
		if (kvm_vcpu_apicv_active(vcpu))
			avic_update_vapic_bar(to_svm(vcpu), data);
2671
		/* Fall through */
A
Avi Kivity 已提交
2672
	default:
2673
		return kvm_set_msr_common(vcpu, msr);
A
Avi Kivity 已提交
2674 2675 2676 2677
	}
	return 0;
}

A
Avi Kivity 已提交
2678
static int wrmsr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2679
{
2680
	return kvm_emulate_wrmsr(&svm->vcpu);
A
Avi Kivity 已提交
2681 2682
}

A
Avi Kivity 已提交
2683
static int msr_interception(struct vcpu_svm *svm)
A
Avi Kivity 已提交
2684
{
R
Rusty Russell 已提交
2685
	if (svm->vmcb->control.exit_info_1)
A
Avi Kivity 已提交
2686
		return wrmsr_interception(svm);
A
Avi Kivity 已提交
2687
	else
A
Avi Kivity 已提交
2688
		return rdmsr_interception(svm);
A
Avi Kivity 已提交
2689 2690
}

A
Avi Kivity 已提交
2691
static int interrupt_window_interception(struct vcpu_svm *svm)
2692
{
2693
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2694
	svm_clear_vintr(svm);
2695 2696 2697 2698 2699 2700 2701 2702

	/*
	 * For AVIC, the only reason to end up here is ExtINTs.
	 * In this case AVIC was temporarily disabled for
	 * requesting the IRQ window and we have to re-enable it.
	 */
	svm_toggle_avic_for_irq_window(&svm->vcpu, true);

2703
	++svm->vcpu.stat.irq_window_exits;
2704 2705 2706
	return 1;
}

2707 2708
static int pause_interception(struct vcpu_svm *svm)
{
2709 2710 2711
	struct kvm_vcpu *vcpu = &svm->vcpu;
	bool in_kernel = (svm_get_cpl(vcpu) == 0);

2712 2713 2714
	if (pause_filter_thresh)
		grow_ple_window(vcpu);

2715
	kvm_vcpu_on_spin(vcpu, in_kernel);
2716 2717 2718
	return 1;
}

2719 2720
static int nop_interception(struct vcpu_svm *svm)
{
2721
	return kvm_skip_emulated_instruction(&(svm->vcpu));
2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735
}

static int monitor_interception(struct vcpu_svm *svm)
{
	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
	return nop_interception(svm);
}

static int mwait_interception(struct vcpu_svm *svm)
{
	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
	return nop_interception(svm);
}

2736
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
2737 2738 2739 2740
	[SVM_EXIT_READ_CR0]			= cr_interception,
	[SVM_EXIT_READ_CR3]			= cr_interception,
	[SVM_EXIT_READ_CR4]			= cr_interception,
	[SVM_EXIT_READ_CR8]			= cr_interception,
2741
	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
2742
	[SVM_EXIT_WRITE_CR0]			= cr_interception,
2743 2744
	[SVM_EXIT_WRITE_CR3]			= cr_interception,
	[SVM_EXIT_WRITE_CR4]			= cr_interception,
J
Joerg Roedel 已提交
2745
	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761
	[SVM_EXIT_READ_DR0]			= dr_interception,
	[SVM_EXIT_READ_DR1]			= dr_interception,
	[SVM_EXIT_READ_DR2]			= dr_interception,
	[SVM_EXIT_READ_DR3]			= dr_interception,
	[SVM_EXIT_READ_DR4]			= dr_interception,
	[SVM_EXIT_READ_DR5]			= dr_interception,
	[SVM_EXIT_READ_DR6]			= dr_interception,
	[SVM_EXIT_READ_DR7]			= dr_interception,
	[SVM_EXIT_WRITE_DR0]			= dr_interception,
	[SVM_EXIT_WRITE_DR1]			= dr_interception,
	[SVM_EXIT_WRITE_DR2]			= dr_interception,
	[SVM_EXIT_WRITE_DR3]			= dr_interception,
	[SVM_EXIT_WRITE_DR4]			= dr_interception,
	[SVM_EXIT_WRITE_DR5]			= dr_interception,
	[SVM_EXIT_WRITE_DR6]			= dr_interception,
	[SVM_EXIT_WRITE_DR7]			= dr_interception,
J
Jan Kiszka 已提交
2762 2763
	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
2764
	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
J
Joerg Roedel 已提交
2765 2766
	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
2767
	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
2768
	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
J
Joerg Roedel 已提交
2769
	[SVM_EXIT_INTR]				= intr_interception,
2770
	[SVM_EXIT_NMI]				= nmi_interception,
A
Avi Kivity 已提交
2771 2772
	[SVM_EXIT_SMI]				= nop_on_interception,
	[SVM_EXIT_INIT]				= nop_on_interception,
2773
	[SVM_EXIT_VINTR]			= interrupt_window_interception,
A
Avi Kivity 已提交
2774
	[SVM_EXIT_RDPMC]			= rdpmc_interception,
A
Avi Kivity 已提交
2775
	[SVM_EXIT_CPUID]			= cpuid_interception,
2776
	[SVM_EXIT_IRET]                         = iret_interception,
2777
	[SVM_EXIT_INVD]                         = emulate_on_interception,
2778
	[SVM_EXIT_PAUSE]			= pause_interception,
A
Avi Kivity 已提交
2779
	[SVM_EXIT_HLT]				= halt_interception,
M
Marcelo Tosatti 已提交
2780
	[SVM_EXIT_INVLPG]			= invlpg_interception,
A
Alexander Graf 已提交
2781
	[SVM_EXIT_INVLPGA]			= invlpga_interception,
J
Joerg Roedel 已提交
2782
	[SVM_EXIT_IOIO]				= io_interception,
A
Avi Kivity 已提交
2783 2784
	[SVM_EXIT_MSR]				= msr_interception,
	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
2785
	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
A
Alexander Graf 已提交
2786
	[SVM_EXIT_VMRUN]			= vmrun_interception,
2787
	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
2788 2789
	[SVM_EXIT_VMLOAD]			= vmload_interception,
	[SVM_EXIT_VMSAVE]			= vmsave_interception,
2790 2791
	[SVM_EXIT_STGI]				= stgi_interception,
	[SVM_EXIT_CLGI]				= clgi_interception,
2792
	[SVM_EXIT_SKINIT]			= skinit_interception,
D
David Kaplan 已提交
2793
	[SVM_EXIT_WBINVD]                       = wbinvd_interception,
2794 2795
	[SVM_EXIT_MONITOR]			= monitor_interception,
	[SVM_EXIT_MWAIT]			= mwait_interception,
J
Joerg Roedel 已提交
2796
	[SVM_EXIT_XSETBV]			= xsetbv_interception,
J
Jim Mattson 已提交
2797
	[SVM_EXIT_RDPRU]			= rdpru_interception,
2798
	[SVM_EXIT_NPF]				= npf_interception,
B
Brijesh Singh 已提交
2799
	[SVM_EXIT_RSM]                          = rsm_interception,
2800 2801
	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
A
Avi Kivity 已提交
2802 2803
};

2804
static void dump_vmcb(struct kvm_vcpu *vcpu)
2805 2806 2807 2808 2809
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;
	struct vmcb_save_area *save = &svm->vmcb->save;

2810 2811 2812 2813 2814
	if (!dump_invalid_vmcb) {
		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
		return;
	}

2815
	pr_err("VMCB Control Area:\n");
2816 2817 2818 2819 2820 2821 2822
	pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
	pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
	pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
	pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
	pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
	pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
2823 2824
	pr_err("%-20s%d\n", "pause filter threshold:",
	       control->pause_filter_thresh);
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839
	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
	pr_err("%-20s%d\n", "asid:", control->asid);
	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
	pr_err("%-20s%08x\n", "int_state:", control->int_state);
	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
2840
	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
2841 2842
	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
2843
	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
2844
	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
2845 2846 2847
	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
2848
	pr_err("VMCB State Save Area:\n");
2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "es:",
	       save->es.selector, save->es.attrib,
	       save->es.limit, save->es.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "cs:",
	       save->cs.selector, save->cs.attrib,
	       save->cs.limit, save->cs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ss:",
	       save->ss.selector, save->ss.attrib,
	       save->ss.limit, save->ss.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ds:",
	       save->ds.selector, save->ds.attrib,
	       save->ds.limit, save->ds.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "fs:",
	       save->fs.selector, save->fs.attrib,
	       save->fs.limit, save->fs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "gs:",
	       save->gs.selector, save->gs.attrib,
	       save->gs.limit, save->gs.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "gdtr:",
	       save->gdtr.selector, save->gdtr.attrib,
	       save->gdtr.limit, save->gdtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "ldtr:",
	       save->ldtr.selector, save->ldtr.attrib,
	       save->ldtr.limit, save->ldtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "idtr:",
	       save->idtr.selector, save->idtr.attrib,
	       save->idtr.limit, save->idtr.base);
	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
	       "tr:",
	       save->tr.selector, save->tr.attrib,
	       save->tr.limit, save->tr.base);
2889 2890
	pr_err("cpl:            %d                efer:         %016llx\n",
		save->cpl, save->efer);
2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cr0:", save->cr0, "cr2:", save->cr2);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cr3:", save->cr3, "cr4:", save->cr4);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "dr6:", save->dr6, "dr7:", save->dr7);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "rip:", save->rip, "rflags:", save->rflags);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "rsp:", save->rsp, "rax:", save->rax);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "star:", save->star, "lstar:", save->lstar);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "cstar:", save->cstar, "sfmask:", save->sfmask);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "kernel_gs_base:", save->kernel_gs_base,
	       "sysenter_cs:", save->sysenter_cs);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "sysenter_esp:", save->sysenter_esp,
	       "sysenter_eip:", save->sysenter_eip);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "br_from:", save->br_from, "br_to:", save->br_to);
	pr_err("%-15s %016llx %-13s %016llx\n",
	       "excp_from:", save->last_excp_from,
	       "excp_to:", save->last_excp_to);
2918 2919
}

2920 2921 2922 2923 2924 2925 2926 2927
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;

	*info1 = control->exit_info_1;
	*info2 = control->exit_info_2;
}

2928
static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
A
Avi Kivity 已提交
2929
{
2930
	struct vcpu_svm *svm = to_svm(vcpu);
A
Avi Kivity 已提交
2931
	struct kvm_run *kvm_run = vcpu->run;
2932
	u32 exit_code = svm->vmcb->control.exit_code;
A
Avi Kivity 已提交
2933

2934 2935
	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);

2936
	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2937 2938 2939
		vcpu->arch.cr0 = svm->vmcb->save.cr0;
	if (npt_enabled)
		vcpu->arch.cr3 = svm->vmcb->save.cr3;
2940

2941 2942
	svm_complete_interrupts(svm);

2943
	if (is_guest_mode(vcpu)) {
2944 2945
		int vmexit;

2946 2947 2948 2949
		trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
					svm->vmcb->control.exit_info_1,
					svm->vmcb->control.exit_info_2,
					svm->vmcb->control.exit_int_info,
2950 2951
					svm->vmcb->control.exit_int_info_err,
					KVM_ISA_SVM);
2952

2953 2954 2955 2956 2957 2958
		vmexit = nested_svm_exit_special(svm);

		if (vmexit == NESTED_EXIT_CONTINUE)
			vmexit = nested_svm_exit_handled(svm);

		if (vmexit == NESTED_EXIT_DONE)
2959 2960 2961
			return 1;
	}

2962 2963 2964 2965
	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		kvm_run->fail_entry.hardware_entry_failure_reason
			= svm->vmcb->control.exit_code;
2966
		kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2967
		dump_vmcb(vcpu);
2968 2969 2970
		return 0;
	}

2971
	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2972
	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2973 2974
	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2975
		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
A
Avi Kivity 已提交
2976
		       "exit_code 0x%x\n",
2977
		       __func__, svm->vmcb->control.exit_int_info,
A
Avi Kivity 已提交
2978 2979
		       exit_code);

2980
	if (exit_fastpath != EXIT_FASTPATH_NONE)
2981
		return 1;
2982 2983

	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
J
Joe Perches 已提交
2984
	    || !svm_exit_handlers[exit_code]) {
2985 2986 2987 2988 2989
		vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
		dump_vmcb(vcpu);
		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		vcpu->run->internal.suberror =
			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2990
		vcpu->run->internal.ndata = 2;
2991
		vcpu->run->internal.data[0] = exit_code;
2992
		vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2993
		return 0;
A
Avi Kivity 已提交
2994 2995
	}

2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007
#ifdef CONFIG_RETPOLINE
	if (exit_code == SVM_EXIT_MSR)
		return msr_interception(svm);
	else if (exit_code == SVM_EXIT_VINTR)
		return interrupt_window_interception(svm);
	else if (exit_code == SVM_EXIT_INTR)
		return intr_interception(svm);
	else if (exit_code == SVM_EXIT_HLT)
		return halt_interception(svm);
	else if (exit_code == SVM_EXIT_NPF)
		return npf_interception(svm);
#endif
A
Avi Kivity 已提交
3008
	return svm_exit_handlers[exit_code](svm);
A
Avi Kivity 已提交
3009 3010 3011 3012
}

static void reload_tss(struct kvm_vcpu *vcpu)
{
3013
	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
A
Avi Kivity 已提交
3014

3015
	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
A
Avi Kivity 已提交
3016 3017 3018
	load_TR_desc();
}

R
Rusty Russell 已提交
3019
static void pre_svm_run(struct vcpu_svm *svm)
A
Avi Kivity 已提交
3020
{
3021
	struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
A
Avi Kivity 已提交
3022

3023
	if (sev_guest(svm->vcpu.kvm))
3024
		return pre_sev_run(svm, svm->vcpu.cpu);
3025

3026
	/* FIXME: handle wraparound of asid_generation */
3027 3028
	if (svm->asid_generation != sd->asid_generation)
		new_asid(svm, sd);
A
Avi Kivity 已提交
3029 3030
}

3031 3032 3033 3034 3035 3036
static void svm_inject_nmi(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
	vcpu->arch.hflags |= HF_NMI_MASK;
3037
	svm_set_intercept(svm, INTERCEPT_IRET);
3038 3039
	++vcpu->stat.nmi_injections;
}
A
Avi Kivity 已提交
3040

3041
static void svm_set_irq(struct kvm_vcpu *vcpu)
E
Eddie Dong 已提交
3042 3043 3044
{
	struct vcpu_svm *svm = to_svm(vcpu);

3045
	BUG_ON(!(gif_set(svm)));
3046

3047 3048 3049
	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
	++vcpu->stat.irq_injections;

3050 3051
	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
E
Eddie Dong 已提交
3052 3053
}

3054
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3055 3056 3057
{
	struct vcpu_svm *svm = to_svm(vcpu);

3058
	if (nested_svm_virtualize_tpr(vcpu))
3059 3060
		return;

3061 3062
	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);

3063
	if (irr == -1)
3064 3065
		return;

3066
	if (tpr >= irr)
3067
		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3068
}
3069

3070
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3071 3072 3073
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *vmcb = svm->vmcb;
3074
	bool ret;
3075

3076
	if (!gif_set(svm))
3077 3078
		return true;

3079 3080 3081 3082 3083
	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
		return false;

	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
	      (svm->vcpu.arch.hflags & HF_NMI_MASK);
J
Joerg Roedel 已提交
3084 3085

	return ret;
3086 3087
}

3088
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3089 3090 3091
{
	struct vcpu_svm *svm = to_svm(vcpu);
	if (svm->nested.nested_run_pending)
3092
		return -EBUSY;
3093

3094 3095
	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3096
		return -EBUSY;
3097 3098

	return !svm_nmi_blocked(vcpu);
3099 3100
}

J
Jan Kiszka 已提交
3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
}

static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (masked) {
		svm->vcpu.arch.hflags |= HF_NMI_MASK;
3114
		svm_set_intercept(svm, INTERCEPT_IRET);
J
Jan Kiszka 已提交
3115 3116
	} else {
		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3117
		svm_clr_intercept(svm, INTERCEPT_IRET);
J
Jan Kiszka 已提交
3118 3119 3120
	}
}

3121
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3122 3123 3124
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *vmcb = svm->vmcb;
3125

3126
	if (!gif_set(svm))
3127
		return true;
3128

3129 3130
	if (is_guest_mode(vcpu)) {
		/* As long as interrupts are being delivered...  */
P
Paolo Bonzini 已提交
3131
		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
P
Paolo Bonzini 已提交
3132
		    ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144
		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
			return true;

		/* ... vmexits aren't blocked by the interrupt shadow  */
		if (nested_exit_on_intr(svm))
			return false;
	} else {
		if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
			return true;
	}

	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3145 3146
}

3147
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3148 3149 3150
{
	struct vcpu_svm *svm = to_svm(vcpu);
	if (svm->nested.nested_run_pending)
3151
		return -EBUSY;
3152

3153 3154 3155 3156 3157
	/*
	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
	 * e.g. if the IRQ arrived asynchronously after checking nested events.
	 */
	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3158
		return -EBUSY;
3159 3160

	return !svm_interrupt_blocked(vcpu);
3161 3162
}

3163
static void enable_irq_window(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3164
{
3165 3166
	struct vcpu_svm *svm = to_svm(vcpu);

J
Joerg Roedel 已提交
3167 3168 3169 3170
	/*
	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
	 * get that intercept, this function will be called again though and
3171 3172 3173
	 * we'll get the vintr intercept. However, if the vGIF feature is
	 * enabled, the STGI interception will not occur. Enable the irq
	 * window under the assumption that the hardware will set the GIF.
J
Joerg Roedel 已提交
3174
	 */
3175
	if (vgif_enabled(svm) || gif_set(svm)) {
3176 3177 3178 3179 3180 3181 3182
		/*
		 * IRQ window is not needed when AVIC is enabled,
		 * unless we have pending ExtINT since it cannot be injected
		 * via AVIC. In such case, we need to temporarily disable AVIC,
		 * and fallback to injecting IRQ via V_IRQ.
		 */
		svm_toggle_avic_for_irq_window(vcpu, false);
3183 3184
		svm_set_vintr(svm);
	}
3185 3186
}

3187
static void enable_nmi_window(struct kvm_vcpu *vcpu)
3188
{
3189
	struct vcpu_svm *svm = to_svm(vcpu);
3190

3191 3192
	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
	    == HF_NMI_MASK)
3193
		return; /* IRET will cause a vm exit */
3194

3195 3196
	if (!gif_set(svm)) {
		if (vgif_enabled(svm))
3197
			svm_set_intercept(svm, INTERCEPT_STGI);
3198
		return; /* STGI will cause a vm exit */
3199
	}
3200

J
Joerg Roedel 已提交
3201 3202 3203 3204
	/*
	 * Something prevents NMI from been injected. Single step over possible
	 * problem (IRET or exception injection or interrupt shadow)
	 */
3205
	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
J
Jan Kiszka 已提交
3206
	svm->nmi_singlestep = true;
3207
	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3208 3209
}

3210 3211 3212 3213 3214
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
	return 0;
}

3215 3216 3217 3218 3219
static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
	return 0;
}

3220
void svm_flush_tlb(struct kvm_vcpu *vcpu)
3221
{
3222 3223
	struct vcpu_svm *svm = to_svm(vcpu);

3224 3225 3226 3227 3228 3229 3230
	/*
	 * Flush only the current ASID even if the TLB flush was invoked via
	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
	 * unconditionally does a TLB flush on both nested VM-Enter and nested
	 * VM-Exit (via kvm_mmu_reset_context()).
	 */
3231 3232 3233 3234
	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
	else
		svm->asid_generation--;
3235 3236
}

3237 3238 3239 3240 3241 3242 3243
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	invlpga(gva, svm->vmcb->control.asid);
}

3244 3245 3246 3247
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
}

3248 3249 3250 3251
static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

3252
	if (nested_svm_virtualize_tpr(vcpu))
3253 3254
		return;

3255
	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3256
		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3257
		kvm_set_cr8(vcpu, cr8);
3258 3259 3260
	}
}

3261 3262 3263 3264 3265
static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	u64 cr8;

3266
	if (nested_svm_virtualize_tpr(vcpu) ||
3267
	    kvm_vcpu_apicv_active(vcpu))
3268 3269
		return;

3270 3271 3272 3273 3274
	cr8 = kvm_get_cr8(vcpu);
	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}

3275 3276 3277 3278 3279
static void svm_complete_interrupts(struct vcpu_svm *svm)
{
	u8 vector;
	int type;
	u32 exitintinfo = svm->vmcb->control.exit_int_info;
3280 3281 3282
	unsigned int3_injected = svm->int3_injected;

	svm->int3_injected = 0;
3283

3284 3285 3286 3287 3288 3289
	/*
	 * If we've made progress since setting HF_IRET_MASK, we've
	 * executed an IRET and can allow NMI injection.
	 */
	if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
	    && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3290
		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3291 3292
		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
	}
3293

3294 3295 3296 3297 3298 3299 3300
	svm->vcpu.arch.nmi_injected = false;
	kvm_clear_exception_queue(&svm->vcpu);
	kvm_clear_interrupt_queue(&svm->vcpu);

	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
		return;

3301 3302
	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);

3303 3304 3305 3306 3307 3308 3309 3310
	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;

	switch (type) {
	case SVM_EXITINTINFO_TYPE_NMI:
		svm->vcpu.arch.nmi_injected = true;
		break;
	case SVM_EXITINTINFO_TYPE_EXEPT:
3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321
		/*
		 * In case of software exceptions, do not reinject the vector,
		 * but re-execute the instruction instead. Rewind RIP first
		 * if we emulated INT3 before.
		 */
		if (kvm_exception_is_soft(vector)) {
			if (vector == BP_VECTOR && int3_injected &&
			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
				kvm_rip_write(&svm->vcpu,
					      kvm_rip_read(&svm->vcpu) -
					      int3_injected);
3322
			break;
3323
		}
3324 3325
		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
			u32 err = svm->vmcb->control.exit_int_info_err;
3326
			kvm_requeue_exception_e(&svm->vcpu, vector, err);
3327 3328

		} else
3329
			kvm_requeue_exception(&svm->vcpu, vector);
3330 3331
		break;
	case SVM_EXITINTINFO_TYPE_INTR:
3332
		kvm_queue_interrupt(&svm->vcpu, vector, false);
3333 3334 3335 3336 3337 3338
		break;
	default:
		break;
	}
}

A
Avi Kivity 已提交
3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349
static void svm_cancel_injection(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb_control_area *control = &svm->vmcb->control;

	control->exit_int_info = control->event_inj;
	control->exit_int_info_err = control->event_inj_err;
	control->event_inj = 0;
	svm_complete_interrupts(svm);
}

3350
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3351 3352 3353 3354 3355 3356 3357 3358 3359
{
	if (!is_guest_mode(vcpu) &&
	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
	    to_svm(vcpu)->vmcb->control.exit_info_1)
		return handle_fastpath_set_msr_irqoff(vcpu);

	return EXIT_FASTPATH_NONE;
}

3360
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
3361

3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387
static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
					struct vcpu_svm *svm)
{
	/*
	 * VMENTER enables interrupts (host state), but the kernel state is
	 * interrupts disabled when this is invoked. Also tell RCU about
	 * it. This is the same logic as for exit_to_user_mode().
	 *
	 * This ensures that e.g. latency analysis on the host observes
	 * guest mode as interrupt enabled.
	 *
	 * guest_enter_irqoff() informs context tracking about the
	 * transition to guest mode and if enabled adjusts RCU state
	 * accordingly.
	 */
	instrumentation_begin();
	trace_hardirqs_on_prepare();
	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
	instrumentation_end();

	guest_enter_irqoff();
	lockdep_hardirqs_on(CALLER_ADDR0);

	__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);

#ifdef CONFIG_X86_64
3388
	native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415
#else
	loadsegment(fs, svm->host.fs);
#ifndef CONFIG_X86_32_LAZY_GS
	loadsegment(gs, svm->host.gs);
#endif
#endif

	/*
	 * VMEXIT disables interrupts (host state), but tracing and lockdep
	 * have them in state 'on' as recorded before entering guest mode.
	 * Same as enter_from_user_mode().
	 *
	 * guest_exit_irqoff() restores host context and reinstates RCU if
	 * enabled and required.
	 *
	 * This needs to be done before the below as native_read_msr()
	 * contains a tracepoint and x86_spec_ctrl_restore_host() calls
	 * into world and some more.
	 */
	lockdep_hardirqs_off(CALLER_ADDR0);
	guest_exit_irqoff();

	instrumentation_begin();
	trace_hardirqs_off_finish();
	instrumentation_end();
}

3416
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
A
Avi Kivity 已提交
3417
{
3418
	fastpath_t exit_fastpath;
3419
	struct vcpu_svm *svm = to_svm(vcpu);
3420

3421 3422 3423 3424
	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440
	/*
	 * Disable singlestep if we're injecting an interrupt/exception.
	 * We don't want our modified rflags to be pushed on the stack where
	 * we might not be able to easily reset them if we disabled NMI
	 * singlestep later.
	 */
	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
		/*
		 * Event injection happens before external interrupts cause a
		 * vmexit and interrupts are disabled here, so smp_send_reschedule
		 * is enough to force an immediate vmexit.
		 */
		disable_nmi_singlestep(svm);
		smp_send_reschedule(vcpu->cpu);
	}

R
Rusty Russell 已提交
3441
	pre_svm_run(svm);
A
Avi Kivity 已提交
3442

3443 3444
	sync_lapic_to_cr8(vcpu);

3445
	svm->vmcb->save.cr2 = vcpu->arch.cr2;
A
Avi Kivity 已提交
3446

3447 3448 3449 3450 3451 3452 3453 3454 3455
	/*
	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
	 * of a #DB.
	 */
	if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
		svm_set_dr6(svm, vcpu->arch.dr6);
	else
		svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);

3456
	clgi();
3457
	kvm_load_guest_xsave_state(vcpu);
3458

3459 3460 3461 3462
	if (lapic_in_kernel(vcpu) &&
		vcpu->arch.apic->lapic_timer.timer_advance_ns)
		kvm_wait_lapic_expire(vcpu);

3463 3464 3465 3466 3467 3468
	/*
	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
	 * is no need to worry about the conditional branch over the wrmsr
	 * being speculatively taken.
	 */
3469
	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3470

3471
	svm_vcpu_enter_exit(vcpu, svm);
3472

3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487
	/*
	 * We do not use IBRS in the kernel. If this vCPU has used the
	 * SPEC_CTRL MSR it may have left it on; save the value and
	 * turn it off. This is much more efficient than blindly adding
	 * it to the atomic save/restore list. Especially as the former
	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
	 *
	 * For non-nested case:
	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 *
	 * For nested case:
	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
	 * save it.
	 */
3488
	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3489
		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3490

A
Avi Kivity 已提交
3491 3492
	reload_tss(vcpu);

3493 3494
	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);

3495 3496 3497 3498 3499
	vcpu->arch.cr2 = svm->vmcb->save.cr2;
	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;

3500
	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3501
		kvm_before_interrupt(&svm->vcpu);
3502

3503
	kvm_load_host_xsave_state(vcpu);
3504 3505 3506
	stgi();

	/* Any pending NMI will happen here */
3507
	exit_fastpath = svm_exit_handlers_fastpath(vcpu);
3508 3509

	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3510
		kvm_after_interrupt(&svm->vcpu);
3511

3512 3513
	sync_cr8_to_lapic(vcpu);

3514
	svm->next_rip = 0;
3515 3516 3517 3518
	if (is_guest_mode(&svm->vcpu)) {
		sync_nested_vmcb_control(svm);
		svm->nested.nested_run_pending = 0;
	}
3519

3520 3521
	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;

G
Gleb Natapov 已提交
3522 3523
	/* if exit due to PF check for async PF */
	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3524 3525
		svm->vcpu.arch.apf.host_apf_flags =
			kvm_read_and_reset_apf_flags();
G
Gleb Natapov 已提交
3526

A
Avi Kivity 已提交
3527 3528 3529 3530
	if (npt_enabled) {
		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
	}
3531 3532 3533 3534 3535 3536 3537 3538

	/*
	 * We need to handle MC intercepts here before the vcpu has a chance to
	 * change the physical cpu
	 */
	if (unlikely(svm->vmcb->control.exit_code ==
		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
		svm_handle_mce(svm);
3539

3540
	vmcb_mark_all_clean(svm->vmcb);
3541
	return exit_fastpath;
A
Avi Kivity 已提交
3542 3543
}

3544 3545
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
			     int root_level)
A
Avi Kivity 已提交
3546
{
3547
	struct vcpu_svm *svm = to_svm(vcpu);
3548
	unsigned long cr3;
3549

3550 3551 3552
	cr3 = __sme_set(root);
	if (npt_enabled) {
		svm->vmcb->control.nested_cr3 = cr3;
3553
		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3554

3555
		/* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
3556 3557 3558
		if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
			return;
		cr3 = vcpu->arch.cr3;
3559
	}
3560

3561
	svm->vmcb->save.cr3 = cr3;
3562
	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
3563 3564
}

A
Avi Kivity 已提交
3565 3566
static int is_disabled(void)
{
3567 3568 3569 3570 3571 3572
	u64 vm_cr;

	rdmsrl(MSR_VM_CR, vm_cr);
	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
		return 1;

A
Avi Kivity 已提交
3573 3574 3575
	return 0;
}

I
Ingo Molnar 已提交
3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
	/*
	 * Patch in the VMMCALL instruction:
	 */
	hypercall[0] = 0x0f;
	hypercall[1] = 0x01;
	hypercall[2] = 0xd9;
}

3587
static int __init svm_check_processor_compat(void)
Y
Yang, Sheng 已提交
3588
{
3589
	return 0;
Y
Yang, Sheng 已提交
3590 3591
}

3592 3593 3594 3595 3596
static bool svm_cpu_has_accelerated_tpr(void)
{
	return false;
}

3597
static bool svm_has_emulated_msr(u32 index)
3598
{
3599 3600
	switch (index) {
	case MSR_IA32_MCG_EXT_CTL:
3601
	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3602 3603 3604 3605 3606
		return false;
	default:
		break;
	}

3607 3608 3609
	return true;
}

3610 3611 3612 3613 3614
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
	return 0;
}

3615
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
3616
{
3617 3618
	struct vcpu_svm *svm = to_svm(vcpu);

3619
	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
3620
				    boot_cpu_has(X86_FEATURE_XSAVE) &&
3621 3622
				    boot_cpu_has(X86_FEATURE_XSAVES);

3623
	/* Update nrips enabled cache */
3624 3625
	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
			     guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
3626 3627 3628 3629

	if (!kvm_vcpu_apicv_active(vcpu))
		return;

3630 3631 3632 3633 3634 3635 3636
	/*
	 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
	 * is exposed to the guest, disable AVIC.
	 */
	if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
		kvm_request_apicv_update(vcpu->kvm, false,
					 APICV_INHIBIT_REASON_X2APIC);
3637 3638 3639 3640 3641 3642 3643 3644

	/*
	 * Currently, AVIC does not work with nested virtualization.
	 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
	 */
	if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
		kvm_request_apicv_update(vcpu->kvm, false,
					 APICV_INHIBIT_REASON_NESTED);
3645 3646
}

3647 3648 3649 3650 3651
static bool svm_has_wbinvd_exit(void)
{
	return true;
}

3652
#define PRE_EX(exit)  { .exit_code = (exit), \
3653
			.stage = X86_ICPT_PRE_EXCEPT, }
3654
#define POST_EX(exit) { .exit_code = (exit), \
3655
			.stage = X86_ICPT_POST_EXCEPT, }
3656
#define POST_MEM(exit) { .exit_code = (exit), \
3657
			.stage = X86_ICPT_POST_MEMACCESS, }
3658

3659
static const struct __x86_intercept {
3660 3661 3662 3663 3664 3665 3666 3667
	u32 exit_code;
	enum x86_intercept_stage stage;
} x86_intercept_map[] = {
	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
3668 3669
	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
3670 3671 3672 3673 3674 3675 3676 3677
	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
3678 3679 3680 3681 3682 3683 3684 3685
	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
3686 3687 3688
	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
3689 3690 3691 3692 3693 3694 3695 3696 3697
	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
3698 3699 3700 3701 3702 3703 3704
	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
3705 3706 3707 3708
	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
3709
	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
3710 3711
};

3712
#undef PRE_EX
3713
#undef POST_EX
3714
#undef POST_MEM
3715

3716 3717
static int svm_check_intercept(struct kvm_vcpu *vcpu,
			       struct x86_instruction_info *info,
3718 3719
			       enum x86_intercept_stage stage,
			       struct x86_exception *exception)
3720
{
3721 3722 3723 3724 3725 3726 3727 3728 3729 3730
	struct vcpu_svm *svm = to_svm(vcpu);
	int vmexit, ret = X86EMUL_CONTINUE;
	struct __x86_intercept icpt_info;
	struct vmcb *vmcb = svm->vmcb;

	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
		goto out;

	icpt_info = x86_intercept_map[info->intercept];

3731
	if (stage != icpt_info.stage)
3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745
		goto out;

	switch (icpt_info.exit_code) {
	case SVM_EXIT_READ_CR0:
		if (info->intercept == x86_intercept_cr_read)
			icpt_info.exit_code += info->modrm_reg;
		break;
	case SVM_EXIT_WRITE_CR0: {
		unsigned long cr0, val;
		u64 intercept;

		if (info->intercept == x86_intercept_cr_write)
			icpt_info.exit_code += info->modrm_reg;

3746 3747
		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
		    info->intercept == x86_intercept_clts)
3748 3749
			break;

3750
		intercept = svm->nested.ctl.intercept;
3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770

		if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
			break;

		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;

		if (info->intercept == x86_intercept_lmsw) {
			cr0 &= 0xfUL;
			val &= 0xfUL;
			/* lmsw can't clear PE - catch this here */
			if (cr0 & X86_CR0_PE)
				val |= X86_CR0_PE;
		}

		if (cr0 ^ val)
			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;

		break;
	}
3771 3772 3773 3774
	case SVM_EXIT_READ_DR0:
	case SVM_EXIT_WRITE_DR0:
		icpt_info.exit_code += info->modrm_reg;
		break;
3775 3776 3777 3778 3779 3780
	case SVM_EXIT_MSR:
		if (info->intercept == x86_intercept_wrmsr)
			vmcb->control.exit_info_1 = 1;
		else
			vmcb->control.exit_info_1 = 0;
		break;
3781 3782 3783 3784 3785 3786 3787
	case SVM_EXIT_PAUSE:
		/*
		 * We get this for NOP only, but pause
		 * is rep not, check this here
		 */
		if (info->rep_prefix != REPE_PREFIX)
			goto out;
3788
		break;
3789 3790 3791 3792 3793 3794
	case SVM_EXIT_IOIO: {
		u64 exit_info;
		u32 bytes;

		if (info->intercept == x86_intercept_in ||
		    info->intercept == x86_intercept_ins) {
3795 3796
			exit_info = ((info->src_val & 0xffff) << 16) |
				SVM_IOIO_TYPE_MASK;
3797
			bytes = info->dst_bytes;
3798
		} else {
3799
			exit_info = (info->dst_val & 0xffff) << 16;
3800
			bytes = info->src_bytes;
3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820
		}

		if (info->intercept == x86_intercept_outs ||
		    info->intercept == x86_intercept_ins)
			exit_info |= SVM_IOIO_STR_MASK;

		if (info->rep_prefix)
			exit_info |= SVM_IOIO_REP_MASK;

		bytes = min(bytes, 4u);

		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;

		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);

		vmcb->control.exit_info_1 = exit_info;
		vmcb->control.exit_info_2 = info->next_rip;

		break;
	}
3821 3822 3823 3824
	default:
		break;
	}

3825 3826 3827
	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
	if (static_cpu_has(X86_FEATURE_NRIPS))
		vmcb->control.next_rip  = info->next_rip;
3828 3829 3830 3831 3832 3833 3834 3835
	vmcb->control.exit_code = icpt_info.exit_code;
	vmexit = nested_svm_exit_handled(svm);

	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
					   : X86EMUL_CONTINUE;

out:
	return ret;
3836 3837
}

3838
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
3839 3840 3841
{
}

3842 3843
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
3844 3845
	if (pause_filter_thresh)
		shrink_ple_window(vcpu);
3846 3847
}

3848 3849 3850 3851 3852 3853
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
	/* [63:9] are reserved. */
	vcpu->arch.mcg_cap &= 0x1ff;
}

3854
bool svm_smi_blocked(struct kvm_vcpu *vcpu)
3855
{
3856 3857 3858 3859
	struct vcpu_svm *svm = to_svm(vcpu);

	/* Per APM Vol.2 15.22.2 "Response to SMI" */
	if (!gif_set(svm))
3860 3861 3862 3863 3864
		return true;

	return is_smm(vcpu);
}

3865
static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3866 3867 3868
{
	struct vcpu_svm *svm = to_svm(vcpu);
	if (svm->nested.nested_run_pending)
3869
		return -EBUSY;
3870

3871 3872
	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
3873
		return -EBUSY;
3874

3875
	return !svm_smi_blocked(vcpu);
3876 3877
}

3878 3879
static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896
	struct vcpu_svm *svm = to_svm(vcpu);
	int ret;

	if (is_guest_mode(vcpu)) {
		/* FED8h - SVM Guest */
		put_smstate(u64, smstate, 0x7ed8, 1);
		/* FEE0h - SVM Guest VMCB Physical Address */
		put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);

		svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
		svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
		svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

		ret = nested_svm_vmexit(svm);
		if (ret)
			return ret;
	}
3897 3898 3899
	return 0;
}

3900
static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
3901
{
3902 3903
	struct vcpu_svm *svm = to_svm(vcpu);
	struct vmcb *nested_vmcb;
3904
	struct kvm_host_map map;
3905 3906
	u64 guest;
	u64 vmcb;
3907
	int ret = 0;
3908

3909 3910
	guest = GET_SMSTATE(u64, smstate, 0x7ed8);
	vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
3911

3912
	if (guest) {
3913
		if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
3914
			return 1;
3915
		nested_vmcb = map.hva;
3916
		ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
3917
		kvm_vcpu_unmap(&svm->vcpu, &map, true);
3918
	}
3919 3920

	return ret;
3921 3922
}

3923
static void enable_smi_window(struct kvm_vcpu *vcpu)
3924 3925 3926 3927 3928
{
	struct vcpu_svm *svm = to_svm(vcpu);

	if (!gif_set(svm)) {
		if (vgif_enabled(svm))
3929
			svm_set_intercept(svm, INTERCEPT_STGI);
3930
		/* STGI will cause a vm exit */
3931 3932
	} else {
		/* We must be in SMM; RSM will cause a vmexit anyway.  */
3933 3934 3935
	}
}

3936 3937
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
{
3938 3939 3940 3941
	unsigned long cr4 = kvm_read_cr4(vcpu);
	bool smep = cr4 & X86_CR4_SMEP;
	bool smap = cr4 & X86_CR4_SMAP;
	bool is_user = svm_get_cpl(vcpu) == 3;
3942

3943 3944 3945 3946 3947 3948 3949
	/*
	 * If RIP is invalid, go ahead with emulation which will cause an
	 * internal error exit.
	 */
	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
		return true;

3950
	/*
3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979
	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
	 *
	 * Errata:
	 * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
	 * possible that CPU microcode implementing DecodeAssist will fail
	 * to read bytes of instruction which caused #NPF. In this case,
	 * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
	 * return 0 instead of the correct guest instruction bytes.
	 *
	 * This happens because CPU microcode reading instruction bytes
	 * uses a special opcode which attempts to read data using CPL=0
	 * priviledges. The microcode reads CS:RIP and if it hits a SMAP
	 * fault, it gives up and returns no instruction bytes.
	 *
	 * Detection:
	 * We reach here in case CPU supports DecodeAssist, raised #NPF and
	 * returned 0 in GuestIntrBytes field of the VMCB.
	 * First, errata can only be triggered in case vCPU CR4.SMAP=1.
	 * Second, if vCPU CR4.SMEP=1, errata could only be triggered
	 * in case vCPU CPL==3 (Because otherwise guest would have triggered
	 * a SMEP fault instead of #NPF).
	 * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
	 * As most guests enable SMAP if they have also enabled SMEP, use above
	 * logic in order to attempt minimize false-positive of detecting errata
	 * while still preserving all cases semantic correctness.
	 *
	 * Workaround:
	 * To determine what instruction the guest was executing, the hypervisor
	 * will have to decode the instruction at the instruction pointer.
3980 3981 3982 3983 3984 3985 3986 3987 3988 3989
	 *
	 * In non SEV guest, hypervisor will be able to read the guest
	 * memory to decode the instruction pointer when insn_len is zero
	 * so we return true to indicate that decoding is possible.
	 *
	 * But in the SEV guest, the guest memory is encrypted with the
	 * guest specific key and hypervisor will not be able to decode the
	 * instruction pointer so we will not able to workaround it. Lets
	 * print the error and request to kill the guest.
	 */
3990
	if (smap && (!smep || is_user)) {
3991 3992 3993
		if (!sev_guest(vcpu->kvm))
			return true;

3994
		pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
3995 3996 3997 3998 3999 4000
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	}

	return false;
}

4001 4002 4003 4004 4005 4006 4007
static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
	struct vcpu_svm *svm = to_svm(vcpu);

	/*
	 * TODO: Last condition latch INIT signals on vCPU when
	 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
4008 4009 4010
	 * To properly emulate the INIT intercept,
	 * svm_check_nested_events() should call nested_svm_vmexit()
	 * if an INIT signal is pending.
4011 4012 4013 4014 4015
	 */
	return !gif_set(svm) ||
		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
}

4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033
static void svm_vm_destroy(struct kvm *kvm)
{
	avic_vm_destroy(kvm);
	sev_vm_destroy(kvm);
}

static int svm_vm_init(struct kvm *kvm)
{
	if (avic) {
		int ret = avic_vm_init(kvm);
		if (ret)
			return ret;
	}

	kvm_apicv_init(kvm, avic);
	return 0;
}

4034
static struct kvm_x86_ops svm_x86_ops __initdata = {
4035
	.hardware_unsetup = svm_hardware_teardown,
A
Avi Kivity 已提交
4036 4037
	.hardware_enable = svm_hardware_enable,
	.hardware_disable = svm_hardware_disable,
4038
	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4039
	.has_emulated_msr = svm_has_emulated_msr,
A
Avi Kivity 已提交
4040 4041 4042

	.vcpu_create = svm_create_vcpu,
	.vcpu_free = svm_free_vcpu,
4043
	.vcpu_reset = svm_vcpu_reset,
A
Avi Kivity 已提交
4044

4045
	.vm_size = sizeof(struct kvm_svm),
4046
	.vm_init = svm_vm_init,
B
Brijesh Singh 已提交
4047
	.vm_destroy = svm_vm_destroy,
4048

4049
	.prepare_guest_switch = svm_prepare_guest_switch,
A
Avi Kivity 已提交
4050 4051
	.vcpu_load = svm_vcpu_load,
	.vcpu_put = svm_vcpu_put,
4052 4053
	.vcpu_blocking = svm_vcpu_blocking,
	.vcpu_unblocking = svm_vcpu_unblocking,
A
Avi Kivity 已提交
4054

4055
	.update_exception_bitmap = update_exception_bitmap,
4056
	.get_msr_feature = svm_get_msr_feature,
A
Avi Kivity 已提交
4057 4058 4059 4060 4061
	.get_msr = svm_get_msr,
	.set_msr = svm_set_msr,
	.get_segment_base = svm_get_segment_base,
	.get_segment = svm_get_segment,
	.set_segment = svm_set_segment,
4062
	.get_cpl = svm_get_cpl,
4063
	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
A
Avi Kivity 已提交
4064 4065 4066 4067 4068 4069 4070
	.set_cr0 = svm_set_cr0,
	.set_cr4 = svm_set_cr4,
	.set_efer = svm_set_efer,
	.get_idt = svm_get_idt,
	.set_idt = svm_set_idt,
	.get_gdt = svm_get_gdt,
	.set_gdt = svm_set_gdt,
4071
	.set_dr7 = svm_set_dr7,
4072
	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
A
Avi Kivity 已提交
4073
	.cache_reg = svm_cache_reg,
A
Avi Kivity 已提交
4074 4075
	.get_rflags = svm_get_rflags,
	.set_rflags = svm_set_rflags,
4076

4077
	.tlb_flush_all = svm_flush_tlb,
4078
	.tlb_flush_current = svm_flush_tlb,
4079
	.tlb_flush_gva = svm_flush_tlb_gva,
4080
	.tlb_flush_guest = svm_flush_tlb,
A
Avi Kivity 已提交
4081 4082

	.run = svm_vcpu_run,
4083
	.handle_exit = handle_exit,
A
Avi Kivity 已提交
4084
	.skip_emulated_instruction = skip_emulated_instruction,
4085
	.update_emulated_instruction = NULL,
4086 4087
	.set_interrupt_shadow = svm_set_interrupt_shadow,
	.get_interrupt_shadow = svm_get_interrupt_shadow,
I
Ingo Molnar 已提交
4088
	.patch_hypercall = svm_patch_hypercall,
E
Eddie Dong 已提交
4089
	.set_irq = svm_set_irq,
4090
	.set_nmi = svm_inject_nmi,
4091
	.queue_exception = svm_queue_exception,
A
Avi Kivity 已提交
4092
	.cancel_injection = svm_cancel_injection,
4093
	.interrupt_allowed = svm_interrupt_allowed,
4094
	.nmi_allowed = svm_nmi_allowed,
J
Jan Kiszka 已提交
4095 4096
	.get_nmi_mask = svm_get_nmi_mask,
	.set_nmi_mask = svm_set_nmi_mask,
4097 4098 4099
	.enable_nmi_window = enable_nmi_window,
	.enable_irq_window = enable_irq_window,
	.update_cr8_intercept = update_cr8_intercept,
4100
	.set_virtual_apic_mode = svm_set_virtual_apic_mode,
4101
	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
4102
	.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
4103
	.pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
4104
	.load_eoi_exitmap = svm_load_eoi_exitmap,
4105 4106
	.hwapic_irr_update = svm_hwapic_irr_update,
	.hwapic_isr_update = svm_hwapic_isr_update,
4107
	.sync_pir_to_irr = kvm_lapic_find_highest_irr,
4108
	.apicv_post_state_restore = avic_post_state_restore,
4109 4110

	.set_tss_addr = svm_set_tss_addr,
4111
	.set_identity_map_addr = svm_set_identity_map_addr,
4112
	.get_max_tdp_level = get_max_npt_level,
4113
	.get_mt_mask = svm_get_mt_mask,
4114

4115 4116
	.get_exit_info = svm_get_exit_info,

4117
	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4118

4119
	.has_wbinvd_exit = svm_has_wbinvd_exit,
4120

4121
	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
4122

4123
	.load_mmu_pgd = svm_load_mmu_pgd,
4124 4125

	.check_intercept = svm_check_intercept,
4126
	.handle_exit_irqoff = svm_handle_exit_irqoff,
4127

4128 4129
	.request_immediate_exit = __kvm_request_immediate_exit,

4130
	.sched_in = svm_sched_in,
4131 4132

	.pmu_ops = &amd_pmu_ops,
4133 4134
	.nested_ops = &svm_nested_ops,

4135
	.deliver_posted_interrupt = svm_deliver_avic_intr,
4136
	.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
4137
	.update_pi_irte = svm_update_pi_irte,
4138
	.setup_mce = svm_setup_mce,
4139

4140
	.smi_allowed = svm_smi_allowed,
4141 4142
	.pre_enter_smm = svm_pre_enter_smm,
	.pre_leave_smm = svm_pre_leave_smm,
4143
	.enable_smi_window = enable_smi_window,
B
Brijesh Singh 已提交
4144 4145

	.mem_enc_op = svm_mem_enc_op,
4146 4147
	.mem_enc_reg_region = svm_register_enc_region,
	.mem_enc_unreg_region = svm_unregister_enc_region,
4148

4149
	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
4150 4151

	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
A
Avi Kivity 已提交
4152 4153
};

4154 4155 4156 4157 4158 4159 4160
static struct kvm_x86_init_ops svm_init_ops __initdata = {
	.cpu_has_kvm_support = has_svm,
	.disabled_by_bios = is_disabled,
	.hardware_setup = svm_hardware_setup,
	.check_processor_compatibility = svm_check_processor_compat,

	.runtime_ops = &svm_x86_ops,
A
Avi Kivity 已提交
4161 4162 4163 4164
};

static int __init svm_init(void)
{
4165
	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
4166
			__alignof__(struct vcpu_svm), THIS_MODULE);
A
Avi Kivity 已提交
4167 4168 4169 4170
}

static void __exit svm_exit(void)
{
4171
	kvm_exit();
A
Avi Kivity 已提交
4172 4173 4174 4175
}

module_init(svm_init)
module_exit(svm_exit)