xstate.c 26.4 KB
Newer Older
1 2 3 4 5 6
/*
 * xsave/xrstor support.
 *
 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
 */
#include <linux/compat.h>
F
Fenghua Yu 已提交
7
#include <linux/cpu.h>
8
#include <linux/pkeys.h>
9

10
#include <asm/fpu/api.h>
11
#include <asm/fpu/internal.h>
12
#include <asm/fpu/signal.h>
13
#include <asm/fpu/regset.h>
I
Ingo Molnar 已提交
14

A
Andy Lutomirski 已提交
15
#include <asm/tlbflush.h>
16

17 18 19 20 21
/*
 * Although we spell it out in here, the Processor Trace
 * xfeature is completely unused.  We use other mechanisms
 * to save/restore PT state in Linux.
 */
22 23 24 25 26 27 28 29 30 31
static const char *xfeature_names[] =
{
	"x87 floating point registers"	,
	"SSE registers"			,
	"AVX registers"			,
	"MPX bounds registers"		,
	"MPX CSR"			,
	"AVX-512 opmask"		,
	"AVX-512 Hi256"			,
	"AVX-512 ZMM_Hi256"		,
32
	"Processor Trace (unused)"	,
33
	"Protection Keys User registers",
34 35 36
	"unknown xstate feature"	,
};

37
/*
38
 * Mask of xstate features supported by the CPU and the kernel:
39
 */
40
u64 xfeatures_mask __read_mostly;
41

D
Dave Hansen 已提交
42 43
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
44
static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
45

46 47 48 49 50 51 52
/*
 * The XSAVE area of kernel can be in standard or compacted format;
 * it is always in standard format for user mode. This is the user
 * mode standard format size used for signal and ptrace frames.
 */
unsigned int fpu_user_xstate_size;

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
/*
 * Clear all of the X86_FEATURE_* bits that are unavailable
 * when the CPU has no XSAVE support.
 */
void fpu__xstate_clear_all_cpu_caps(void)
{
	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
	setup_clear_cpu_cap(X86_FEATURE_AVX);
	setup_clear_cpu_cap(X86_FEATURE_AVX2);
	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
69 70 71
	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
72
	setup_clear_cpu_cap(X86_FEATURE_MPX);
73
	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
74
	setup_clear_cpu_cap(X86_FEATURE_PKU);
75 76
}

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
/*
 * Return whether the system supports a given xfeature.
 *
 * Also return the name of the (most advanced) feature that the caller requested:
 */
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;

	if (unlikely(feature_name)) {
		long xfeature_idx, max_idx;
		u64 xfeatures_print;
		/*
		 * So we use FLS here to be able to print the most advanced
		 * feature that was requested but is missing. So if a driver
D
Dave Hansen 已提交
92
		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
		 * missing AVX feature - this is the most informative message
		 * to users:
		 */
		if (xfeatures_missing)
			xfeatures_print = xfeatures_missing;
		else
			xfeatures_print = xfeatures_needed;

		xfeature_idx = fls64(xfeatures_print)-1;
		max_idx = ARRAY_SIZE(xfeature_names)-1;
		xfeature_idx = min(xfeature_idx, max_idx);

		*feature_name = xfeature_names[xfeature_idx];
	}

	if (xfeatures_missing)
		return 0;

	return 1;
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
static int xfeature_is_supervisor(int xfeature_nr)
{
	/*
	 * We currently do not support supervisor states, but if
	 * we did, we could find out like this.
	 *
	 * SDM says: If state component 'i' is a user state component,
	 * ECX[0] return 0; if state component i is a supervisor
	 * state component, ECX[0] returns 1.
	 */
	u32 eax, ebx, ecx, edx;

	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
	return !!(ecx & 1);
}

static int xfeature_is_user(int xfeature_nr)
{
	return !xfeature_is_supervisor(xfeature_nr);
}

136
/*
137 138 139 140 141
 * When executing XSAVEOPT (or other optimized XSAVE instructions), if
 * a processor implementation detects that an FPU state component is still
 * (or is again) in its initialized state, it may clear the corresponding
 * bit in the header.xfeatures field, and can skip the writeout of registers
 * to the corresponding memory layout.
142 143 144 145 146 147 148 149
 *
 * This means that when the bit is zero, the state component might still contain
 * some previous - non-initialized register state.
 *
 * Before writing xstate information to user-space we sanitize those components,
 * to always ensure that the memory layout of a feature will be in the init state
 * if the corresponding header bit is zero. This is to ensure that user-space doesn't
 * see some stale state in the memory layout during signal handling, debugging etc.
150
 */
151
void fpstate_sanitize_xstate(struct fpu *fpu)
152
{
153
	struct fxregs_state *fx = &fpu->state.fxsave;
154
	int feature_bit;
155
	u64 xfeatures;
156

157
	if (!use_xsaveopt())
158 159
		return;

160
	xfeatures = fpu->state.xsave.header.xfeatures;
161 162 163

	/*
	 * None of the feature bits are in init state. So nothing else
L
Lucas De Marchi 已提交
164
	 * to do for us, as the memory layout is up to date.
165
	 */
166
	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
167 168 169 170 171
		return;

	/*
	 * FP is in init state
	 */
D
Dave Hansen 已提交
172
	if (!(xfeatures & XFEATURE_MASK_FP)) {
173 174 175 176 177 178 179 180 181 182 183 184
		fx->cwd = 0x37f;
		fx->swd = 0;
		fx->twd = 0;
		fx->fop = 0;
		fx->rip = 0;
		fx->rdp = 0;
		memset(&fx->st_space[0], 0, 128);
	}

	/*
	 * SSE is in init state
	 */
D
Dave Hansen 已提交
185
	if (!(xfeatures & XFEATURE_MASK_SSE))
186 187
		memset(&fx->xmm_space[0], 0, 256);

188 189 190 191 192
	/*
	 * First two features are FPU and SSE, which above we handled
	 * in a special way already:
	 */
	feature_bit = 0x2;
193
	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
194 195

	/*
196 197 198
	 * Update all the remaining memory layouts according to their
	 * standard xstate layout, if their header bit is in the init
	 * state:
199
	 */
200 201
	while (xfeatures) {
		if (xfeatures & 0x1) {
202
			int offset = xstate_comp_offsets[feature_bit];
203 204
			int size = xstate_sizes[feature_bit];

205
			memcpy((void *)fx + offset,
206
			       (void *)&init_fpstate.xsave + offset,
207 208 209
			       size);
		}

210
		xfeatures >>= 1;
211 212 213 214
		feature_bit++;
	}
}

215
/*
216 217
 * Enable the extended processor state save/restore feature.
 * Called once per CPU onlining.
218
 */
219
void fpu__init_cpu_xstate(void)
220
{
221
	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
222 223
		return;

A
Andy Lutomirski 已提交
224
	cr4_set_bits(X86_CR4_OSXSAVE);
225
	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
226 227
}

228 229 230 231 232 233 234 235 236 237
/*
 * Note that in the future we will likely need a pair of
 * functions here: one for user xstates and the other for
 * system xstates.  For now, they are the same.
 */
static int xfeature_enabled(enum xfeature xfeature)
{
	return !!(xfeatures_mask & (1UL << xfeature));
}

238
/*
239 240
 * Record the offsets and sizes of various xstates contained
 * in the XSAVE state memory layout.
241
 */
242
static void __init setup_xstate_features(void)
243
{
D
Dave Hansen 已提交
244
	u32 eax, ebx, ecx, edx, i;
245 246 247
	/* start at the beginnning of the "extended state" */
	unsigned int last_good_offset = offsetof(struct xregs_state,
						 extended_state_area);
248

D
Dave Hansen 已提交
249
	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
250 251
		if (!xfeature_enabled(i))
			continue;
252

253
		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
254 255 256 257 258 259 260 261

		/*
		 * If an xfeature is supervisor state, the offset
		 * in EBX is invalid. We leave it to -1.
		 */
		if (xfeature_is_user(i))
			xstate_offsets[i] = ebx;

D
Dave Hansen 已提交
262
		xstate_sizes[i] = eax;
263 264 265 266 267 268 269 270
		/*
		 * In our xstate size checks, we assume that the
		 * highest-numbered xstate feature has the
		 * highest offset in the buffer.  Ensure it does.
		 */
		WARN_ONCE(last_good_offset > xstate_offsets[i],
			"x86/fpu: misordered xstate at %d\n", last_good_offset);
		last_good_offset = xstate_offsets[i];
271

D
Dave Hansen 已提交
272
		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
273
	}
274 275
}

276
static void __init print_xstate_feature(u64 xstate_mask)
277
{
278
	const char *feature_name;
279

280
	if (cpu_has_xfeatures(xstate_mask, &feature_name))
281
		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
282 283 284 285 286
}

/*
 * Print out all the supported xstate features:
 */
287
static void __init print_xstate_features(void)
288
{
D
Dave Hansen 已提交
289 290 291 292 293 294 295 296
	print_xstate_feature(XFEATURE_MASK_FP);
	print_xstate_feature(XFEATURE_MASK_SSE);
	print_xstate_feature(XFEATURE_MASK_YMM);
	print_xstate_feature(XFEATURE_MASK_BNDREGS);
	print_xstate_feature(XFEATURE_MASK_BNDCSR);
	print_xstate_feature(XFEATURE_MASK_OPMASK);
	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
297
	print_xstate_feature(XFEATURE_MASK_PKRU);
298 299
}

300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
/*
 * This check is important because it is easy to get XSTATE_*
 * confused with XSTATE_BIT_*.
 */
#define CHECK_XFEATURE(nr) do {		\
	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
	WARN_ON(nr >= XFEATURE_MAX);	\
} while (0)

/*
 * We could cache this like xstate_size[], but we only use
 * it here, so it would be a waste of space.
 */
static int xfeature_is_aligned(int xfeature_nr)
{
	u32 eax, ebx, ecx, edx;

	CHECK_XFEATURE(xfeature_nr);
	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
	/*
	 * The value returned by ECX[1] indicates the alignment
	 * of state component 'i' when the compacted format
	 * of the extended region of an XSAVE area is used:
	 */
	return !!(ecx & 2);
}

327 328 329 330 331
/*
 * This function sets up offsets and sizes of all extended states in
 * xsave area. This supports both standard format and compacted format
 * of the xsave aread.
 */
332
static void __init setup_xstate_comp(void)
333
{
334
	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
335 336
	int i;

337 338 339 340 341 342
	/*
	 * The FP xstates and SSE xstates are legacy states. They are always
	 * in the fixed offsets in the xsave area in either compacted form
	 * or standard form.
	 */
	xstate_comp_offsets[0] = 0;
343
	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
344

345
	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
D
Dave Hansen 已提交
346
		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
347
			if (xfeature_enabled(i)) {
348 349 350 351 352 353 354
				xstate_comp_offsets[i] = xstate_offsets[i];
				xstate_comp_sizes[i] = xstate_sizes[i];
			}
		}
		return;
	}

355 356
	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
		FXSAVE_SIZE + XSAVE_HDR_SIZE;
357

D
Dave Hansen 已提交
358
	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
359
		if (xfeature_enabled(i))
360 361 362 363
			xstate_comp_sizes[i] = xstate_sizes[i];
		else
			xstate_comp_sizes[i] = 0;

364
		if (i > FIRST_EXTENDED_XFEATURE) {
365 366 367
			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
					+ xstate_comp_sizes[i-1];

368 369 370 371
			if (xfeature_is_aligned(i))
				xstate_comp_offsets[i] =
					ALIGN(xstate_comp_offsets[i], 64);
		}
372 373 374
	}
}

375 376 377
/*
 * setup the xstate image representing the init state
 */
378
static void __init setup_init_fpu_buf(void)
379
{
380
	static int on_boot_cpu __initdata = 1;
381 382 383 384

	WARN_ON_FPU(!on_boot_cpu);
	on_boot_cpu = 0;

385
	if (!boot_cpu_has(X86_FEATURE_XSAVE))
386 387 388
		return;

	setup_xstate_features();
389
	print_xstate_features();
390

391
	if (boot_cpu_has(X86_FEATURE_XSAVES))
392
		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
393

394
	/*
395
	 * Init all the features state with header.xfeatures being 0x0
396
	 */
397
	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
398

399 400 401 402
	/*
	 * Dump the init state again. This is to identify the init state
	 * of any feature which is not represented by all zero's.
	 */
403
	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
404 405
}

406 407 408 409
static int xfeature_uncompacted_offset(int xfeature_nr)
{
	u32 eax, ebx, ecx, edx;

410 411 412 413 414 415 416 417 418 419
	/*
	 * Only XSAVES supports supervisor states and it uses compacted
	 * format. Checking a supervisor state's uncompacted offset is
	 * an error.
	 */
	if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
		WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
		return -1;
	}

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
	CHECK_XFEATURE(xfeature_nr);
	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
	return ebx;
}

static int xfeature_size(int xfeature_nr)
{
	u32 eax, ebx, ecx, edx;

	CHECK_XFEATURE(xfeature_nr);
	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
	return eax;
}

/*
 * 'XSAVES' implies two different things:
 * 1. saving of supervisor/system state
 * 2. using the compacted format
 *
 * Use this function when dealing with the compacted format so
 * that it is obvious which aspect of 'XSAVES' is being handled
 * by the calling code.
 */
443
int using_compacted_format(void)
444
{
445
	return boot_cpu_has(X86_FEATURE_XSAVES);
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
}

static void __xstate_dump_leaves(void)
{
	int i;
	u32 eax, ebx, ecx, edx;
	static int should_dump = 1;

	if (!should_dump)
		return;
	should_dump = 0;
	/*
	 * Dump out a few leaves past the ones that we support
	 * just in case there are some goodies up there
	 */
	for (i = 0; i < XFEATURE_MAX + 10; i++) {
		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
			XSTATE_CPUID, i, eax, ebx, ecx, edx);
	}
}

#define XSTATE_WARN_ON(x) do {							\
	if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {	\
		__xstate_dump_leaves();						\
	}									\
} while (0)

474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {			\
	if ((nr == nr_macro) &&						\
	    WARN_ONCE(sz != sizeof(__struct),				\
		"%s: struct is %zu bytes, cpu state %d bytes\n",	\
		__stringify(nr_macro), sizeof(__struct), sz)) {		\
		__xstate_dump_leaves();					\
	}								\
} while (0)

/*
 * We have a C struct for each 'xstate'.  We need to ensure
 * that our software representation matches what the CPU
 * tells us about the state's size.
 */
static void check_xstate_against_struct(int nr)
{
	/*
	 * Ask the CPU for the size of the state.
	 */
	int sz = xfeature_size(nr);
	/*
	 * Match each CPU state with the corresponding software
	 * structure.
	 */
	XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
	XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
	XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
504
	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
505 506 507 508 509 510 511

	/*
	 * Make *SURE* to add any feature numbers in below if
	 * there are "holes" in the xsave state component
	 * numbers.
	 */
	if ((nr < XFEATURE_YMM) ||
512 513
	    (nr >= XFEATURE_MAX) ||
	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
514 515 516 517 518
		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
		XSTATE_WARN_ON(1);
	}
}

519 520 521 522 523 524 525 526 527 528 529 530 531
/*
 * This essentially double-checks what the cpu told us about
 * how large the XSAVE buffer needs to be.  We are recalculating
 * it to be safe.
 */
static void do_extra_xstate_size_checks(void)
{
	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
	int i;

	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
		if (!xfeature_enabled(i))
			continue;
532 533

		check_xstate_against_struct(i);
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
		/*
		 * Supervisor state components can be managed only by
		 * XSAVES, which is compacted-format only.
		 */
		if (!using_compacted_format())
			XSTATE_WARN_ON(xfeature_is_supervisor(i));

		/* Align from the end of the previous feature */
		if (xfeature_is_aligned(i))
			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
		/*
		 * The offset of a given state in the non-compacted
		 * format is given to us in a CPUID leaf.  We check
		 * them for being ordered (increasing offsets) in
		 * setup_xstate_features().
		 */
		if (!using_compacted_format())
			paranoid_xstate_size = xfeature_uncompacted_offset(i);
		/*
		 * The compacted-format offset always depends on where
		 * the previous state ended.
		 */
		paranoid_xstate_size += xfeature_size(i);
	}
558
	XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
559 560
}

561

F
Fenghua Yu 已提交
562
/*
563
 * Get total size of enabled xstates in XCR0/xfeatures_mask.
564 565 566 567 568 569 570 571
 *
 * Note the SDM's wording here.  "sub-function 0" only enumerates
 * the size of the *user* states.  If we use it to size a buffer
 * that we use 'XSAVES' on, we could potentially overflow the
 * buffer because 'XSAVES' saves system states too.
 *
 * Note that we do not currently set any bits on IA32_XSS so
 * 'XCR0 | IA32_XSS == XCR0' for now.
F
Fenghua Yu 已提交
572
 */
573
static unsigned int __init get_xsaves_size(void)
F
Fenghua Yu 已提交
574 575
{
	unsigned int eax, ebx, ecx, edx;
576 577 578 579 580 581 582 583 584 585 586
	/*
	 * - CPUID function 0DH, sub-function 1:
	 *    EBX enumerates the size (in bytes) required by
	 *    the XSAVES instruction for an XSAVE area
	 *    containing all the state components
	 *    corresponding to bits currently set in
	 *    XCR0 | IA32_XSS.
	 */
	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
	return ebx;
}
F
Fenghua Yu 已提交
587

588 589 590 591 592 593 594 595 596 597 598 599
static unsigned int __init get_xsave_size(void)
{
	unsigned int eax, ebx, ecx, edx;
	/*
	 * - CPUID function 0DH, sub-function 0:
	 *    EBX enumerates the size (in bytes) required by
	 *    the XSAVE instruction for an XSAVE area
	 *    containing all the *user* state components
	 *    corresponding to bits currently set in XCR0.
	 */
	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
	return ebx;
D
Dave Hansen 已提交
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
}

/*
 * Will the runtime-enumerated 'xstate_size' fit in the init
 * task's statically-allocated buffer?
 */
static bool is_supported_xstate_size(unsigned int test_xstate_size)
{
	if (test_xstate_size <= sizeof(union fpregs_state))
		return true;

	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
			sizeof(union fpregs_state), test_xstate_size);
	return false;
}

static int init_xstate_size(void)
{
	/* Recompute the context size for enabled features: */
619 620 621 622 623 624 625 626 627
	unsigned int possible_xstate_size;
	unsigned int xsave_size;

	xsave_size = get_xsave_size();

	if (boot_cpu_has(X86_FEATURE_XSAVES))
		possible_xstate_size = get_xsaves_size();
	else
		possible_xstate_size = xsave_size;
D
Dave Hansen 已提交
628 629 630 631 632 633 634 635 636

	/* Ensure we have the space to store all enabled: */
	if (!is_supported_xstate_size(possible_xstate_size))
		return -EINVAL;

	/*
	 * The size is OK, we are definitely going to use xsave,
	 * make it known to the world that we need more space.
	 */
637
	fpu_kernel_xstate_size = possible_xstate_size;
638
	do_extra_xstate_size_checks();
639 640 641 642 643

	/*
	 * User space is always in standard format.
	 */
	fpu_user_xstate_size = xsave_size;
D
Dave Hansen 已提交
644 645 646
	return 0;
}

D
Dave Hansen 已提交
647 648 649 650 651
/*
 * We enabled the XSAVE hardware, but something went wrong and
 * we can not use it.  Disable it.
 */
static void fpu__init_disable_system_xstate(void)
D
Dave Hansen 已提交
652 653 654 655
{
	xfeatures_mask = 0;
	cr4_clear_bits(X86_CR4_OSXSAVE);
	fpu__xstate_clear_all_cpu_caps();
F
Fenghua Yu 已提交
656 657
}

658 659
/*
 * Enable and initialize the xsave feature.
660
 * Called once per system bootup.
661
 */
662
void __init fpu__init_system_xstate(void)
663 664
{
	unsigned int eax, ebx, ecx, edx;
665
	static int on_boot_cpu __initdata = 1;
D
Dave Hansen 已提交
666
	int err;
667 668 669

	WARN_ON_FPU(!on_boot_cpu);
	on_boot_cpu = 0;
670

671
	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
672 673 674 675
		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
		return;
	}

676
	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
677
		WARN_ON_FPU(1);
678 679 680 681
		return;
	}

	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
682
	xfeatures_mask = eax + ((u64)edx << 32);
683

D
Dave Hansen 已提交
684
	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
685
		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
686 687 688
		BUG();
	}

689
	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
690

691 692
	/* Enable xstate instructions to be able to continue with initialization: */
	fpu__init_cpu_xstate();
D
Dave Hansen 已提交
693 694 695 696 697 698
	err = init_xstate_size();
	if (err) {
		/* something went wrong, boot without any XSAVE support */
		fpu__init_disable_system_xstate();
		return;
	}
699

700
	update_regset_xstate_info(fpu_kernel_xstate_size, xfeatures_mask);
I
Ingo Molnar 已提交
701
	fpu__init_prepare_fx_sw_frame();
702
	setup_init_fpu_buf();
703
	setup_xstate_comp();
704

705
	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
706
		xfeatures_mask,
707
		fpu_kernel_xstate_size,
708
		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
709
}
710

711 712 713 714 715 716 717 718
/*
 * Restore minimal FPU state after suspend:
 */
void fpu__resume_cpu(void)
{
	/*
	 * Restore XCR0 on xsave capable CPUs:
	 */
719
	if (boot_cpu_has(X86_FEATURE_XSAVE))
720 721 722
		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
}

723 724 725 726 727 728 729 730 731 732 733 734 735
/*
 * Given an xstate feature mask, calculate where in the xsave
 * buffer the state is.  Callers should ensure that the buffer
 * is valid.
 *
 * Note: does not work for compacted buffers.
 */
void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
{
	int feature_nr = fls64(xstate_feature_mask) - 1;

	return (void *)xsave + xstate_comp_offsets[feature_nr];
}
736 737 738 739 740 741 742
/*
 * Given the xsave area and a state inside, this function returns the
 * address of the state.
 *
 * This is the API that is called to get xstate address in either
 * standard format or compacted format of xsave area.
 *
743 744 745
 * Note that if there is no data for the field in the xsave buffer
 * this will return NULL.
 *
746
 * Inputs:
747 748
 *	xstate: the thread's storage area for all FPU data
 *	xstate_feature: state which is defined in xsave.h (e.g.
D
Dave Hansen 已提交
749
 *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
750
 * Output:
751 752
 *	address of the state in the xsave area, or NULL if the
 *	field is not present in the xsave buffer.
753
 */
754
void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
755
{
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
	/*
	 * Do we even *have* xsave state?
	 */
	if (!boot_cpu_has(X86_FEATURE_XSAVE))
		return NULL;

	/*
	 * We should not ever be requesting features that we
	 * have not enabled.  Remember that pcntxt_mask is
	 * what we write to the XCR0 register.
	 */
	WARN_ONCE(!(xfeatures_mask & xstate_feature),
		  "get of unsupported state");
	/*
	 * This assumes the last 'xsave*' instruction to
	 * have requested that 'xstate_feature' be saved.
	 * If it did not, we might be seeing and old value
	 * of the field in the buffer.
	 *
	 * This can happen because the last 'xsave' did not
	 * request that this feature be saved (unlikely)
	 * or because the "init optimization" caused it
	 * to not be saved.
	 */
	if (!(xsave->header.xfeatures & xstate_feature))
781 782
		return NULL;

783
	return __raw_xsave_addr(xsave, xstate_feature);
784
}
P
Paolo Bonzini 已提交
785
EXPORT_SYMBOL_GPL(get_xsave_addr);
786 787 788 789 790 791 792 793 794 795 796 797

/*
 * This wraps up the common operations that need to occur when retrieving
 * data from xsave state.  It first ensures that the current task was
 * using the FPU and retrieves the data in to a buffer.  It then calculates
 * the offset of the requested field in the buffer.
 *
 * This function is safe to call whether the FPU is in use or not.
 *
 * Note that this only works on the current task.
 *
 * Inputs:
D
Dave Hansen 已提交
798 799
 *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
 *	XFEATURE_MASK_SSE, etc...)
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
 * Output:
 *	address of the state in the xsave area or NULL if the state
 *	is not present or is in its 'init state'.
 */
const void *get_xsave_field_ptr(int xsave_state)
{
	struct fpu *fpu = &current->thread.fpu;

	if (!fpu->fpstate_active)
		return NULL;
	/*
	 * fpu__save() takes the CPU's xstate registers
	 * and saves them off to the 'fpu memory buffer.
	 */
	fpu__save(fpu);

	return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899


/*
 * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
 * to take out of its "init state".  This will ensure that an
 * XRSTOR actually restores the state.
 */
static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
		int xstate_feature_mask)
{
	xsave->header.xfeatures |= xstate_feature_mask;
}

/*
 * This function is safe to call whether the FPU is in use or not.
 *
 * Note that this only works on the current task.
 *
 * Inputs:
 *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
 *	XFEATURE_MASK_SSE, etc...)
 *	@xsave_state_ptr: a pointer to a copy of the state that you would
 *	like written in to the current task's FPU xsave state.  This pointer
 *	must not be located in the current tasks's xsave area.
 * Output:
 *	address of the state in the xsave area or NULL if the state
 *	is not present or is in its 'init state'.
 */
static void fpu__xfeature_set_state(int xstate_feature_mask,
		void *xstate_feature_src, size_t len)
{
	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
	struct fpu *fpu = &current->thread.fpu;
	void *dst;

	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
		WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
		return;
	}

	/*
	 * Tell the FPU code that we need the FPU state to be in
	 * 'fpu' (not in the registers), and that we need it to
	 * be stable while we write to it.
	 */
	fpu__current_fpstate_write_begin();

	/*
	 * This method *WILL* *NOT* work for compact-format
	 * buffers.  If the 'xstate_feature_mask' is unset in
	 * xcomp_bv then we may need to move other feature state
	 * "up" in the buffer.
	 */
	if (xsave->header.xcomp_bv & xstate_feature_mask) {
		WARN_ON_ONCE(1);
		goto out;
	}

	/* find the location in the xsave buffer of the desired state */
	dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);

	/*
	 * Make sure that the pointer being passed in did not
	 * come from the xsave buffer itself.
	 */
	WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");

	/* put the caller-provided data in the location */
	memcpy(dst, xstate_feature_src, len);

	/*
	 * Mark the xfeature so that the CPU knows there is state
	 * in the buffer now.
	 */
	fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
out:
	/*
	 * We are done writing to the 'fpu'.  Reenable preeption
	 * and (possibly) move the fpstate back in to the fpregs.
	 */
	fpu__current_fpstate_write_end();
}
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970

#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)

/*
 * This will go out and modify the XSAVE buffer so that PKRU is
 * set to a particular state for access to 'pkey'.
 *
 * PKRU state does affect kernel access to user memory.  We do
 * not modfiy PKRU *itself* here, only the XSAVE state that will
 * be restored in to PKRU when we return back to userspace.
 */
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
		unsigned long init_val)
{
	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
	struct pkru_state *old_pkru_state;
	struct pkru_state new_pkru_state;
	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
	u32 new_pkru_bits = 0;

	/*
	 * This check implies XSAVE support.  OSPKE only gets
	 * set if we enable XSAVE and we enable PKU in XCR0.
	 */
	if (!boot_cpu_has(X86_FEATURE_OSPKE))
		return -EINVAL;

	/* Set the bits we need in PKRU  */
	if (init_val & PKEY_DISABLE_ACCESS)
		new_pkru_bits |= PKRU_AD_BIT;
	if (init_val & PKEY_DISABLE_WRITE)
		new_pkru_bits |= PKRU_WD_BIT;

	/* Shift the bits in to the correct place in PKRU for pkey. */
	new_pkru_bits <<= pkey_shift;

	/* Locate old copy of the state in the xsave buffer */
	old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);

	/*
	 * When state is not in the buffer, it is in the init
	 * state, set it manually.  Otherwise, copy out the old
	 * state.
	 */
	if (!old_pkru_state)
		new_pkru_state.pkru = 0;
	else
		new_pkru_state.pkru = old_pkru_state->pkru;

	/* mask off any old bits in place */
	new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
	/* Set the newly-requested bits */
	new_pkru_state.pkru |= new_pkru_bits;

	/*
	 * We could theoretically live without zeroing pkru.pad.
	 * The current XSAVE feature state definition says that
	 * only bytes 0->3 are used.  But we do not want to
	 * chance leaking kernel stack out to userspace in case a
	 * memcpy() of the whole xsave buffer was done.
	 *
	 * They're in the same cacheline anyway.
	 */
	new_pkru_state.pad = 0;

	fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
			sizeof(new_pkru_state));

	return 0;
}