kvm: x86: Add support for getting/setting expanded xstate buffer

mainline inclusion from mainline-v5.17-rc1 commit be50b206 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5RQLJ CVE: NA Intel-SIG: commit be50b206 kvm: x86: Add support for getting/setting expanded xstate buffer. -------------------------------- With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set xstate data from/to KVM. This doesn't work when dynamic xfeatures (e.g. AMX) are exposed to the guest as they require a larger buffer size. Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended to work with both legacy and new capabilities by doing properly-sized memdup_user() based on the guest fpu container. KVM_GET_XSAVE is kept for backward-compatible reason. Instead, KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred interface for getting xstate buffer (4KB or larger size) from KVM (Link: https://lkml.org/lkml/2021/12/15/510) Also, update the api doc with the new KVM_GET_XSAVE2 ioctl. Signed-off-by: N Guang Zeng <guang.zeng@intel.com> Signed-off-by: N Wei Wang <wei.w.wang@intel.com> Signed-off-by: N Jing Liu <jing2.liu@intel.com> Signed-off-by: N Kevin Tian <kevin.tian@intel.com> Signed-off-by: N Yang Zhong <yang.zhong@intel.com> Message-Id: <20220105123532.12586-19-yang.zhong@intel.com> Signed-off-by: N Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: N Lin Wang <lin.x.wang@intel.com>

kvm: x86: Add support for getting/setting expanded xstate buffer
mainline inclusion from mainline-v5.17-rc1 commit be50b206 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5RQLJ CVE: NA Intel-SIG: commit be50b206 kvm: x86: Add support for getting/setting expanded xstate buffer. -------------------------------- With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set xstate data from/to KVM. This doesn't work when dynamic xfeatures (e.g. AMX) are exposed to the guest as they require a larger buffer size. Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended to work with both legacy and new capabilities by doing properly-sized memdup_user() based on the guest fpu container. KVM_GET_XSAVE is kept for backward-compatible reason. Instead, KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred interface for getting xstate buffer (4KB or larger size) from KVM (Link: https://lkml.org/lkml/2021/12/15/510) Also, update the api doc with the new KVM_GET_XSAVE2 ioctl. Signed-off-by: N Guang Zeng <guang.zeng@intel.com> Signed-off-by: N Wei Wang <wei.w.wang@intel.com> Signed-off-by: N Jing Liu <jing2.liu@intel.com> Signed-off-by: N Kevin Tian <kevin.tian@intel.com> Signed-off-by: N Yang Zhong <yang.zhong@intel.com> Message-Id: <20220105123532.12586-19-yang.zhong@intel.com> Signed-off-by: N Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: N Lin Wang <lin.x.wang@intel.com>
0d9e57e1 · Guang Zeng · Lin Wang · 8462c730 · 0d9e57e1 · 0d9e57e1
6 changed file
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1514,6 +1514,7 @@ is vcpu 0.

  struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
  };

 This ioctl would copy current vcpu's xsave struct to the userspace.
@@ -1522,7 +1523,7 @@ This ioctl would copy current vcpu's xsave struct to the userspace.
 4.43 KVM_SET_XSAVE
 ------------------

-:Capability: KVM_CAP_XSAVE
+:Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2
 :Architectures: x86
 :Type: vcpu ioctl
 :Parameters: struct kvm_xsave (in)
@@ -1533,9 +1534,18 @@ This ioctl would copy current vcpu's xsave struct to the userspace.

  struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
  };

-This ioctl would copy userspace's xsave struct to the kernel.
+This ioctl would copy userspace's xsave struct to the kernel. It copies
+as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2),
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the
+contents of CPUID leaf 0xD on the host.


 4.44 KVM_GET_XCRS
@@ -4983,6 +4993,33 @@ KVM does guarantee that vCPUs will see either the previous filter or the new
 filter, e.g. MSRs with identical settings in both the old and new filter will
 have deterministic behavior.

+4.134 KVM_GET_XSAVE2
+--------------------
+
+:Capability: KVM_CAP_XSAVE2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xsave (out)
+:Returns: 0 on success, -1 on error
+
+
+::
+
+  struct kvm_xsave {
+        __u32 region[1024];
+        __u32 extra[0];
+  };
+
+This ioctl would copy current vcpu's xsave struct to the userspace. It
+copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the contents
+of CPUID leaf 0xD on the host.
+

 5. The kvm_run structure
 ========================

--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -362,9 +362,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };

-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };

 #define KVM_MAX_XCRS	16

--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -32,7 +32,7 @@
 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);

-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;

--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -47,6 +47,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 	       u32 *ecx, u32 *edx, bool exact_only);

+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);

 static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3991,6 +3991,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		else
 			r = 0;
 		break;
+	case KVM_CAP_XSAVE2: {
+		u64 guest_perm = xstate_get_guest_group_perm();
+
+		r = xstate_required_size(supported_xcr0 & guest_perm, false);
+		if (r < sizeof(struct kvm_xsave))
+			r = sizeof(struct kvm_xsave);
+		break;
+	}
 	case KVM_CAP_X86_NOTIFY_VMEXIT:
 		r = kvm_has_notify_vmexit;
 		break;
@@ -4634,6 +4642,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 				       vcpu->arch.pkru);
 }

+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+					  u8 *state, unsigned int size)
+{
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return;
+
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       state, size, vcpu->arch.pkru);
+}
+
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
@@ -4953,6 +4971,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
+		r = -EINVAL;
+		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+			break;
+
 		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xsave)
@@ -4967,7 +4989,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = memdup_user(argp, size);
 		if (IS_ERR(u.xsave)) {
 			r = PTR_ERR(u.xsave);
 			goto out_nofree;
@@ -4976,6 +5000,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
+
+	case KVM_GET_XSAVE2: {
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+		r = -ENOMEM;
+		if (!u.xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, u.xsave, size))
+			break;
+
+		r = 0;
+		break;
+	}
+
 	case KVM_GET_XCRS: {
 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;

--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1070,6 +1070,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_XSAVE2 208
 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
 #define KVM_CAP_X86_NOTIFY_VMEXIT 219

@@ -1558,6 +1559,9 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)

+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 struct kvm_s390_pv_sec_parm {
 	__u64 origin;
 	__u64 length;