diff mbox series

[v3,19/22] kvm: x86: Get/set expanded xstate buffer

Message ID 20211222124052.644626-20-jing2.liu@intel.com (mailing list archive)
State New
Headers show
Series AMX Support in KVM | expand

Commit Message

Jing Liu Dec. 22, 2021, 12:40 p.m. UTC
From: Guang Zeng <guang.zeng@intel.com>

When AMX is enabled it requires a larger xstate buffer than
the legacy hardcoded 4KB one. Exising kvm ioctls
(KVM_[G|S]ET_XSAVE under KVM_CAP_XSAVE) are not suitable for
this purpose.

A new capability (KVM_CAP_XSAVE2) is introduced to mark an
extended kvm_xsave format to support >4KB fpstate. The expanded
fpstate size is returned to userspace when it checks the
KVM_CAP_XSAVE2 capability.

Introduce a new KVM_GET_XSAVE2 under this capability to use the
new format for copying guest fpstate to userspace. Reuse
KVM_SET_XSAVE for both old/new formats by reimplementing it to
do properly-sized memdup_user() based on the guest fpu container.

Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.

Signed-off-by: Guang Zeng <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
---
 Documentation/virt/kvm/api.rst  | 42 +++++++++++++++++++++++++++++++--
 arch/x86/include/uapi/asm/kvm.h | 16 ++++++++++++-
 arch/x86/kvm/x86.c              | 39 +++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  4 ++++
 4 files changed, 97 insertions(+), 4 deletions(-)

Comments

Sean Christopherson Dec. 29, 2021, 12:38 a.m. UTC | #1
Shortlog needs to have a verb somewhere.

On Wed, Dec 22, 2021, Jing Liu wrote:
> From: Guang Zeng <guang.zeng@intel.com>
> 
> When AMX is enabled it requires a larger xstate buffer than
> the legacy hardcoded 4KB one. Exising kvm ioctls

Existing

> (KVM_[G|S]ET_XSAVE under KVM_CAP_XSAVE) are not suitable for
> this purpose.

...

> Reuse KVM_SET_XSAVE for both old/new formats by reimplementing it to
> do properly-sized memdup_user() based on the guest fpu container.

I'm confused, the first sentence says KVM_SET_XSAVE isn't suitable, the second
says it can be reused with minimal effort.

> Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.

...

> @@ -5367,7 +5382,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_SET_XSAVE: {
> -		u.xsave = memdup_user(argp, sizeof(*u.xsave));
> +		int size = vcpu->arch.guest_fpu.uabi_size;

IIUC, reusing KVM_SET_XSAVE works by requiring that userspace use KVM_GET_XSAVE2
if userspace has expanded the guest FPU size by exposing relevant features to
the guest via guest CPUID.  If so, then that needs to be enforced in KVM_GET_XSAVE,
otherwise userspace will get subtle corruption by invoking the wrong ioctl, e.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c9606380bca..5d2acbd52df5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5386,6 +5386,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_GET_XSAVE: {
+               r -EINVAL;
+               if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+                       break;
+
                u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
                r = -ENOMEM;
                if (!u.xsave)

> +
> +		u.xsave = memdup_user(argp, size);
>  		if (IS_ERR(u.xsave)) {
>  			r = PTR_ERR(u.xsave);
>  			goto out_nofree;
> @@ -5376,6 +5393,26 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
>  		break;
>  	}
> +
> +	case KVM_GET_XSAVE2: {
> +		int size = vcpu->arch.guest_fpu.uabi_size;
> +
> +		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
> +		if (!u.xsave) {
> +			r = -ENOMEM;

I hate the odd patterns in this code as much as anyone, but for better or worse
the style throughout is:

		r = -ENOMEM;
		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
		if (u.xsave)
			break;

> +			break;
> +		}
> +
> +		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
> +
> +		if (copy_to_user(argp, u.xsave, size)) {
> +			r = -EFAULT;
> +			break;

Same style thing here.

> +		}
> +		r = 0;
> +		break;
> +	}
> +
>  	case KVM_GET_XCRS: {
>  		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
>  		r = -ENOMEM;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 1daa45268de2..9d1c01669560 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1131,6 +1131,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>  #define KVM_CAP_ARM_MTE 205
>  #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
> +#define KVM_CAP_XSAVE2 207
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1610,6 +1611,9 @@ struct kvm_enc_region {
>  #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
>  #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
>  
> +/* Available with KVM_CAP_XSAVE2 */
> +#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
> +
>  struct kvm_s390_pv_sec_parm {
>  	__u64 origin;
>  	__u64 length;
> -- 
> 2.27.0
>
Wei Wang Dec. 29, 2021, 2:57 a.m. UTC | #2
On Wednesday, December 29, 2021 8:39 AM, Sean Christopherson wrote:
> To: Liu, Jing2 <jing2.liu@intel.com>
> Cc: x86@kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-doc@vger.kernel.org; linux-kselftest@vger.kernel.org; tglx@linutronix.de;
> mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com;
> pbonzini@redhat.com; corbet@lwn.net; shuah@kernel.org; Nakajima, Jun
> <jun.nakajima@intel.com>; Tian, Kevin <kevin.tian@intel.com>;
> jing2.liu@linux.intel.com; Zeng, Guang <guang.zeng@intel.com>; Wang, Wei
> W <wei.w.wang@intel.com>; Zhong, Yang <yang.zhong@intel.com>
> Subject: Re: [PATCH v3 19/22] kvm: x86: Get/set expanded xstate buffer
> 
> Shortlog needs to have a verb somewhere.
> 
> On Wed, Dec 22, 2021, Jing Liu wrote:
> > From: Guang Zeng <guang.zeng@intel.com>
> >
> > When AMX is enabled it requires a larger xstate buffer than the legacy
> > hardcoded 4KB one. Exising kvm ioctls
> 
> Existing
> 
> > (KVM_[G|S]ET_XSAVE under KVM_CAP_XSAVE) are not suitable for this
> > purpose.
> 
> ...
> 
> > Reuse KVM_SET_XSAVE for both old/new formats by reimplementing it to
> > do properly-sized memdup_user() based on the guest fpu container.
> 
> I'm confused, the first sentence says KVM_SET_XSAVE isn't suitable, the
> second says it can be reused with minimal effort.

Probably "doesn't support" sounds better than "isn't suitable" above. But plan to reword a bit:

With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set xstate data from/to
KVM. This doesn't work when dynamic features (e.g. AMX) are used by the guest, as KVM uses
a full expanded xstate buffer for the guest fpu emulation, which is larger than 4KB.

Add KVM_CAP_XSAVE2, and userspace gets the required xstate buffer size from KVM via
KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended with the support to
work with larger xstate data size passed from userspace. KVM_GET_XSAVE2 is preferred to
extending KVM_GET_XSAVE to work with large buffer size for backward-compatible considerations.
(Link: https://lkml.org/lkml/2021/12/15/510)

Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.




> 
> > Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.
> 
> ...
> 
> > @@ -5367,7 +5382,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> >  		break;
> >  	}
> >  	case KVM_SET_XSAVE: {
> > -		u.xsave = memdup_user(argp, sizeof(*u.xsave));
> > +		int size = vcpu->arch.guest_fpu.uabi_size;
> 
> IIUC, reusing KVM_SET_XSAVE works by requiring that userspace use
> KVM_GET_XSAVE2 if userspace has expanded the guest FPU size by exposing
> relevant features to the guest via guest CPUID.  If so, then that needs to be
> enforced in KVM_GET_XSAVE, otherwise userspace will get subtle corruption
> by invoking the wrong ioctl, e.g.
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index
> 2c9606380bca..5d2acbd52df5 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5386,6 +5386,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 break;
>         }
>         case KVM_GET_XSAVE: {
> +               r -EINVAL;
> +               if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct
> kvm_xsave))
> +                       break;
> +

Looks good to me.

Thanks,
Wei
Tian, Kevin Dec. 29, 2021, 6:36 a.m. UTC | #3
> From: Wang, Wei W <wei.w.wang@intel.com>
> Sent: Wednesday, December 29, 2021 10:58 AM
> 
> > > Reuse KVM_SET_XSAVE for both old/new formats by reimplementing it to
> > > do properly-sized memdup_user() based on the guest fpu container.
> >
> > I'm confused, the first sentence says KVM_SET_XSAVE isn't suitable, the
> > second says it can be reused with minimal effort.
> 
> Probably "doesn't support" sounds better than "isn't suitable" above. But
> plan to reword a bit:
> 
> With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set
> xstate data from/to
> KVM. This doesn't work when dynamic features (e.g. AMX) are used by the
> guest, as KVM uses
> a full expanded xstate buffer for the guest fpu emulation, which is larger
> than 4KB.
> 
> Add KVM_CAP_XSAVE2, and userspace gets the required xstate buffer size
> from KVM via
> KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended
> with the support to
> work with larger xstate data size passed from userspace. KVM_GET_XSAVE2
> is preferred to
> extending KVM_GET_XSAVE to work with large buffer size for backward-
> compatible considerations.
> (Link: https://lkml.org/lkml/2021/12/15/510)
> 
> Also, update the api doc with the new KVM_GET_XSAVE2 ioctl.
> 

Revised to:

--

With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set
xstate data from/to KVM. This doesn't work when dynamic xfeatures
(e.g. AMX) are exposed to the guest as they require a larger buffer
size.

Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the
required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).
KVM_SET_XSAVE is extended to work with both legacy and new capabilities
by doing properly-sized memdup_user() based on the guest fpu container.
KVM_GET_XSAVE is kept for backward-compatible reason. Instead,
KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred
interface for getting xstate buffer (4KB or larger size) from KVM.
(Link: https://lkml.org/lkml/2021/12/15/510)

Also, update the api doc with the new KVM_GET_XSAVE2 ioctl

--

Thanks
Kevin
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1cf2483246cd..e48f7de5f23a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1566,6 +1566,7 @@  otherwise it will return EBUSY error.
 
   struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
   };
 
 This ioctl would copy current vcpu's xsave struct to the userspace.
@@ -1574,7 +1575,7 @@  This ioctl would copy current vcpu's xsave struct to the userspace.
 4.43 KVM_SET_XSAVE
 ------------------
 
-:Capability: KVM_CAP_XSAVE
+:Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2
 :Architectures: x86
 :Type: vcpu ioctl
 :Parameters: struct kvm_xsave (in)
@@ -1585,9 +1586,18 @@  This ioctl would copy current vcpu's xsave struct to the userspace.
 
   struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
   };
 
-This ioctl would copy userspace's xsave struct to the kernel.
+This ioctl would copy userspace's xsave struct to the kernel. It copies
+as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2),
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the
+contents of CPUID leaf 0xD on the host.
 
 
 4.44 KVM_GET_XCRS
@@ -5507,6 +5517,34 @@  the trailing ``'\0'``, is indicated by ``name_size`` in the header.
 The Stats Data block contains an array of 64-bit values in the same order
 as the descriptors in Descriptors block.
 
+4.42 KVM_GET_XSAVE2
+------------------
+
+:Capability: KVM_CAP_XSAVE2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xsave (out)
+:Returns: 0 on success, -1 on error
+
+
+::
+
+  struct kvm_xsave {
+	__u32 region[1024];
+	__u32 extra[0];
+  };
+
+This ioctl would copy current vcpu's xsave struct to the userspace. It
+copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the contents
+of CPUID leaf 0xD on the host.
+
+
 5. The kvm_run structure
 ========================
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a776a08f78c..2da3316bb559 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -373,9 +373,23 @@  struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c558c098979a..3b756ff13103 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4297,6 +4297,11 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		else
 			r = 0;
 		break;
+	case KVM_CAP_XSAVE2:
+		r = kvm->vcpus[0]->arch.guest_fpu.uabi_size;
+		if (r < sizeof(struct kvm_xsave))
+			r = sizeof(struct kvm_xsave);
+		break;
 	default:
 		break;
 	}
@@ -4900,6 +4905,16 @@  static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 				       vcpu->arch.pkru);
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+					  u8 *state, unsigned int size)
+{
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return;
+
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       state, size, vcpu->arch.pkru);
+}
+
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
@@ -5367,7 +5382,9 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = memdup_user(argp, size);
 		if (IS_ERR(u.xsave)) {
 			r = PTR_ERR(u.xsave);
 			goto out_nofree;
@@ -5376,6 +5393,26 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
+
+	case KVM_GET_XSAVE2: {
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+		if (!u.xsave) {
+			r = -ENOMEM;
+			break;
+		}
+
+		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+		if (copy_to_user(argp, u.xsave, size)) {
+			r = -EFAULT;
+			break;
+		}
+		r = 0;
+		break;
+	}
+
 	case KVM_GET_XCRS: {
 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1daa45268de2..9d1c01669560 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1131,6 +1131,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
+#define KVM_CAP_XSAVE2 207
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1610,6 +1611,9 @@  struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 struct kvm_s390_pv_sec_parm {
 	__u64 origin;
 	__u64 length;