Message ID | 20211208000359.2853257-17-yang.zhong@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | AMX Support in KVM | expand |
On 12/8/21 01:03, Yang Zhong wrote: > From: Jing Liu <jing2.liu@intel.com> > > When dynamic XSTATE features are supported, the xsave states are > beyond 4KB. The current kvm_xsave structure and related > KVM_{G, S}ET_XSAVE only allows 4KB which is not enough for full > states. > > Introduce a new kvm_xsave2 structure and the corresponding > KVM_GET_XSAVE2 and KVM_SET_XSAVE2 ioctls so that userspace VMM > can get and set the full xsave states. > > Signed-off-by: Jing Liu <jing2.liu@intel.com> > Signed-off-by: Yang Zhong <yang.zhong@intel.com> > --- > arch/x86/include/uapi/asm/kvm.h | 6 ++++ > arch/x86/kvm/x86.c | 62 +++++++++++++++++++++++++++++++++ > include/uapi/linux/kvm.h | 7 +++- > 3 files changed, 74 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h > index 5a776a08f78c..de42a51e20c3 100644 > --- a/arch/x86/include/uapi/asm/kvm.h > +++ b/arch/x86/include/uapi/asm/kvm.h > @@ -47,6 +47,7 @@ > #define __KVM_HAVE_VCPU_EVENTS > #define __KVM_HAVE_DEBUGREGS > #define __KVM_HAVE_XSAVE > +#define __KVM_HAVE_XSAVE2 > #define __KVM_HAVE_XCRS > #define __KVM_HAVE_READONLY_MEM > > @@ -378,6 +379,11 @@ struct kvm_xsave { > __u32 region[1024]; > }; > > +struct kvm_xsave2 { > + __u32 size; > + __u8 state[0]; > +}; > + > #define KVM_MAX_XCRS 16 > > struct kvm_xcr { > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 8b033c9241d6..d212f6d2d39a 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -4216,6 +4216,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > case KVM_CAP_DEBUGREGS: > case KVM_CAP_X86_ROBUST_SINGLESTEP: > case KVM_CAP_XSAVE: > + case KVM_CAP_XSAVE2: > case KVM_CAP_ASYNC_PF: > case KVM_CAP_ASYNC_PF_INT: > case KVM_CAP_GET_TSC_KHZ: > @@ -4940,6 +4941,17 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, > vcpu->arch.pkru); > } > > +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, > + u8 *state, u32 size) > +{ > + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) > + return; > + > + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, > + state, size, > + vcpu->arch.pkru); > +} > + > static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, > struct kvm_xsave *guest_xsave) > { > @@ -4951,6 +4963,15 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, > supported_xcr0, &vcpu->arch.pkru); > } > > +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) > +{ > + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) > + return 0; > + > + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, > + supported_xcr0, &vcpu->arch.pkru); > +} > + > static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, > struct kvm_xcrs *guest_xcrs) > { > @@ -5416,6 +5437,47 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); > break; > } > + case KVM_GET_XSAVE2: { > + struct kvm_xsave2 __user *xsave2_arg = argp; > + struct kvm_xsave2 xsave2; > + > + r = -EFAULT; > + if (copy_from_user(&xsave2, xsave2_arg, sizeof(struct kvm_xsave2))) > + break; > + > + u.buffer = kzalloc(xsave2.size, GFP_KERNEL_ACCOUNT); > + > + r = -ENOMEM; > + if (!u.buffer) > + break; > + > + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, xsave2.size); > + > + r = -EFAULT; > + if (copy_to_user(xsave2_arg->state, u.buffer, xsave2.size)) > + break; > + > + r = 0; > + break; > + } > + case KVM_SET_XSAVE2: { > + struct kvm_xsave2 __user *xsave2_arg = argp; > + struct kvm_xsave2 xsave2; > + > + r = -EFAULT; > + if (copy_from_user(&xsave2, xsave2_arg, sizeof(struct kvm_xsave2))) > + break; > + > + u.buffer = memdup_user(xsave2_arg->state, xsave2.size); > + > + if (IS_ERR(u.buffer)) { > + r = PTR_ERR(u.buffer); > + goto out_nofree; > + } > + > + r = kvm_vcpu_ioctl_x86_set_xsave2(vcpu, u.buffer); > + break; > + } > case KVM_GET_XCRS: { > u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); > r = -ENOMEM; > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 0c7b301c7254..603e1ca9ba09 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -1132,7 +1132,9 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 > #define KVM_CAP_ARM_MTE 205 > #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 > - > +#ifdef __KVM_HAVE_XSAVE2 > +#define KVM_CAP_XSAVE2 207 > +#endif > #ifdef KVM_CAP_IRQ_ROUTING > > struct kvm_irq_routing_irqchip { > @@ -1679,6 +1681,9 @@ struct kvm_xen_hvm_attr { > #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) > #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) > > +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave2) > +#define KVM_SET_XSAVE2 _IOW(KVMIO, 0xd0, struct kvm_xsave2) > + > struct kvm_xen_vcpu_attr { > __u16 type; > __u16 pad[3]; > Please also modify KVM_GET/SET_XSAVE to fail with ENOSPC if the requested size is bigger than 4096. Paolo
On 12/8/21 01:03, Yang Zhong wrote: > +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, > + u8 *state, u32 size) > +{ > + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) > + return; > + > + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, > + state, size, > + vcpu->arch.pkru); > +} > + > static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, > struct kvm_xsave *guest_xsave) > { > @@ -4951,6 +4963,15 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, > supported_xcr0, &vcpu->arch.pkru); > } > > +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) > +{ > + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) > + return 0; > + > + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, > + supported_xcr0, &vcpu->arch.pkru); > +} > + I think fpu_copy_uabi_to_guest_fpstate (and therefore copy_uabi_from_kernel_to_xstate) needs to check that the size is compatible with the components in the input. Also, IIUC the size of the AMX state will vary in different processors. Is this correct? If so, this should be handled already by KVM_GET/SET_XSAVE2 and therefore should be part of the arch/x86/kernel/fpu APIs. In the future we want to support migrating a "small AMX" host to a "large AMX" host; and also migrating from a "large AMX" host to a "small AMX" host if the guest CPUID is compatible with the destination of the migration. Paolo
On 12/10/21 17:30, Paolo Bonzini wrote: >> >> +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 >> *state) >> +{ >> + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) >> + return 0; >> + >> + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, >> + supported_xcr0, &vcpu->arch.pkru); >> +} >> + > > I think fpu_copy_uabi_to_guest_fpstate (and therefore > copy_uabi_from_kernel_to_xstate) needs to check that the size is > compatible with the components in the input. > > Also, IIUC the size of the AMX state will vary in different processors. > Is this correct? If so, this should be handled already by > KVM_GET/SET_XSAVE2 and therefore should be part of the > arch/x86/kernel/fpu APIs. In the future we want to support migrating a > "small AMX" host to a "large AMX" host; and also migrating from a "large > AMX" host to a "small AMX" host if the guest CPUID is compatible with > the destination of the migration. So, the size of the AMX state will depend on the active "palette" in TILECONFIG, and on the CPUID information. I have a few questions on how Intel intends to handle future extensions to AMX: - can we assume that, in the future, palette 1 will always have the same value (bytes_per_row=64, max_names=8, max_rows=16), and basically that the only variable value is really the number of palettes? - how does Intel plan to handle bigger TILEDATA? Will it use more XCR0 bits or will it rather enlarge save state 18? If it will use more XCR0 bits, I suppose that XCR0 bits will control which palettes can be chosen by LDTILECFG. If not, on the other hand, this will be a first case of one system's XSAVE data not being XRSTOR-able on another system even if the destination system can set XCR0 to the same value as the source system. Likewise, if the size and offsets for save state 18 were to vary depending on the selected palette, then this would be novel, in that the save state size and offsets would not be in CPUID anymore. It would be particularly interesting for non-compacted format, where all save states after 18 would also move forward. So, I hope that save state 18 will be frozen to 8k. In that case, and if palette 1 is frozen to the same values as today, implementing migration will not be a problem; it will be essentially the same as SSE->AVX (horizontal extension of existing registers) and/or AVX->AVX512 (both horizontal and vertical extension). By the way, I think KVM_SET_XSAVE2 is not needed. Instead: - KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) should return the size of the buffer that is passed to KVM_GET_XSAVE2 - KVM_GET_XSAVE2 should fill in the buffer expecting that its size is whatever KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) passes - KVM_SET_XSAVE can just expect a buffer that is bigger than 4k if the save states recorded in the header point to offsets larger than 4k. Paolo
On Saturday, December 11, 2021 6:13 AM, Paolo Bonzini wrote: > > By the way, I think KVM_SET_XSAVE2 is not needed. Instead: > > - KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) should return the size of the > buffer that is passed to KVM_GET_XSAVE2 > > - KVM_GET_XSAVE2 should fill in the buffer expecting that its size is > whatever KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) passes > > - KVM_SET_XSAVE can just expect a buffer that is bigger than 4k if the > save states recorded in the header point to offsets larger than 4k. I think one issue is that KVM_SET_XSAVE works with "struct kvm_xsave" (hardcoded 4KB buffer), including kvm_vcpu_ioctl_x86_set_xsave. The states obtained via KVM_GET_XSAVE2 will be made using "struct kvm_xsave2". Did you mean that we could add a new code path under KVM_SET_XSAVE to make it work with the new "struct kvm_xsave2"? e.g.: (xsave2_enabled below is set when userspace calls to get KVM_CAP_XSAVE2) if (kvm->xsave2_enabled) { new implementation using "struct kvm_xsave2" } else { current implementation using "struct kvm_xsave" } (this seems like a new implementation which might deserve a new ioctl) Thanks, Wei
On 12/13/21 09:23, Wang, Wei W wrote: > On Saturday, December 11, 2021 6:13 AM, Paolo Bonzini wrote: >> >> By the way, I think KVM_SET_XSAVE2 is not needed. Instead: >> >> - KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) should return the size of the >> buffer that is passed to KVM_GET_XSAVE2 >> >> - KVM_GET_XSAVE2 should fill in the buffer expecting that its size is >> whatever KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) passes >> >> - KVM_SET_XSAVE can just expect a buffer that is bigger than 4k if the >> save states recorded in the header point to offsets larger than 4k. > > I think one issue is that KVM_SET_XSAVE works with "struct kvm_xsave" (hardcoded 4KB buffer), > including kvm_vcpu_ioctl_x86_set_xsave. The states obtained via KVM_GET_XSAVE2 will be made > using "struct kvm_xsave2". > > Did you mean that we could add a new code path under KVM_SET_XSAVE to make it work with > the new "struct kvm_xsave2"? There is no need for struct kvm_xsave2, because there is no need for a "size" argument. - KVM_GET_XSAVE2 *is* needed, and it can expect a buffer as big as the return value of KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) - but KVM_SET_XSAVE2 is not needed, because KVM_SET_XSAVE can use copy_from_user to read the XSTATE_BV, use it to deduce the size of the buffer, and use copy_from_user to read the full size of the buffer. For this to work you can redefine struct kvm_xsave to struct kvm_xsave { __u32 region[1024]; /* * KVM_GET_XSAVE only uses 4096 bytes and only returns * user save states up to save state 17 (TILECFG). * * For KVM_GET_XSAVE2, the total size of region + extra * must be the size that is communicated by * KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). * * KVM_SET_XSAVE uses the extra field if the struct was * returned by KVM_GET_XSAVE2. */ __u32 extra[]; } Paolo
On Fri, Dec 10 2021 at 17:30, Paolo Bonzini wrote: > On 12/8/21 01:03, Yang Zhong wrote: >> +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) >> +{ >> + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) >> + return 0; >> + >> + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, >> + supported_xcr0, &vcpu->arch.pkru); >> +} >> + > > I think fpu_copy_uabi_to_guest_fpstate (and therefore > copy_uabi_from_kernel_to_xstate) needs to check that the size is > compatible with the components in the input. fpu_copy_uabi_to_guest_fpstate() expects that the input buffer is correctly sized. We surely can add a size check there. > Also, IIUC the size of the AMX state will vary in different processors. > Is this correct? If so, this should be handled already by > KVM_GET/SET_XSAVE2 and therefore should be part of the > arch/x86/kernel/fpu APIs. In the future we want to support migrating a > "small AMX" host to a "large AMX" host; and also migrating from a "large > AMX" host to a "small AMX" host if the guest CPUID is compatible with > the destination of the migration. How is that supposed to work? If the AMX state size differs then the hosts are not compatible. Thanks, tglx
On 12/13/21 11:10, Thomas Gleixner wrote: > On Fri, Dec 10 2021 at 17:30, Paolo Bonzini wrote: >> On 12/8/21 01:03, Yang Zhong wrote: >>> +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) >>> +{ >>> + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) >>> + return 0; >>> + >>> + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, >>> + supported_xcr0, &vcpu->arch.pkru); >>> +} >>> + >> >> I think fpu_copy_uabi_to_guest_fpstate (and therefore >> copy_uabi_from_kernel_to_xstate) needs to check that the size is >> compatible with the components in the input. > > fpu_copy_uabi_to_guest_fpstate() expects that the input buffer is > correctly sized. We surely can add a size check there. fpu_copy_guest_fpstate_to_uabi is more problematic because that one writes memory. For fpu_copy_uabi_to_guest_fpstate, we know the input buffer size from the components and we can use it to do a properly-sized memdup_user. For fpu_copy_guest_fpstate_to_uabi we can just decide that KVM_GET_XSAVE will only save up to the first 4K. Something like the following might actually be good for 5.16-rc; right now, header.xfeatures might lead userspace into reading uninitialized or unmapped memory: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d28829403ed0..69609b8c3887 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1138,8 +1138,10 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, struct xstate_header header; unsigned int zerofrom; u64 mask; + u64 size; int i; + size = to->left; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; @@ -1186,7 +1188,20 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); - /* Copy the user space relevant state of @xsave->header */ + /* + * Copy the user space relevant state of @xsave->header. + * If not all features fit in the buffer, drop them from the + * saved state so that userspace does not read uninitialized or + * unmapped memory. + */ + mask = fpstate->user_xfeatures; + for_each_extended_xfeature(i, mask) { + if (xstate_offsets[i] + xstate_size[i] > size) { + header.xfeatures &= BIT(i) - 1; + mask &= BIT(i) - 1; + break; + } + } membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); @@ -1197,7 +1212,6 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, * but there is no state to copy from in the compacted * init_fpstate. The gap tracking will zero these states. */ - mask = fpstate->user_xfeatures; for_each_extended_xfeature(i, mask) { /* >> Also, IIUC the size of the AMX state will vary in different processors. >> Is this correct? If so, this should be handled already by >> KVM_GET/SET_XSAVE2 and therefore should be part of the >> arch/x86/kernel/fpu APIs. In the future we want to support migrating a >> "small AMX" host to a "large AMX" host; and also migrating from a "large >> AMX" host to a "small AMX" host if the guest CPUID is compatible with >> the destination of the migration. > > How is that supposed to work? If the AMX state size differs then the > hosts are not compatible. I replied with some more questions later. Basically it depends on how Intel will define palettes that aren't part of the first implementation of AMX. Paolo
On Mon, Dec 13 2021 at 11:43, Paolo Bonzini wrote: > On 12/13/21 11:10, Thomas Gleixner wrote: >> On Fri, Dec 10 2021 at 17:30, Paolo Bonzini wrote: >>> I think fpu_copy_uabi_to_guest_fpstate (and therefore >>> copy_uabi_from_kernel_to_xstate) needs to check that the size is >>> compatible with the components in the input. >> >> fpu_copy_uabi_to_guest_fpstate() expects that the input buffer is >> correctly sized. We surely can add a size check there. > > fpu_copy_guest_fpstate_to_uabi is more problematic because that one > writes memory. For fpu_copy_uabi_to_guest_fpstate, we know the input > buffer size from the components and we can use it to do a properly-sized > memdup_user. > > For fpu_copy_guest_fpstate_to_uabi we can just decide that KVM_GET_XSAVE > will only save up to the first 4K. Something like the following might > actually be good for 5.16-rc; right now, header.xfeatures might lead > userspace into reading uninitialized or unmapped memory: If user space supplies a 4k buffer and reads beyond the end of the buffer then it's hardly a kernel problem. That function allows to provide a short buffer and fill it up to the point where the buffer ends with the real information. Thanks, tglx
On Monday, December 13, 2021 5:24 PM, Paolo Bonzini wrote: > There is no need for struct kvm_xsave2, because there is no need for a "size" > argument. > > - KVM_GET_XSAVE2 *is* needed, and it can expect a buffer as big as the return > value of KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) Why would KVM_GET_XSAVE2 still be needed in this case? I'm thinking it would also be possible to reuse KVM_GET_XSAVE: - If userspace calls to KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2), then KVM knows that the userspace is a new version and it works with larger xsave buffer using the "size" that it returns via KVM_CAP_XSAVE2. So we can add a flag "kvm->xsave2_enabled", which gets set upon userspace checks KVM_CAP_XSAVE2. - On KVM_GET_XSAVE, if "kvm->xsave2_enabled" is set, then KVM allocates buffer to load xstates and copies the loaded xstates data to the userspace buffer using the "size" that was returned to userspace on KVM_CAP_XSAVE2. If "kvm->xsave2_enabled" isn't set, using the legacy "4KB" size. Thanks, Wei
On 12/14/21 07:06, Wang, Wei W wrote: > On Monday, December 13, 2021 5:24 PM, Paolo Bonzini wrote: >> There is no need for struct kvm_xsave2, because there is no need for a "size" >> argument. >> >> - KVM_GET_XSAVE2 *is* needed, and it can expect a buffer as big as the return >> value of KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) > > Why would KVM_GET_XSAVE2 still be needed in this case? > > I'm thinking it would also be possible to reuse KVM_GET_XSAVE: > > - If userspace calls to KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2), > then KVM knows that the userspace is a new version and it works with larger xsave buffer using the "size" that it returns via KVM_CAP_XSAVE2. > So we can add a flag "kvm->xsave2_enabled", which gets set upon userspace checks KVM_CAP_XSAVE2. You can use KVM_ENABLE_CAP(KVM_CAP_XSAVE2) for that, yes. In that case you don't need KVM_GET_XSAVE2. Paolo > - On KVM_GET_XSAVE, if "kvm->xsave2_enabled" is set, > then KVM allocates buffer to load xstates and copies the loaded xstates data to the userspace buffer > using the "size" that was returned to userspace on KVM_CAP_XSAVE2. > If "kvm->xsave2_enabled" isn't set, using the legacy "4KB" size. > > Thanks, > Wei >
On Tuesday, December 14, 2021 2:19 PM, Paolo Bonzini wrote: > To: Wang, Wei W <wei.w.wang@intel.com>; Zhong, Yang > <yang.zhong@intel.com>; x86@kernel.org; kvm@vger.kernel.org; > linux-kernel@vger.kernel.org; tglx@linutronix.de; mingo@redhat.com; > bp@alien8.de; dave.hansen@linux.intel.com > Cc: seanjc@google.com; Nakajima, Jun <jun.nakajima@intel.com>; Tian, Kevin > <kevin.tian@intel.com>; jing2.liu@linux.intel.com; Liu, Jing2 > <jing2.liu@intel.com>; Zeng, Guang <guang.zeng@intel.com> > Subject: Re: [PATCH 16/19] kvm: x86: Introduce KVM_{G|S}ET_XSAVE2 ioctl > > On 12/14/21 07:06, Wang, Wei W wrote: > > On Monday, December 13, 2021 5:24 PM, Paolo Bonzini wrote: > >> There is no need for struct kvm_xsave2, because there is no need for a > "size" > >> argument. > >> > >> - KVM_GET_XSAVE2 *is* needed, and it can expect a buffer as big as > >> the return value of KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) > > > > Why would KVM_GET_XSAVE2 still be needed in this case? > > > > I'm thinking it would also be possible to reuse KVM_GET_XSAVE: > > > > - If userspace calls to KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2), > > then KVM knows that the userspace is a new version and it works with > larger xsave buffer using the "size" that it returns via KVM_CAP_XSAVE2. > > So we can add a flag "kvm->xsave2_enabled", which gets set upon > userspace checks KVM_CAP_XSAVE2. > > You can use KVM_ENABLE_CAP(KVM_CAP_XSAVE2) for that, yes. In that case > you don't need KVM_GET_XSAVE2. On more thing here, what size should KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) return? If the size still comes from the guest CPUID(0xd, 0)::RCX, would it be better to just return 1? This requires that the QEMU CPUID info has been set to KVM before checking the cap. QEMU already has this CPUID info to get the size (seems no need to inquire KVM for it). Thanks, Wei
On 12/15/21 03:39, Wang, Wei W wrote: >>> Why would KVM_GET_XSAVE2 still be needed in this case? >>> >>> I'm thinking it would also be possible to reuse KVM_GET_XSAVE: >>> >>> - If userspace calls to KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2), >>> then KVM knows that the userspace is a new version and it works with >> larger xsave buffer using the "size" that it returns via KVM_CAP_XSAVE2. >>> So we can add a flag "kvm->xsave2_enabled", which gets set upon >> userspace checks KVM_CAP_XSAVE2. >> >> You can use KVM_ENABLE_CAP(KVM_CAP_XSAVE2) for that, yes. In that case >> you don't need KVM_GET_XSAVE2. > > On more thing here, what size should KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) return? > If the size still comes from the guest CPUID(0xd, 0)::RCX, would it be better to just return 1? > This requires that the QEMU CPUID info has been set to KVM before checking the cap. > QEMU already has this CPUID info to get the size (seems no need to inquire KVM for it). It's still easier to return the full size of the buffer from KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). It makes the userspace code a bit easier. I'm also thinking that I prefer KVM_GET_XSAVE2 to KVM_ENABLE_CAP(KVM_CAP_XSAVE2), after all. Since it would be a backwards-incompatible change to an _old_ ioctl (KVM_GET_XSAVE), I prefer to limit the ways that userspace can shoot itself in the foot. Paolo
On Wednesday, December 15, 2021 9:43 PM, Paolo Bonzini wrote: > It's still easier to return the full size of the buffer from > KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). It makes the userspace code a > bit easier. OK. For the "full size" returned to userspace, would you prefer to directly use the value retrieved from guest CPUID(0xd), or get it from guest_fpu (i.e. fpstate->user_size)? (retrieved from CPUID will be the max size and should work fine as well) > > I'm also thinking that I prefer KVM_GET_XSAVE2 to > KVM_ENABLE_CAP(KVM_CAP_XSAVE2), after all. Since it would be a > backwards-incompatible change to an _old_ ioctl (KVM_GET_XSAVE), I prefer > to limit the ways that userspace can shoot itself in the foot. OK, we will use KVM_GET_XSAVE2. Thanks, Wei
On 12/16/21 09:25, Wang, Wei W wrote: >> It's still easier to return the full size of the buffer from >> KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). It makes the userspace code >> a bit easier. > > OK. For the "full size" returned to userspace, would you prefer to > directly use the value retrieved from guest CPUID(0xd), or get it > from guest_fpu (i.e. fpstate->user_size)? (retrieved from CPUID will > be the max size and should work fine as well) It is okay to reflect only the bits that were enabled in prctl, but please document it in api.rst as well. Paolo
> On Dec 10, 2021, at 2:13 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 12/10/21 17:30, Paolo Bonzini wrote: >>> >>> +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) >>> +{ >>> + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) >>> + return 0; >>> + >>> + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, >>> + supported_xcr0, &vcpu->arch.pkru); >>> +} >>> + >> I think fpu_copy_uabi_to_guest_fpstate (and therefore copy_uabi_from_kernel_to_xstate) needs to check that the size is compatible with the components in the input. >> Also, IIUC the size of the AMX state will vary in different processors. Is this correct? If so, this should be handled already by KVM_GET/SET_XSAVE2 and therefore should be part of the arch/x86/kernel/fpu APIs. In the future we want to support migrating a "small AMX" host to a "large AMX" host; and also migrating from a "large AMX" host to a "small AMX" host if the guest CPUID is compatible with the destination of the migration. > > So, the size of the AMX state will depend on the active "palette" in TILECONFIG, and on the CPUID information. I have a few questions on how Intel intends to handle future extensions to AMX: > > - can we assume that, in the future, palette 1 will always have the same value (bytes_per_row=64, max_names=8, max_rows=16), and basically that the only variable value is really the number of palettes? > > - how does Intel plan to handle bigger TILEDATA? Will it use more XCR0 bits or will it rather enlarge save state 18? > > If it will use more XCR0 bits, I suppose that XCR0 bits will control which palettes can be chosen by LDTILECFG. > > If not, on the other hand, this will be a first case of one system's XSAVE data not being XRSTOR-able on another system even if the destination system can set XCR0 to the same value as the source system. > > Likewise, if the size and offsets for save state 18 were to vary depending on the selected palette, then this would be novel, in that the save state size and offsets would not be in CPUID anymore. It would be particularly interesting for non-compacted format, where all save states after 18 would also move forward. > > So, I hope that save state 18 will be frozen to 8k. In that case, and if palette 1 is frozen to the same values as today, implementing migration will not be a problem; it will be essentially the same as SSE->AVX (horizontal extension of existing registers) and/or AVX->AVX512 (both horizontal and vertical extension). Hi Paolo, I would like to confirm that the state component 18 will remain 8KB and palette 1 will remain the same. Thanks, --- Jun
On 12/20/21 18:54, Nakajima, Jun wrote: > Hi Paolo, > > I would like to confirm that the state component 18 will remain 8KB and palette 1 will remain the same. Great! Can you also confirm that XCR0 bits will control which palettes can be chosen by LDTILECFG? Thanks, Paolo
On 12/20/21 9:54 AM, Nakajima, Jun wrote: >> So, I hope that save state 18 will be frozen to 8k. In that case, >> and if palette 1 is frozen to the same values as today, >> implementing migration will not be a problem; it will be >> essentially the same as SSE->AVX (horizontal extension of existing >> registers) and/or AVX->AVX512 (both horizontal and vertical >> extension). > > I would like to confirm that the state component 18 will remain 8KB > and palette 1 will remain the same. Is that an architectural statement that will soon be making its way into the SDM?
> On Dec 22, 2021, at 6:44 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 12/20/21 18:54, Nakajima, Jun wrote: >> Hi Paolo, >> I would like to confirm that the state component 18 will remain 8KB and palette 1 will remain the same. > > Great! Can you also confirm that XCR0 bits will control which palettes can be chosen by LDTILECFG? > I need to discuss with the H/W team more and come back. I think this request is plausible; I will suggest it. Thanks, --- Jun
> On Dec 22, 2021, at 6:52 AM, Hansen, Dave <dave.hansen@intel.com> wrote: > > On 12/20/21 9:54 AM, Nakajima, Jun wrote: >>> So, I hope that save state 18 will be frozen to 8k. In that case, >>> and if palette 1 is frozen to the same values as today, >>> implementing migration will not be a problem; it will be >>> essentially the same as SSE->AVX (horizontal extension of existing >>> registers) and/or AVX->AVX512 (both horizontal and vertical >>> extension). >> >> I would like to confirm that the state component 18 will remain 8KB >> and palette 1 will remain the same. > > Is that an architectural statement that will soon be making its way into > the SDM? Yes, with the other clarifications (e.g. setting IA32_XFD[18]). Thanks, --- Jun
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5a776a08f78c..de42a51e20c3 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -47,6 +47,7 @@ #define __KVM_HAVE_VCPU_EVENTS #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XSAVE2 #define __KVM_HAVE_XCRS #define __KVM_HAVE_READONLY_MEM @@ -378,6 +379,11 @@ struct kvm_xsave { __u32 region[1024]; }; +struct kvm_xsave2 { + __u32 size; + __u8 state[0]; +}; + #define KVM_MAX_XCRS 16 struct kvm_xcr { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b033c9241d6..d212f6d2d39a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4216,6 +4216,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: + case KVM_CAP_XSAVE2: case KVM_CAP_ASYNC_PF: case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: @@ -4940,6 +4941,17 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, vcpu->arch.pkru); } +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, u32 size) +{ + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; + + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + state, size, + vcpu->arch.pkru); +} + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { @@ -4951,6 +4963,15 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, supported_xcr0, &vcpu->arch.pkru); } +static int kvm_vcpu_ioctl_x86_set_xsave2(struct kvm_vcpu *vcpu, u8 *state) +{ + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return 0; + + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, state, + supported_xcr0, &vcpu->arch.pkru); +} + static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { @@ -5416,6 +5437,47 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + case KVM_GET_XSAVE2: { + struct kvm_xsave2 __user *xsave2_arg = argp; + struct kvm_xsave2 xsave2; + + r = -EFAULT; + if (copy_from_user(&xsave2, xsave2_arg, sizeof(struct kvm_xsave2))) + break; + + u.buffer = kzalloc(xsave2.size, GFP_KERNEL_ACCOUNT); + + r = -ENOMEM; + if (!u.buffer) + break; + + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, xsave2.size); + + r = -EFAULT; + if (copy_to_user(xsave2_arg->state, u.buffer, xsave2.size)) + break; + + r = 0; + break; + } + case KVM_SET_XSAVE2: { + struct kvm_xsave2 __user *xsave2_arg = argp; + struct kvm_xsave2 xsave2; + + r = -EFAULT; + if (copy_from_user(&xsave2, xsave2_arg, sizeof(struct kvm_xsave2))) + break; + + u.buffer = memdup_user(xsave2_arg->state, xsave2.size); + + if (IS_ERR(u.buffer)) { + r = PTR_ERR(u.buffer); + goto out_nofree; + } + + r = kvm_vcpu_ioctl_x86_set_xsave2(vcpu, u.buffer); + break; + } case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0c7b301c7254..603e1ca9ba09 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1132,7 +1132,9 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 - +#ifdef __KVM_HAVE_XSAVE2 +#define KVM_CAP_XSAVE2 207 +#endif #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_irqchip { @@ -1679,6 +1681,9 @@ struct kvm_xen_hvm_attr { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave2) +#define KVM_SET_XSAVE2 _IOW(KVMIO, 0xd0, struct kvm_xsave2) + struct kvm_xen_vcpu_attr { __u16 type; __u16 pad[3];