diff mbox series

[13/31] x86/fpu: Move KVMs FPU swapping to FPU core

Message ID 20211011223611.069324121@linutronix.de (mailing list archive)
State New, archived
Headers show
Series x86/fpu: Preparatory cleanups for AMX support (part 1) | expand

Commit Message

Thomas Gleixner Oct. 12, 2021, midnight UTC
Swapping the host/guest FPU is directly fiddling with FPU internals which
requires 5 exports. The upcoming support of dymanically enabled states
would even need more.

Implement a swap function in the FPU core code and export that instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/fpu/api.h      |    8 +++++
 arch/x86/include/asm/fpu/internal.h |   15 +---------
 arch/x86/kernel/fpu/core.c          |   30 ++++++++++++++++++---
 arch/x86/kernel/fpu/init.c          |    1 
 arch/x86/kernel/fpu/xstate.c        |    1 
 arch/x86/kvm/x86.c                  |   51 +++++++-----------------------------
 arch/x86/mm/extable.c               |    2 -
 7 files changed, 48 insertions(+), 60 deletions(-)

Comments

Borislav Petkov Oct. 12, 2021, 4:53 p.m. UTC | #1
Just typos:

On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote:
> Swapping the host/guest FPU is directly fiddling with FPU internals which
> requires 5 exports. The upcoming support of dymanically enabled states

"dynamically"

>  /*
>   * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
>   * disables preemption so be careful if you intend to use it for long periods
> @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatur
>  
>  static inline void update_pasid(void) { }
>  
> +/* FPSTATE related functions which are exported to KVM */

fpstate-related

> +extern void fpu_init_fpstate_user(struct fpu *fpu);
> +
> +/* KVM specific functions */
> +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
> +
>  #endif /* _ASM_X86_FPU_API_H */

...

>  /* Swap (qemu) user FPU context for the guest FPU context. */
>  static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -	fpregs_lock();
> -
> -	kvm_save_current_fpu(vcpu->arch.user_fpu);
> -
>  	/*
> -	 * Guests with protected state can't have it set by the hypervisor,
> -	 * so skip trying to set it.
> +	 * Guest with protected state have guest_fpu == NULL which makes

"Guests ... "

> +	 * the swap only safe the host state. Exclude PKRU from restore as

"save"

> +	 * it is restored separately in kvm_x86_ops.run().
>  	 */
> -	if (vcpu->arch.guest_fpu)
> -		/* PKRU is separately restored in kvm_x86_ops.run. */
> -		__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
> -					~XFEATURE_MASK_PKRU);
> -
> -	fpregs_mark_activate();
> -	fpregs_unlock();
> -
> +	fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu,
> +			 ~XFEATURE_MASK_PKRU);
>  	trace_kvm_fpu(1);
>  }
>  
>  /* When vcpu_run ends, restore user space FPU context. */
>  static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -	fpregs_lock();
> -
>  	/*
> -	 * Guests with protected state can't have it read by the hypervisor,
> -	 * so skip trying to save it.
> +	 * Guest with protected state have guest_fpu == NULL which makes

"Guests ... "

> +	 * swap only restore the host state.
>  	 */
> -	if (vcpu->arch.guest_fpu)
> -		kvm_save_current_fpu(vcpu->arch.guest_fpu);
> -
> -	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
> -
> -	fpregs_mark_activate();
> -	fpregs_unlock();
> -
> +	fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL);
>  	++vcpu->stat.fpu_reload;
>  	trace_kvm_fpu(0);
>  }
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const s
>  	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
>  		  (void *)instruction_pointer(regs));
>  
> -	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
> +	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
>  	return true;
>  }
>  
>
Paolo Bonzini Oct. 12, 2021, 5:22 p.m. UTC | #2
On 12/10/21 02:00, Thomas Gleixner wrote:
> Swapping the host/guest FPU is directly fiddling with FPU internals which
> requires 5 exports. The upcoming support of dymanically enabled states
> would even need more.
> 
> Implement a swap function in the FPU core code and export that instead.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: kvm@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/include/asm/fpu/api.h      |    8 +++++
>   arch/x86/include/asm/fpu/internal.h |   15 +---------
>   arch/x86/kernel/fpu/core.c          |   30 ++++++++++++++++++---
>   arch/x86/kernel/fpu/init.c          |    1
>   arch/x86/kernel/fpu/xstate.c        |    1
>   arch/x86/kvm/x86.c                  |   51 +++++++-----------------------------
>   arch/x86/mm/extable.c               |    2 -
>   7 files changed, 48 insertions(+), 60 deletions(-)
> 
> --- a/arch/x86/include/asm/fpu/api.h
> +++ b/arch/x86/include/asm/fpu/api.h
> @@ -12,6 +12,8 @@
>   #define _ASM_X86_FPU_API_H
>   #include <linux/bottom_half.h>
>   
> +#include <asm/fpu/types.h>
> +
>   /*
>    * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
>    * disables preemption so be careful if you intend to use it for long periods
> @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatur
>   
>   static inline void update_pasid(void) { }
>   
> +/* FPSTATE related functions which are exported to KVM */
> +extern void fpu_init_fpstate_user(struct fpu *fpu);
> +
> +/* KVM specific functions */
> +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
> +
>   #endif /* _ASM_X86_FPU_API_H */
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -74,14 +74,8 @@ static __always_inline __pure bool use_f
>   	return static_cpu_has(X86_FEATURE_FXSR);
>   }
>   
> -/*
> - * fpstate handling functions:
> - */
> -
>   extern union fpregs_state init_fpstate;
> -
>   extern void fpstate_init_user(union fpregs_state *state);
> -extern void fpu_init_fpstate_user(struct fpu *fpu);
>   
>   #ifdef CONFIG_MATH_EMULATION
>   extern void fpstate_init_soft(struct swregs_state *soft);
> @@ -381,12 +375,7 @@ static inline int os_xrstor_safe(struct
>   	return err;
>   }
>   
> -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
> -
> -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
> -{
> -	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
> -}
> +extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
>   
>   extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
>   
> @@ -467,7 +456,7 @@ static inline void fpregs_restore_userre
>   		 */
>   		mask = xfeatures_mask_restore_user() |
>   			xfeatures_mask_supervisor();
> -		__restore_fpregs_from_fpstate(&fpu->state, mask);
> +		restore_fpregs_from_fpstate(&fpu->state, mask);
>   
>   		fpregs_activate(fpu);
>   		fpu->last_cpu = cpu;
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -124,9 +124,8 @@ void save_fpregs_to_fpstate(struct fpu *
>   	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
>   	frstor(&fpu->state.fsave);
>   }
> -EXPORT_SYMBOL(save_fpregs_to_fpstate);
>   
> -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
> +void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
>   {
>   	/*
>   	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
> @@ -151,7 +150,31 @@ void __restore_fpregs_from_fpstate(union
>   			frstor(&fpstate->fsave);
>   	}
>   }
> -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
> +
> +#if IS_ENABLED(CONFIG_KVM)
> +void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
> +{
> +	fpregs_lock();
> +
> +	if (save) {
> +		if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
> +			memcpy(&save->state, &current->thread.fpu.state,
> +			       fpu_kernel_xstate_size);
> +		} else {
> +			save_fpregs_to_fpstate(save);
> +		}
> +	}
> +
> +	if (rstor) {
> +		restore_mask &= xfeatures_mask_fpstate();
> +		restore_fpregs_from_fpstate(&rstor->state, restore_mask);
> +	}
> +
> +	fpregs_mark_activate();
> +	fpregs_unlock();
> +}
> +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
> +#endif
>   
>   void kernel_fpu_begin_mask(unsigned int kfpu_mask)
>   {
> @@ -459,7 +482,6 @@ void fpregs_mark_activate(void)
>   	fpu->last_cpu = smp_processor_id();
>   	clear_thread_flag(TIF_NEED_FPU_LOAD);
>   }
> -EXPORT_SYMBOL_GPL(fpregs_mark_activate);
>   
>   /*
>    * x87 math exception handling:
> --- a/arch/x86/kernel/fpu/init.c
> +++ b/arch/x86/kernel/fpu/init.c
> @@ -136,7 +136,6 @@ static void __init fpu__init_system_gene
>    * components into a single, continuous memory block:
>    */
>   unsigned int fpu_kernel_xstate_size __ro_after_init;
> -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
>   
>   /* Get alignment of the TYPE. */
>   #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -65,7 +65,6 @@ static short xsave_cpuid_features[] __in
>    * XSAVE buffer, both supervisor and user xstates.
>    */
>   u64 xfeatures_mask_all __ro_after_init;
> -EXPORT_SYMBOL_GPL(xfeatures_mask_all);
>   
>   static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
>   	{ [ 0 ... XFEATURE_MAX - 1] = -1};
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -68,7 +68,9 @@
>   #include <asm/mce.h>
>   #include <asm/pkru.h>
>   #include <linux/kernel_stat.h>
> -#include <asm/fpu/internal.h> /* Ugh! */
> +#include <asm/fpu/api.h>
> +#include <asm/fpu/xcr.h>
> +#include <asm/fpu/xstate.h>
>   #include <asm/pvclock.h>
>   #include <asm/div64.h>
>   #include <asm/irq_remapping.h>
> @@ -9899,58 +9901,27 @@ static int complete_emulated_mmio(struct
>   	return 0;
>   }
>   
> -static void kvm_save_current_fpu(struct fpu *fpu)
> -{
> -	/*
> -	 * If the target FPU state is not resident in the CPU registers, just
> -	 * memcpy() from current, else save CPU state directly to the target.
> -	 */
> -	if (test_thread_flag(TIF_NEED_FPU_LOAD))
> -		memcpy(&fpu->state, &current->thread.fpu.state,
> -		       fpu_kernel_xstate_size);
> -	else
> -		save_fpregs_to_fpstate(fpu);
> -}
> -
>   /* Swap (qemu) user FPU context for the guest FPU context. */
>   static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>   {
> -	fpregs_lock();
> -
> -	kvm_save_current_fpu(vcpu->arch.user_fpu);
> -
>   	/*
> -	 * Guests with protected state can't have it set by the hypervisor,
> -	 * so skip trying to set it.
> +	 * Guest with protected state have guest_fpu == NULL which makes
> +	 * the swap only safe the host state. Exclude PKRU from restore as
> +	 * it is restored separately in kvm_x86_ops.run().
>   	 */
> -	if (vcpu->arch.guest_fpu)
> -		/* PKRU is separately restored in kvm_x86_ops.run. */
> -		__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
> -					~XFEATURE_MASK_PKRU);
> -
> -	fpregs_mark_activate();
> -	fpregs_unlock();
> -
> +	fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu,
> +			 ~XFEATURE_MASK_PKRU);
>   	trace_kvm_fpu(1);
>   }
>   
>   /* When vcpu_run ends, restore user space FPU context. */
>   static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>   {
> -	fpregs_lock();
> -
>   	/*
> -	 * Guests with protected state can't have it read by the hypervisor,
> -	 * so skip trying to save it.
> +	 * Guest with protected state have guest_fpu == NULL which makes
> +	 * swap only restore the host state.
>   	 */
> -	if (vcpu->arch.guest_fpu)
> -		kvm_save_current_fpu(vcpu->arch.guest_fpu);
> -
> -	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
> -
> -	fpregs_mark_activate();
> -	fpregs_unlock();
> -
> +	fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL);
>   	++vcpu->stat.fpu_reload;
>   	trace_kvm_fpu(0);
>   }
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const s
>   	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
>   		  (void *)instruction_pointer(regs));
>   
> -	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
> +	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
>   	return true;
>   }
>   
> 

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Thomas Gleixner Oct. 12, 2021, 6:25 p.m. UTC | #3
On Tue, Oct 12 2021 at 18:53, Borislav Petkov wrote:
> On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote:
>>  	/*
>> -	 * Guests with protected state can't have it set by the hypervisor,
>> -	 * so skip trying to set it.
>> +	 * Guest with protected state have guest_fpu == NULL which makes
>
> "Guests ... "
>
>> +	 * the swap only safe the host state. Exclude PKRU from restore as
>
> "save"

No I meant safe, but let me rephrase it, Swap does both save and
restore. But it's not safe to dereference a NULL pointer :)

 .... makes the swap only handle the host state. Exclude PKRU from restore as

Thanks,

        tglx
Thomas Gleixner Oct. 12, 2021, 6:26 p.m. UTC | #4
On Tue, Oct 12 2021 at 20:25, Thomas Gleixner wrote:

> On Tue, Oct 12 2021 at 18:53, Borislav Petkov wrote:
>> On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote:
>>>  	/*
>>> -	 * Guests with protected state can't have it set by the hypervisor,
>>> -	 * so skip trying to set it.
>>> +	 * Guest with protected state have guest_fpu == NULL which makes
>>
>> "Guests ... "
>>
>>> +	 * the swap only safe the host state. Exclude PKRU from restore as
>>
>> "save"
>
> No I meant safe, but let me rephrase it, Swap does both save and
> restore. But it's not safe to dereference a NULL pointer :)
>
>  .... makes the swap only handle the host state. Exclude PKRU from restore as

Gah. I should have looked at the context first. "save" is correct
here. Oh well...
Jing Liu Oct. 13, 2021, 6:15 a.m. UTC | #5
> On 12/10/21 02:00, Thomas Gleixner wrote:
> > Swapping the host/guest FPU is directly fiddling with FPU internals
> > which requires 5 exports. The upcoming support of dymanically enabled
> > states would even need more.
> >
> > Implement a swap function in the FPU core code and export that instead.
> >
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Cc: kvm@vger.kernel.org
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/x86/include/asm/fpu/api.h      |    8 +++++
> >   arch/x86/include/asm/fpu/internal.h |   15 +---------
> >   arch/x86/kernel/fpu/core.c          |   30 ++++++++++++++++++---
> >   arch/x86/kernel/fpu/init.c          |    1
> >   arch/x86/kernel/fpu/xstate.c        |    1
> >   arch/x86/kvm/x86.c                  |   51 +++++++-----------------------------
> >   arch/x86/mm/extable.c               |    2 -
> >   7 files changed, 48 insertions(+), 60 deletions(-)
> >

When looking into the tglx/devel.git x86/fpu for the full #1-#4 
series and the KVM AMX support, I'd like to talk two things
 as follows,

1. KVM dynamic allocation API:
Since KVM also uses dynamic allocation, after KVM detects guest
requesting AMX by #NM trap, KVM need alloc extra buffer for
this vcpu's current->thread.fpu.fpstate and guest_fpu related.
So far, the kernel itself has such API like fpstate_realloc(), but it's
static. How about making a common function usable for KVM?


2. There exists a case that *guest AMX state can be lost*:

After KVM passthrough XFD to guest, when vmexit opening
irq window and KVM is interrupted, kernel softirq path can call
kernel_fpu_begin() to touch xsave state. This function does
XSAVES. If guest XFD[18] is 1, and with guest AMX state in register,
then guest AMX state is lost by XSAVES.

The detailed example call trace in commit
commit 2620fe268e80d667a94553cd37a94ccaa2cb8c83
Author: Sean Christopherson <seanjc@google.com>
Date:   Fri Jan 17 11:30:51 2020 -0800

    KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest"

    Reload the current thread's FPU state, which contains the guest's FPU
    state, to the CPU registers if necessary during vcpu_enter_guest().
    TIF_NEED_FPU_LOAD can be set any time control is transferred out of
    KVM,
    e.g. if I/O is triggered during a KVM call to get_user_pages() or if a
    softirq occurs while KVM is scheduled in.
    ...
   A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while
    vcpu state is loaded:

     <IRQ>
      gcmaes_crypt_by_sg.constprop.12+0x26e/0x660
      ? 0xffffffffc024547d
      ? __qdisc_run+0x83/0x510
      ? __dev_queue_xmit+0x45e/0x990
      ...
      ? do_IRQ+0x7f/0xd0
      ? common_interrupt+0xf/0xf
      </IRQ>
      ? irq_entries_start+0x20/0x660
      ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel]
      ? kvm_set_msr_common+0xfc7/0x2380 [kvm]
      ? recalibrate_cpu_khz+0x10/0x10
      ? ktime_get+0x3a/0xa0
      ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm]
      ? kvm_init+0x6bf/0xd00 [kvm]

For this case, I think one way is kernel doing something before XSAVES
for KVM thread; another way is let KVM fix: maintaining a zero XFD
value (by current->state.fpu.fpstate->xfd = 0) after vcpu fpu state is 
loaded and restore real guest XFD value before vmenter. 
Logic as follows.

after vmexit:
if XFD is passthrough
then
	sync guest XFD to vmx->xfd;
	set XFD to current->state.fpu.fpstate->xfd (= 0)
	__this_cpu_write(xfd_state, 0);

before vmenter (irq is disabled):
if passthrough
then
	restore to real guest XFD by vmx->xfd;

vcpu_run: (if XFD is passthrough)
load: swap from qemu's to a zero XFD
put: swap zero to qemu's


Thanks,
Jing

[...]
Paolo Bonzini Oct. 13, 2021, 6:26 a.m. UTC | #6
On 13/10/21 08:15, Liu, Jing2 wrote:
> After KVM passthrough XFD to guest, when vmexit opening
> irq window and KVM is interrupted, kernel softirq path can call
> kernel_fpu_begin() to touch xsave state. This function does
> XSAVES. If guest XFD[18] is 1, and with guest AMX state in register,
> then guest AMX state is lost by XSAVES.

Yes, the host value of XFD (which is zero) has to be restored after 
vmexit.  See how KVM already handles SPEC_CTRL.

Passthrough of XFD is only enabled after the guest has caused an #NM 
vmexit and the full XSAVE state has been dynamically allocated, 
therefore it is always possible to do an XSAVES even from atomic context.

Paolo
Jing Liu Oct. 13, 2021, 7:46 a.m. UTC | #7
> On 13/10/21 08:15, Liu, Jing2 wrote:
> > After KVM passthrough XFD to guest, when vmexit opening irq window and
> > KVM is interrupted, kernel softirq path can call
> > kernel_fpu_begin() to touch xsave state. This function does XSAVES. If
> > guest XFD[18] is 1, and with guest AMX state in register, then guest
> > AMX state is lost by XSAVES.
> 
> Yes, the host value of XFD (which is zero) has to be restored after vmexit.
> See how KVM already handles SPEC_CTRL.
> 

I'm trying to understand why qemu's XFD is zero after kernel supports AMX.
Do you mean in guest #NM trap KVM also alloc extra user_fpu buffer and
clear qemu's XFD? But why do we need do that?

I think only when qemu userspace requests an AMX permission and exec
AMX instruction generating host #NM, host kernel clears qemu's XFD[18].
If guest #NM being trapped, KVM *don't* need clear host's XFD, but only
allocate guest_fpu's buffer and current->thread.fpu 's buffer, and
clear guest's XFD.

 
> Passthrough of XFD is only enabled after the guest has caused an #NM
> vmexit 

Yes, passthrough is done by two cases: one is guest #NM trapped;
another is guest clearing XFD before it generates #NM (this is possible for
guest), then passthrough.
For the two cases, we passthrough and allocate buffer for guest_fpu, and
current->thread.fpu.

Thanks,
Jing

and the full XSAVE state has been dynamically allocated, therefore it
> is always possible to do an XSAVES even from atomic context.
> 
> Paolo
Paolo Bonzini Oct. 13, 2021, 8:42 a.m. UTC | #8
On 13/10/21 09:46, Liu, Jing2 wrote:
> 
>> On 13/10/21 08:15, Liu, Jing2 wrote:
>>> After KVM passthrough XFD to guest, when vmexit opening irq window and
>>> KVM is interrupted, kernel softirq path can call
>>> kernel_fpu_begin() to touch xsave state. This function does XSAVES. If
>>> guest XFD[18] is 1, and with guest AMX state in register, then guest
>>> AMX state is lost by XSAVES.
>>
>> Yes, the host value of XFD (which is zero) has to be restored after vmexit.
>> See how KVM already handles SPEC_CTRL.
> 
> I'm trying to understand why qemu's XFD is zero after kernel supports AMX.

There are three copies of XFD:

- the guest value stored in vcpu->arch.

- the "QEMU" value attached to host_fpu.  This one only becomes zero if 
QEMU requires AMX (which shouldn't happen).

- the internal KVM value attached to guest_fpu.  When #NM happens, this 
one becomes zero.


The CPU value is:

- the host_fpu value before kvm_load_guest_fpu and after 
kvm_put_guest_fpu.  This ensures that QEMU context switch is as cheap as 
possible.

- the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. 
  This ensures that no state is lost in the case you are describing.

- the OR of the guest value and the guest_fpu value while the guest runs 
(using either MSR load/save lists, or manual wrmsr like 
pt_guest_enter/pt_guest_exit).  This ensures that the host has the 
opportunity to get a #NM exception, and allocate AMX state in the 
guest_fpu and in current->thread.fpu.

> Yes, passthrough is done by two cases: one is guest #NM trapped;
> another is guest clearing XFD before it generates #NM (this is possible for
> guest), then passthrough.
> For the two cases, we passthrough and allocate buffer for guest_fpu, and
> current->thread.fpu.

I think it's simpler to always wait for #NM, it will only happen once 
per vCPU.  In other words, even if the guest clears XFD before it 
generates #NM, the guest_fpu's XFD remains nonzero and an #NM vmexit is 
possible.  After #NM the guest_fpu's XFD is zero; then passthrough can 
happen and the #NM vmexit trap can be disabled.

Paolo
Andy Lutomirski Oct. 13, 2021, 10:14 a.m. UTC | #9
On Wed, Oct 13, 2021, at 1:42 AM, Paolo Bonzini wrote:
> On 13/10/21 09:46, Liu, Jing2 wrote:
>> 
>>> On 13/10/21 08:15, Liu, Jing2 wrote:
>>>> After KVM passthrough XFD to guest, when vmexit opening irq window and
>>>> KVM is interrupted, kernel softirq path can call
>>>> kernel_fpu_begin() to touch xsave state. This function does XSAVES. If
>>>> guest XFD[18] is 1, and with guest AMX state in register, then guest
>>>> AMX state is lost by XSAVES.
>>>
>>> Yes, the host value of XFD (which is zero) has to be restored after vmexit.
>>> See how KVM already handles SPEC_CTRL.
>> 
>> I'm trying to understand why qemu's XFD is zero after kernel supports AMX.
>
> There are three copies of XFD:
>
> - the guest value stored in vcpu->arch.
>
> - the "QEMU" value attached to host_fpu.  This one only becomes zero if 
> QEMU requires AMX (which shouldn't happen).
>
> - the internal KVM value attached to guest_fpu.  When #NM happens, this 
> one becomes zero.
>
>
> The CPU value is:
>
> - the host_fpu value before kvm_load_guest_fpu and after 
> kvm_put_guest_fpu.  This ensures that QEMU context switch is as cheap as 
> possible.
>
> - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. 
>   This ensures that no state is lost in the case you are describing.
>
> - the OR of the guest value and the guest_fpu value while the guest runs 
> (using either MSR load/save lists, or manual wrmsr like 
> pt_guest_enter/pt_guest_exit).  This ensures that the host has the 
> opportunity to get a #NM exception, and allocate AMX state in the 
> guest_fpu and in current->thread.fpu.
>
>> Yes, passthrough is done by two cases: one is guest #NM trapped;
>> another is guest clearing XFD before it generates #NM (this is possible for
>> guest), then passthrough.
>> For the two cases, we passthrough and allocate buffer for guest_fpu, and
>> current->thread.fpu.
>
> I think it's simpler to always wait for #NM, it will only happen once 
> per vCPU.  In other words, even if the guest clears XFD before it 
> generates #NM, the guest_fpu's XFD remains nonzero and an #NM vmexit is 
> possible.  After #NM the guest_fpu's XFD is zero; then passthrough can 
> happen and the #NM vmexit trap can be disabled.

This will stop being at all optimal when Intel inevitably adds another feature that uses XFD.  In the potentially infinite window in which the guest manages XFD and #NM on behalf of its userspace and when the guest allocates the other hypothetical feature, all the #NMs will have to be trapped by KVM.

Is it really worthwhile for KVM to use XFD at all instead of preallocating the state and being done with it?  KVM would still have to avoid data loss if the guest sets XFD with non-init state, but #NM could always pass through.
Jing Liu Oct. 13, 2021, 10:25 a.m. UTC | #10
On 13/10/21 10:42, Paolo Bonzini wrote:
> On 13/10/21 09:46, Liu, Jing2 wrote:
> >
> >> On 13/10/21 08:15, Liu, Jing2 wrote:
> >>> After KVM passthrough XFD to guest, when vmexit opening irq window
> >>> and KVM is interrupted, kernel softirq path can call
> >>> kernel_fpu_begin() to touch xsave state. This function does XSAVES.
> >>> If guest XFD[18] is 1, and with guest AMX state in register, then
> >>> guest AMX state is lost by XSAVES.
> >>
> >> Yes, the host value of XFD (which is zero) has to be restored after vmexit.
> >> See how KVM already handles SPEC_CTRL.
> >
> > I'm trying to understand why qemu's XFD is zero after kernel supports AMX.
> 
> There are three copies of XFD:
> 
> - the guest value stored in vcpu->arch.

OK, let's call it e.g. vcpu->arch.xfd

[...]
> - the internal KVM value attached to guest_fpu.  When #NM happens, this
> one becomes zero.

> The CPU value is:
> 
> - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu.
>   This ensures that no state is lost in the case you are describing.
> 

OK, you mean using guest_fpu as a KVM value. Let me describe the
flow to see if anything missing.

When #NM trap which makes passthrough, guest_fpu XFD set to 0 and keeps
forever. (don't change HW XFD which is still 1)
In the #NM trap, KVM alloc buffer and regenerate a #NM exception to guest
to make guest kernel alloc its thread buffer. 
Then in next vmexit, KVM sync vcpu->arch.xfd, load guest_fpu value (=0) and
update current->thread.fpu XFD to 0 for kernel reference. 


> - the OR of the guest value and the guest_fpu value while the guest runs
> (using either MSR load/save lists, or manual wrmsr like
> pt_guest_enter/pt_guest_exit).  This ensures that the host has the
> opportunity to get a #NM exception, and allocate AMX state in the
> guest_fpu and in current->thread.fpu.
> 
> > Yes, passthrough is done by two cases: one is guest #NM trapped;
> > another is guest clearing XFD before it generates #NM (this is possible for
> > guest), then passthrough.
> > For the two cases, we passthrough and allocate buffer for guest_fpu, and
> > current->thread.fpu.
> 
> I think it's simpler to always wait for #NM, it will only happen once
> per vCPU.  In other words, even if the guest clears XFD before it
> generates #NM, the guest_fpu's XFD remains nonzero 

You mean a wrmsr trap doesn't do anything and return back?
In this case, when next vmenter, the OR of the guest value 
(vcpu->arch.xfd) and the guest_fpu value is still 1, so this 
doesn't obey guest's HW assumption? (guest finds the wrmsr 
didn't work)
 
Thanks,
Jing

and an #NM vmexit is
> possible.  After #NM the guest_fpu's XFD is zero; then passthrough can
> happen and the #NM vmexit trap can be disabled.

> 
> Paolo
Paolo Bonzini Oct. 13, 2021, 12:26 p.m. UTC | #11
On 13/10/21 12:14, Andy Lutomirski wrote:
>> I think it's simpler to always wait for #NM, it will only happen
>> once per vCPU.  In other words, even if the guest clears XFD before
>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM
>> vmexit is possible.  After #NM the guest_fpu's XFD is zero; then
>> passthrough can happen and the #NM vmexit trap can be disabled.
>
> This will stop being at all optimal when Intel inevitably adds
> another feature that uses XFD.  In the potentially infinite window in
> which the guest manages XFD and #NM on behalf of its userspace and
> when the guest allocates the other hypothetical feature, all the #NMs
> will have to be trapped by KVM.

The reason is that it's quite common to simply let the guest see all 
CPUID bits that KVM knows about.  But it's not unlikely that most guests 
will not ever use any XFD feature, and therefore will not ever see an 
#NM.  I wouldn't have any problem with allocating _all_ of the dynamic 
state space on the first #NM.

Thinking more about it, #NM only has to be trapped if XCR0 enables a 
dynamic feature.  In other words, the guest value of XFD can be limited 
to (host_XFD|guest_XFD) & guest_XCR0.  This avoids that KVM 
unnecessarily traps for old guests that use CR0.TS.

Paolo

> Is it really worthwhile for KVM to use XFD at all instead of
> preallocating the state and being done with it?  KVM would still have
> to avoid data loss if the guest sets XFD with non-init state, but #NM
> could always pass through.
>
Paolo Bonzini Oct. 13, 2021, 12:37 p.m. UTC | #12
On 13/10/21 12:25, Liu, Jing2 wrote:
> [...]
>> - the internal KVM value attached to guest_fpu.  When #NM happens, this
>> one becomes zero.
> 
>> The CPU value is:
>>
>> - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu.
>>    This ensures that no state is lost in the case you are describing.
>>
> 
> OK, you mean using guest_fpu as a KVM value. Let me describe the
> flow to see if anything missing.
> 
> When #NM trap which makes passthrough, guest_fpu XFD set to 0 and keeps
> forever. (don't change HW XFD which is still 1)
> In the #NM trap, KVM alloc buffer and regenerate a #NM exception to guest
> to make guest kernel alloc its thread buffer.
> Then in next vmexit, KVM sync vcpu->arch.xfd, load guest_fpu value (=0) and
> update current->thread.fpu XFD to 0 for kernel reference.

In the #NM handler, KVM allocates the buffer and the guest_fpu XFD 
becomes zero.  Also because the guest_fpu XFD is zero:

- #NM vmexits are disabled.  More precisely, trapping #NM is only 
necessary if guest_fpu->xfd & ~vcpu->arch.xfd & vcpu->arch.xcr0 is 
nonzero (i.e. only if there is a state that is guest_fpu-disabled, but 
enabled according to both XFD and XCR0).

- On the next vmentry XFD is set to just vcpu->arch.xfd and the 
instruction is retried.  If the instruction causes an #NM in the guest, 
it is not trapped and delivered normally to the guest.

>> I think it's simpler to always wait for #NM, it will only happen once
>> per vCPU.  In other words, even if the guest clears XFD before it
>> generates #NM, the guest_fpu's XFD remains nonzero
> 
> You mean a wrmsr trap doesn't do anything and return back?

The guest might run with the same XFD value as before (which is 
guest_fpu->xfd | vcpu->arch.xfd), but vcpu->arch.xfd is changed.  The 
value in vcpu->arch.xfd will be read back by an RDMSR, because 
passthrough is not enabled and the RDMSR will cause a vmexit.

Once an #NM is received and guest_fpu->xfd becomes zero, passthrough can 
be enabled.

Paolo

> In this case, when next vmenter, the OR of the guest value
> (vcpu->arch.xfd) and the guest_fpu value is still 1, so this
> doesn't obey guest's HW assumption? (guest finds the wrmsr
> didn't work)
Thomas Gleixner Oct. 13, 2021, 2:06 p.m. UTC | #13
Paolo,

On Wed, Oct 13 2021 at 10:42, Paolo Bonzini wrote:
> On 13/10/21 09:46, Liu, Jing2 wrote:
>>> Yes, the host value of XFD (which is zero) has to be restored after vmexit.
>>> See how KVM already handles SPEC_CTRL.
>> 
>> I'm trying to understand why qemu's XFD is zero after kernel supports AMX.
>
> There are three copies of XFD:
>
> - the guest value stored in vcpu->arch.
>
> - the "QEMU" value attached to host_fpu.  This one only becomes zero if 
> QEMU requires AMX (which shouldn't happen).

I don't think that makes sense.

First of all, if QEMU wants to expose AMX to guests, then it has to ask
for permission to do so as any other user space process. We're not going
to make that special just because.

The guest configuration will have to have a 'needs AMX' flag set. So
QEMU knows that it is required upfront.

Which also means that a guest configuration which has it not set will
never get AMX passed through.

That tells me, that we should not bother at all with on demand buffer
reallocations for that case and just keep things simple.

The on demand buffer allocation from the general OS point of view makes
sense because there it really matters whether we allocate $N kilobytes
per thread or not.

But does it matter for the QEMU process and its vCPU threads when the
guest is allowed to use AMX? I don't think so. It's an academic exercise
IMO and just makes the handling of this way more complex than required.

So the logic should be:

   qemu()
     read_config()
     if (dynamic_features_passthrough())
     	request_permission(feature)

     create_vcpu_threads()
       ....

       vcpu_thread()
         kvm_ioctl(ENABLE_DYN_FEATURE, feature)
           reallocate_buffers()
             realloc(tsk->fpu.fpstate, feature)
             realloc(guest_fpu.fpstate, feature)
             realloc(host_fpu.fpstate, feature)

             All of them will have

             fpstate.xfd = default_xfd & ~feature

That makes also resume and migration simple because that's going to use
exactly the same mechanism.

Yes, it _allows_ QEMU user space to use AMX, but that's not the end of
the world, really and avoids a ton of special cases to worry about.

Also the extra memory consumption per vCPU thread is probably just noise
compared to the rest of the vCPU state.

With that the only thing you have to take care of is in vmx_vcpu_run():

   local_irq_disable();
   ...
   vmx_vcpu_run()
     wrmsrl(XFD, guest->xfd)
     vmenter()
     guest->xfd = rdmsrl(XFD)
     wrmsrl(XFD, host->xfd)

It does not matter when at some day there is a XFD controlled bit 19 and
you want to selectively allow access to guests because we have two
mechanisms here:

  1) XCR0

    XSETBV in the guest is intercepted and checked against the allowed
    bits. If it tries to set one which is not allowed, then this is
    not any different from what KVM is doing today.

    I.e. Guest1 is allowed to set bit 18, but not 19
         Guest2 is allowed to set bit 19, but not 18
         Guest3 is allowed to set both 18 and 19

  2) XFD

     Intercepting XFD is optional I think. It does not matter what the
     guest writes into it, because if XCRO[i] = 0 then the state of
     XFD[i] is irrelevant according to the ISE:

     "(IA32_XFD[i] does not affect processor operations if XCR0[i] = 0.)"

     The only thing different vs. bare metal is that when guest writes
     XFD[i]=1 it wont get #GP despite the fact that virtualized CPUID
     suggest that it should get one:
     
     "Bit i of either MSR can be set to 1 only if CPUID.(EAX=0DH,ECX=i):ECX[2]
      is enumerated as 1.  An execution of WRMSR that attempts to set an
      unsupported bit in either MSR causes a general-protection fault
      (#GP)."

     Does it matter?  Probably not, all it can figure out is that
     component[i] is supported in hardware, but it can't do anything
     with that information because the VMM will not allow it to set the
     corresponding XCR0 bit...

     Sure you can intercept XFD, check the write against the allowed
     guest bits and inject #GP if not.

     But keep in mind that the guest kernel will context switch it and
     that will not be any better than context switching XCR0 in the
     guest kernel...

The thing we need to think about is the case where guest has XCR0[i] =
XFD[i] = 1 and host has XFD[i] = 0, because setting XFD[i] = 1 does not
bring the component[i] into init state.

In that case we have the following situation after a vmexit:

     guest->xfd = rdmsrl(XFD)         [i] = 1
     wrmsrl(XFD, host->xfd)           [i] = 0

If the component[i] is _not_ in init state then the next XSAVES on the
host will save it and therefore have xsave.header.XSAVE_BV[i] = 1 in the
buffer. A subsequent XRSTORS of that buffer on the host will restore the
saved data into component[i].

But the subsequent vmenter() will restore the guest XFD which will just
bring the guest into the exactly same state as before the VMEXIT.

Ergo it does not matter at all.

That also makes #NM handling trivial. Any #NM generated in the guest is
completely uninteresting for the host with that scheme and it's the
guests problem to deal with it.

But that brings me to another issue: XFD_ERR.

Assume guest takes #NM and before the handler can run and read/clear
XFD_ERR a VMEXIT happens which means XFD_ERR will have the guest error
bit set and nothing will clear it. So XFD_ERR has to be handled properly
otherwise a subsequent #NM on the host will see a stale bit from the
guest.

   vmx_vcpu_run()
     wrmsrl(XFD, guest->xfd)
     wrmsrl(XFD_ERR, guest->xfd_err)
     vmenter()
     guest->xfd_err = rdmsrl(XFD_ERR)
     guest->xfd = rdmsrl(XFD)
     wrmsrl(XFD_ERR, 0)
     wrmsrl(XFD, host->xfd)

Of course that want's to be conditional on the guest configuration and
you probably want all of that to be in the auto-load/store area, but
you get the idea.

Anything else will just create more problems than it solves. Especially
#NM handling (think nested guest) and the XFD_ERR additive behaviour
will be a nasty playground and easy to get wrong.

Not having that at all makes life way simpler, right?

Thanks,

        tglx
Thomas Gleixner Oct. 13, 2021, 2:14 p.m. UTC | #14
On Wed, Oct 13 2021 at 14:26, Paolo Bonzini wrote:

> On 13/10/21 12:14, Andy Lutomirski wrote:
>>> I think it's simpler to always wait for #NM, it will only happen
>>> once per vCPU.  In other words, even if the guest clears XFD before
>>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM
>>> vmexit is possible.  After #NM the guest_fpu's XFD is zero; then
>>> passthrough can happen and the #NM vmexit trap can be disabled.
>>
>> This will stop being at all optimal when Intel inevitably adds
>> another feature that uses XFD.  In the potentially infinite window in
>> which the guest manages XFD and #NM on behalf of its userspace and
>> when the guest allocates the other hypothetical feature, all the #NMs
>> will have to be trapped by KVM.
>
> The reason is that it's quite common to simply let the guest see all 
> CPUID bits that KVM knows about.

On fleets the cpu features exposed to guests matter a lot to ensure
migratability and I would be surprised when such a feature would just
be universally available to anyone.

Thanks,

        tglx
Thomas Gleixner Oct. 13, 2021, 2:24 p.m. UTC | #15
On Wed, Oct 13 2021 at 16:14, Thomas Gleixner wrote:

> On Wed, Oct 13 2021 at 14:26, Paolo Bonzini wrote:
>
>> On 13/10/21 12:14, Andy Lutomirski wrote:
>>>> I think it's simpler to always wait for #NM, it will only happen
>>>> once per vCPU.  In other words, even if the guest clears XFD before
>>>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM
>>>> vmexit is possible.  After #NM the guest_fpu's XFD is zero; then
>>>> passthrough can happen and the #NM vmexit trap can be disabled.
>>>
>>> This will stop being at all optimal when Intel inevitably adds
>>> another feature that uses XFD.  In the potentially infinite window in
>>> which the guest manages XFD and #NM on behalf of its userspace and
>>> when the guest allocates the other hypothetical feature, all the #NMs
>>> will have to be trapped by KVM.
>>
>> The reason is that it's quite common to simply let the guest see all 
>> CPUID bits that KVM knows about.
>
> On fleets the cpu features exposed to guests matter a lot to ensure
> migratability and I would be surprised when such a feature would just
> be universally available to anyone.

As a VM customer you get charged for RAM, CPUs, storage and whatever
extra features you need. So why would AMX be any different?

So far Intel ignored the fact that these accelerators are managed
resources even if they are accessible via instructions and do not
require to open(/dev/magic_accelerator). But that's just wrong and XFD
should already have happened with AVX512.

Trying to expose AMX unconditionally is just wrong and overengineered
and proliferating the mess we already have to suffer from.

As I said in the other mail. QEMU has to get permissions to use AMX
first and not doing it by circumventing the permission part via a KVM
hack.

Thanks,

        tglx
Andy Lutomirski Oct. 13, 2021, 2:59 p.m. UTC | #16
On Wed, Oct 13, 2021, at 5:26 AM, Paolo Bonzini wrote:
> On 13/10/21 12:14, Andy Lutomirski wrote:
>>> I think it's simpler to always wait for #NM, it will only happen
>>> once per vCPU.  In other words, even if the guest clears XFD before
>>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM
>>> vmexit is possible.  After #NM the guest_fpu's XFD is zero; then
>>> passthrough can happen and the #NM vmexit trap can be disabled.
>>
>> This will stop being at all optimal when Intel inevitably adds
>> another feature that uses XFD.  In the potentially infinite window in
>> which the guest manages XFD and #NM on behalf of its userspace and
>> when the guest allocates the other hypothetical feature, all the #NMs
>> will have to be trapped by KVM.
>
> The reason is that it's quite common to simply let the guest see all 
> CPUID bits that KVM knows about.  But it's not unlikely that most guests 
> will not ever use any XFD feature, and therefore will not ever see an 
> #NM.  I wouldn't have any problem with allocating _all_ of the dynamic 
> state space on the first #NM.
>
> Thinking more about it, #NM only has to be trapped if XCR0 enables a 
> dynamic feature.  In other words, the guest value of XFD can be limited 
> to (host_XFD|guest_XFD) & guest_XCR0.  This avoids that KVM 
> unnecessarily traps for old guests that use CR0.TS.
>

You could simplify this by allocating the state the first time XCR0 enables the feature in question.

(This is how regular non-virt userspace *should* work too, but it looks like I’ve probably been outvoted on that front…)

> Paolo
>
>> Is it really worthwhile for KVM to use XFD at all instead of
>> preallocating the state and being done with it?  KVM would still have
>> to avoid data loss if the guest sets XFD with non-init state, but #NM
>> could always pass through.
>>
Paolo Bonzini Oct. 13, 2021, 3:05 p.m. UTC | #17
On 13/10/21 16:59, Andy Lutomirski wrote:
>> 
>> Thinking more about it, #NM only has to be trapped if XCR0 enables
>> a dynamic feature.  In other words, the guest value of XFD can be
>> limited to (host_XFD|guest_XFD) & guest_XCR0.  This avoids that
>> KVM unnecessarily traps for old guests that use CR0.TS.
>> 
> You could simplify this by allocating the state the first time XCR0
> enables the feature in question.
> 
> (This is how regular non-virt userspace*should*  work too, but it
> looks like I’ve probably been outvoted on that front…)

Good point, you could do that too and do the work on the XCR0 vmexit 
instead of #NM.

Paolo
Thomas Gleixner Oct. 13, 2021, 3:12 p.m. UTC | #18
Jing,

On Wed, Oct 13 2021 at 06:15, Jing2 Liu wrote:
>> On 12/10/21 02:00, Thomas Gleixner wrote:
> When looking into the tglx/devel.git x86/fpu for the full #1-#4 
> series and the KVM AMX support, I'd like to talk two things
>  as follows,
>
> 1. KVM dynamic allocation API:
> Since KVM also uses dynamic allocation, after KVM detects guest
> requesting AMX by #NM trap, KVM need alloc extra buffer for
> this vcpu's current->thread.fpu.fpstate and guest_fpu related.
> So far, the kernel itself has such API like fpstate_realloc(), but it's
> static. How about making a common function usable for KVM?

Just making that function usable without a proper design how this should
work at all does not solve anything.

We first need a conclusion vs. buffer reallocation.

Once that is sorted then we can create proper infrastructure for that in
the FPU core code and not just expose a random function to KVM and hack
it into submssion.

Thanks,

        tglx
Paolo Bonzini Oct. 14, 2021, 6:50 a.m. UTC | #19
On 13/10/21 16:06, Thomas Gleixner wrote:
>> - the guest value stored in vcpu->arch.
>>
>> - the "QEMU" value attached to host_fpu.  This one only becomes zero if
>> QEMU requires AMX (which shouldn't happen).
> 
> I don't think that makes sense.
> 
> First of all, if QEMU wants to expose AMX to guests, then it has to ask
> for permission to do so as any other user space process. We're not going
> to make that special just because.

Hmm, I would have preferred if there was no need to enable AMX for the 
QEMU FPU.  But you're saying that guest_fpu needs to swap out to 
current->thread.fpu if the guest is preempted, which would require 
XFD=0; and affect QEMU operation as well.

In principle I don't like it very much; it would be nicer to say "you 
enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for 
the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to 
keep things simple, so it's not a strong objection at all.

> Anything else will just create more problems than it solves. Especially
> #NM handling (think nested guest) and the XFD_ERR additive behaviour
> will be a nasty playground and easy to get wrong.
> 
> Not having that at all makes life way simpler, right?

It is simpler indeed, and it makes sense to start simple.  I am not sure 
if it will hold, but I agree it's better for the first implementation.

Paolo
Jing Liu Oct. 14, 2021, 8:02 a.m. UTC | #20
On 10/14/2021 2:50 PM, Paolo Bonzini wrote:
> On 13/10/21 16:06, Thomas Gleixner wrote:
> >> - the guest value stored in vcpu->arch.
> >>
> >> - the "QEMU" value attached to host_fpu.  This one only becomes zero
> >> if QEMU requires AMX (which shouldn't happen).
> >
> > I don't think that makes sense.
> >
> > First of all, if QEMU wants to expose AMX to guests, then it has to
> > ask for permission to do so as any other user space process. We're not
> > going to make that special just because.
> 
> Hmm, I would have preferred if there was no need to enable AMX for the
> QEMU FPU.  But you're saying that guest_fpu needs to swap out to
> current->thread.fpu if the guest is preempted, which would require
> XFD=0; and affect QEMU operation as well.

For preemption, if guest_fpu XFD is used as KVM internal value, then
we can simply set current->thread.fpu XFD the same as KVM internal
value in vmexit so kernel preemption can refer to it.

Thus, I think this issue doesn't much effect if enabling AMX for Qemu
FPU or not.

> 
> In principle I don't like it very much; it would be nicer to say "you
> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for
> the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to
> keep things simple, so it's not a strong objection at all.
> 

Does this mean that KVM allocate 3 buffers via 
1) Qemu's request, instead of via 2) guest XCR0 trap? 

For the two ways, I think what we need care is the same: a) allocation time;
b) lazy passthrough time which related to XFD handling and values. Because
we don't want always rdmsr and clear XFD in vmexit, and don't want to
trap different XFD switching in guest.

For 1), Qemu need prctl() and ioctl(ENABLE_DYN_FEATURE).
But *when* does Qemu do ioctl(ENABLE_DYN_FEATURE)? I mean if
guest XCR0 doesn't set bit18, then KVM doesn't need alloc 3 buffers
at all.

Thus, XCR0 trap is a simple way?

Meanwhile, for lazy passthrough, do we want to make it when guest
wrmsr trap (i.e. guest changes XFD, not inits XFD) if using 1) qemu's
request?  Or using 2) via XCR0 trap, directly passthrough when XCR0
trap?

> > Anything else will just create more problems than it solves. Especially
> > #NM handling (think nested guest) and the XFD_ERR additive behaviour
> > will be a nasty playground and easy to get wrong.
> >
> > Not having that at all makes life way simpler, right?
> 
> It is simpler indeed, and it makes sense to start simple.  
I'd like to confirm which is the simpler way we'd like to :)

Thanks,
Jing

I am not sure
> if it will hold, but I agree it's better for the first implementation.
> 
> Paolo
Jing Liu Oct. 14, 2021, 8:21 a.m. UTC | #21
> > 1. KVM dynamic allocation API:
> > Since KVM also uses dynamic allocation, after KVM detects guest
> > requesting AMX by #NM trap, KVM need alloc extra buffer for this
> > vcpu's current->thread.fpu.fpstate and guest_fpu related.
> > So far, the kernel itself has such API like fpstate_realloc(), but
> > it's static. How about making a common function usable for KVM?
> 
> Just making that function usable without a proper design how this should
> work at all does not solve anything.
> 
> We first need a conclusion vs. buffer reallocation.
> 
> Once that is sorted then we can create proper infrastructure for that in the
> FPU core code and not just expose a random function to KVM and hack it into
> submssion.
Yes, we need a consensus on the way we choose and then to see if need a
kernel function for KVM usage.

Thanks,
Jing

> 
> Thanks,
> 
>         tglx
Paolo Bonzini Oct. 14, 2021, 9:01 a.m. UTC | #22
On 14/10/21 10:02, Liu, Jing2 wrote:
>> In principle I don't like it very much; it would be nicer to say "you
>> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for
>> the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to
>> keep things simple, so it's not a strong objection at all.
> 
> Does this mean that KVM allocate 3 buffers via
> 1) Qemu's request, instead of via 2) guest XCR0 trap?

Based on the input from Andy and Thomas, the new way would be like this:

1) host_fpu must always be checked for reallocation in 
kvm_load_guest_fpu (or in the FPU functions that it calls, that depends 
on the rest of Thomas's patches).  That's because arch_prctl can enable 
AMX for QEMU at any point after KVM_CREATE_VCPU.

2) every use of vcpu->arch.guest_supported_xcr0 is changed to only 
include those dynamic-feature bits that were enabled via arch_prctl.
That is, something like:

static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu)
{
	return vcpu->arch.guest_supported_xcr0 &
		(~xfeatures_mask_user_dynamic | \
		 current->thread.fpu.dynamic_state_perm);
}

3) Even with passthrough disabled, the guest can run with XFD set to 
vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler 
than trapping #NM.  The traps for writing XCR0 and XFD are used to 
allocate dynamic state for guest_fpu, and start the passthrough of XFD 
and XFD_ERR.  What we need is:

- if a dynamic state has XCR0[n]=0, bit n will never be set in XFD_ERR 
and the state will never be dirtied by the guest.

- if a dynamic state has XCR0[n]=1, but all enabled dynamic states have 
XFD[n]=1, the guest is not able to dirty any dynamic XSAVE state, 
because they all have either XCR0[n]=0 or XFD[n]=1.  An attempt to do so 
will cause an #NM trap and set the bit in XFD_ERR.

- if a dynamic state has XCR0[n]=1 and XFD[n]=0, the state for bit n is 
allocated in guest_fpu, and it can also disable the vmexits for XFD and 
XFD_ERR.

Therefore:

- if passthrough is disabled, the XCR0 and XFD write traps can check 
guest_xcr0 & ~guest_xfd.  If it includes a dynamic state bit, dynamic 
state is allocated for all bits enabled in guest_xcr0 and passthrough is 
started; this should happen shortly after the guest gets its first #NM 
trap for AMX.

- if passthrough is enabled, the XCR0 write trap must still ensure that 
dynamic state is allocated for all bits enabled in guest_xcr0.

So something like this pseudocode is called by both XCR0 and XFD writes:

int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu)
{
	u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm;
	u64 enabled_dynamic =
		vcpu->arch.xcr0 & xfeatures_mask_user_dynamic;

	/* All dynamic features have to be arch_prctl'd first.  */
	WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic);

	if (!vcpu->arch.xfd_passthrough) {
		/* All dynamic states will #NM?  Wait and see.  */
		if ((enabled_dynamic & ~vcpu->arch.xfd) == 0)
			return 0;

		kvm_x86_ops.enable_xfd_passthrough(vcpu);
	}

	/* current->thread.fpu was already handled by arch_prctl.  */
	return fpu_alloc_features(vcpu->guest_fpu,
		vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic);
}

Paolo
Jing Liu Oct. 14, 2021, 11:21 a.m. UTC | #23
On 10/14/2021 5:01 PM, Paolo Bonzini wrote:

> On 14/10/21 10:02, Liu, Jing2 wrote:
> >> In principle I don't like it very much; it would be nicer to say "you
> >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and
> >> for the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you
> >> want to keep things simple, so it's not a strong objection at all.
> >
> > Does this mean that KVM allocate 3 buffers via
> > 1) Qemu's request, instead of via 2) guest XCR0 trap?
> 
> Based on the input from Andy and Thomas, the new way would be like this:
> 
> 1) host_fpu must always be checked for reallocation in kvm_load_guest_fpu
> (or in the FPU functions that it calls, that depends on the rest of Thomas's
> patches).  That's because arch_prctl can enable AMX for QEMU at any point
> after KVM_CREATE_VCPU.

For Qemu's XFD, I'd like to confirm that:
Since the arch_prctl() onlys add current->group_leader->thread.fpu's  state_perm,
__state_size, (current->thread.fpu.* is not changed), thus in
kvm_load_guest_fpu, host_fpu->xfd is always 1. That is to say, Qemu's arch_prctl()
doesn't change any copies of XFD.

> 
> 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only include
> those dynamic-feature bits that were enabled via arch_prctl.
> That is, something like:
> 
> static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) {
> 	return vcpu->arch.guest_supported_xcr0 &
> 		(~xfeatures_mask_user_dynamic | \
> 		 current->thread.fpu.dynamic_state_perm);
> }
> 
> 3) Even with passthrough disabled, the guest can run with XFD set to
> vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler
> than trapping #NM.  The traps for writing XCR0 and XFD are used to allocate
> dynamic state for guest_fpu, and start the passthrough of XFD and XFD_ERR.
> What we need is:
> 
> - if a dynamic state has XCR0[n]=0, bit n will never be set in XFD_ERR and the
> state will never be dirtied by the guest.
> 
> - if a dynamic state has XCR0[n]=1, but all enabled dynamic states have
> XFD[n]=1, the guest is not able to dirty any dynamic XSAVE state, because
> they all have either XCR0[n]=0 or XFD[n]=1.  An attempt to do so will cause an
> #NM trap and set the bit in XFD_ERR.
> 
> - if a dynamic state has XCR0[n]=1 and XFD[n]=0, the state for bit n is
> allocated in guest_fpu, and it can also disable the vmexits for XFD and
> XFD_ERR.
> 

Got it, the principle is once XCR0[n]=1 and XFD[n]=0, then guest is allowed
to use the dynamic XSAVE state, thus KVM must prepare all things well
before. This probably happens shortly after guest #NM.

Only one thing: it seems we assume that vcpu->arch.xfd is guest runtime
value. And before guest initializes XFD, KVM provides
vcpu->arch.xfd[18]=1, right? But the spec asks XFD reset value as zero.
If so, between guest init XCR0 to 1 and init XFD to 1, it's XCR0[n]=1 and
XFD[n]=0. If a guest never init XFD and directly use dynamic state...

Or do we want to provide guest a XFD[18]=1 value at the very beginning?

> Therefore:
> 
> - if passthrough is disabled, the XCR0 and XFD write traps can check
> guest_xcr0 & ~guest_xfd.  If it includes a dynamic state bit, dynamic state is
> allocated for all bits enabled in guest_xcr0 and passthrough is started; this
> should happen shortly after the guest gets its first #NM trap for AMX.
> 
> - if passthrough is enabled, the XCR0 write trap must still ensure that
> dynamic state is allocated for all bits enabled in guest_xcr0.
> 
> So something like this pseudocode is called by both XCR0 and XFD writes:
> 
> int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu) {
> 	u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm;
> 	u64 enabled_dynamic =
> 		vcpu->arch.xcr0 & xfeatures_mask_user_dynamic;
> 
> 	/* All dynamic features have to be arch_prctl'd first.  */
> 	WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic);
> 
> 	if (!vcpu->arch.xfd_passthrough) {
> 		/* All dynamic states will #NM?  Wait and see.  */
> 		if ((enabled_dynamic & ~vcpu->arch.xfd) == 0)
Here, when guest init XCR0 to 1, vcpu->arch.xfd should be 1
otherwise XCR0 trap makes passthrough and allocates buffer, which
is not what we want.

> 			return 0;
> 
> 		kvm_x86_ops.enable_xfd_passthrough(vcpu);
> 	}
> 
> 	/* current->thread.fpu was already handled by arch_prctl.  */
It seems so far, arch_prctl does not change current->thread.fpu,
only #NM handler itself does it. We here alloc current too.

Thanks,
Jing
> 	return fpu_alloc_features(vcpu->guest_fpu,
> 		vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic); }
> 
> Paolo
Jing Liu Oct. 14, 2021, 11:30 a.m. UTC | #24
On 10/14/2021 5:01 PM, Paolo Bonzini wrote:
> On 14/10/21 10:02, Liu, Jing2 wrote:
> >> In principle I don't like it very much; it would be nicer to say "you
> >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and
> >> for the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you
> >> want to keep things simple, so it's not a strong objection at all.
> >
> > Does this mean that KVM allocate 3 buffers via
> > 1) Qemu's request, instead of via 2) guest XCR0 trap?
> 
> Based on the input from Andy and Thomas, the new way would be like this:
> 
> 1) host_fpu must always be checked for reallocation in kvm_load_guest_fpu
> (or in the FPU functions that it calls, that depends on the rest of Thomas's
> patches).  That's because arch_prctl can enable AMX for QEMU at any point
> after KVM_CREATE_VCPU.
> 
> 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only include
> those dynamic-feature bits that were enabled via arch_prctl.
> That is, something like:
> 
> static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) {
> 	return vcpu->arch.guest_supported_xcr0 &
> 		(~xfeatures_mask_user_dynamic | \
> 		 current->thread.fpu.dynamic_state_perm);
> }
> 
> 3) Even with passthrough disabled, the guest can run with XFD set to
> vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler
> than trapping #NM.  The traps for writing XCR0 and XFD are used to allocate
> dynamic state for guest_fpu, and start the passthrough of XFD and XFD_ERR.

For XFD_ERR, since it can be auto changed by HW, write-protect is not
need I think. KVM also not need trap rdmsr of it because no use.

I guess we're worrying about is when KVM is sched_out, a nonzero XFD_ERR
can be lost by other host thread. We can save guest XFD_ERR in sched_out
and restore before next vmenter. Kernel is assumed not using AMX thus
softirq won't make it lost.
I think this solves the problem. So we can directly passthrough RW of it,
and no need to rdmsr(XFD_ERR) in vmexit.

Thanks,
Jing
Paolo Bonzini Oct. 14, 2021, 11:33 a.m. UTC | #25
On 14/10/21 13:21, Liu, Jing2 wrote:
> Got it, the principle is once XCR0[n]=1 and XFD[n]=0, then guest is allowed
> to use the dynamic XSAVE state, thus KVM must prepare all things well
> before. This probably happens shortly after guest #NM.
> 
> Only one thing: it seems we assume that vcpu->arch.xfd is guest runtime
> value. And before guest initializes XFD, KVM provides
> vcpu->arch.xfd[18]=1, right? But the spec asks XFD reset value as zero.
> If so, between guest init XCR0 to 1 and init XFD to 1, it's XCR0[n]=1 and
> XFD[n]=0. If a guest never init XFD and directly use dynamic state...
> 
> Or do we want to provide guest a XFD[18]=1 value at the very beginning?

On reset the guest value has to be zero.  For Linux, which we control, 
we probably want to write the bit in XFD before XSETBV.  For other OSes 
there's nothing we can do, but hopefully they make similar considerations.

Paolo
Paolo Bonzini Oct. 14, 2021, 11:39 a.m. UTC | #26
On 14/10/21 13:30, Liu, Jing2 wrote:
> I guess we're worrying about is when KVM is sched_out, a nonzero XFD_ERR
> can be lost by other host thread. We can save guest XFD_ERR in sched_out
> and restore before next vmenter. Kernel is assumed not using AMX thus
> softirq won't make it lost.
> I think this solves the problem. So we can directly passthrough RW of it,
> and no need to rdmsr(XFD_ERR) in vmexit.

Correct; you can also use the "user-return MSRs" machinery (until Linux 
starts using AMX in the kernel, but that shouldn't happen too soon).

Paolo
Thomas Gleixner Oct. 14, 2021, 12:23 p.m. UTC | #27
Paolo,

On Thu, Oct 14 2021 at 08:50, Paolo Bonzini wrote:
> On 13/10/21 16:06, Thomas Gleixner wrote:
>>> - the guest value stored in vcpu->arch.
>>>
>>> - the "QEMU" value attached to host_fpu.  This one only becomes zero if
>>> QEMU requires AMX (which shouldn't happen).
>> 
>> I don't think that makes sense.
>> 
>> First of all, if QEMU wants to expose AMX to guests, then it has to ask
>> for permission to do so as any other user space process. We're not going
>> to make that special just because.
>
> Hmm, I would have preferred if there was no need to enable AMX for the 
> QEMU FPU.  But you're saying that guest_fpu needs to swap out to 
> current->thread.fpu if the guest is preempted, which would require 
> XFD=0; and affect QEMU operation as well.

Exactly. If we don't enable it for QEMY itself, then this is creating
just a horrible inconsistency which requires nasty hacks. I'm not at
all interested in those as I just got rid of quite some and made the
code consistent.

> In principle I don't like it very much; it would be nicer to say "you 
> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for 
> the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to 
> keep things simple, so it's not a strong objection at all.

Errm.

   qemu()
     read_config()
     if (dynamic_features_passthrough())
	request_permission(feature)             <- prctl(ARCH_SET_STATE_ENABLE)

     create_vcpu_threads()
       ....

       vcpu_thread()
	 kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl

That's what I lined out, right?

>> Anything else will just create more problems than it solves. Especially
>> #NM handling (think nested guest) and the XFD_ERR additive behaviour
>> will be a nasty playground and easy to get wrong.
>> 
>> Not having that at all makes life way simpler, right?
>
> It is simpler indeed, and it makes sense to start simple.  I am not sure 
> if it will hold, but I agree it's better for the first implementation.

KISS is a very reasonable engineering principle :)

If there is a real world use case and a proper technical justification
for doing the dynamic buffer allocation then I'm happy to discuss that.

Thanks,

        tglx
Paolo Bonzini Oct. 14, 2021, 12:26 p.m. UTC | #28
On 14/10/21 14:23, Thomas Gleixner wrote:
>> In principle I don't like it very much; it would be nicer to say "you
>> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for
>> the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to
>> keep things simple, so it's not a strong objection at all.
> Errm.
> 
>     qemu()
>       read_config()
>       if (dynamic_features_passthrough())
> 	request_permission(feature)             <- prctl(ARCH_SET_STATE_ENABLE)
> 
>       create_vcpu_threads()
>         ....
> 
>         vcpu_thread()
> 	 kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl
> 
> That's what I lined out, right?
> 

I meant prctl for QEMU-in-user-mode vs. ioctl QEMU-in-guest-mode (i.e. 
no prctl if only the guest uses it).  But anyway it's just abstract 
"beauty", let's stick to simple. :)

Paolo
Thomas Gleixner Oct. 14, 2021, 1:08 p.m. UTC | #29
On Thu, Oct 14 2021 at 08:21, Jing2 Liu wrote:
>> 
>> Once that is sorted then we can create proper infrastructure for that in the
>> FPU core code and not just expose a random function to KVM and hack it into
>> submssion.
> Yes, we need a consensus on the way we choose and then to see if need a
> kernel function for KVM usage.

The question is not 'if'. The question is 'which' functionality we need.

Thanks,

        tglx
Thomas Gleixner Oct. 14, 2021, 2:09 p.m. UTC | #30
On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote:
> On 14/10/21 10:02, Liu, Jing2 wrote:
> Based on the input from Andy and Thomas, the new way would be like this:
>
> 1) host_fpu must always be checked for reallocation in 
> kvm_load_guest_fpu (or in the FPU functions that it calls, that depends 
> on the rest of Thomas's patches).  That's because arch_prctl can enable 
> AMX for QEMU at any point after KVM_CREATE_VCPU.

No.

   1) QEMU starts
   2) QEMU requests permissions via prctl()
   3) QEMU creates vCPU threads

Doing it the other way around makes no sense at all and wont work.

> 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only 
> include those dynamic-feature bits that were enabled via arch_prctl.
> That is, something like:
>
> static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu)
> {
> 	return vcpu->arch.guest_supported_xcr0 &
> 		(~xfeatures_mask_user_dynamic | \
> 		 current->thread.fpu.dynamic_state_perm);

Bah. You can't get enough from poking in internals, right?

vcpu_create()

  fpu_init_fpstate_user(guest_fpu, supported_xcr0)

That will (it does not today) do:

     guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm();

for you. Once.

The you have the information you need right in the guest FPU.

See?

> So something like this pseudocode is called by both XCR0 and XFD writes:
>
> int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu)
> {
> 	u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm;

That's not a valid assumption.

> 	u64 enabled_dynamic =
> 		vcpu->arch.xcr0 & xfeatures_mask_user_dynamic;
>
> 	/* All dynamic features have to be arch_prctl'd first.  */
> 	WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic);
>
> 	if (!vcpu->arch.xfd_passthrough) {
> 		/* All dynamic states will #NM?  Wait and see.  */
> 		if ((enabled_dynamic & ~vcpu->arch.xfd) == 0)
> 			return 0;
>
> 		kvm_x86_ops.enable_xfd_passthrough(vcpu);
> 	}
>
> 	/* current->thread.fpu was already handled by arch_prctl.  */

No. current->thread.fpu has the default buffer unless QEMU used AMX or
something forced it to allocate a larger buffer.

> 	return fpu_alloc_features(vcpu->guest_fpu,
> 		vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic);

This unconditionally calls into that allocation for every XCR0/XFD
trap ?

> }

Also you really should not wait until _all_ dynamic states are cleared
in guest XFD. Because a guest which has bit 18 and 19 available but only
uses one of them is going to trap on every other context switch due to
XFD writes.

So you check for

   (guest_xfd & guest_perm) != guest_perm)

and

   (guest_xr0 & guest_perm) != 0

If both are true, then you reallocate the buffers for _all_ permitted
states _and_ set XFD to pass through.

Thanks,

        tglx
Thomas Gleixner Oct. 14, 2021, 2:23 p.m. UTC | #31
On Thu, Oct 14 2021 at 14:26, Paolo Bonzini wrote:
> On 14/10/21 14:23, Thomas Gleixner wrote:
>>> In principle I don't like it very much; it would be nicer to say "you
>>> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for
>>> the guests via ioctl(KVM_SET_CPUID2)".  But I can see why you want to
>>> keep things simple, so it's not a strong objection at all.
>> Errm.
>> 
>>     qemu()
>>       read_config()
>>       if (dynamic_features_passthrough())
>> 	request_permission(feature)             <- prctl(ARCH_SET_STATE_ENABLE)
>> 
>>       create_vcpu_threads()
>>         ....
>> 
>>         vcpu_thread()
>> 	 kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl
>> 
>> That's what I lined out, right?
>> 
>
> I meant prctl for QEMU-in-user-mode vs. ioctl QEMU-in-guest-mode (i.e. 
> no prctl if only the guest uses it).  But anyway it's just abstract 
> "beauty", let's stick to simple. :)

It's not about simple. It's about correctness in the first place.

The prctl() is process wide and grants permission. If that permission is
not granted, e.g. by a seccomp rule, then the vCPU threads cannot use it
either. We are _not_ making exceptions for KVM just because it's KVM.

Trying to pretend that the usermode thread does not need it is just
illusion. The kernel representation of that very usermode vCPU thread must
have a large fpstate. It still can have XFD set, but that's a detail.

So what you are trying to sell me has nothing to do with beauty at all
except when your definition of beauty originates from a tunnel of horrors.

Thanks,

        tglx
Thomas Gleixner Oct. 14, 2021, 2:37 p.m. UTC | #32
On Thu, Oct 14 2021 at 16:09, Thomas Gleixner wrote:
> On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote:
>
> Also you really should not wait until _all_ dynamic states are cleared
> in guest XFD. Because a guest which has bit 18 and 19 available but only
> uses one of them is going to trap on every other context switch due to
> XFD writes.
>
> So you check for
>
>    (guest_xfd & guest_perm) != guest_perm)
>
> and
>
>    (guest_xr0 & guest_perm) != 0
>
> If both are true, then you reallocate the buffers for _all_ permitted
> states _and_ set XFD to pass through.

And for that to work we must write XFD _before_ XSETBV in the guest boot
phase.

Thanks,

        tglx
Paolo Bonzini Oct. 14, 2021, 3:01 p.m. UTC | #33
On 14/10/21 16:09, Thomas Gleixner wrote:
> On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote:
>> On 14/10/21 10:02, Liu, Jing2 wrote:
>> Based on the input from Andy and Thomas, the new way would be like this:
>>
>> 1) host_fpu must always be checked for reallocation in
>> kvm_load_guest_fpu (or in the FPU functions that it calls, that depends
>> on the rest of Thomas's patches).  That's because arch_prctl can enable
>> AMX for QEMU at any point after KVM_CREATE_VCPU.
> 
> No.
> 
>     1) QEMU starts
>     2) QEMU requests permissions via prctl()
>     3) QEMU creates vCPU threads
> 
> Doing it the other way around makes no sense at all and wont work.

Sure, but KVM needs to do something that makes sense even for userspaces 
that are not QEMU.

For example, there could be a program that uses AMX *itself* and does 
not expose it to the guest.  In that case, the arch_prctl can come at 
the point AMX is needed, which can be after the program creates vCPU 
threads.  That's for host_fpu.

For the guest_fpu, I agree that the arch_prctl must come before creating 
vCPUs.

>> 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only
>> include those dynamic-feature bits that were enabled via arch_prctl.
>> That is, something like:
>>
>> static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu)
>> {
>> 	return vcpu->arch.guest_supported_xcr0 &
>> 		(~xfeatures_mask_user_dynamic | \
>> 		 current->thread.fpu.dynamic_state_perm);
> 
> Bah. You can't get enough from poking in internals, right?
> 
> vcpu_create()
> 
>    fpu_init_fpstate_user(guest_fpu, supported_xcr0)
> 
> That will (it does not today) do:
> 
>       guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm();
> 
> The you have the information you need right in the guest FPU.

Good, I wasn't aware of the APIs that will be there.

>> int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu)
>> {
>> 	u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm;
> 
> That's not a valid assumption.
> 
>> 	u64 enabled_dynamic =
>> 		vcpu->arch.xcr0 & xfeatures_mask_user_dynamic;
>>
>> 	/* All dynamic features have to be arch_prctl'd first.  */
>> 	WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic);
>>
>> 	if (!vcpu->arch.xfd_passthrough) {
>> 		/* All dynamic states will #NM?  Wait and see.  */
>> 		if ((enabled_dynamic & ~vcpu->arch.xfd) == 0)
>> 			return 0;
>>
>> 		kvm_x86_ops.enable_xfd_passthrough(vcpu);
>> 	}
>>
>> 	/* current->thread.fpu was already handled by arch_prctl.  */
> 
> No. current->thread.fpu has the default buffer unless QEMU used AMX or
> something forced it to allocate a larger buffer.
> 
>> 	return fpu_alloc_features(vcpu->guest_fpu,
>> 		vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic);
> 
> This unconditionally calls into that allocation for every XCR0/XFD
> trap ?

Calls into the function, but doesn't necessarily allocate anything. 
What you wrote below looks correct to me, thanks.

Paolo

> Also you really should not wait until _all_ dynamic states are cleared
> in guest XFD.  Because a guest which has bit 18 and 19 available but only > uses one of them is going to trap on every other context switch due to
> XFD writes.
Thomas Gleixner Oct. 14, 2021, 7:14 p.m. UTC | #34
Paolo,

On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote:
> On 14/10/21 16:09, Thomas Gleixner wrote:
>> On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote:
>>> On 14/10/21 10:02, Liu, Jing2 wrote:
>>> Based on the input from Andy and Thomas, the new way would be like this:
>>>
>>> 1) host_fpu must always be checked for reallocation in
>>> kvm_load_guest_fpu (or in the FPU functions that it calls, that depends
>>> on the rest of Thomas's patches).  That's because arch_prctl can enable
>>> AMX for QEMU at any point after KVM_CREATE_VCPU.
>> 
>> No.
>> 
>>     1) QEMU starts
>>     2) QEMU requests permissions via prctl()
>>     3) QEMU creates vCPU threads
>> 
>> Doing it the other way around makes no sense at all and wont work.
>
> Sure, but KVM needs to do something that makes sense even for userspaces 
> that are not QEMU.
>
> For example, there could be a program that uses AMX *itself* and does 
> not expose it to the guest.  In that case, the arch_prctl can come at 
> the point AMX is needed, which can be after the program creates vCPU 
> threads.  That's for host_fpu.

That wont affect the vCPU threads unless they start to use AMX in user
space themself. Which means they have the default buffer and their vCPU
user/guest FPU's too.

The prctl() sets the permission nothing else.  As long as they don't use
AMX their XFD[18] stays set. Only when they start using AMX in user
space themself they trigger #NM which allocates a larger buffer for the
thread.

So then the point where it matters is fpu_swap_kvm_fpu() and that's
preemptible context so we can do allocations before fiddling with the
buffers. Not rocket science.

And that has nothing to do with the whole XCR0/XFD/XFD_ERR/#NM guest
mess.

> For the guest_fpu, I agree that the arch_prctl must come before creating 
> vCPUs.

Good :)

>> vcpu_create()
>> 
>>    fpu_init_fpstate_user(guest_fpu, supported_xcr0)
>> 
>> That will (it does not today) do:
>> 
>>       guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm();
>> 
>> The you have the information you need right in the guest FPU.
>
> Good, I wasn't aware of the APIs that will be there.

Me neither, but that's a pretty obvious consequence of the work I'm
doing for AMX. So I made it up for you. :)

>> This unconditionally calls into that allocation for every XCR0/XFD
>> trap ?
>
> Calls into the function, but doesn't necessarily allocate anything.

Sure.

> What you wrote below looks correct to me, thanks.
>
> Paolo
>

Properly quoting mail is hard, right?

>> Also you really should not wait until _all_ dynamic states are cleared
>> in guest XFD.  Because a guest which has bit 18 and 19 available but only > uses one of them is going to trap on every other context switch due to
>> XFD writes.

Thanks,

        tglx
Jing Liu Oct. 15, 2021, 9 a.m. UTC | #35
On 10/14/2021 11:01 PM, Paolo Bonzini wrote:
[...]
> Calls into the function, but doesn't necessarily allocate anything.
> What you wrote below looks correct to me, thanks.
> 

For the guest dynamic state support, based on the latest discussion,
four copies of XFD need be cared and switched, I'd like to list as follows.

- vcpu->arch.xfd: this is the real guest value for running.
Since kernel init XFD before XCR0, so I think KVM can initialize it as
bit[n]=0, for a guest start value. Otherwise, kvm_arch_vcpu_create()
need initializes vcpu->arch.xfd=guest_fpu->xfd=user_fpu->xfd=1.
Guest wrmsr XFD trap will make it update.

- user_fpu->fpstate->xfd: Qemu itself and not for guest, which is
probably always set.

- guest_fpu->fpstate->xfd: this is for KVM internal value between time[*].
KVM reinitializes it as bit[n]=0 (not the same as user_fpu), and it will be
updated when guest wrmsr trap. Thus, before passthrough, it's the same
as vcpu->arch.xfd, thus vmenter/vmexit need not rewrite msr.
After passthrough, this keeps bit[n] as 0 forever.

- current_fpu->fpstate->xfd: it should be the same as KVM internal value
between time[*].
[*] this means between kvm_load_guest_fpu and kvm_put_guest_fpu.

From guest booting timeline,  the values are: 

Booting start...   # In this time, vcpu->arch.xfd[n]=guest_fpu->xfd[n]=0
Init XFD by WRMSR(XFD[n], 1)  	# Then, vcpu->arch.xfd[n]=guest_fpu->xfd[n]=1
Init XCR0 by XSETBV 	
...
#NM WRMSR(XFD[n], 0)  # Then, guest_fpu->xfd[n]=0, vcpu->arch.xfd[n]=0.
vcpu->arch.xfd will be updated in later vmexits. 

BTW, we only need lazy-passthrough XFD WRITE and passthrough
READ directly.

Thanks,
Jing

> Paolo
> 
> > Also you really should not wait until _all_ dynamic states are cleared
> > in guest XFD.  Because a guest which has bit 18 and 19 available but
> > only > uses one of them is going to trap on every other context switch due
> to XFD writes.
Jing Liu Oct. 15, 2021, 9:20 a.m. UTC | #36
On 10/15/2021 3:14 AM, Thomas Gleixner wrote:
> Paolo,
> 
[...]
> >> vcpu_create()
> >>
> >>    fpu_init_fpstate_user(guest_fpu, supported_xcr0)
> >>
> >> That will (it does not today) do:
> >>
> >>       guest_fpu::__state_perm = supported_xcr0 &
> >> xstate_get_group_perm();
> >>
> >> The you have the information you need right in the guest FPU.
> >
> > Good, I wasn't aware of the APIs that will be there.
> 
> Me neither, but that's a pretty obvious consequence of the work I'm doing
> for AMX. So I made it up for you. :)

Do you mean that fpu_init_fpstate_user() will be updated to add
supported_xcr0 later? :) 

I'm thinking if guest_fpu::xfd is good to directly initialize as user's
init_fpstate.xfd. Because before guest initializes XFD, "hardware"
is reset value. So it would be better to make guest_fpu::xfd the
same so no need to reload zero before vmenter during this time.

Thanks,
Jing
Thomas Gleixner Oct. 15, 2021, 9:36 a.m. UTC | #37
Paolo,

On Thu, Oct 14 2021 at 21:14, Thomas Gleixner wrote:
> On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote:
>>> vcpu_create()
>>> 
>>>    fpu_init_fpstate_user(guest_fpu, supported_xcr0)
>>> 
>>> That will (it does not today) do:
>>> 
>>>       guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm();
>>> 
>>> The you have the information you need right in the guest FPU.
>>
>> Good, I wasn't aware of the APIs that will be there.
>
> Me neither, but that's a pretty obvious consequence of the work I'm
> doing for AMX. So I made it up for you. :)

let me make some more up for you!

If you carefully look at part 2 of the rework, then you might notice
that there is a fundamental change which allows to do a real
simplification for KVM FPU handling:

   current->thread.fpu.fpstate

is now a pointer. So you can spare one FPU allocation because we can now
do:

fpu_attach_guest_fpu(supported_xcr0)
{
        guest_fpstate = alloc_fpstate(supported_xcr0);
        fpu_init_fpstate_user(guest_fpstate, supported_xcr0);
        current->thread.fpu.guest_fpstate = guest_fpstate;
}

fpu_swap_kvm_fpu() becomes in the first step:

fpu_swap_kvm_fpu(bool enter_guest)
{
        safe_fpregs_to_fpstate(current->thread.fpu.fpstate);

        swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate);

        restore_fpregs_from_fpstate(current->thread.fpu.fpstate);
}

@enter guest will allow to do some sanity checks

In a second step:

fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features)
{
        possibly_reallocate(enter_guest, guest_needs_features);
        safe_fpregs_to_fpstate(current->thread.fpu.fpstate);

        swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate);

        restore_fpregs_from_fpstate(current->thread.fpu.fpstate);
        possibly_reallocate(enter_guest, guest_needs_features);
}

@guest_needs_features is the information which you gather via guest XCR0
and guest XFD.

So fpu_swap_kvm_fpu() is going to be the place where reallocation happens
and that's good enough for both cases:

vcpu_run()

     fpu_swap_kvm_fpu(); <- 1

     while (...)
           vmenter();

     fpu_swap_kvm_fpu(); <- 2

#1 QEMU user space used feature and has already large fpstate

#2 Guest requires feature but has not used it yet (XCR0/XFD trapping)

See?

It's not only correct, it's also simple and truly beautiful.

Thanks,

        tglx
Thomas Gleixner Oct. 15, 2021, 10:50 a.m. UTC | #38
Jing,

On Fri, Oct 15 2021 at 09:00, Jing2 Liu wrote:
> On 10/14/2021 11:01 PM, Paolo Bonzini wrote:
> For the guest dynamic state support, based on the latest discussion,
> four copies of XFD need be cared and switched, I'd like to list as
> follows.

There will not be 4 copies. Read my last mail and think about the
consequences.

I'm really tired of this tinkering frenzy. There is only one correct
approach to this:

   1) Define the requirements

   2) Define the best trapping mechanism

   3) Sit down, look at the existing code including the FPU rework for
      AMX. Come up with a proper integration plan

   4) Clean up the existing KVM FPU mess further so the integration
      can be done smoothly

   5) Add the required infrastructure in FPU core and KVM

   6) Add the trapping mechanics

   7) Enable feature

What you are doing is looking for the quickest way to duct tape this
into the existing mess.

That might be matching the KVM expectations, but it's not going to
happen.

KVM already violates all well known rules of encapsulation and just
fiddles in the guts of FPU mechanism, duplicates code in buggy ways.

This has to stop now!

You are free to ignore me, but all you are going to achieve is to delay
AMX integration further. Seriously, I'm not even going to reply to
anything which is not based on the above approach.

I'm sure you can figure out at which point we are at the moment.

Thanks,

        tglx
Paolo Bonzini Oct. 15, 2021, 11:17 a.m. UTC | #39
On 15/10/21 12:50, Thomas Gleixner wrote:
> That might be matching the KVM expectations, but it's not going to
> happen.
> 
> KVM already violates all well known rules of encapsulation and just
> fiddles in the guts of FPU mechanism, duplicates code in buggy ways.
> 
> This has to stop now!

FWIW, I totally agree about that.  Over the years we've gotten more 
well-thought hooks in the kernel for KVM and less hacks, and I'll only 
be happy if that extends to the FPU code which I'm quite wary of 
touching.  Most of it has been unchanged since Ingo's last rewrite.

Paolo

> You are free to ignore me, but all you are going to achieve is to delay
> AMX integration further. Seriously, I'm not even going to reply to
> anything which is not based on the above approach.
Jing Liu Oct. 15, 2021, 1:01 p.m. UTC | #40
Hi Thomas,
On 10/15/2021 6:50 PM, Thomas Gleixner wrote:
> Jing,
> 
> On Fri, Oct 15 2021 at 09:00, Jing2 Liu wrote:
> > On 10/14/2021 11:01 PM, Paolo Bonzini wrote:
> > For the guest dynamic state support, based on the latest discussion,
> > four copies of XFD need be cared and switched, I'd like to list as
> > follows.
> 
> There will not be 4 copies. Read my last mail and think about the
> consequences.
> 
Actually I saw there are fpu_init_fpstate_user(vcpu->arch.user_fpu)
and fpu_init_fpstate_user(vcpu->arch.guest_fpu) in the full series,
so I understood that we'd keep it this way. (Your last mail corrects me)

But yes, these xstate copies really make things complex and bad,
and I'm glad to do for a good clean way. I'll reply the thinking
(based on your approach below) on that thread later.


> I'm really tired of this tinkering frenzy. There is only one correct approach to
> this:

> 
>    1) Define the requirements
> 
>    2) Define the best trapping mechanism
> 
>    3) Sit down, look at the existing code including the FPU rework for
>       AMX. Come up with a proper integration plan
> 
>    4) Clean up the existing KVM FPU mess further so the integration
>       can be done smoothly
> 
>    5) Add the required infrastructure in FPU core and KVM
> 
>    6) Add the trapping mechanics
> 
>    7) Enable feature
> 
> What you are doing is looking for the quickest way to duct tape this into the
> existing mess.
> 
> That might be matching the KVM expectations, but it's not going to happen.
> 
> KVM already violates all well known rules of encapsulation and just fiddles in
> the guts of FPU mechanism, duplicates code in buggy ways.
> 
> This has to stop now!
> 

Yes, this is an opportunity to make current KVM FPU better.  

> You are free to ignore me,
Of course I won't, because I also want to try a good way that both KVM 
and kernel are glad to use.  

Thanks,
Jing

 but all you are going to achieve is to delay AMX
> integration further. Seriously, I'm not even going to reply to anything which is
> not based on the above approach.
> 
> I'm sure you can figure out at which point we are at the moment.
> 
> Thanks,
> 
>         tglx
Jing Liu Oct. 15, 2021, 2:24 p.m. UTC | #41
Hi Thomas,

On 10/15/2021 5:36 PM, Thomas Gleixner wrote:
> Paolo,
> 
> On Thu, Oct 14 2021 at 21:14, Thomas Gleixner wrote:
> > On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote:
> >>> vcpu_create()
> >>>
> >>>    fpu_init_fpstate_user(guest_fpu, supported_xcr0)
> >>>
> >>> That will (it does not today) do:
> >>>
> >>>       guest_fpu::__state_perm = supported_xcr0 &
> >>> xstate_get_group_perm();
> >>>
> >>> The you have the information you need right in the guest FPU.
> >>
> >> Good, I wasn't aware of the APIs that will be there.
> >
> > Me neither, but that's a pretty obvious consequence of the work I'm
> > doing for AMX. So I made it up for you. :)
> 
> let me make some more up for you!
> 
> If you carefully look at part 2 of the rework, then you might notice that there
> is a fundamental change which allows to do a real simplification for KVM FPU
> handling:
> 
>    current->thread.fpu.fpstate
> 
> is now a pointer. So you can spare one FPU allocation because we can now
> do:

Trying to understand your point, seems struct fpu will add a guest_fpstate
pointer. And this will be allocated when vcpu_create() by the following
function. Swap between the two pointers in load/put. What I was thinking 
is that vcpu keeps having guest_fpu and delete user_fpu. 

> 
> fpu_attach_guest_fpu(supported_xcr0)
> {
>         guest_fpstate = alloc_fpstate(supported_xcr0);

I supposed this is called when vcpu_create(). Not sure the reason 
of supported_xcr0 input here. supported_xcr0[n]=1 and
guest _state_perm[n]=1 which is requested before vcpu_create(),
so this will allocate full buffer, at vcpu_create() stage? 
Or do you mean vcpu->arch.guest_supported_xcr0.

Please correct me if I misunderstood. Thanks.

>         fpu_init_fpstate_user(guest_fpstate, supported_xcr0);
>         current->thread.fpu.guest_fpstate = guest_fpstate; }
> 


> fpu_swap_kvm_fpu() becomes in the first step:
> 
> fpu_swap_kvm_fpu(bool enter_guest)
> {
>         safe_fpregs_to_fpstate(current->thread.fpu.fpstate);
> 
>         swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate);
> 
>         restore_fpregs_from_fpstate(current->thread.fpu.fpstate);
> }
> 
> @enter guest will allow to do some sanity checks
> 
> In a second step:
> 
> fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) {
>         possibly_reallocate(enter_guest, guest_needs_features);

When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate
fpstate buffer for full features.
Because in next vmexit, guest might have dynamic state and KVM
can be preempted before running fpu_swap_kvm_fpu().
Thus, here the current->thread.fpu.fpstate already has enough space
for saving guest.

Thanks,
Jing

>         safe_fpregs_to_fpstate(current->thread.fpu.fpstate);
> 
>         swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate);
> 
>         restore_fpregs_from_fpstate(current->thread.fpu.fpstate);
>         possibly_reallocate(enter_guest, guest_needs_features); }
> 
> @guest_needs_features is the information which you gather via guest XCR0
> and guest XFD.
> 
> So fpu_swap_kvm_fpu() is going to be the place where reallocation happens
> and that's good enough for both cases:
> 
> vcpu_run()
> 
>      fpu_swap_kvm_fpu(); <- 1
> 
>      while (...)
>            vmenter();
> 
>      fpu_swap_kvm_fpu(); <- 2
> 
> #1 QEMU user space used feature and has already large fpstate
> 
> #2 Guest requires feature but has not used it yet (XCR0/XFD trapping)
> 
> See?
> 
> It's not only correct, it's also simple and truly beautiful.
> 
> Thanks,
> 
>         tglx
Paolo Bonzini Oct. 15, 2021, 3:53 p.m. UTC | #42
On 15/10/21 16:24, Liu, Jing2 wrote:
>> fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) {
>>          possibly_reallocate(enter_guest, guest_needs_features);
> When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate
> fpstate buffer for full features.

You mean XCR0 and XFD (not XFD in #NM), but yeah at the point of 
fpu_swap_kvm_fpu we are in atomic context.

Still, for now the first pass of AMX implementation doesn't need to do 
anything but swap the pointers, and it can simply allocate the full 
buffer at vCPU creation.

Paolo

> Because in next vmexit, guest might have dynamic state and KVM
> can be preempted before running fpu_swap_kvm_fpu().
> Thus, here the current->thread.fpu.fpstate already has enough space
> for saving guest.
Thomas Gleixner Oct. 16, 2021, 2:45 p.m. UTC | #43
Jing,

On Fri, Oct 15 2021 at 14:24, Jing2 Liu wrote:
> On 10/15/2021 5:36 PM, Thomas Gleixner wrote:
>> If you carefully look at part 2 of the rework, then you might notice that there
>> is a fundamental change which allows to do a real simplification for KVM FPU
>> handling:
>> 
>>    current->thread.fpu.fpstate
>> 
>> is now a pointer. So you can spare one FPU allocation because we can now
>> do:
>
> Trying to understand your point, seems struct fpu will add a guest_fpstate
> pointer. And this will be allocated when vcpu_create() by the following
> function. Swap between the two pointers in load/put. What I was thinking 
> is that vcpu keeps having guest_fpu and delete user_fpu.

unfortunately we can't do that in vcpu_create() because the thread doing
that is not necessarily the vCPU thread which invokes vcpu_run()
later. But that does not matter much.

So vcpu_create() will do

   vcpu->arch.guest_fpstate = fpu_alloc_guest_fpstate();

and in vcpu_run() invoke

    fpu_swap_kvm_fpstate(guest_fpstate, ...)

which in turn does:

int fpu_swap_kvm_fpstate(struct fpstate *guest_fps, bool enter_guest,
			 u64 restore_mask)
{
	struct fpu *fpu = &current->thread.fpu;
	struct fpstate *fps = fpu->fpstate;

	fpregs_lock();
	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
		save_fpregs_to_fpstate(fpu);

	/* Swap fpstate */
	if (enter_guest) {
		fpu->__task_fpstate = fps;
		fpu->fpstate = guest_fps;
	} else {
		fpu->fpstate = fpu->__task_fpstate;
		fpu->__task_fpstate = NULL;
	}

	fps = fpu->fpstate;

	/*
	 * Once XFD support is added, XFP switching happens here
	 * right before the restore.
	 */
	restore_mask &= XFEATURE_MASK_FPSTATE;
	restore_fpregs_from_fpstate(fps, restore_mask);

	fpregs_mark_activate();
	fpregs_unlock();
	return 0;
}

That's a simplified version of what I have already running on top of the
FPU rework part 3, but you get the idea.

If you are curious:

  https://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git/log/?h=x86/fpu-3-kvm

If you compare that to the current KVM FPU swap handling then you'll
notice that there is only _one_ buffer required and in case that
TIF_NEED_FPU_RELOAD is set there is no memcpy() required either because
the state is already saved in the to be swapped out buffer.

That's a valuable cleanup and improvement independent of AMX.

See?

This also makes the confidential computing case less awkward because we
can do:

	if (!fpstate->is-scratch && !test_thread_flag(TIF_NEED_FPU_LOAD))
		save_fpregs_to_fpstate(fpu);

instead of the current hack of freeing guest_fpu. See the git tree.

Though I'm not sure whether the logic for this "is_scratch" optimization
is correct as I implemnted it, but I'm neither sure that the current
logic in KVM is correct. But that's just a implementation detail which
needs to be looked at.

XFD support will be also fully consistently integrated:

  XFD will be switched before the restore and this will be fully
  consistent with everything we are doing vs. host side support because
  current->thread.fpu.fpstate->xfd will always be the authoritive
  answer. No need to copy any information from one place to another.

Ergo: No 4 copies of XFD.

>> In a second step:
>> 
>> fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) {
>>         possibly_reallocate(enter_guest, guest_needs_features);
>
> When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate
> fpstate buffer for full features.
> Because in next vmexit, guest might have dynamic state and KVM
> can be preempted before running fpu_swap_kvm_fpu().
> Thus, here the current->thread.fpu.fpstate already has enough space
> for saving guest.

I think we are talking past each other.

You are looking at this from the point of view of what you have been
doing so far and I am looking at it from a design and abstraction point
of view.

That explains why we have different excpectations vs. XCR0/XFD/#NM.

So the regular boring case will be:

H   vcpu_run()
H   	fpu_swap_kvm_fpstate() <- Sets guest_fpstate->xfd
H
H        while (..) {
H           vmenter()

G              ....
G              ....     -> vmexit (unrelated to XCR0/XFD)

H           ...
H        }
H
H  	fpu_swap_kvm_fpstate() <- Sets user (task) XFD

Now let's look at the XFD/XCR0 intercept case:

H   vcpu_run()
H   	fpu_swap_kvm_fpstate() <- Sets guest_fpstate->xfd
H
H        while (..) {
H           vmenter()

G              ....
G              write to XFD/XCR0;        -> intercept

H           ...
H           if (reason == write to XFD/XCR0)) {
H                if (needs_action(guest_fpstate, $XFDVAL, $XCR0VAL)) {
H                        fpstate_set_realloc_request(guest_fpstate);
H
H                        break;
H
H                }
H           }
H           .....
H        }
H
H  	fpu_swap_kvm_fpstate()

fpu_swap_kvm_fpstate() will see the reallocation request in
guest_fpstate and act accordingly.

Both user and guest state are fully consistent at that point. Why?

It does not matter at all whether the wrmsrl(XFD) or XSETBV affecting
XCR0 in the guest happens because the guest decided it is cool to enable
it just for fun or because the guest took a #NM and wrote to XFD.

In both cases the XFD controlled component is in init state at that
point. So there is _nothing_ to save and _nothing_ which can be lost and
no buffer size problem at all.

Therefore it does also not matter whether the vCPU thread gets preempted
or not on the way out to fpu_swap_kvm_fpstate(). It's all still
consistent.

So fpu_swap_kvm_fpstate() will do in case of a reallocation request:

  1) Allocate new guest fpstate

     If that fails, it does a save and restore with the existing
     fpstates and return an error code which makes KVM drop out to user
     space to decide what to do.

     On success initialize the state properly including the new XFD
     value.

  2) Save guest state into new guest fpstate

  3) Restore host (user) state from the existing fpstate

See?

It does not even have to allocate a new host (user) fpstate to do
that. Why?

  Because once the fpstate pointer is flipped the state is consistent in
  both directions including XFD.

See?

Now if you think about the other way round then the same principle
applies:

  If the host (user) side of the vCPU thread used a dynamic state it has
  a large buffer, but that does not require the guest side buffer to be
  large as well.

  So this is what Paolo wanted, right? I was fighting that because with
  the existing three buffer scheme this cannot not work.

See?

The point is that: 

  - the simple state switching was impossible because the proposed host
    side infrastructure had the wrong abstraction:

    It just reallocated the register buffer, but did not give
    it a container which carries the other relevant information,
    i.e. features, sizes and importantly xfd.

  - the simple state switching was impossible because the user/guest FPU
    concept of KVM was preventing that.

  - it was tried to plug the reallocation into the wrong place:

    There is no point to do that from inside the vcpu_run() loop. It's a
    one off event and that extra overhead of going out to the place
    where this can be handled sanely does not matter at all.

Just to be clear: I'm not blamning you for any of this at all.

There have been enough senior people involved who should have seen the
obvious instead of misguiding you.

So please just forget the horrors which you went through due to lack of
proper guidance, sit back and think about it.

The code in that git branch is just a first step and requires a few
tweaks to get the reallocation handled correctly, but if you look at the
above then you might realize that there are two related but largely
independent problems to solve:

  1) What needs to be intercepted and analyzed in the intercept handlers
     of XCR0 and XFD

  2) Handling the reallocation problem

#1 is a KVM problem and #2 is a host FPU problem

As you are a KVM wizard, I let you sort out #1 with the KVM folks and
I'm looking at the #2 part together with Chang and Dave. Why?

  #1 is not my area of expertise, but I surely might have opinions.
  
  #2 is not any different from handling the host side lazy reallocation.

Can you spot the difference between the original approach and the approach
I have taken?

Maybe you understand now why I was explaining over and over that we need
consistent state and asked everyone to look at the AMX host series.

Just for the record. When I looked at that KVM FPU switching exactly two
weeks ago while I was thinking about the right abstraction for AMX, it
was bloody obvious that just reallocating the register state buffer is
wrong. And it was bloody obvious that the user/guest FPU concept of KVM
is nonsense to begin with and going to be in the way of doing a clean
integration.

Why?

Because when you switch buffers, you need to switch state information
which belongs to the buffer, i.e. features, sizes and xfd, as well
because otherwise you create inconsistent state. Sure you can copy tons
of stuff back and forth, but why would you do that if you just can
switch the full state by swapping the pointer to a container which
contains all the information which is needed and makes everything else
(KVM trap bits aside) just work.

So you can rightfully ask why I did not tell that plan right away?

The reason is that I wanted all of you look at the AMX host series and I
desperately hoped that my insisting on state consistency will make at
least one of the involved people come back and say:

  "Thomas, why don't you do the obvious in fpu_swap_kvm_fpu() and switch
   the fpstate pointers? That FPU switching with the three buffers you
   kept around is bloody stupid."

My answer would have been: "Doh, of course, it's obvious. Stupid me."

But that did not happen. Even when I brought up the

    vcpu_create() -> alloc_and_attach()
    vcpu_run() -> swap() -> vmenter_loop() -> swap()
    vcpu_destroy() -> detach_and_free()

proposal nobody told me:

     "Thomas, this can't work because the thread which creates the  vCPU
      is not necessarily the same as the one which runs it."

No, I had to ask the question myself because I had second thoughts when
I was starting to implement that scheme. I had not thought about that
when I wrote it up in mail simply because I'm not a KVM expert. But it
did not matter because the concept stayed the same, just the
implementation details changed:

    vcpu_create() -> alloc()
    vcpu_run() -> attach() -> vmenter_loop() -> detach()
    vcpu_destroy() -> free()

Why? Because everyone was busy trying to cram their hacks into the code
I just changed instead of rethinking the situation.

See?

Jing, as I said before, I'm not blaming you personally. What I blame is
the general approach to add new features to the kernel:

    Hack it into the existing mess until it works by some definition
    of works.

That simply cannot go anywhere because it makes the code slow and
unmaintainable in the long run.

If a new feature does not fit nicely into the existing code, then the
only viable approach is to sit back, look at the requirements of the new
feature and come up with proper abstractions and a plan how to refactor
the code so that the feature falls into place at the very end and does
not create mess and overhead all over the place.

If you look at the three series I posted, then you see not a single bit
which does not make sense on it's own, except for the last series which
adds the fpu config data structure with the pairs of default_* and
max_*.

Even the fpstate pointer makes sense on it's own because the resulting
cleanup of the KVM FPU switch code is already worthwhile even w/o AMX
and XFD in terms of memory consumption, performance and robustness.

See?

The real AMX stuff which still needs to be posted is just building upon
this refactoring. It adds the necessary infrastructure for AMX, which is
all slow path code.

In the hotpath it adds only the XFD update at exactly 4 places where
state is saved or restored. IOW, all hotpath operations are exactly the
same. If XFD is not available on a CPU then the overhead of the XFD
update code is a few extra NOPs due to the patched out static branch.
If enabled then yes, it has an extra conditional and when the XFD value
really changes then a wrmsrl, but that's inevitable.

See?

Now if you sit back and look at the KVM concepts I explained above then
you surely can see that the overhead for the KVM case is going to
exactly a few extra NOPs in the hotpath when XFD is not available.

When XFD is enabled then yes, it needs the extra setup for XFD, but the
common case in the vmenter_loop() will have only a minimalistic overhead
if at all. The common case in fpu_swap_kvm_fpstate() will only grow a
single conditional in the hotpath:

       if (unlikely(guest_fpstate->need_realloc)) {

       }

but that's not even measurable.

See?

Thanks,

        Thomas
Jing Liu Nov. 22, 2021, 8:50 a.m. UTC | #44
Hi Paolo,

> On 10/14/2021 7:39 PM, Paolo Bonzini wrote:
> 
> On 14/10/21 13:30, Liu, Jing2 wrote:
> > I guess we're worrying about is when KVM is sched_out, a nonzero
> > XFD_ERR can be lost by other host thread. We can save guest XFD_ERR in
> > sched_out and restore before next vmenter. Kernel is assumed not using
> > AMX thus softirq won't make it lost.
> > I think this solves the problem. So we can directly passthrough RW of
> > it, and no need to rdmsr(XFD_ERR) in vmexit.
> 
> Correct; you can also use the "user-return MSRs" machinery (until Linux
> starts using AMX in the kernel, but that shouldn't happen too soon).
> 
Thanks for the suggestion. For user-return MSR mechanism using by emulated 
MSRs, it calls kvm_set_user_return_msr() to wrmsr of guest value, update curr
value and switch host once kernel exit to userspace. 

For XFD_ERR, it's automatically changed by H/W in guest, so KVM need correctly 
update guest XFD_ERR value at a time, which is different from other user-return
MSRs, e.g., at KVM preemption and kvm_put_guest_fpu() time, and both cases 
need not do wrmsr. And for kvm_put_guest_fpu(), it does return to userspace.
Also, XFD_ERR cannot refer to vmx->guest_uret_msrs_loaded to update before 
vmenter, since curr may not an up-to-date value. My feeling is the mechanism
may not much suitable for this case and need special handling.

Since guest non-zero XFD_ERR is rare case at vmexit, how about saving XFD_ERR
when preemption, mark flag=true and restore if non-zero before vcpu enter? This 
seems simple and direct way, drawback is if XFD_ERR is not changed when schedule
out, KVM need a wrmsr, but this only happens when it's non-zero&&flag=true.

Thanks,
Jing

> Paolo
diff mbox series

Patch

--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,8 @@ 
 #define _ASM_X86_FPU_API_H
 #include <linux/bottom_half.h>
 
+#include <asm/fpu/types.h>
+
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
  * disables preemption so be careful if you intend to use it for long periods
@@ -108,4 +110,10 @@  extern int cpu_has_xfeatures(u64 xfeatur
 
 static inline void update_pasid(void) { }
 
+/* FPSTATE related functions which are exported to KVM */
+extern void fpu_init_fpstate_user(struct fpu *fpu);
+
+/* KVM specific functions */
+extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask);
+
 #endif /* _ASM_X86_FPU_API_H */
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -74,14 +74,8 @@  static __always_inline __pure bool use_f
 	return static_cpu_has(X86_FEATURE_FXSR);
 }
 
-/*
- * fpstate handling functions:
- */
-
 extern union fpregs_state init_fpstate;
-
 extern void fpstate_init_user(union fpregs_state *state);
-extern void fpu_init_fpstate_user(struct fpu *fpu);
 
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
@@ -381,12 +375,7 @@  static inline int os_xrstor_safe(struct
 	return err;
 }
 
-extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
-
-static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
-{
-	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
-}
+extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
 
 extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
@@ -467,7 +456,7 @@  static inline void fpregs_restore_userre
 		 */
 		mask = xfeatures_mask_restore_user() |
 			xfeatures_mask_supervisor();
-		__restore_fpregs_from_fpstate(&fpu->state, mask);
+		restore_fpregs_from_fpstate(&fpu->state, mask);
 
 		fpregs_activate(fpu);
 		fpu->last_cpu = cpu;
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -124,9 +124,8 @@  void save_fpregs_to_fpstate(struct fpu *
 	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
 	frstor(&fpu->state.fsave);
 }
-EXPORT_SYMBOL(save_fpregs_to_fpstate);
 
-void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
 {
 	/*
 	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
@@ -151,7 +150,31 @@  void __restore_fpregs_from_fpstate(union
 			frstor(&fpstate->fsave);
 	}
 }
-EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask)
+{
+	fpregs_lock();
+
+	if (save) {
+		if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+			memcpy(&save->state, &current->thread.fpu.state,
+			       fpu_kernel_xstate_size);
+		} else {
+			save_fpregs_to_fpstate(save);
+		}
+	}
+
+	if (rstor) {
+		restore_mask &= xfeatures_mask_fpstate();
+		restore_fpregs_from_fpstate(&rstor->state, restore_mask);
+	}
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu);
+#endif
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
@@ -459,7 +482,6 @@  void fpregs_mark_activate(void)
 	fpu->last_cpu = smp_processor_id();
 	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
 
 /*
  * x87 math exception handling:
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,7 +136,6 @@  static void __init fpu__init_system_gene
  * components into a single, continuous memory block:
  */
 unsigned int fpu_kernel_xstate_size __ro_after_init;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
 
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,7 +65,6 @@  static short xsave_cpuid_features[] __in
  * XSAVE buffer, both supervisor and user xstates.
  */
 u64 xfeatures_mask_all __ro_after_init;
-EXPORT_SYMBOL_GPL(xfeatures_mask_all);
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -68,7 +68,9 @@ 
 #include <asm/mce.h>
 #include <asm/pkru.h>
 #include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
@@ -9899,58 +9901,27 @@  static int complete_emulated_mmio(struct
 	return 0;
 }
 
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
-	/*
-	 * If the target FPU state is not resident in the CPU registers, just
-	 * memcpy() from current, else save CPU state directly to the target.
-	 */
-	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		memcpy(&fpu->state, &current->thread.fpu.state,
-		       fpu_kernel_xstate_size);
-	else
-		save_fpregs_to_fpstate(fpu);
-}
-
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
-	kvm_save_current_fpu(vcpu->arch.user_fpu);
-
 	/*
-	 * Guests with protected state can't have it set by the hypervisor,
-	 * so skip trying to set it.
+	 * Guest with protected state have guest_fpu == NULL which makes
+	 * the swap only safe the host state. Exclude PKRU from restore as
+	 * it is restored separately in kvm_x86_ops.run().
 	 */
-	if (vcpu->arch.guest_fpu)
-		/* PKRU is separately restored in kvm_x86_ops.run. */
-		__restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
-					~XFEATURE_MASK_PKRU);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu,
+			 ~XFEATURE_MASK_PKRU);
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
 	/*
-	 * Guests with protected state can't have it read by the hypervisor,
-	 * so skip trying to save it.
+	 * Guest with protected state have guest_fpu == NULL which makes
+	 * swap only restore the host state.
 	 */
-	if (vcpu->arch.guest_fpu)
-		kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
-	restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL);
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -47,7 +47,7 @@  static bool ex_handler_fprestore(const s
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+	restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
 	return true;
 }