diff mbox series

KVM:SVM: Flush cache only on CPUs running SEV guest

Message ID 1880816055.4545532.1709260250219.JavaMail.zimbra@sjtu.edu.cn (mailing list archive)
State New, archived
Headers show
Series KVM:SVM: Flush cache only on CPUs running SEV guest | expand

Commit Message

Zheyun Shen March 1, 2024, 2:30 a.m. UTC
On AMD CPUs without ensuring cache consistency, each memory page reclamation in
an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
performance of other programs on the host.

Typically, an AMD server may have 128 cores or more, while the SEV guest might only
utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
to specific physical CPUs.

Therefore, keeping a record of the physical core numbers each time a vCPU runs
can help avoid flushing the cache for all CPUs every time.

Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn>
---
 arch/x86/include/asm/smp.h |  1 +
 arch/x86/kvm/svm/sev.c     | 28 ++++++++++++++++++++++++----
 arch/x86/kvm/svm/svm.c     |  4 ++++
 arch/x86/kvm/svm/svm.h     |  3 +++
 arch/x86/lib/cache-smp.c   |  7 +++++++
 5 files changed, 39 insertions(+), 4 deletions(-)

--
2.34.1

Comments

Sean Christopherson March 4, 2024, 5:55 p.m. UTC | #1
+Tom

"KVM: SVM:" for the shortlog scope.

On Fri, Mar 01, 2024, Zheyun Shen wrote:
> On AMD CPUs without ensuring cache consistency, each memory page reclamation in
> an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
> performance of other programs on the host.
> 
> Typically, an AMD server may have 128 cores or more, while the SEV guest might only
> utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
> to specific physical CPUs.
> 
> Therefore, keeping a record of the physical core numbers each time a vCPU runs
> can help avoid flushing the cache for all CPUs every time.

This needs an unequivocal statement from AMD that flushing caches only on CPUs
that do VMRUN is sufficient.  That sounds like it should be obviously correct,
as I don't see how else a cache line can be dirtied for the encrypted PA, but
this entire non-coherent caches mess makes me more than a bit paranoid.

> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn>
> ---
>  arch/x86/include/asm/smp.h |  1 +
>  arch/x86/kvm/svm/sev.c     | 28 ++++++++++++++++++++++++----
>  arch/x86/kvm/svm/svm.c     |  4 ++++
>  arch/x86/kvm/svm/svm.h     |  3 +++
>  arch/x86/lib/cache-smp.c   |  7 +++++++
>  5 files changed, 39 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index 4fab2ed45..19297202b 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -120,6 +120,7 @@ void native_play_dead(void);
>  void play_dead_common(void);
>  void wbinvd_on_cpu(int cpu);
>  int wbinvd_on_all_cpus(void);
> +int wbinvd_on_cpus(struct cpumask *cpumask);

KVM already has an internal helper that does this, see kvm_emulate_wbinvd_noskip().
I'm not necessarily advocating that we keep KVM's internal code, but I don't want
two ways of doing the same thing.

>  void smp_kick_mwait_play_dead(void);
>  
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index f760106c3..b6ed9a878 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -215,6 +215,21 @@ static void sev_asid_free(struct kvm_sev_info *sev)
>          sev->misc_cg = NULL;
>  }
>  
> +struct cpumask *sev_get_cpumask(struct kvm *kvm)
> +{
> +        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +
> +        return &sev->cpumask;
> +}
> +
> +void sev_clear_cpumask(struct kvm *kvm)
> +{
> +        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +
> +        cpumask_clear(&sev->cpumask);
> +}
> +
> +

Unnecessary newline.  But I would just delete these helpers.

>  static void sev_decommission(unsigned int handle)
>  {
>          struct sev_data_decommission decommission;
> @@ -255,6 +270,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>          if (unlikely(sev->active))
>                  return ret;
>  
> +        cpumask_clear(&sev->cpumask);

This is unnecessary, the mask is zero allocated.

>          sev->active = true;
>          sev->es_active = argp->id == KVM_SEV_ES_INIT;
>          asid = sev_asid_new(sev);
> @@ -2048,7 +2064,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
>           * releasing the pages back to the system for use. CLFLUSH will
>           * not do this, so issue a WBINVD.
>           */
> -        wbinvd_on_all_cpus();
> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
> +        sev_clear_cpumask(kvm);

Instead of copy+paste WBINVD+cpumask_clear() everywhere, add a prep patch to
replace relevant open coded calls to wbinvd_on_all_cpus() with calls to
sev_guest_memory_reclaimed().  Then only sev_guest_memory_reclaimed() needs to
updated, and IMO it helps document why KVM is blasting WBINVD.

That's why I recommend deleting sev_get_cpumask() and sev_clear_cpumask(), there
really should only be two places that touch the mask itself: svm

>          __unregister_enc_region_locked(kvm, region);
>  
> @@ -2152,7 +2169,8 @@ void sev_vm_destroy(struct kvm *kvm)
>           * releasing the pages back to the system for use. CLFLUSH will
>           * not do this, so issue a WBINVD.
>           */
> -        wbinvd_on_all_cpus();
> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
> +        sev_clear_cpumask(kvm);
>  
>          /*
>           * if userspace was terminated before unregistering the memory regions
> @@ -2343,7 +2361,8 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
>          return;
>  
>  do_wbinvd:
> -        wbinvd_on_all_cpus();
> +        wbinvd_on_cpus(sev_get_cpumask(vcpu->kvm));
> +        sev_clear_cpumask(vcpu->kvm);
>  }
>  
>  void sev_guest_memory_reclaimed(struct kvm *kvm)
> @@ -2351,7 +2370,8 @@ void sev_guest_memory_reclaimed(struct kvm *kvm)
>          if (!sev_guest(kvm))
>                  return;
>  
> -        wbinvd_on_all_cpus();
> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
> +        sev_clear_cpumask(kvm);

This is unsafe from a correctness perspective, as sev_guest_memory_reclaimed()
is called without holding any KVM locks.  E.g. if a vCPU runs between blasting
WBINVD and cpumask_clear(), KVM will fail to emit WBINVD on a future reclaim.

Making the mask per-vCPU, a la vcpu->arch.wbinvd_dirty_mask, doesn't solve the
problem as KVM can't take vcpu->mutex in this path (sleeping may not be allowed),
and that would create an unnecessary/unwated bottleneck.

The simplest solution I can think of is to iterate over all possible CPUs using
cpumask_test_and_clear_cpu().

>  }
>  
>  void sev_free_vcpu(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index e90b429c8..f9bfa6e57 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4107,6 +4107,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>  
>          amd_clear_divider();
>  
> +    if (sev_guest(vcpu->kvm))

Use tabs, not spaces.

> +                cpumask_set_cpu(smp_processor_id(), sev_get_cpumask(vcpu->kvm));

This does not need to be in the noinstr region, and it _shouldn't_ be in the
noinstr region.  There's already a handy dandy pre_sev_run() that provides a
convenient location to bury this stuff in SEV specific code.

> +    
>          if (sev_es_guest(vcpu->kvm))
>                  __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
>          else
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 8ef95139c..1577e200e 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -90,6 +90,7 @@ struct kvm_sev_info {
>          struct list_head mirror_entry; /* Use as a list entry of mirrors */
>          struct misc_cg *misc_cg; /* For misc cgroup accounting */
>          atomic_t migration_in_progress;
> +        struct cpumask cpumask; /* CPU list to flush */

That is not a helpful comment.  Flush what?  What adds to the list?  When is the
list cleared.  Even the name is fairly useless, e.g. "

I'm also pretty sure this should be a cpumask_var_t, and dynamically allocated
as appropriate.  And at that point, it should be allocated and filled if and only
if the CPU doesn't have X86_FEATURE_SME_COHERENT.
Tom Lendacky March 5, 2024, 10:51 p.m. UTC | #2
On 3/4/24 11:55, Sean Christopherson wrote:
> +Tom
> 
> "KVM: SVM:" for the shortlog scope.
> 
> On Fri, Mar 01, 2024, Zheyun Shen wrote:
>> On AMD CPUs without ensuring cache consistency, each memory page reclamation in
>> an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
>> performance of other programs on the host.
>>
>> Typically, an AMD server may have 128 cores or more, while the SEV guest might only
>> utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
>> to specific physical CPUs.
>>
>> Therefore, keeping a record of the physical core numbers each time a vCPU runs
>> can help avoid flushing the cache for all CPUs every time.
> 
> This needs an unequivocal statement from AMD that flushing caches only on CPUs
> that do VMRUN is sufficient.  That sounds like it should be obviously correct,
> as I don't see how else a cache line can be dirtied for the encrypted PA, but
> this entire non-coherent caches mess makes me more than a bit paranoid.

As long as the wbinvd_on_all_cpus() related to the ASID flushing isn't 
changed, this should be ok. And the code currently flushes the source 
pages when doing LAUNCH_UPDATE commands and adding encrypted regions, so 
should be good there.

Would it make sense to make this configurable, with the current behavior 
the default, until testing looks good for a while?

Thanks,
Tom

> 
>> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn>
>> ---
>>   arch/x86/include/asm/smp.h |  1 +
>>   arch/x86/kvm/svm/sev.c     | 28 ++++++++++++++++++++++++----
>>   arch/x86/kvm/svm/svm.c     |  4 ++++
>>   arch/x86/kvm/svm/svm.h     |  3 +++
>>   arch/x86/lib/cache-smp.c   |  7 +++++++
>>   5 files changed, 39 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
>> index 4fab2ed45..19297202b 100644
>> --- a/arch/x86/include/asm/smp.h
>> +++ b/arch/x86/include/asm/smp.h
>> @@ -120,6 +120,7 @@ void native_play_dead(void);
>>   void play_dead_common(void);
>>   void wbinvd_on_cpu(int cpu);
>>   int wbinvd_on_all_cpus(void);
>> +int wbinvd_on_cpus(struct cpumask *cpumask);
> 
> KVM already has an internal helper that does this, see kvm_emulate_wbinvd_noskip().
> I'm not necessarily advocating that we keep KVM's internal code, but I don't want
> two ways of doing the same thing.
> 
>>   void smp_kick_mwait_play_dead(void);
>>   
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index f760106c3..b6ed9a878 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -215,6 +215,21 @@ static void sev_asid_free(struct kvm_sev_info *sev)
>>           sev->misc_cg = NULL;
>>   }
>>   
>> +struct cpumask *sev_get_cpumask(struct kvm *kvm)
>> +{
>> +        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> +
>> +        return &sev->cpumask;
>> +}
>> +
>> +void sev_clear_cpumask(struct kvm *kvm)
>> +{
>> +        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> +
>> +        cpumask_clear(&sev->cpumask);
>> +}
>> +
>> +
> 
> Unnecessary newline.  But I would just delete these helpers.
> 
>>   static void sev_decommission(unsigned int handle)
>>   {
>>           struct sev_data_decommission decommission;
>> @@ -255,6 +270,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>           if (unlikely(sev->active))
>>                   return ret;
>>   
>> +        cpumask_clear(&sev->cpumask);
> 
> This is unnecessary, the mask is zero allocated.
> 
>>           sev->active = true;
>>           sev->es_active = argp->id == KVM_SEV_ES_INIT;
>>           asid = sev_asid_new(sev);
>> @@ -2048,7 +2064,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
>>            * releasing the pages back to the system for use. CLFLUSH will
>>            * not do this, so issue a WBINVD.
>>            */
>> -        wbinvd_on_all_cpus();
>> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
>> +        sev_clear_cpumask(kvm);
> 
> Instead of copy+paste WBINVD+cpumask_clear() everywhere, add a prep patch to
> replace relevant open coded calls to wbinvd_on_all_cpus() with calls to
> sev_guest_memory_reclaimed().  Then only sev_guest_memory_reclaimed() needs to
> updated, and IMO it helps document why KVM is blasting WBINVD.
> 
> That's why I recommend deleting sev_get_cpumask() and sev_clear_cpumask(), there
> really should only be two places that touch the mask itself: svm
> 
>>           __unregister_enc_region_locked(kvm, region);
>>   
>> @@ -2152,7 +2169,8 @@ void sev_vm_destroy(struct kvm *kvm)
>>            * releasing the pages back to the system for use. CLFLUSH will
>>            * not do this, so issue a WBINVD.
>>            */
>> -        wbinvd_on_all_cpus();
>> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
>> +        sev_clear_cpumask(kvm);
>>   
>>           /*
>>            * if userspace was terminated before unregistering the memory regions
>> @@ -2343,7 +2361,8 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
>>           return;
>>   
>>   do_wbinvd:
>> -        wbinvd_on_all_cpus();
>> +        wbinvd_on_cpus(sev_get_cpumask(vcpu->kvm));
>> +        sev_clear_cpumask(vcpu->kvm);
>>   }
>>   
>>   void sev_guest_memory_reclaimed(struct kvm *kvm)
>> @@ -2351,7 +2370,8 @@ void sev_guest_memory_reclaimed(struct kvm *kvm)
>>           if (!sev_guest(kvm))
>>                   return;
>>   
>> -        wbinvd_on_all_cpus();
>> +        wbinvd_on_cpus(sev_get_cpumask(kvm));
>> +        sev_clear_cpumask(kvm);
> 
> This is unsafe from a correctness perspective, as sev_guest_memory_reclaimed()
> is called without holding any KVM locks.  E.g. if a vCPU runs between blasting
> WBINVD and cpumask_clear(), KVM will fail to emit WBINVD on a future reclaim.
> 
> Making the mask per-vCPU, a la vcpu->arch.wbinvd_dirty_mask, doesn't solve the
> problem as KVM can't take vcpu->mutex in this path (sleeping may not be allowed),
> and that would create an unnecessary/unwated bottleneck.
> 
> The simplest solution I can think of is to iterate over all possible CPUs using
> cpumask_test_and_clear_cpu().
> 
>>   }
>>   
>>   void sev_free_vcpu(struct kvm_vcpu *vcpu)
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index e90b429c8..f9bfa6e57 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -4107,6 +4107,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>>   
>>           amd_clear_divider();
>>   
>> +    if (sev_guest(vcpu->kvm))
> 
> Use tabs, not spaces.
> 
>> +                cpumask_set_cpu(smp_processor_id(), sev_get_cpumask(vcpu->kvm));
> 
> This does not need to be in the noinstr region, and it _shouldn't_ be in the
> noinstr region.  There's already a handy dandy pre_sev_run() that provides a
> convenient location to bury this stuff in SEV specific code.
> 
>> +
>>           if (sev_es_guest(vcpu->kvm))
>>                   __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
>>           else
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index 8ef95139c..1577e200e 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -90,6 +90,7 @@ struct kvm_sev_info {
>>           struct list_head mirror_entry; /* Use as a list entry of mirrors */
>>           struct misc_cg *misc_cg; /* For misc cgroup accounting */
>>           atomic_t migration_in_progress;
>> +        struct cpumask cpumask; /* CPU list to flush */
> 
> That is not a helpful comment.  Flush what?  What adds to the list?  When is the
> list cleared.  Even the name is fairly useless, e.g. "
> 
> I'm also pretty sure this should be a cpumask_var_t, and dynamically allocated
> as appropriate.  And at that point, it should be allocated and filled if and only
> if the CPU doesn't have X86_FEATURE_SME_COHERENT.
Sean Christopherson March 6, 2024, 12:19 a.m. UTC | #3
On Tue, Mar 05, 2024, Tom Lendacky wrote:
> On 3/4/24 11:55, Sean Christopherson wrote:
> > +Tom
> > 
> > "KVM: SVM:" for the shortlog scope.
> > 
> > On Fri, Mar 01, 2024, Zheyun Shen wrote:
> > > On AMD CPUs without ensuring cache consistency, each memory page reclamation in
> > > an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
> > > performance of other programs on the host.
> > > 
> > > Typically, an AMD server may have 128 cores or more, while the SEV guest might only
> > > utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
> > > to specific physical CPUs.
> > > 
> > > Therefore, keeping a record of the physical core numbers each time a vCPU runs
> > > can help avoid flushing the cache for all CPUs every time.
> > 
> > This needs an unequivocal statement from AMD that flushing caches only on CPUs
> > that do VMRUN is sufficient.  That sounds like it should be obviously correct,
> > as I don't see how else a cache line can be dirtied for the encrypted PA, but
> > this entire non-coherent caches mess makes me more than a bit paranoid.
> 
> As long as the wbinvd_on_all_cpus() related to the ASID flushing isn't
> changed, this should be ok. And the code currently flushes the source pages
> when doing LAUNCH_UPDATE commands and adding encrypted regions, so should be
> good there.

Nice, thanks!

> Would it make sense to make this configurable, with the current behavior the
> default, until testing looks good for a while?

I don't hate the idea, but I'm inclined to hit the "I'm feeling lucky" button.
I would rather we put in effort to all but guarantee we can do a clean revert in
the future, at which point a kill switch doesn't add all that much value.  E.g.
it would allow for a non-disruptive fix, and maybe a slightly faster confirmation
of a bug, but that's about it.

And since the fallout from this would be host data corruption, _not_ rebooting
hosts that may have been corrupted is probably a bad idea, i.e. the whole
non-disruptive fix benefit is quite dubious.

The other issue is that it'd be extremely difficult to know when we could/should
remove the kill switch.  It might be months or even years before anyone starts
running high volume of SEV/SEV-ES VMs with this optimization.
Tom Lendacky March 6, 2024, 4:26 p.m. UTC | #4
On 3/5/24 18:19, Sean Christopherson wrote:
> On Tue, Mar 05, 2024, Tom Lendacky wrote:
>> On 3/4/24 11:55, Sean Christopherson wrote:
>>> +Tom
>>>
>>> "KVM: SVM:" for the shortlog scope.
>>>
>>> On Fri, Mar 01, 2024, Zheyun Shen wrote:
>>>> On AMD CPUs without ensuring cache consistency, each memory page reclamation in
>>>> an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
>>>> performance of other programs on the host.
>>>>
>>>> Typically, an AMD server may have 128 cores or more, while the SEV guest might only
>>>> utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
>>>> to specific physical CPUs.
>>>>
>>>> Therefore, keeping a record of the physical core numbers each time a vCPU runs
>>>> can help avoid flushing the cache for all CPUs every time.
>>>
>>> This needs an unequivocal statement from AMD that flushing caches only on CPUs
>>> that do VMRUN is sufficient.  That sounds like it should be obviously correct,
>>> as I don't see how else a cache line can be dirtied for the encrypted PA, but
>>> this entire non-coherent caches mess makes me more than a bit paranoid.
>>
>> As long as the wbinvd_on_all_cpus() related to the ASID flushing isn't
>> changed, this should be ok. And the code currently flushes the source pages
>> when doing LAUNCH_UPDATE commands and adding encrypted regions, so should be
>> good there.
> 
> Nice, thanks!
> 
>> Would it make sense to make this configurable, with the current behavior the
>> default, until testing looks good for a while?
> 
> I don't hate the idea, but I'm inclined to hit the "I'm feeling lucky" button.
> I would rather we put in effort to all but guarantee we can do a clean revert in
> the future, at which point a kill switch doesn't add all that much value.  E.g.
> it would allow for a non-disruptive fix, and maybe a slightly faster confirmation
> of a bug, but that's about it.
> 
> And since the fallout from this would be host data corruption, _not_ rebooting
> hosts that may have been corrupted is probably a bad idea, i.e. the whole
> non-disruptive fix benefit is quite dubious.
> 
> The other issue is that it'd be extremely difficult to know when we could/should
> remove the kill switch.  It might be months or even years before anyone starts
> running high volume of SEV/SEV-ES VMs with this optimization.

I can run the next version of the patch through our CI and see if it 
uncovers anything. I just worry about corner cases... but then that's just me.

Thanks,
Tom
Sean Christopherson March 6, 2024, 4:45 p.m. UTC | #5
On Wed, Mar 06, 2024, Tom Lendacky wrote:
> On 3/5/24 18:19, Sean Christopherson wrote:
> > On Tue, Mar 05, 2024, Tom Lendacky wrote:
> > > On 3/4/24 11:55, Sean Christopherson wrote:
> > > > +Tom
> > > > 
> > > > "KVM: SVM:" for the shortlog scope.
> > > > 
> > > > On Fri, Mar 01, 2024, Zheyun Shen wrote:
> > > > > On AMD CPUs without ensuring cache consistency, each memory page reclamation in
> > > > > an SEV guest triggers a call to wbinvd_on_all_cpus, thereby affecting the
> > > > > performance of other programs on the host.
> > > > > 
> > > > > Typically, an AMD server may have 128 cores or more, while the SEV guest might only
> > > > > utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs
> > > > > to specific physical CPUs.
> > > > > 
> > > > > Therefore, keeping a record of the physical core numbers each time a vCPU runs
> > > > > can help avoid flushing the cache for all CPUs every time.
> > > > 
> > > > This needs an unequivocal statement from AMD that flushing caches only on CPUs
> > > > that do VMRUN is sufficient.  That sounds like it should be obviously correct,
> > > > as I don't see how else a cache line can be dirtied for the encrypted PA, but
> > > > this entire non-coherent caches mess makes me more than a bit paranoid.
> > > 
> > > As long as the wbinvd_on_all_cpus() related to the ASID flushing isn't
> > > changed, this should be ok. And the code currently flushes the source pages
> > > when doing LAUNCH_UPDATE commands and adding encrypted regions, so should be
> > > good there.
> > 
> > Nice, thanks!
> > 
> > > Would it make sense to make this configurable, with the current behavior the
> > > default, until testing looks good for a while?
> > 
> > I don't hate the idea, but I'm inclined to hit the "I'm feeling lucky" button.
> > I would rather we put in effort to all but guarantee we can do a clean revert in
> > the future, at which point a kill switch doesn't add all that much value.  E.g.
> > it would allow for a non-disruptive fix, and maybe a slightly faster confirmation
> > of a bug, but that's about it.
> > 
> > And since the fallout from this would be host data corruption, _not_ rebooting
> > hosts that may have been corrupted is probably a bad idea, i.e. the whole
> > non-disruptive fix benefit is quite dubious.
> > 
> > The other issue is that it'd be extremely difficult to know when we could/should
> > remove the kill switch.  It might be months or even years before anyone starts
> > running high volume of SEV/SEV-ES VMs with this optimization.
> 
> I can run the next version of the patch through our CI and see if it
> uncovers anything. I just worry about corner cases... but then that's just
> me.

Heh, it's definitely not just you, this scares me more than a bit.

Doh, I realize I misread your suggestion (several times).  You're suggesting we
make this opt-in.  Hmm, that's definitely more valuable than a kill switch, though
it has the same problem of us having no idea when it's safe to enable by default.
And I'm not sure I like the idea of having a knob that basically says, "we think
this works, but we're too scared to enable it by default, so _you_ should totally
enable it and let us know if we've corrupted your data" :-)
diff mbox series

Patch

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4fab2ed45..19297202b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -120,6 +120,7 @@  void native_play_dead(void);
 void play_dead_common(void);
 void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
+int wbinvd_on_cpus(struct cpumask *cpumask);
 
 void smp_kick_mwait_play_dead(void);
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f760106c3..b6ed9a878 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -215,6 +215,21 @@  static void sev_asid_free(struct kvm_sev_info *sev)
         sev->misc_cg = NULL;
 }
 
+struct cpumask *sev_get_cpumask(struct kvm *kvm)
+{
+        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+        return &sev->cpumask;
+}
+
+void sev_clear_cpumask(struct kvm *kvm)
+{
+        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+        cpumask_clear(&sev->cpumask);
+}
+
+
 static void sev_decommission(unsigned int handle)
 {
         struct sev_data_decommission decommission;
@@ -255,6 +270,7 @@  static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
         if (unlikely(sev->active))
                 return ret;
 
+        cpumask_clear(&sev->cpumask);
         sev->active = true;
         sev->es_active = argp->id == KVM_SEV_ES_INIT;
         asid = sev_asid_new(sev);
@@ -2048,7 +2064,8 @@  int sev_mem_enc_unregister_region(struct kvm *kvm,
          * releasing the pages back to the system for use. CLFLUSH will
          * not do this, so issue a WBINVD.
          */
-        wbinvd_on_all_cpus();
+        wbinvd_on_cpus(sev_get_cpumask(kvm));
+        sev_clear_cpumask(kvm);
 
         __unregister_enc_region_locked(kvm, region);
 
@@ -2152,7 +2169,8 @@  void sev_vm_destroy(struct kvm *kvm)
          * releasing the pages back to the system for use. CLFLUSH will
          * not do this, so issue a WBINVD.
          */
-        wbinvd_on_all_cpus();
+        wbinvd_on_cpus(sev_get_cpumask(kvm));
+        sev_clear_cpumask(kvm);
 
         /*
          * if userspace was terminated before unregistering the memory regions
@@ -2343,7 +2361,8 @@  static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
         return;
 
 do_wbinvd:
-        wbinvd_on_all_cpus();
+        wbinvd_on_cpus(sev_get_cpumask(vcpu->kvm));
+        sev_clear_cpumask(vcpu->kvm);
 }
 
 void sev_guest_memory_reclaimed(struct kvm *kvm)
@@ -2351,7 +2370,8 @@  void sev_guest_memory_reclaimed(struct kvm *kvm)
         if (!sev_guest(kvm))
                 return;
 
-        wbinvd_on_all_cpus();
+        wbinvd_on_cpus(sev_get_cpumask(kvm));
+        sev_clear_cpumask(kvm);
 }
 
 void sev_free_vcpu(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e90b429c8..f9bfa6e57 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4107,6 +4107,10 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 
         amd_clear_divider();
 
+    if (sev_guest(vcpu->kvm))
+                cpumask_set_cpu(smp_processor_id(), sev_get_cpumask(vcpu->kvm));
+    
         if (sev_es_guest(vcpu->kvm))
                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
         else
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8ef95139c..1577e200e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -90,6 +90,7 @@  struct kvm_sev_info {
         struct list_head mirror_entry; /* Use as a list entry of mirrors */
         struct misc_cg *misc_cg; /* For misc cgroup accounting */
         atomic_t migration_in_progress;
+        struct cpumask cpumask; /* CPU list to flush */
 };
 
 struct kvm_svm {
@@ -694,6 +695,8 @@  void sev_es_vcpu_reset(struct vcpu_svm *svm);
 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+struct cpumask *sev_get_cpumask(struct kvm *kvm);
+void sev_clear_cpumask(struct kvm *kvm);
 
 /* vmenter.S */
 
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7af743bd3..8806f53ba 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -20,3 +20,10 @@  int wbinvd_on_all_cpus(void)
         return 0;
 }
 EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+int wbinvd_on_cpus(struct cpumask *mask)
+{
+    on_each_cpu_cond_mask(NULL, __wbinvd, NULL, 1, mask);
+    return 0;
+}
+EXPORT_SYMBOL(wbinvd_on_cpus);