diff mbox series

KVM: SVM: use vmsave/vmload for saving/restoring additional host state

Message ID 20201210174814.1122585-1-michael.roth@amd.com (mailing list archive)
State New, archived
Headers show
Series KVM: SVM: use vmsave/vmload for saving/restoring additional host state | expand

Commit Message

Michael Roth Dec. 10, 2020, 5:48 p.m. UTC
Using a guest workload which simply issues 'hlt' in a tight loop to
generate VMEXITs, it was observed (on a recent EPYC processor) that a
significant amount of the VMEXIT overhead measured on the host was the
result of MSR reads/writes in svm_vcpu_load/svm_vcpu_put according to
perf:

  67.49%--kvm_arch_vcpu_ioctl_run
          |
          |--23.13%--vcpu_put
          |          kvm_arch_vcpu_put
          |          |
          |          |--21.31%--native_write_msr
          |          |
          |           --1.27%--svm_set_cr4
          |
          |--16.11%--vcpu_load
          |          |
          |           --15.58%--kvm_arch_vcpu_load
          |                     |
          |                     |--13.97%--svm_set_cr4
          |                     |          |
          |                     |          |--12.64%--native_read_msr

Most of these MSRs relate to 'syscall'/'sysenter' and segment bases, and
can be saved/restored using 'vmsave'/'vmload' instructions rather than
explicit MSR reads/writes. In doing so there is a significant reduction
in the svm_vcpu_load/svm_vcpu_put overhead measured for the above
workload:

  50.92%--kvm_arch_vcpu_ioctl_run
          |
          |--19.28%--disable_nmi_singlestep
          |
          |--13.68%--vcpu_load
          |          kvm_arch_vcpu_load
          |          |
          |          |--9.19%--svm_set_cr4
          |          |          |
          |          |           --6.44%--native_read_msr
          |          |
          |           --3.55%--native_write_msr
          |
          |--6.05%--kvm_inject_nmi
          |--2.80%--kvm_sev_es_mmio_read
          |--2.19%--vcpu_put
          |          |
          |           --1.25%--kvm_arch_vcpu_put
          |                     native_write_msr

Quantifying this further, if we look at the raw cycle counts for a
normal iteration of the above workload (according to 'rdtscp'),
kvm_arch_vcpu_ioctl_run() takes ~4600 cycles from start to finish with
the current behavior. Using 'vmsave'/'vmload', this is reduced to
~2800 cycles, a savings of 39%.

While this approach doesn't seem to manifest in any noticeable
improvement for more realistic workloads like UnixBench, netperf, and
kernel builds, likely due to their exit paths generally involving IO
with comparatively high latencies, it does improve overall overhead
of KVM_RUN significantly, which may still be noticeable for certain
situations. It also simplifies some aspects of the code.

With this change, explicit save/restore is no longer needed for the
following host MSRs, since they are documented[1] as being part of the
VMCB State Save Area:

  MSR_STAR, MSR_LSTAR, MSR_CSTAR,
  MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  MSR_IA32_SYSENTER_CS,
  MSR_IA32_SYSENTER_ESP,
  MSR_IA32_SYSENTER_EIP,
  MSR_FS_BASE, MSR_GS_BASE

and only the following MSR needs individual handling in
svm_vcpu_put/svm_vcpu_load:

  MSR_TSC_AUX

We could drop the host_save_user_msrs array/loop and instead handle
MSR read/write of MSR_TSC_AUX directly, but we leave that for now as
a potential follow-up.

Since 'vmsave'/'vmload' also handles the LDTR and FS/GS segment
registers (and associated hidden state)[2], some of the code
previously used to handle this is no longer needed, so we drop it
as well.

The first public release of the SVM spec[3] also documents the same
handling for the host state in question, so we make these changes
unconditionally.

Also worth noting is that we 'vmsave' to the same page that is
subsequently used by 'vmrun' to record some host additional state. This
is okay, since, in accordance with the spec[2], the additional state
written to the page by 'vmrun' does not overwrite any fields written by
'vmsave'. This has also been confirmed through testing (for the above
CPU, at least).

[1] AMD64 Architecture Programmer's Manual, Rev 3.33, Volume 2, Appendix B, Table B-2
[2] AMD64 Architecture Programmer's Manual, Rev 3.31, Volume 3, Chapter 4, VMSAVE/VMLOAD
[3] Secure Virtual Machine Architecture Reference Manual, Rev 3.01

Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/svm/svm.c | 48 ++++++++++++++++++++++++++----------------
 arch/x86/kvm/svm/svm.h |  6 ------
 2 files changed, 30 insertions(+), 24 deletions(-)

Comments

Andy Lutomirski Dec. 10, 2020, 7:23 p.m. UTC | #1
On Thu, Dec 10, 2020 at 9:52 AM Michael Roth <michael.roth@amd.com> wrote:
>   MSR_STAR, MSR_LSTAR, MSR_CSTAR,
>   MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
>   MSR_IA32_SYSENTER_CS,
>   MSR_IA32_SYSENTER_ESP,
>   MSR_IA32_SYSENTER_EIP,
>   MSR_FS_BASE, MSR_GS_BASE

Can you get rid of all the old FS/GS manipulation at the same time?

> +       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> +               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> +       }
> +
> +       asm volatile(__ex("vmsave")
> +                    : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
> +                    : "memory");
> +       /*
> +        * Host FS/GS segment registers might be restored soon after
> +        * vmexit, prior to vmload of host save area. Even though this
> +        * state is now saved in the host's save area, we cannot use
> +        * per-cpu accesses until these registers are restored, so we
> +        * store a copy in the VCPU struct to make sure they are
> +        * accessible.
> +        */
>  #ifdef CONFIG_X86_64
> -       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
> +       svm->host.gs_base = hostsa->gs.base;
>  #endif

For example, this comment makes no sense to me.  Just let VMLOAD
restore FS/GS and be done with it.  Don't copy those gs_base and
gs.base fields -- just delete them please.  (Or are they needed for
nested virt for some reason?  If so, please document that.)

> -       savesegment(fs, svm->host.fs);
> -       savesegment(gs, svm->host.gs);
> -       svm->host.ldt = kvm_read_ldt();
> -
> -       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> -               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> +       svm->host.fs = hostsa->fs.selector;
> +       svm->host.gs = hostsa->gs.selector;

This too.  Why is the host code thinking about selectors at all?

> -       kvm_load_ldt(svm->host.ldt);

I have a patch that deletes this, too.  Don't worry about the conflict
-- I'll sort it out.

> @@ -120,7 +115,6 @@ struct vcpu_svm {
>         struct {
>                 u16 fs;
>                 u16 gs;
> -               u16 ldt;
>                 u64 gs_base;
>         } host;

Shouldn't you be about to delete fs, gs, and gs_base too?
Andy Lutomirski Dec. 10, 2020, 11:49 p.m. UTC | #2
> On Dec 10, 2020, at 1:27 PM, Michael Roth <michael.roth@amd.com> wrote:
>
> Quoting Andy Lutomirski (2020-12-10 13:23:19)
>>> On Thu, Dec 10, 2020 at 9:52 AM Michael Roth <michael.roth@amd.com> wrote:
>>>  MSR_STAR, MSR_LSTAR, MSR_CSTAR,
>>>  MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
>>>  MSR_IA32_SYSENTER_CS,
>>>  MSR_IA32_SYSENTER_ESP,
>>>  MSR_IA32_SYSENTER_EIP,
>>>  MSR_FS_BASE, MSR_GS_BASE
>>
>> Can you get rid of all the old FS/GS manipulation at the same time?
>>
>>> +       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>>> +               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
>>> +       }
>>> +
>>> +       asm volatile(__ex("vmsave")
>>> +                    : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
>>> +                    : "memory");
>>> +       /*
>>> +        * Host FS/GS segment registers might be restored soon after
>>> +        * vmexit, prior to vmload of host save area. Even though this
>>> +        * state is now saved in the host's save area, we cannot use
>>> +        * per-cpu accesses until these registers are restored, so we
>>> +        * store a copy in the VCPU struct to make sure they are
>>> +        * accessible.
>>> +        */
>>> #ifdef CONFIG_X86_64
>>> -       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
>>> +       svm->host.gs_base = hostsa->gs.base;
>>> #endif
>>
>> For example, this comment makes no sense to me.  Just let VMLOAD
>> restore FS/GS and be done with it.  Don't copy those gs_base and
>> gs.base fields -- just delete them please.  (Or are they needed for
>> nested virt for some reason?  If so, please document that.)
>
> Hi Andy,
>
> The main issue is that we restore FS/GS immediately after a vmexit since
> we need them soon-after for things like per-cpu accesses, but the rest
> of the host state only needs to be restored if we're exiting all the way
> out to userspace. That's also why we store a copy of the values, since
> the host can't access the per-cpu save_area beforehand.

>
> In theory I think we probably could use vmload to restore this state
> immediately after vmexit as you're suggesting, but then we will end up
> taking a performance hit for cases where the vmexit can be handled within
> the kernel, which might leave us worse-off than the pre-patch behavior
> for those cases (handling an MMIO for a virtqueue notification when
> vhost_net is enabled, for instance)

Please benchmark this.  WRMSR to MSR_GS_BASE is serializing and may
well be slower than VMLOAD.

>
>>
>>> -       savesegment(fs, svm->host.fs);
>>> -       savesegment(gs, svm->host.gs);
>>> -       svm->host.ldt = kvm_read_ldt();
>>> -
>>> -       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
>>> -               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
>>> +       svm->host.fs = hostsa->fs.selector;
>>> +       svm->host.gs = hostsa->gs.selector;
>>
>> This too.  Why is the host code thinking about selectors at all?
>>
>>> -       kvm_load_ldt(svm->host.ldt);
>>
>> I have a patch that deletes this, too.  Don't worry about the conflict
>> -- I'll sort it out.
>>
>>> @@ -120,7 +115,6 @@ struct vcpu_svm {
>>>        struct {
>>>                u16 fs;
>>>                u16 gs;
>>> -               u16 ldt;
>>>                u64 gs_base;
>>>        } host;
>>
>> Shouldn't you be about to delete fs, gs, and gs_base too?
>
> For the reasons above it seems like they'd need to be there in some form,
> though we could maybe replace them with a pointer to the per-cpu save_area
> so that they can be accessed directly before GS segment is restored.

I’m confused. Why would you be accessing them before VMLOAD?  These
are host values.

I think there are two reasonable ways to do this:

1. VMLOAD before STGI.  This is obviously correct, and it's quite simple.

2. Save cpu_kernelmode_gs_base(cpu) before VM entry, and restore that
value to MSR_GS_BASE using code like this (or its asm equivalent)
before STGI:

if (static_cpu_has(X86_FEATURE_FSGSBASE))
  wrgsbase(base);
else
  wrmsr...

and then VMLOAD in the vcpu_put() path.

I can't think of any reason to use loadsegment(), load_gs_index(), or
savesegment() at all, nor can I think of any reason to touch
MSR_KERNEL_GS_BASE or MSR_FS_BASE.

--Andy
Andy Lutomirski Dec. 11, 2020, 1:10 a.m. UTC | #3
> On Dec 10, 2020, at 4:48 PM, Michael Roth <michael.roth@amd.com> wrote:
>

>> I think there are two reasonable ways to do this:
>>
>> 1. VMLOAD before STGI.  This is obviously correct, and it's quite simple.
>
> For the testing I ended up putting it immediately after __svm_vcpu_run()
> since that's where we restored GS base previously. Does that seem okay or did
> you have another place in mind?

Looks okay.  If we get an NMI or MCE with the wrong MSR_GS_BASE, then
we are toast, at least on Zen 2 and earlier.  But that spot has GI ==
0, so this won't happen.

>
>>
>> 2. Save cpu_kernelmode_gs_base(cpu) before VM entry, and restore that
>> value to MSR_GS_BASE using code like this (or its asm equivalent)
>> before STGI:
>>
>> if (static_cpu_has(X86_FEATURE_FSGSBASE))
>>  wrgsbase(base);
>> else
>>  wrmsr...
>>
>> and then VMLOAD in the vcpu_put() path.
>>
>> I can't think of any reason to use loadsegment(), load_gs_index(), or
>> savesegment() at all, nor can I think of any reason to touch
>> MSR_KERNEL_GS_BASE or MSR_FS_BASE.
>
> I'm sort of lumping MSR_GS_BASE restoration in with everything else since I
> don't fully understand what the original was code doing either and was content
> to leave it be if we couldn't use VMLOAD to handle it without a performance
> regression, but since it looks like we can use VMLOAD here instead I agree
> we should just drop it all.

The original code is entirely bogus. Don’t try to hard to understand
how it’s correct — I’m pretty sure it’s not. In fact, I was planning
to write a patch a lot like yours, but I don’t have an SVM-capable CPU
to test on.  In general, I suspect that you could delete all these
fields from the structs, see what fails to compile, and fix it pretty
easily.

MSR_GS_BASE is kernel state. (On 32-bit, fs and maybe gs are kernel
state.). Everything else is host *user* state and isn’t touched by
normal kernel code.
Sean Christopherson Dec. 11, 2020, 1:27 a.m. UTC | #4
Michael, please reply to all so that everyone can read along and so that the
conversation gets recorded in the various mailing list archives.

If you are replying all, then I think something funky is going on with AMD's
mail servers, as I'm not getting your responses (I double checked SPAM), nor are
they showing up on lore.

On Thu, Dec 10, 2020, Andy Lutomirski wrote:
> > On Dec 10, 2020, at 4:48 PM, Michael Roth <michael.roth@amd.com> wrote:
> >
> 
> >> I think there are two reasonable ways to do this:
> >>
> >> 1. VMLOAD before STGI.  This is obviously correct, and it's quite simple.
> >
> > For the testing I ended up putting it immediately after __svm_vcpu_run()
> > since that's where we restored GS base previously. Does that seem okay or did
> > you have another place in mind?
> 
> Looks okay.  If we get an NMI or MCE with the wrong MSR_GS_BASE, then
> we are toast, at least on Zen 2 and earlier.  But that spot has GI ==
> 0, so this won't happen.
Paolo Bonzini Dec. 11, 2020, 11:42 a.m. UTC | #5
On 11/12/20 02:27, Sean Christopherson wrote:
> Michael, please reply to all so that everyone can read along and so that the
> conversation gets recorded in the various mailing list archives.
> 
> If you are replying all, then I think something funky is going on with AMD's
> mail servers, as I'm not getting your responses (I double checked SPAM), nor are
> they showing up on lore.

I think somebody else reported similar issues with AMD's mail servers.

Paolo
Paolo Bonzini Dec. 11, 2020, 1:39 p.m. UTC | #6
On 11/12/20 14:28, Michael Roth wrote:
> Quoting Paolo Bonzini (2020-12-11 05:42:02)
>> On 11/12/20 02:27, Sean Christopherson wrote:
>>> Michael, please reply to all so that everyone can read along and so that the
>>> conversation gets recorded in the various mailing list archives.
>>>
>>> If you are replying all, then I think something funky is going on with AMD's
>>> mail servers, as I'm not getting your responses (I double checked SPAM), nor are
>>> they showing up on lore.
>> I think somebody else reported similar issues with AMD's mail servers.
> It looks like my messages did make their way to lore eventually:
> 
>    https://lore.kernel.org/patchwork/patch/1352128/

Ok, so maybe some graylisting.  If it fixes itself it's not a big deal.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6dc337b9c231..d6578bca0a77 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1360,6 +1360,8 @@  static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+	struct vmcb_save_area *hostsa =
+		(struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
 	int i;
 
 	if (unlikely(cpu != vcpu->cpu)) {
@@ -1367,15 +1369,26 @@  static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vmcb_mark_all_dirty(svm->vmcb);
 	}
 
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
+		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+	}
+
+	asm volatile(__ex("vmsave")
+		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
+		     : "memory");
+	/*
+	 * Host FS/GS segment registers might be restored soon after
+	 * vmexit, prior to vmload of host save area. Even though this
+	 * state is now saved in the host's save area, we cannot use
+	 * per-cpu accesses until these registers are restored, so we
+	 * store a copy in the VCPU struct to make sure they are
+	 * accessible.
+	 */
 #ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+	svm->host.gs_base = hostsa->gs.base;
 #endif
-	savesegment(fs, svm->host.fs);
-	savesegment(gs, svm->host.gs);
-	svm->host.ldt = kvm_read_ldt();
-
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+	svm->host.fs = hostsa->fs.selector;
+	svm->host.gs = hostsa->gs.selector;
 
 	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
@@ -1398,23 +1411,22 @@  static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	int cpu = get_cpu();
+	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 	int i;
 
 	avic_vcpu_put(vcpu);
 
 	++vcpu->stat.host_state_reload;
-	kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
-	loadsegment(fs, svm->host.fs);
-	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
-	load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+
+	asm volatile(__ex("vmload")
+		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT));
+
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+	}
+
+	put_cpu();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fdff76eb6ceb..fa7d7e4f9d3d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -21,11 +21,6 @@ 
 #include <asm/svm.h>
 
 static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-	MSR_FS_BASE,
-#endif
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_TSC_AUX,
 };
 
@@ -120,7 +115,6 @@  struct vcpu_svm {
 	struct {
 		u16 fs;
 		u16 gs;
-		u16 ldt;
 		u64 gs_base;
 	} host;