diff mbox series

[PATCHV2,1/3] KVM: x86: introduce kvm_mmu_pte_prefetch structure

Message ID 20211019153214.109519-2-senozhatsky@chromium.org (mailing list archive)
State New, archived
Headers show
Series kvm: x86: make PTE_PREFETCH_NUM tunable | expand

Commit Message

Sergey Senozhatsky Oct. 19, 2021, 3:32 p.m. UTC
kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
prefetch pages array, lock and the number of PTE to prefetch.

This is needed to turn PTE_PREFETCH_NUM into a tunable VM
parameter.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 arch/x86/include/asm/kvm_host.h | 12 +++++++
 arch/x86/kvm/mmu.h              |  4 +++
 arch/x86/kvm/mmu/mmu.c          | 57 ++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.c              |  9 +++++-
 4 files changed, 77 insertions(+), 5 deletions(-)

Comments

David Matlack Oct. 19, 2021, 10:44 p.m. UTC | #1
On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> prefetch pages array, lock and the number of PTE to prefetch.
>
> This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> parameter.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  arch/x86/include/asm/kvm_host.h | 12 +++++++
>  arch/x86/kvm/mmu.h              |  4 +++
>  arch/x86/kvm/mmu/mmu.c          | 57 ++++++++++++++++++++++++++++++---
>  arch/x86/kvm/x86.c              |  9 +++++-
>  4 files changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5271fce6cd65..11400bc3c70d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
>         u64 runstate_times[4];
>  };
>
> +struct kvm_mmu_pte_prefetch {
> +       /*
> +        * This will be cast either to array of pointers to struct page,
> +        * or array of u64, or array of u32
> +        */
> +       void *ents;
> +       unsigned int num_ents;
> +       spinlock_t lock;

The spinlock is overkill. I'd suggest something like this:
- When VM-ioctl is invoked to update prefetch count, store it in
kvm_arch. No synchronization with vCPUs needed.
- When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
different than count at last fault, re-allocate vCPU prefetch array.
(So you'll need to add prefetch array and count to kvm_vcpu_arch as
well.)

No extra locks are needed. vCPUs that fault after the VM-ioctl will
get the new prefetch count. We don't really care if a prefetch count
update races with a vCPU fault as long as vCPUs are careful to only
read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
use that. Assuming prefetch count ioctls are rare, the re-allocation
on the fault path will be rare as well.

Note: You could apply this same approach to a module param, except
vCPUs would be reading the module param rather than vcpu->kvm during
each fault.

And the other alternative, like you suggested in the other patch, is
to use a vCPU ioctl. That would side-step the synchronization issue
because vCPU ioctls require the vCPU mutex. So the reallocation could
be done in the ioctl and not at fault time.

Taking a step back, can you say a bit more about your usecase? The
module param approach would be simplest because you would not have to
add userspace support, but in v1 you did mention you wanted per-VM
control.

> +};
> +
>  struct kvm_vcpu_arch {
>         /*
>          * rip and regs accesses must go through
> @@ -682,6 +692,8 @@ struct kvm_vcpu_arch {
>         struct kvm_mmu_memory_cache mmu_gfn_array_cache;
>         struct kvm_mmu_memory_cache mmu_page_header_cache;
>
> +       struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
> +
>         /*
>          * QEMU userspace and the guest each have their own FPU state.
>          * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 75367af1a6d3..b953a3a4083a 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -68,6 +68,10 @@ static __always_inline u64 rsvd_bits(int s, int e)
>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
>  void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
>
> +int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents);
> +int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu);
> +void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu);
> +
>  void kvm_init_mmu(struct kvm_vcpu *vcpu);
>  void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
>                              unsigned long cr4, u64 efer, gpa_t nested_cr3);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 24a9f4c3f5e7..fed3a498a729 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -115,6 +115,7 @@ module_param(dbg, bool, 0644);
>  #endif
>
>  #define PTE_PREFETCH_NUM               8
> +#define MAX_PTE_PREFETCH_NUM           128
>
>  #define PT32_LEVEL_BITS 10
>
> @@ -732,7 +733,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>
>         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
>         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
> -                                      1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
> +                                      1 + PT64_ROOT_MAX_LEVEL + MAX_PTE_PREFETCH_NUM);
>         if (r)
>                 return r;
>         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> @@ -2753,12 +2754,13 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
>                                     struct kvm_mmu_page *sp,
>                                     u64 *start, u64 *end)
>  {
> -       struct page *pages[PTE_PREFETCH_NUM];
> +       struct page **pages;
>         struct kvm_memory_slot *slot;
>         unsigned int access = sp->role.access;
>         int i, ret;
>         gfn_t gfn;
>
> +       pages = (struct page **)vcpu->arch.mmu_pte_prefetch.ents;
>         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
>         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
>         if (!slot)
> @@ -2781,14 +2783,17 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
>                                   struct kvm_mmu_page *sp, u64 *sptep)
>  {
>         u64 *spte, *start = NULL;
> +       unsigned int pte_prefetch_num;
>         int i;
>
>         WARN_ON(!sp->role.direct);
>
> -       i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
> +       spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
> +       pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
> +       i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
>         spte = sp->spt + i;
>
> -       for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
> +       for (i = 0; i < pte_prefetch_num; i++, spte++) {
>                 if (is_shadow_present_pte(*spte) || spte == sptep) {
>                         if (!start)
>                                 continue;
> @@ -2800,6 +2805,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
>         }
>         if (start)
>                 direct_pte_prefetch_many(vcpu, sp, start, spte);
> +       spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
>  }
>
>  static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> @@ -4914,6 +4920,49 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_mmu);
>
> +int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents)
> +{
> +       u64 *ents;
> +
> +       if (!num_ents)
> +               return -EINVAL;
> +
> +       if (!is_power_of_2(num_ents))
> +               return -EINVAL;
> +
> +       if (num_ents > MAX_PTE_PREFETCH_NUM)
> +               return -EINVAL;
> +
> +       ents = kmalloc_array(num_ents, sizeof(u64), GFP_KERNEL);
> +       if (!ents)
> +               return -ENOMEM;
> +
> +       spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
> +       kfree(vcpu->arch.mmu_pte_prefetch.ents);
> +       vcpu->arch.mmu_pte_prefetch.ents = ents;
> +       vcpu->arch.mmu_pte_prefetch.num_ents = num_ents;
> +       spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_set_pte_prefetch);
> +
> +int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu)
> +{
> +       spin_lock_init(&vcpu->arch.mmu_pte_prefetch.lock);
> +
> +       return kvm_set_pte_prefetch(vcpu, PTE_PREFETCH_NUM);
> +}
> +EXPORT_SYMBOL_GPL(kvm_init_pte_prefetch);
> +
> +void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu)
> +{
> +       vcpu->arch.mmu_pte_prefetch.num_ents = 0;
> +       kfree(vcpu->arch.mmu_pte_prefetch.ents);
> +       vcpu->arch.mmu_pte_prefetch.ents = NULL;
> +}
> +EXPORT_SYMBOL_GPL(kvm_pte_prefetch_destroy);
> +
>  static union kvm_mmu_page_role
>  kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b0f99132d7d1..4805960a89e6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10707,10 +10707,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>         vcpu->arch.hv_root_tdp = INVALID_PAGE;
>  #endif
>
> -       r = static_call(kvm_x86_vcpu_create)(vcpu);
> +       r = kvm_init_pte_prefetch(vcpu);
>         if (r)
>                 goto free_guest_fpu;
>
> +       r = static_call(kvm_x86_vcpu_create)(vcpu);
> +       if (r)
> +               goto free_pte_prefetch;
> +
>         vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
>         vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
>         kvm_vcpu_mtrr_init(vcpu);
> @@ -10721,6 +10725,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>         vcpu_put(vcpu);
>         return 0;
>
> +free_pte_prefetch:
> +       kvm_pte_prefetch_destroy(vcpu);
>  free_guest_fpu:
>         kvm_free_guest_fpu(vcpu);
>  free_user_fpu:
> @@ -10782,6 +10788,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>         kvm_free_lapic(vcpu);
>         idx = srcu_read_lock(&vcpu->kvm->srcu);
>         kvm_mmu_destroy(vcpu);
> +       kvm_pte_prefetch_destroy(vcpu);
>         srcu_read_unlock(&vcpu->kvm->srcu, idx);
>         free_page((unsigned long)vcpu->arch.pio_data);
>         kvfree(vcpu->arch.cpuid_entries);
> --
> 2.33.0.1079.g6e70778dc9-goog
>
Sergey Senozhatsky Oct. 20, 2021, 1:23 a.m. UTC | #2
On (21/10/19 15:44), David Matlack wrote:
> On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> > prefetch pages array, lock and the number of PTE to prefetch.
> >
> > This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> > parameter.
> >
> > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > ---
> >  arch/x86/include/asm/kvm_host.h | 12 +++++++
> >  arch/x86/kvm/mmu.h              |  4 +++
> >  arch/x86/kvm/mmu/mmu.c          | 57 ++++++++++++++++++++++++++++++---
> >  arch/x86/kvm/x86.c              |  9 +++++-
> >  4 files changed, 77 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 5271fce6cd65..11400bc3c70d 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
> >         u64 runstate_times[4];
> >  };
> >
> > +struct kvm_mmu_pte_prefetch {
> > +       /*
> > +        * This will be cast either to array of pointers to struct page,
> > +        * or array of u64, or array of u32
> > +        */
> > +       void *ents;
> > +       unsigned int num_ents;
> > +       spinlock_t lock;
> 
> The spinlock is overkill. I'd suggest something like this:
> - When VM-ioctl is invoked to update prefetch count, store it in
> kvm_arch. No synchronization with vCPUs needed.
> - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> different than count at last fault, re-allocate vCPU prefetch array.
> (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> well.)
> 
> No extra locks are needed. vCPUs that fault after the VM-ioctl will
> get the new prefetch count. We don't really care if a prefetch count
> update races with a vCPU fault as long as vCPUs are careful to only
> read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> use that. Assuming prefetch count ioctls are rare, the re-allocation
> on the fault path will be rare as well.

So reallocation from the faul-path should happen before vCPU takes the
mmu_lock? And READ_ONCE(prefetch_count) should also happen before vCPU
takes mmu_lock, I assume, so we need to pass it as a parameter to all
the functions that will access prefetch array.

> Note: You could apply this same approach to a module param, except
> vCPUs would be reading the module param rather than vcpu->kvm during
> each fault.
> 
> And the other alternative, like you suggested in the other patch, is
> to use a vCPU ioctl. That would side-step the synchronization issue
> because vCPU ioctls require the vCPU mutex. So the reallocation could
> be done in the ioctl and not at fault time.

One more idea, wonder what do you think:

There is an upper limit on the number of PTEs we prefault, which is 128 as of
now, but I think 64 will be good enough, or maybe even 32. So we can always
allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
way we never have to reallocate anything, we just adjust the "maximum index"
value.

> Taking a step back, can you say a bit more about your usecase?

We are looking at various ways of reducing the number of vm-exits. There
is only one VM running on the device (a pretty low-end laptop).
David Matlack Oct. 20, 2021, 3:56 p.m. UTC | #3
On Tue, Oct 19, 2021 at 6:24 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (21/10/19 15:44), David Matlack wrote:
> > On Tue, Oct 19, 2021 at 8:32 AM Sergey Senozhatsky
> > <senozhatsky@chromium.org> wrote:
> > >
> > > kvm_mmu_pte_prefetch is a per-VCPU structure that holds a PTE
> > > prefetch pages array, lock and the number of PTE to prefetch.
> > >
> > > This is needed to turn PTE_PREFETCH_NUM into a tunable VM
> > > parameter.
> > >
> > > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > > ---
> > >  arch/x86/include/asm/kvm_host.h | 12 +++++++
> > >  arch/x86/kvm/mmu.h              |  4 +++
> > >  arch/x86/kvm/mmu/mmu.c          | 57 ++++++++++++++++++++++++++++++---
> > >  arch/x86/kvm/x86.c              |  9 +++++-
> > >  4 files changed, 77 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 5271fce6cd65..11400bc3c70d 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -607,6 +607,16 @@ struct kvm_vcpu_xen {
> > >         u64 runstate_times[4];
> > >  };
> > >
> > > +struct kvm_mmu_pte_prefetch {
> > > +       /*
> > > +        * This will be cast either to array of pointers to struct page,
> > > +        * or array of u64, or array of u32
> > > +        */
> > > +       void *ents;
> > > +       unsigned int num_ents;
> > > +       spinlock_t lock;
> >
> > The spinlock is overkill. I'd suggest something like this:
> > - When VM-ioctl is invoked to update prefetch count, store it in
> > kvm_arch. No synchronization with vCPUs needed.
> > - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> > different than count at last fault, re-allocate vCPU prefetch array.
> > (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> > well.)
> >
> > No extra locks are needed. vCPUs that fault after the VM-ioctl will
> > get the new prefetch count. We don't really care if a prefetch count
> > update races with a vCPU fault as long as vCPUs are careful to only
> > read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> > use that. Assuming prefetch count ioctls are rare, the re-allocation
> > on the fault path will be rare as well.
>
> So reallocation from the faul-path should happen before vCPU takes the
> mmu_lock?

Yes. Take a look at mmu_topup_memory_caches for an example of
allocating in the fault path prior to taking the mmu lock.

> And READ_ONCE(prefetch_count) should also happen before vCPU
> takes mmu_lock, I assume, so we need to pass it as a parameter to all
> the functions that will access prefetch array.

Store the value of READ_ONCE(prefetch_count) in struct kvm_vcpu_arch
because you also need to know if it changes on the next fault. Then
you also don't have to add a parameter to a bunch of functions in the
fault path.

>
> > Note: You could apply this same approach to a module param, except
> > vCPUs would be reading the module param rather than vcpu->kvm during
> > each fault.
> >
> > And the other alternative, like you suggested in the other patch, is
> > to use a vCPU ioctl. That would side-step the synchronization issue
> > because vCPU ioctls require the vCPU mutex. So the reallocation could
> > be done in the ioctl and not at fault time.
>
> One more idea, wonder what do you think:
>
> There is an upper limit on the number of PTEs we prefault, which is 128 as of
> now, but I think 64 will be good enough, or maybe even 32. So we can always
> allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
> ->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
> way we never have to reallocate anything, we just adjust the "maximum index"
> value.

128 * 8 would be 1KB per vCPU. That is probably reasonable, but I
don't think the re-allocation would be that complex.

>
> > Taking a step back, can you say a bit more about your usecase?
>
> We are looking at various ways of reducing the number of vm-exits. There
> is only one VM running on the device (a pretty low-end laptop).

When you say reduce the number of vm-exits, can you be more specific?
Are you trying to reduce the time it takes for vCPUs to fault in
memory during VM startup? I just mention because there are likely
other techniques you can apply that would not require modifying KVM
code (e.g. prefaulting the host memory before running the VM, using
the TDP MMU instead of the legacy MMU to allow parallel faults, using
hugepages to map in more memory per fault, etc.)
Sergey Senozhatsky Oct. 21, 2021, 2:48 a.m. UTC | #4
On (21/10/20 08:56), David Matlack wrote:
> > > The spinlock is overkill. I'd suggest something like this:
> > > - When VM-ioctl is invoked to update prefetch count, store it in
> > > kvm_arch. No synchronization with vCPUs needed.
> > > - When a vCPU takes a fault: Read the prefetch count from kvm_arch. If
> > > different than count at last fault, re-allocate vCPU prefetch array.
> > > (So you'll need to add prefetch array and count to kvm_vcpu_arch as
> > > well.)
> > >
> > > No extra locks are needed. vCPUs that fault after the VM-ioctl will
> > > get the new prefetch count. We don't really care if a prefetch count
> > > update races with a vCPU fault as long as vCPUs are careful to only
> > > read the count once (i.e. use READ_ONCE(vcpu->kvm.prefetch_count)) and
> > > use that. Assuming prefetch count ioctls are rare, the re-allocation
> > > on the fault path will be rare as well.
> >
> > So reallocation from the faul-path should happen before vCPU takes the
> > mmu_lock?
> 
> Yes. Take a look at mmu_topup_memory_caches for an example of
> allocating in the fault path prior to taking the mmu lock.
> 
> > And READ_ONCE(prefetch_count) should also happen before vCPU
> > takes mmu_lock, I assume, so we need to pass it as a parameter to all
> > the functions that will access prefetch array.
> 
> Store the value of READ_ONCE(prefetch_count) in struct kvm_vcpu_arch
> because you also need to know if it changes on the next fault. Then
> you also don't have to add a parameter to a bunch of functions in the
> fault path.
> 
> >
> > > Note: You could apply this same approach to a module param, except
> > > vCPUs would be reading the module param rather than vcpu->kvm during
> > > each fault.
> > >
> > > And the other alternative, like you suggested in the other patch, is
> > > to use a vCPU ioctl. That would side-step the synchronization issue
> > > because vCPU ioctls require the vCPU mutex. So the reallocation could
> > > be done in the ioctl and not at fault time.
> >
> > One more idea, wonder what do you think:
> >
> > There is an upper limit on the number of PTEs we prefault, which is 128 as of
> > now, but I think 64 will be good enough, or maybe even 32. So we can always
> > allocate MAX_PTE_PREFETCH_NUM arrays in vcpu->arch and ioctl() will change
> > ->num_ents only, which is always in (0, MAX_PTE_PREFETCH_NUM - 1] range. This
> > way we never have to reallocate anything, we just adjust the "maximum index"
> > value.
> 
> 128 * 8 would be 1KB per vCPU. That is probably reasonable, but I
> don't think the re-allocation would be that complex.

128 is probably too large. What I'm thinking is "32 * 8" per-VCPU.
Then it can even be something like:

---

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5271fce6cd65..b3a436f8fdf5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,6 +607,11 @@ struct kvm_vcpu_xen {
        u64 runstate_times[4];
 };
 
+struct kvm_mmu_pte_prefetch {
+       u64 ents[32];
+       unsigned int num_ents;  /* max prefetch value for this vCPU */
+};
+
 struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
@@ -682,6 +687,8 @@ struct kvm_vcpu_arch {
        struct kvm_mmu_memory_cache mmu_gfn_array_cache;
        struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+       struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
+
        /*
         * QEMU userspace and the guest each have their own FPU state.
         * In vcpu_run, we switch between the user and guest FPU contexts.

---

> >
> > > Taking a step back, can you say a bit more about your usecase?
> >
> > We are looking at various ways of reducing the number of vm-exits. There
> > is only one VM running on the device (a pretty low-end laptop).
> 
> When you say reduce the number of vm-exits, can you be more specific?
> Are you trying to reduce the time it takes for vCPUs to fault in
> memory during VM startup?

VM Boot is the test I'm running, yes, and I see some improvements with
pte-prefault == 16.

> I just mention because there are likely other techniques you can apply
> that would not require modifying KVM code (e.g. prefaulting the host
> memory before running the VM, using the TDP MMU instead of the legacy
> MMU to allow parallel faults, using hugepages to map in more memory
> per fault, etc.)

THP would be awesome. We have THP enabled on devices that have 8-plus
gigabytes of RAM; but we don't have THP enabled on low end devices, that
only have 4 gigabytes of RAM. On low end devices KVM with THP causes host
memory regression, that we cannot accept, hence for 4 gig devices we try
various "other solutions".

We are using TDP. And somehow I never see (literally never) async PFs.
It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
__direct_map() from tdp_page_fault().
Sergey Senozhatsky Oct. 21, 2021, 3:28 a.m. UTC | #5
On (21/10/21 11:48), Sergey Senozhatsky wrote:
> 
> We are using TDP. And somehow I never see (literally never) async PFs.
> It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
> __direct_map() from tdp_page_fault().

Hmm, and tdp_page_fault()->fast_page_fault() always fails on
!is_access_allowed(error_code, new_spte), it never handles the faults.
And I see some ->mmu_lock contention:

	spin_lock(&vcpu->kvm->mmu_lock);
	__direct_map();
	spin_unlock(&vcpu->kvm->mmu_lock);

So it might be that we setup guest memory wrongly and never get
advantages of TPD and fast page faults?
Sergey Senozhatsky Oct. 21, 2021, 8:09 a.m. UTC | #6
On (21/10/21 12:28), Sergey Senozhatsky wrote:
> > 
> > We are using TDP. And somehow I never see (literally never) async PFs.
> > It's always either hva_to_pfn_fast() or hva_to_pfn_slow() or
> > __direct_map() from tdp_page_fault().
> 
> Hmm, and tdp_page_fault()->fast_page_fault() always fails on
> !is_access_allowed(error_code, new_spte), it never handles the faults.
> And I see some ->mmu_lock contention:
> 
> 	spin_lock(&vcpu->kvm->mmu_lock);
> 	__direct_map();
> 	spin_unlock(&vcpu->kvm->mmu_lock);
> 
> So it might be that we setup guest memory wrongly and never get
> advantages of TPD and fast page faults?

No, never mind, that's probably expected and ->mmu_lock contention is not
severe.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5271fce6cd65..11400bc3c70d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,6 +607,16 @@  struct kvm_vcpu_xen {
 	u64 runstate_times[4];
 };
 
+struct kvm_mmu_pte_prefetch {
+	/*
+	 * This will be cast either to array of pointers to struct page,
+	 * or array of u64, or array of u32
+	 */
+	void *ents;
+	unsigned int num_ents;
+	spinlock_t lock;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -682,6 +692,8 @@  struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	struct kvm_mmu_pte_prefetch mmu_pte_prefetch;
+
 	/*
 	 * QEMU userspace and the guest each have their own FPU state.
 	 * In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 75367af1a6d3..b953a3a4083a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -68,6 +68,10 @@  static __always_inline u64 rsvd_bits(int s, int e)
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
+int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents);
+int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu);
+void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu);
+
 void kvm_init_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 			     unsigned long cr4, u64 efer, gpa_t nested_cr3);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 24a9f4c3f5e7..fed3a498a729 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -115,6 +115,7 @@  module_param(dbg, bool, 0644);
 #endif
 
 #define PTE_PREFETCH_NUM		8
+#define MAX_PTE_PREFETCH_NUM		128
 
 #define PT32_LEVEL_BITS 10
 
@@ -732,7 +733,7 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 
 	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
-				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
+				       1 + PT64_ROOT_MAX_LEVEL + MAX_PTE_PREFETCH_NUM);
 	if (r)
 		return r;
 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
@@ -2753,12 +2754,13 @@  static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp,
 				    u64 *start, u64 *end)
 {
-	struct page *pages[PTE_PREFETCH_NUM];
+	struct page **pages;
 	struct kvm_memory_slot *slot;
 	unsigned int access = sp->role.access;
 	int i, ret;
 	gfn_t gfn;
 
+	pages = (struct page **)vcpu->arch.mmu_pte_prefetch.ents;
 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
 	if (!slot)
@@ -2781,14 +2783,17 @@  static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *sptep)
 {
 	u64 *spte, *start = NULL;
+	unsigned int pte_prefetch_num;
 	int i;
 
 	WARN_ON(!sp->role.direct);
 
-	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+	pte_prefetch_num = vcpu->arch.mmu_pte_prefetch.num_ents;
+	i = (sptep - sp->spt) & ~(pte_prefetch_num - 1);
 	spte = sp->spt + i;
 
-	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+	for (i = 0; i < pte_prefetch_num; i++, spte++) {
 		if (is_shadow_present_pte(*spte) || spte == sptep) {
 			if (!start)
 				continue;
@@ -2800,6 +2805,7 @@  static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
 	}
 	if (start)
 		direct_pte_prefetch_many(vcpu, sp, start, spte);
+	spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
 }
 
 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -4914,6 +4920,49 @@  void kvm_init_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
+int kvm_set_pte_prefetch(struct kvm_vcpu *vcpu, u64 num_ents)
+{
+	u64 *ents;
+
+	if (!num_ents)
+		return -EINVAL;
+
+	if (!is_power_of_2(num_ents))
+		return -EINVAL;
+
+	if (num_ents > MAX_PTE_PREFETCH_NUM)
+		return -EINVAL;
+
+	ents = kmalloc_array(num_ents, sizeof(u64), GFP_KERNEL);
+	if (!ents)
+		return -ENOMEM;
+
+	spin_lock(&vcpu->arch.mmu_pte_prefetch.lock);
+	kfree(vcpu->arch.mmu_pte_prefetch.ents);
+	vcpu->arch.mmu_pte_prefetch.ents = ents;
+	vcpu->arch.mmu_pte_prefetch.num_ents = num_ents;
+	spin_unlock(&vcpu->arch.mmu_pte_prefetch.lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_pte_prefetch);
+
+int kvm_init_pte_prefetch(struct kvm_vcpu *vcpu)
+{
+	spin_lock_init(&vcpu->arch.mmu_pte_prefetch.lock);
+
+	return kvm_set_pte_prefetch(vcpu, PTE_PREFETCH_NUM);
+}
+EXPORT_SYMBOL_GPL(kvm_init_pte_prefetch);
+
+void kvm_pte_prefetch_destroy(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.mmu_pte_prefetch.num_ents = 0;
+	kfree(vcpu->arch.mmu_pte_prefetch.ents);
+	vcpu->arch.mmu_pte_prefetch.ents = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_pte_prefetch_destroy);
+
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0f99132d7d1..4805960a89e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10707,10 +10707,14 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.hv_root_tdp = INVALID_PAGE;
 #endif
 
-	r = static_call(kvm_x86_vcpu_create)(vcpu);
+	r = kvm_init_pte_prefetch(vcpu);
 	if (r)
 		goto free_guest_fpu;
 
+	r = static_call(kvm_x86_vcpu_create)(vcpu);
+	if (r)
+		goto free_pte_prefetch;
+
 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 	kvm_vcpu_mtrr_init(vcpu);
@@ -10721,6 +10725,8 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu_put(vcpu);
 	return 0;
 
+free_pte_prefetch:
+	kvm_pte_prefetch_destroy(vcpu);
 free_guest_fpu:
 	kvm_free_guest_fpu(vcpu);
 free_user_fpu:
@@ -10782,6 +10788,7 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_free_lapic(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	kvm_mmu_destroy(vcpu);
+	kvm_pte_prefetch_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
 	kvfree(vcpu->arch.cpuid_entries);