diff mbox series

[RFC,04/15] KVM: x86/mmu: Factor out logic to atomically install a new page table

Message ID 20211119235759.1304274-5-dmatlack@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Eager Page Splitting for the TDP MMU | expand

Commit Message

David Matlack Nov. 19, 2021, 11:57 p.m. UTC
Factor out the logic to atomically replace an SPTE with an SPTE that
points to a new page table. This will be used in a follow-up commit to
split a large page SPTE into one level lower.

Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 16 deletions(-)

Comments

Ben Gardon Nov. 22, 2021, 6:52 p.m. UTC | #1
On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote:
>
> Factor out the logic to atomically replace an SPTE with an SPTE that
> points to a new page table. This will be used in a follow-up commit to
> split a large page SPTE into one level lower.
>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------
>  1 file changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index cc9fe33c9b36..9ee3f4f7fdf5 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>         return ret;
>  }
>
> +/*
> + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an
> + * spte pointing to the provided page table.
> + *
> + * @kvm: kvm instance
> + * @iter: a tdp_iter instance currently on the SPTE that should be set
> + * @sp: The new TDP page table to install.
> + * @account_nx: True if this page table is being installed to split a
> + *              non-executable huge page.
> + *
> + * Returns: True if the new page table was installed. False if spte being
> + *          replaced changed, causing the atomic compare-exchange to fail.
> + *          If this function returns false the sp will be freed before
> + *          returning.
> + */
> +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
> +                                     struct tdp_iter *iter,
> +                                     struct kvm_mmu_page *sp,
> +                                     bool account_nx)
> +{
> +       u64 spte;
> +
> +       spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
> +
> +       if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
> +               tdp_mmu_link_page(kvm, sp, account_nx);
> +               return true;
> +       } else {
> +               tdp_mmu_free_sp(sp);
> +               return false;
> +       }
> +}
> +
>  /*
>   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
>   * page tables and SPTEs to translate the faulting guest physical address.
> @@ -954,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>         struct kvm_mmu *mmu = vcpu->arch.mmu;
>         struct tdp_iter iter;
>         struct kvm_mmu_page *sp;
> -       u64 *child_pt;
> -       u64 new_spte;
>         int ret;
>
>         kvm_mmu_hugepage_adjust(vcpu, fault);
> @@ -983,6 +1014,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                 }
>
>                 if (!is_shadow_present_pte(iter.old_spte)) {
> +                       bool account_nx = fault->huge_page_disallowed &&
> +                                         fault->req_level >= iter.level;
> +
>                         /*
>                          * If SPTE has been frozen by another thread, just
>                          * give up and retry, avoiding unnecessary page table
> @@ -992,21 +1026,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                                 break;
>
>                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
> -                       child_pt = sp->spt;
> -
> -                       new_spte = make_nonleaf_spte(child_pt,
> -                                                    !shadow_accessed_mask);
> -
> -                       if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
> -                               tdp_mmu_link_page(vcpu->kvm, sp,
> -                                                 fault->huge_page_disallowed &&
> -                                                 fault->req_level >= iter.level);
> -
> -                               trace_kvm_mmu_get_page(sp, true);

This refactoring drops this trace point. Is that intentional?


> -                       } else {
> -                               tdp_mmu_free_sp(sp);
> +                       if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx))
>                                 break;
> -                       }
>                 }
>         }
>
> --
> 2.34.0.rc2.393.gf8c9666880-goog
>
David Matlack Nov. 30, 2021, 11:27 p.m. UTC | #2
On Mon, Nov 22, 2021 at 10:53 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote:
> >
> > Factor out the logic to atomically replace an SPTE with an SPTE that
> > points to a new page table. This will be used in a follow-up commit to
> > split a large page SPTE into one level lower.
> >
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------
> >  1 file changed, 37 insertions(+), 16 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index cc9fe33c9b36..9ee3f4f7fdf5 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> >         return ret;
> >  }
> >
> > +/*
> > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an
> > + * spte pointing to the provided page table.
> > + *
> > + * @kvm: kvm instance
> > + * @iter: a tdp_iter instance currently on the SPTE that should be set
> > + * @sp: The new TDP page table to install.
> > + * @account_nx: True if this page table is being installed to split a
> > + *              non-executable huge page.
> > + *
> > + * Returns: True if the new page table was installed. False if spte being
> > + *          replaced changed, causing the atomic compare-exchange to fail.
> > + *          If this function returns false the sp will be freed before
> > + *          returning.
> > + */
> > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
> > +                                     struct tdp_iter *iter,
> > +                                     struct kvm_mmu_page *sp,
> > +                                     bool account_nx)
> > +{
> > +       u64 spte;
> > +
> > +       spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
> > +
> > +       if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
> > +               tdp_mmu_link_page(kvm, sp, account_nx);
> > +               return true;
> > +       } else {
> > +               tdp_mmu_free_sp(sp);
> > +               return false;
> > +       }
> > +}
> > +
> >  /*
> >   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
> >   * page tables and SPTEs to translate the faulting guest physical address.
> > @@ -954,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >         struct kvm_mmu *mmu = vcpu->arch.mmu;
> >         struct tdp_iter iter;
> >         struct kvm_mmu_page *sp;
> > -       u64 *child_pt;
> > -       u64 new_spte;
> >         int ret;
> >
> >         kvm_mmu_hugepage_adjust(vcpu, fault);
> > @@ -983,6 +1014,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >                 }
> >
> >                 if (!is_shadow_present_pte(iter.old_spte)) {
> > +                       bool account_nx = fault->huge_page_disallowed &&
> > +                                         fault->req_level >= iter.level;
> > +
> >                         /*
> >                          * If SPTE has been frozen by another thread, just
> >                          * give up and retry, avoiding unnecessary page table
> > @@ -992,21 +1026,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >                                 break;
> >
> >                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
> > -                       child_pt = sp->spt;
> > -
> > -                       new_spte = make_nonleaf_spte(child_pt,
> > -                                                    !shadow_accessed_mask);
> > -
> > -                       if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
> > -                               tdp_mmu_link_page(vcpu->kvm, sp,
> > -                                                 fault->huge_page_disallowed &&
> > -                                                 fault->req_level >= iter.level);
> > -
> > -                               trace_kvm_mmu_get_page(sp, true);
>
> This refactoring drops this trace point. Is that intentional?

Yes it was intentional, but I forgot to describe it in the commit
message. Good catch.

This tracepoint is redundant with the one in alloc_tdp_mmu_page().

I'll update the commit message for v1.

>
>
> > -                       } else {
> > -                               tdp_mmu_free_sp(sp);
> > +                       if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx))
> >                                 break;
> > -                       }
> >                 }
> >         }
> >
> > --
> > 2.34.0.rc2.393.gf8c9666880-goog
> >
Sean Christopherson Dec. 1, 2021, 7:13 p.m. UTC | #3
On Fri, Nov 19, 2021, David Matlack wrote:
> Factor out the logic to atomically replace an SPTE with an SPTE that
> points to a new page table. This will be used in a follow-up commit to
> split a large page SPTE into one level lower.
> 
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------
>  1 file changed, 37 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index cc9fe33c9b36..9ee3f4f7fdf5 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>  	return ret;
>  }
>  
> +/*
> + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an
> + * spte pointing to the provided page table.
> + *
> + * @kvm: kvm instance
> + * @iter: a tdp_iter instance currently on the SPTE that should be set
> + * @sp: The new TDP page table to install.
> + * @account_nx: True if this page table is being installed to split a
> + *              non-executable huge page.
> + *
> + * Returns: True if the new page table was installed. False if spte being
> + *          replaced changed, causing the atomic compare-exchange to fail.
> + *          If this function returns false the sp will be freed before
> + *          returning.
> + */
> +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
> +				      struct tdp_iter *iter,
> +				      struct kvm_mmu_page *sp,
> +				      bool account_nx)
> +{
> +	u64 spte;
> +
> +	spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);

This can easily go on one line.

	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
> +
> +	if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
> +		tdp_mmu_link_page(kvm, sp, account_nx);
> +		return true;
> +	} else {
> +		tdp_mmu_free_sp(sp);
> +		return false;

I don't think this helper should free the sp on failure, even if that's what all
paths end up doing.  When reading the calling code, it really looks like the sp
is being leaked because the allocation and free are in different contexts.  That
the sp is consumed on success is fairly intuitive given the "install" action, but
freeing on failure not so much.

And for the eager splitting, freeing on failure is wasteful.  It's extremely
unlikely to happen often, so in practice it's unlikely to be an issue, but it's
certainly odd since the loop is likely going to immediately allocate another sp,
either for the current spte or for the next spte.

Side topic, tdp_mmu_set_spte_atomic() and friends really should return 0/-EBUSY.
Boolean returns for errors usually end in tears sooner or later.
David Matlack Dec. 1, 2021, 9:52 p.m. UTC | #4
On Wed, Dec 1, 2021 at 11:14 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Nov 19, 2021, David Matlack wrote:
> > Factor out the logic to atomically replace an SPTE with an SPTE that
> > points to a new page table. This will be used in a follow-up commit to
> > split a large page SPTE into one level lower.
> >
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------
> >  1 file changed, 37 insertions(+), 16 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index cc9fe33c9b36..9ee3f4f7fdf5 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> >       return ret;
> >  }
> >
> > +/*
> > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an
> > + * spte pointing to the provided page table.
> > + *
> > + * @kvm: kvm instance
> > + * @iter: a tdp_iter instance currently on the SPTE that should be set
> > + * @sp: The new TDP page table to install.
> > + * @account_nx: True if this page table is being installed to split a
> > + *              non-executable huge page.
> > + *
> > + * Returns: True if the new page table was installed. False if spte being
> > + *          replaced changed, causing the atomic compare-exchange to fail.
> > + *          If this function returns false the sp will be freed before
> > + *          returning.
> > + */
> > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
> > +                                   struct tdp_iter *iter,
> > +                                   struct kvm_mmu_page *sp,
> > +                                   bool account_nx)
> > +{
> > +     u64 spte;
> > +
> > +     spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
>
> This can easily go on one line.
>
>         u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
> > +
> > +     if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
> > +             tdp_mmu_link_page(kvm, sp, account_nx);
> > +             return true;
> > +     } else {
> > +             tdp_mmu_free_sp(sp);
> > +             return false;
>
> I don't think this helper should free the sp on failure, even if that's what all
> paths end up doing.  When reading the calling code, it really looks like the sp
> is being leaked because the allocation and free are in different contexts.  That
> the sp is consumed on success is fairly intuitive given the "install" action, but
> freeing on failure not so much.
>
> And for the eager splitting, freeing on failure is wasteful.  It's extremely
> unlikely to happen often, so in practice it's unlikely to be an issue, but it's
> certainly odd since the loop is likely going to immediately allocate another sp,
> either for the current spte or for the next spte.

Good point. I'll fix this in v1.

>
> Side topic, tdp_mmu_set_spte_atomic() and friends really should return 0/-EBUSY.
> Boolean returns for errors usually end in tears sooner or later.

Agreed. I was sticking with local style here but would like to see
more of this code switch to returning ints. I'll take a look at
including that cleanup as well in v1, if not a separate pre-series.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index cc9fe33c9b36..9ee3f4f7fdf5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -945,6 +945,39 @@  static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+/*
+ * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an
+ * spte pointing to the provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @account_nx: True if this page table is being installed to split a
+ *              non-executable huge page.
+ *
+ * Returns: True if the new page table was installed. False if spte being
+ *          replaced changed, causing the atomic compare-exchange to fail.
+ *          If this function returns false the sp will be freed before
+ *          returning.
+ */
+static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
+				      struct tdp_iter *iter,
+				      struct kvm_mmu_page *sp,
+				      bool account_nx)
+{
+	u64 spte;
+
+	spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
+
+	if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
+		tdp_mmu_link_page(kvm, sp, account_nx);
+		return true;
+	} else {
+		tdp_mmu_free_sp(sp);
+		return false;
+	}
+}
+
 /*
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
@@ -954,8 +987,6 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	struct tdp_iter iter;
 	struct kvm_mmu_page *sp;
-	u64 *child_pt;
-	u64 new_spte;
 	int ret;
 
 	kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -983,6 +1014,9 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		}
 
 		if (!is_shadow_present_pte(iter.old_spte)) {
+			bool account_nx = fault->huge_page_disallowed &&
+					  fault->req_level >= iter.level;
+
 			/*
 			 * If SPTE has been frozen by another thread, just
 			 * give up and retry, avoiding unnecessary page table
@@ -992,21 +1026,8 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 				break;
 
 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
-			child_pt = sp->spt;
-
-			new_spte = make_nonleaf_spte(child_pt,
-						     !shadow_accessed_mask);
-
-			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
-				tdp_mmu_link_page(vcpu->kvm, sp,
-						  fault->huge_page_disallowed &&
-						  fault->req_level >= iter.level);
-
-				trace_kvm_mmu_get_page(sp, true);
-			} else {
-				tdp_mmu_free_sp(sp);
+			if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx))
 				break;
-			}
 		}
 	}