Message ID | 20211119235759.1304274-5-dmatlack@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: Eager Page Splitting for the TDP MMU | expand |
On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote: > > Factor out the logic to atomically replace an SPTE with an SPTE that > points to a new page table. This will be used in a follow-up commit to > split a large page SPTE into one level lower. > > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------ > 1 file changed, 37 insertions(+), 16 deletions(-) > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index cc9fe33c9b36..9ee3f4f7fdf5 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > return ret; > } > > +/* > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an > + * spte pointing to the provided page table. > + * > + * @kvm: kvm instance > + * @iter: a tdp_iter instance currently on the SPTE that should be set > + * @sp: The new TDP page table to install. > + * @account_nx: True if this page table is being installed to split a > + * non-executable huge page. > + * > + * Returns: True if the new page table was installed. False if spte being > + * replaced changed, causing the atomic compare-exchange to fail. > + * If this function returns false the sp will be freed before > + * returning. > + */ > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + struct kvm_mmu_page *sp, > + bool account_nx) > +{ > + u64 spte; > + > + spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); > + > + if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) { > + tdp_mmu_link_page(kvm, sp, account_nx); > + return true; > + } else { > + tdp_mmu_free_sp(sp); > + return false; > + } > +} > + > /* > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing > * page tables and SPTEs to translate the faulting guest physical address. > @@ -954,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > struct kvm_mmu *mmu = vcpu->arch.mmu; > struct tdp_iter iter; > struct kvm_mmu_page *sp; > - u64 *child_pt; > - u64 new_spte; > int ret; > > kvm_mmu_hugepage_adjust(vcpu, fault); > @@ -983,6 +1014,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > } > > if (!is_shadow_present_pte(iter.old_spte)) { > + bool account_nx = fault->huge_page_disallowed && > + fault->req_level >= iter.level; > + > /* > * If SPTE has been frozen by another thread, just > * give up and retry, avoiding unnecessary page table > @@ -992,21 +1026,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > break; > > sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); > - child_pt = sp->spt; > - > - new_spte = make_nonleaf_spte(child_pt, > - !shadow_accessed_mask); > - > - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { > - tdp_mmu_link_page(vcpu->kvm, sp, > - fault->huge_page_disallowed && > - fault->req_level >= iter.level); > - > - trace_kvm_mmu_get_page(sp, true); This refactoring drops this trace point. Is that intentional? > - } else { > - tdp_mmu_free_sp(sp); > + if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) > break; > - } > } > } > > -- > 2.34.0.rc2.393.gf8c9666880-goog >
On Mon, Nov 22, 2021 at 10:53 AM Ben Gardon <bgardon@google.com> wrote: > > On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote: > > > > Factor out the logic to atomically replace an SPTE with an SPTE that > > points to a new page table. This will be used in a follow-up commit to > > split a large page SPTE into one level lower. > > > > Signed-off-by: David Matlack <dmatlack@google.com> > > --- > > arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------ > > 1 file changed, 37 insertions(+), 16 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index cc9fe33c9b36..9ee3f4f7fdf5 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > > return ret; > > } > > > > +/* > > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an > > + * spte pointing to the provided page table. > > + * > > + * @kvm: kvm instance > > + * @iter: a tdp_iter instance currently on the SPTE that should be set > > + * @sp: The new TDP page table to install. > > + * @account_nx: True if this page table is being installed to split a > > + * non-executable huge page. > > + * > > + * Returns: True if the new page table was installed. False if spte being > > + * replaced changed, causing the atomic compare-exchange to fail. > > + * If this function returns false the sp will be freed before > > + * returning. > > + */ > > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm, > > + struct tdp_iter *iter, > > + struct kvm_mmu_page *sp, > > + bool account_nx) > > +{ > > + u64 spte; > > + > > + spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); > > + > > + if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) { > > + tdp_mmu_link_page(kvm, sp, account_nx); > > + return true; > > + } else { > > + tdp_mmu_free_sp(sp); > > + return false; > > + } > > +} > > + > > /* > > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing > > * page tables and SPTEs to translate the faulting guest physical address. > > @@ -954,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > struct kvm_mmu *mmu = vcpu->arch.mmu; > > struct tdp_iter iter; > > struct kvm_mmu_page *sp; > > - u64 *child_pt; > > - u64 new_spte; > > int ret; > > > > kvm_mmu_hugepage_adjust(vcpu, fault); > > @@ -983,6 +1014,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > } > > > > if (!is_shadow_present_pte(iter.old_spte)) { > > + bool account_nx = fault->huge_page_disallowed && > > + fault->req_level >= iter.level; > > + > > /* > > * If SPTE has been frozen by another thread, just > > * give up and retry, avoiding unnecessary page table > > @@ -992,21 +1026,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > break; > > > > sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); > > - child_pt = sp->spt; > > - > > - new_spte = make_nonleaf_spte(child_pt, > > - !shadow_accessed_mask); > > - > > - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { > > - tdp_mmu_link_page(vcpu->kvm, sp, > > - fault->huge_page_disallowed && > > - fault->req_level >= iter.level); > > - > > - trace_kvm_mmu_get_page(sp, true); > > This refactoring drops this trace point. Is that intentional? Yes it was intentional, but I forgot to describe it in the commit message. Good catch. This tracepoint is redundant with the one in alloc_tdp_mmu_page(). I'll update the commit message for v1. > > > > - } else { > > - tdp_mmu_free_sp(sp); > > + if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) > > break; > > - } > > } > > } > > > > -- > > 2.34.0.rc2.393.gf8c9666880-goog > >
On Fri, Nov 19, 2021, David Matlack wrote: > Factor out the logic to atomically replace an SPTE with an SPTE that > points to a new page table. This will be used in a follow-up commit to > split a large page SPTE into one level lower. > > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------ > 1 file changed, 37 insertions(+), 16 deletions(-) > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index cc9fe33c9b36..9ee3f4f7fdf5 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > return ret; > } > > +/* > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an > + * spte pointing to the provided page table. > + * > + * @kvm: kvm instance > + * @iter: a tdp_iter instance currently on the SPTE that should be set > + * @sp: The new TDP page table to install. > + * @account_nx: True if this page table is being installed to split a > + * non-executable huge page. > + * > + * Returns: True if the new page table was installed. False if spte being > + * replaced changed, causing the atomic compare-exchange to fail. > + * If this function returns false the sp will be freed before > + * returning. > + */ > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + struct kvm_mmu_page *sp, > + bool account_nx) > +{ > + u64 spte; > + > + spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); This can easily go on one line. u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); > + > + if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) { > + tdp_mmu_link_page(kvm, sp, account_nx); > + return true; > + } else { > + tdp_mmu_free_sp(sp); > + return false; I don't think this helper should free the sp on failure, even if that's what all paths end up doing. When reading the calling code, it really looks like the sp is being leaked because the allocation and free are in different contexts. That the sp is consumed on success is fairly intuitive given the "install" action, but freeing on failure not so much. And for the eager splitting, freeing on failure is wasteful. It's extremely unlikely to happen often, so in practice it's unlikely to be an issue, but it's certainly odd since the loop is likely going to immediately allocate another sp, either for the current spte or for the next spte. Side topic, tdp_mmu_set_spte_atomic() and friends really should return 0/-EBUSY. Boolean returns for errors usually end in tears sooner or later.
On Wed, Dec 1, 2021 at 11:14 AM Sean Christopherson <seanjc@google.com> wrote: > > On Fri, Nov 19, 2021, David Matlack wrote: > > Factor out the logic to atomically replace an SPTE with an SPTE that > > points to a new page table. This will be used in a follow-up commit to > > split a large page SPTE into one level lower. > > > > Signed-off-by: David Matlack <dmatlack@google.com> > > --- > > arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------ > > 1 file changed, 37 insertions(+), 16 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index cc9fe33c9b36..9ee3f4f7fdf5 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > > return ret; > > } > > > > +/* > > + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an > > + * spte pointing to the provided page table. > > + * > > + * @kvm: kvm instance > > + * @iter: a tdp_iter instance currently on the SPTE that should be set > > + * @sp: The new TDP page table to install. > > + * @account_nx: True if this page table is being installed to split a > > + * non-executable huge page. > > + * > > + * Returns: True if the new page table was installed. False if spte being > > + * replaced changed, causing the atomic compare-exchange to fail. > > + * If this function returns false the sp will be freed before > > + * returning. > > + */ > > +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm, > > + struct tdp_iter *iter, > > + struct kvm_mmu_page *sp, > > + bool account_nx) > > +{ > > + u64 spte; > > + > > + spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); > > This can easily go on one line. > > u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); > > + > > + if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) { > > + tdp_mmu_link_page(kvm, sp, account_nx); > > + return true; > > + } else { > > + tdp_mmu_free_sp(sp); > > + return false; > > I don't think this helper should free the sp on failure, even if that's what all > paths end up doing. When reading the calling code, it really looks like the sp > is being leaked because the allocation and free are in different contexts. That > the sp is consumed on success is fairly intuitive given the "install" action, but > freeing on failure not so much. > > And for the eager splitting, freeing on failure is wasteful. It's extremely > unlikely to happen often, so in practice it's unlikely to be an issue, but it's > certainly odd since the loop is likely going to immediately allocate another sp, > either for the current spte or for the next spte. Good point. I'll fix this in v1. > > Side topic, tdp_mmu_set_spte_atomic() and friends really should return 0/-EBUSY. > Boolean returns for errors usually end in tears sooner or later. Agreed. I was sticking with local style here but would like to see more of this code switch to returning ints. I'll take a look at including that cleanup as well in v1, if not a separate pre-series.
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cc9fe33c9b36..9ee3f4f7fdf5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -945,6 +945,39 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return ret; } +/* + * tdp_mmu_install_sp_atomic - Atomically replace the given spte with an + * spte pointing to the provided page table. + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @sp: The new TDP page table to install. + * @account_nx: True if this page table is being installed to split a + * non-executable huge page. + * + * Returns: True if the new page table was installed. False if spte being + * replaced changed, causing the atomic compare-exchange to fail. + * If this function returns false the sp will be freed before + * returning. + */ +static bool tdp_mmu_install_sp_atomic(struct kvm *kvm, + struct tdp_iter *iter, + struct kvm_mmu_page *sp, + bool account_nx) +{ + u64 spte; + + spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); + + if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) { + tdp_mmu_link_page(kvm, sp, account_nx); + return true; + } else { + tdp_mmu_free_sp(sp); + return false; + } +} + /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. @@ -954,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; - u64 *child_pt; - u64 new_spte; int ret; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -983,6 +1014,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (!is_shadow_present_pte(iter.old_spte)) { + bool account_nx = fault->huge_page_disallowed && + fault->req_level >= iter.level; + /* * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table @@ -992,21 +1026,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) break; sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); - child_pt = sp->spt; - - new_spte = make_nonleaf_spte(child_pt, - !shadow_accessed_mask); - - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, - fault->huge_page_disallowed && - fault->req_level >= iter.level); - - trace_kvm_mmu_get_page(sp, true); - } else { - tdp_mmu_free_sp(sp); + if (!tdp_mmu_install_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) break; - } } }
Factor out the logic to atomically replace an SPTE with an SPTE that points to a new page table. This will be used in a follow-up commit to split a large page SPTE into one level lower. Signed-off-by: David Matlack <dmatlack@google.com> --- arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 16 deletions(-)