Message ID | 20210611235701.3941724-6-dmatlack@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: Fast page fault support for the TDP MMU | expand |
On Fri, Jun 11, 2021 at 4:57 PM David Matlack <dmatlack@google.com> wrote: > > In order to use walk_shadow_page_lockless() in fast_page_fault() we need > to also record the spteps. > > No functional change intended. > > Signed-off-by: David Matlack <dmatlack@google.com> Reviewed-by: Ben Gardon <bgardon@google.com> > --- > arch/x86/kvm/mmu/mmu.c | 1 + > arch/x86/kvm/mmu/mmu_internal.h | 3 +++ > arch/x86/kvm/mmu/tdp_mmu.c | 1 + > 3 files changed, 5 insertions(+) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 8140c262f4d3..765f5b01768d 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3538,6 +3538,7 @@ static bool walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr, > spte = mmu_spte_get_lockless(it.sptep); > walk->last_level = it.level; > walk->sptes[it.level] = spte; > + walk->spteps[it.level] = it.sptep; > > if (!is_shadow_present_pte(spte)) > break; > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 26da6ca30fbf..0fefbd5d6c95 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -178,6 +178,9 @@ struct shadow_page_walk { > > /* The spte value at each level. */ > u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; > + > + /* The spte pointers at each level. */ > + u64 *spteps[PT64_ROOT_MAX_LEVEL + 1]; > }; > > #endif /* __KVM_X86_MMU_INTERNAL_H */ > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 36f4844a5f95..7279d17817a1 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1529,6 +1529,7 @@ bool kvm_tdp_mmu_walk_lockless(struct kvm_vcpu *vcpu, u64 addr, > > walk->last_level = iter.level; > walk->sptes[iter.level] = iter.old_spte; > + walk->spteps[iter.level] = iter.sptep; > } > > return walk_ok; > -- > 2.32.0.272.g935e593368-goog >
On Fri, Jun 11, 2021 at 11:56:58PM +0000, David Matlack wrote: > In order to use walk_shadow_page_lockless() in fast_page_fault() we need > to also record the spteps. > > No functional change intended. > > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/x86/kvm/mmu/mmu.c | 1 + > arch/x86/kvm/mmu/mmu_internal.h | 3 +++ > arch/x86/kvm/mmu/tdp_mmu.c | 1 + > 3 files changed, 5 insertions(+) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 8140c262f4d3..765f5b01768d 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3538,6 +3538,7 @@ static bool walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr, > spte = mmu_spte_get_lockless(it.sptep); > walk->last_level = it.level; > walk->sptes[it.level] = spte; > + walk->spteps[it.level] = it.sptep; > > if (!is_shadow_present_pte(spte)) > break; > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 26da6ca30fbf..0fefbd5d6c95 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -178,6 +178,9 @@ struct shadow_page_walk { > > /* The spte value at each level. */ > u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; > + > + /* The spte pointers at each level. */ > + u64 *spteps[PT64_ROOT_MAX_LEVEL + 1]; > }; > > #endif /* __KVM_X86_MMU_INTERNAL_H */ > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 36f4844a5f95..7279d17817a1 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1529,6 +1529,7 @@ bool kvm_tdp_mmu_walk_lockless(struct kvm_vcpu *vcpu, u64 addr, > > walk->last_level = iter.level; > walk->sptes[iter.level] = iter.old_spte; > + walk->spteps[iter.level] = iter.sptep; I think this should technically be: walk->spteps[iter.level] = rcu_dereference(iter.sptep); > } > > return walk_ok; > -- > 2.32.0.272.g935e593368-goog >
On Fri, Jun 11, 2021, David Matlack wrote: > In order to use walk_shadow_page_lockless() in fast_page_fault() we need > to also record the spteps. > > No functional change intended. > > Signed-off-by: David Matlack <dmatlack@google.com> > --- > arch/x86/kvm/mmu/mmu.c | 1 + > arch/x86/kvm/mmu/mmu_internal.h | 3 +++ > arch/x86/kvm/mmu/tdp_mmu.c | 1 + > 3 files changed, 5 insertions(+) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 8140c262f4d3..765f5b01768d 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3538,6 +3538,7 @@ static bool walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr, > spte = mmu_spte_get_lockless(it.sptep); > walk->last_level = it.level; > walk->sptes[it.level] = spte; > + walk->spteps[it.level] = it.sptep; > > if (!is_shadow_present_pte(spte)) > break; > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 26da6ca30fbf..0fefbd5d6c95 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -178,6 +178,9 @@ struct shadow_page_walk { > > /* The spte value at each level. */ > u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; > + > + /* The spte pointers at each level. */ > + u64 *spteps[PT64_ROOT_MAX_LEVEL + 1]; Hrm. I'm not sure how I feel about this patch, or about shadow_page_walk in general. On the one hand, I like having a common API. On the other hand, I really don't like mixing two different protection schemes, e.g. this really should be tdp_ptep_t spteps; in order to gain the RCU annotations for TDP, but those RCU annotations are problematic because the legacy MMU doesn't use RCU to protect its page tables. I also don't like forcing the caller to hold the "lock" for longer than is necessary, e.g. get_mmio_spte() doesn't require access to the page tables after the initial walk, but the common spteps[] kinda sorta forces its hand. The two use cases (and the only common use cases I can see) have fairly different requirements. The MMIO check wants the SPTEs at _all_ levels, whereas the fast page fault handler wants the SPTE _and_ its pointer at a single level. So I wonder if by providing a super generic API we'd actually increase complexity. I.e. rather than provide a completely generic API, maybe it would be better to have two distinct API. That wouldn't fix the tdp_ptep_t issue, but it would at least bound it to some degree and make the code more obvious. I suspect it would also reduce the code churn, though that's not necessarily a goal in and of itself. E.g. for fast_page_fault(): walk_shadow_page_lockless_begin(vcpu); do { sptep = get_spte_lockless(..., &spte); if (!is_shadow_present_pte(spte)) break; sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; ... } while(true); walk_shadow_page_lockless_end(vcpu); and for get_mmio_spte(): walk_shadow_page_lockless_begin(vcpu); leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; } walk_shadow_page_lockless_end(vcpu); And if we look at the TDP MMU implementations, they aren't sharing _that_ much code, and the code that is shared isn't all that interesting, e.g. if we really wanted to we could macro-magic away the boilerplate, but I think even I would balk at the result :-) int kvm_tdp_mmu_get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; *root_level = vcpu->arch.mmu->shadow_root_level; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } return leaf; } u64 *kvm_tdp_mmu_get_spte_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *spte) { struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; struct tdp_iter iter; u64 *sptep = NULL; *spte = 0ull; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { /* * Here be a comment about the unfortunate differences between * the TDP MMU and the legacy MMU. */ sptep = (u64 * __force)iter.sptep; *spte = iter.old_spte; } return sptep; }
On Mon, Jun 14, 2021 at 10:59:14PM +0000, Sean Christopherson wrote: > On Fri, Jun 11, 2021, David Matlack wrote: > > In order to use walk_shadow_page_lockless() in fast_page_fault() we need > > to also record the spteps. > > > > No functional change intended. > > > > Signed-off-by: David Matlack <dmatlack@google.com> > > --- > > arch/x86/kvm/mmu/mmu.c | 1 + > > arch/x86/kvm/mmu/mmu_internal.h | 3 +++ > > arch/x86/kvm/mmu/tdp_mmu.c | 1 + > > 3 files changed, 5 insertions(+) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index 8140c262f4d3..765f5b01768d 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -3538,6 +3538,7 @@ static bool walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr, > > spte = mmu_spte_get_lockless(it.sptep); > > walk->last_level = it.level; > > walk->sptes[it.level] = spte; > > + walk->spteps[it.level] = it.sptep; > > > > if (!is_shadow_present_pte(spte)) > > break; > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > > index 26da6ca30fbf..0fefbd5d6c95 100644 > > --- a/arch/x86/kvm/mmu/mmu_internal.h > > +++ b/arch/x86/kvm/mmu/mmu_internal.h > > @@ -178,6 +178,9 @@ struct shadow_page_walk { > > > > /* The spte value at each level. */ > > u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; > > + > > + /* The spte pointers at each level. */ > > + u64 *spteps[PT64_ROOT_MAX_LEVEL + 1]; > > Hrm. I'm not sure how I feel about this patch, or about shadow_page_walk in > general. On the one hand, I like having a common API. On the other hand, I > really don't like mixing two different protection schemes, e.g. this really > should be > > tdp_ptep_t spteps; > > in order to gain the RCU annotations for TDP, but those RCU annotations are > problematic because the legacy MMU doesn't use RCU to protect its page tables. > > I also don't like forcing the caller to hold the "lock" for longer than is > necessary, e.g. get_mmio_spte() doesn't require access to the page tables after > the initial walk, but the common spteps[] kinda sorta forces its hand. Yeah this felt gross implementing. I like your idea to create separate APIs instead. > > The two use cases (and the only common use cases I can see) have fairly different > requirements. The MMIO check wants the SPTEs at _all_ levels, whereas the fast > page fault handler wants the SPTE _and_ its pointer at a single level. So I > wonder if by providing a super generic API we'd actually increase complexity. > > I.e. rather than provide a completely generic API, maybe it would be better to > have two distinct API. That wouldn't fix the tdp_ptep_t issue, but it would at > least bound it to some degree and make the code more obvious. Does the tdp_ptep_t issue go away if kvm_tdp_mmu_get_spte_lockless returns an rcu_dereference'd version of the pointer? See below. > I suspect it would > also reduce the code churn, though that's not necessarily a goal in and of itself. Makes sense. So keep walk_shadow_page_lockless_{begin,end}() as a generic API but provide separate helper functions for get_mmio_spte and fast_page_fault to get each exactly what each needs. > > E.g. for fast_page_fault(): > > walk_shadow_page_lockless_begin(vcpu); > > do { > sptep = get_spte_lockless(..., &spte); > if (!is_shadow_present_pte(spte)) > break; > > sp = sptep_to_sp(sptep); > if (!is_last_spte(spte, sp->role.level)) > break; > > ... > } while(true); > > walk_shadow_page_lockless_end(vcpu); > > > and for get_mmio_spte(): > walk_shadow_page_lockless_begin(vcpu); > leaf = get_sptes_lockless(vcpu, addr, sptes, &root); > if (unlikely(leaf < 0)) { > *sptep = 0ull; > return reserved; > } > > walk_shadow_page_lockless_end(vcpu); > > > And if we look at the TDP MMU implementations, they aren't sharing _that_ much > code, and the code that is shared isn't all that interesting, e.g. if we really > wanted to we could macro-magic away the boilerplate, but I think even I would > balk at the result :-) Agreed. > > int kvm_tdp_mmu_get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, > int *root_level) > { > struct tdp_iter iter; > struct kvm_mmu *mmu = vcpu->arch.mmu; > gfn_t gfn = addr >> PAGE_SHIFT; > int leaf = -1; > > *root_level = vcpu->arch.mmu->shadow_root_level; > > tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { > leaf = iter.level; > sptes[leaf] = iter.old_spte; > } > > return leaf; > } > > u64 *kvm_tdp_mmu_get_spte_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *spte) > { > struct kvm_mmu *mmu = vcpu->arch.mmu; > gfn_t gfn = addr >> PAGE_SHIFT; > struct tdp_iter iter; > u64 *sptep = NULL; > > *spte = 0ull; > > tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { > /* > * Here be a comment about the unfortunate differences between > * the TDP MMU and the legacy MMU. > */ > sptep = (u64 * __force)iter.sptep; Instead, should this be: sptep = rcu_dereference(iter.sptep); ? > *spte = iter.old_spte; > } > return sptep; > } >
On Mon, Jun 14, 2021, David Matlack wrote: > On Mon, Jun 14, 2021 at 10:59:14PM +0000, Sean Christopherson wrote: > > The two use cases (and the only common use cases I can see) have fairly different > > requirements. The MMIO check wants the SPTEs at _all_ levels, whereas the fast > > page fault handler wants the SPTE _and_ its pointer at a single level. So I > > wonder if by providing a super generic API we'd actually increase complexity. > > > > I.e. rather than provide a completely generic API, maybe it would be better to > > have two distinct API. That wouldn't fix the tdp_ptep_t issue, but it would at > > least bound it to some degree and make the code more obvious. > > Does the tdp_ptep_t issue go away if kvm_tdp_mmu_get_spte_lockless > returns an rcu_dereference'd version of the pointer? See below. Sort of? > > u64 *kvm_tdp_mmu_get_spte_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *spte) > > { > > struct kvm_mmu *mmu = vcpu->arch.mmu; > > gfn_t gfn = addr >> PAGE_SHIFT; > > struct tdp_iter iter; > > u64 *sptep = NULL; > > > > *spte = 0ull; > > > > tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { > > /* > > * Here be a comment about the unfortunate differences between > > * the TDP MMU and the legacy MMU. > > */ > > sptep = (u64 * __force)iter.sptep; > > Instead, should this be: > > sptep = rcu_dereference(iter.sptep); > > ? It's not wrong per se, but it's cheating in some sense. The problem is that it's not the pointer itself that's RCU-protected, rather it's what it's pointing at (the page tables) that's RCU-protected. E.g. if this were reading the _value_, it would be a-ok to do: spte = READ_ONCE(*rcu_dereference(iter.sptep)); and return the value because the caller gets a copy of the RCU-protected data. Reading multiple times from a single dereference is also ok, e.g. see kvm_bitmap_or_dest_vcpus() and several other APIC helpers, but note how they all take and release the RCU lock in a single block and don't return the protected pointer to the caller. Stripping the __rcu annotation without actually being able to guarantee that the caller is going to do the right thing with the pointer is why I say it's "cheating". E.g. if the caller were to do: u64 get_spte_lockess_broken(...) { u64 *sptep; rcu_read_lock(); sptep = kvm_tdp_mmu_get_spte_lockless(...); rcu_read_unlock(); return READ_ONCE(*sptep); } it would violate RCU but Sparse would be none the wiser. The __force ugliness is also cheating, but it's also loudly stating that we're intentionally cheating, hence the request for a comment. > > *spte = iter.old_spte; > > } > > return sptep; > > } > >
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8140c262f4d3..765f5b01768d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3538,6 +3538,7 @@ static bool walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr, spte = mmu_spte_get_lockless(it.sptep); walk->last_level = it.level; walk->sptes[it.level] = spte; + walk->spteps[it.level] = it.sptep; if (!is_shadow_present_pte(spte)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 26da6ca30fbf..0fefbd5d6c95 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -178,6 +178,9 @@ struct shadow_page_walk { /* The spte value at each level. */ u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + + /* The spte pointers at each level. */ + u64 *spteps[PT64_ROOT_MAX_LEVEL + 1]; }; #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 36f4844a5f95..7279d17817a1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1529,6 +1529,7 @@ bool kvm_tdp_mmu_walk_lockless(struct kvm_vcpu *vcpu, u64 addr, walk->last_level = iter.level; walk->sptes[iter.level] = iter.old_spte; + walk->spteps[iter.level] = iter.sptep; } return walk_ok;
In order to use walk_shadow_page_lockless() in fast_page_fault() we need to also record the spteps. No functional change intended. Signed-off-by: David Matlack <dmatlack@google.com> --- arch/x86/kvm/mmu/mmu.c | 1 + arch/x86/kvm/mmu/mmu_internal.h | 3 +++ arch/x86/kvm/mmu/tdp_mmu.c | 1 + 3 files changed, 5 insertions(+)