Message ID | 20210302184540.2829328-4-seanjc@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: Lots of bug fixes | expand |
On Tue, Mar 2, 2021 at 10:46 AM Sean Christopherson <seanjc@google.com> wrote: > > Hold the mmu_lock for write for the entire duration of allocating and > initializing an MMU's roots. This ensures there are MMU pages available > and thus prevents root allocations from failing. That in turn fixes a > bug where KVM would fail to free valid PAE roots if a one of the later > roots failed to allocate. > > Note, KVM still leaks the PAE roots if the lm_root allocation fails. > This will be addressed in a future commit. > > Cc: Ben Gardon <bgardon@google.com> > Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: Ben Gardon <bgardon@google.com> Very tidy cleanup! > --- > arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++-------------------------- > arch/x86/kvm/mmu/tdp_mmu.c | 23 +++++---------------- > 2 files changed, 18 insertions(+), 46 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 2ed3fac1244e..1f129001a30c 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -2398,6 +2398,9 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) > { > unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); > > + /* Ensure all four PAE roots can be allocated in a single pass. */ > + BUILD_BUG_ON(KVM_MIN_FREE_MMU_PAGES < 4); > + For a second I thought that this should be 5 since a page is needed to hold the 4 PAE roots, but that page is allocated at vCPU creation and reused, so no need to check for it here. > if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) > return 0; > > @@ -3220,16 +3223,9 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, > { > struct kvm_mmu_page *sp; > > - write_lock(&vcpu->kvm->mmu_lock); > - > - if (make_mmu_pages_available(vcpu)) { > - write_unlock(&vcpu->kvm->mmu_lock); > - return INVALID_PAGE; > - } > sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); > ++sp->root_count; > > - write_unlock(&vcpu->kvm->mmu_lock); > return __pa(sp->spt); > } > > @@ -3241,16 +3237,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > > if (is_tdp_mmu_enabled(vcpu->kvm)) { > root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); > - > - if (!VALID_PAGE(root)) > - return -ENOSPC; > vcpu->arch.mmu->root_hpa = root; > } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { > root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, > true); > - > - if (!VALID_PAGE(root)) > - return -ENOSPC; There's so much going on in mmu_alloc_root that removing this check makes me nervous, but I think it should be safe. I checked though the function because I was worried it might yield somewhere in there, which could result in the page cache being emptied and the allocation failing, but I don't think mmu_alloc_root this function will yield. > vcpu->arch.mmu->root_hpa = root; > } else if (shadow_root_level == PT32E_ROOT_LEVEL) { > for (i = 0; i < 4; ++i) { > @@ -3258,8 +3248,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > > root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), > i << 30, PT32_ROOT_LEVEL, true); > - if (!VALID_PAGE(root)) > - return -ENOSPC; > vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; > } > vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); > @@ -3294,8 +3282,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > > root = mmu_alloc_root(vcpu, root_gfn, 0, > vcpu->arch.mmu->shadow_root_level, false); > - if (!VALID_PAGE(root)) > - return -ENOSPC; > vcpu->arch.mmu->root_hpa = root; > goto set_root_pgd; > } > @@ -3325,6 +3311,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > > for (i = 0; i < 4; ++i) { > MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); > + > if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { > pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); > if (!(pdptr & PT_PRESENT_MASK)) { > @@ -3338,8 +3325,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > > root = mmu_alloc_root(vcpu, root_gfn, i << 30, > PT32_ROOT_LEVEL, false); > - if (!VALID_PAGE(root)) > - return -ENOSPC; > vcpu->arch.mmu->pae_root[i] = root | pm_mask; > } > vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); > @@ -3373,14 +3358,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > return 0; > } > > -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) > -{ > - if (vcpu->arch.mmu->direct_map) > - return mmu_alloc_direct_roots(vcpu); > - else > - return mmu_alloc_shadow_roots(vcpu); > -} > - > void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) > { > int i; > @@ -4822,7 +4799,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) > r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); > if (r) > goto out; > - r = mmu_alloc_roots(vcpu); > + write_lock(&vcpu->kvm->mmu_lock); > + if (make_mmu_pages_available(vcpu)) > + r = -ENOSPC; > + else if (vcpu->arch.mmu->direct_map) > + r = mmu_alloc_direct_roots(vcpu); > + else > + r = mmu_alloc_shadow_roots(vcpu); > + write_unlock(&vcpu->kvm->mmu_lock); > + > kvm_mmu_sync_roots(vcpu); > if (r) > goto out; > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 70226e0875fe..50ef757c5586 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, > return sp; > } > > -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) > +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) > { > union kvm_mmu_page_role role; > struct kvm *kvm = vcpu->kvm; > struct kvm_mmu_page *root; > > + lockdep_assert_held_write(&kvm->mmu_lock); > + > role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); > > - write_lock(&kvm->mmu_lock); > - > /* Check for an existing root before allocating a new one. */ > for_each_tdp_mmu_root(kvm, root) { > if (root->role.word == role.word) { > kvm_mmu_get_root(kvm, root); > - write_unlock(&kvm->mmu_lock); > - return root; > + goto out; > } > } > > @@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) > > list_add(&root->link, &kvm->arch.tdp_mmu_roots); > > - write_unlock(&kvm->mmu_lock); > - > - return root; > -} > - > -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) > -{ > - struct kvm_mmu_page *root; > - > - root = get_tdp_mmu_vcpu_root(vcpu); > - if (!root) > - return INVALID_PAGE; > - > +out: > return __pa(root->spt); > } > > -- > 2.30.1.766.gb4fecdf3b7-goog >
On Tue, Mar 02, 2021, Ben Gardon wrote: > > @@ -3241,16 +3237,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > > > > if (is_tdp_mmu_enabled(vcpu->kvm)) { > > root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); > > - > > - if (!VALID_PAGE(root)) > > - return -ENOSPC; > > vcpu->arch.mmu->root_hpa = root; > > } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { > > root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, > > true); > > - > > - if (!VALID_PAGE(root)) > > - return -ENOSPC; > > There's so much going on in mmu_alloc_root that removing this check > makes me nervous, but I think it should be safe. Just think of it as a variant of kvm_mmu_get_page(), then all your fears will melt away. ;-) > I checked though the function because I was worried it might yield > somewhere in there, which could result in the page cache being emptied > and the allocation failing, but I don't think mmu_alloc_root this > function will yield. Ugh, mmu_alloc_root() won't yield, but get_zeroed_page() used to allocate pae_root and lm_root on-demand in mmu_alloc_shadow_roots() will. The two options are (a) allocate the fake roots before taking the lock and (b) allocate them from vcpu->arch.mmu_shadow_page_cache. I probably prefer (a). (b) will slide directly into the existing code, but would require bumping the min number of objects for mmu_shadow_page_cache in mmu_topup_memory_caches(). I'd prefer not to have yet more code that has to deal with this insanity.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2ed3fac1244e..1f129001a30c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2398,6 +2398,9 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); + /* Ensure all four PAE roots can be allocated in a single pass. */ + BUILD_BUG_ON(KVM_MIN_FREE_MMU_PAGES < 4); + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; @@ -3220,16 +3223,9 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - write_lock(&vcpu->kvm->mmu_lock); - - if (make_mmu_pages_available(vcpu)) { - write_unlock(&vcpu->kvm->mmu_lock); - return INVALID_PAGE; - } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } @@ -3241,16 +3237,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - - if (!VALID_PAGE(root)) - return -ENOSPC; vcpu->arch.mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); - - if (!VALID_PAGE(root)) - return -ENOSPC; vcpu->arch.mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { @@ -3258,8 +3248,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - if (!VALID_PAGE(root)) - return -ENOSPC; vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); @@ -3294,8 +3282,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, root_gfn, 0, vcpu->arch.mmu->shadow_root_level, false); - if (!VALID_PAGE(root)) - return -ENOSPC; vcpu->arch.mmu->root_hpa = root; goto set_root_pgd; } @@ -3325,6 +3311,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { @@ -3338,8 +3325,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, false); - if (!VALID_PAGE(root)) - return -ENOSPC; vcpu->arch.mmu->pae_root[i] = root | pm_mask; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); @@ -3373,14 +3358,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 0; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu->direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -4822,7 +4799,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; - r = mmu_alloc_roots(vcpu); + write_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu)) + r = -ENOSPC; + else if (vcpu->arch.mmu->direct_map) + r = mmu_alloc_direct_roots(vcpu); + else + r = mmu_alloc_shadow_roots(vcpu); + write_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 70226e0875fe..50ef757c5586 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, return sp; } -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; + lockdep_assert_held_write(&kvm->mmu_lock); + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - write_lock(&kvm->mmu_lock); - /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - write_unlock(&kvm->mmu_lock); - return root; + goto out; } } @@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - write_unlock(&kvm->mmu_lock); - - return root; -} - -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *root; - - root = get_tdp_mmu_vcpu_root(vcpu); - if (!root) - return INVALID_PAGE; - +out: return __pa(root->spt); }
Hold the mmu_lock for write for the entire duration of allocating and initializing an MMU's roots. This ensures there are MMU pages available and thus prevents root allocations from failing. That in turn fixes a bug where KVM would fail to free valid PAE roots if a one of the later roots failed to allocate. Note, KVM still leaks the PAE roots if the lm_root allocation fails. This will be addressed in a future commit. Cc: Ben Gardon <bgardon@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++-------------------------- arch/x86/kvm/mmu/tdp_mmu.c | 23 +++++---------------- 2 files changed, 18 insertions(+), 46 deletions(-)