diff mbox series

[05/14] KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs

Message ID 20210213005015.1651772-6-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Dirty logging fixes and improvements | expand

Commit Message

Sean Christopherson Feb. 13, 2021, 12:50 a.m. UTC
When zapping SPTEs in order to rebuild them as huge pages, use the new
helper that computes the max mapping level to detect whether or not a
SPTE should be zapped.  Doing so avoids zapping SPTEs that can't
possibly be rebuilt as huge pages, e.g. due to hardware constraints,
memslot alignment, etc...

This also avoids zapping SPTEs that are still large, e.g. if migration
was canceled before write-protected huge pages were shattered to enable
dirty logging.  Note, such pages are still write-protected at this time,
i.e. a page fault VM-Exit will still occur.  This will hopefully be
addressed in a future patch.

Sadly, TDP MMU loses its const on the memslot, but that's a pervasive
problem that's been around for quite some time.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     | 11 ++++++-----
 arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++------
 arch/x86/kvm/mmu/tdp_mmu.h |  2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

Comments

Paolo Bonzini Feb. 18, 2021, 12:43 p.m. UTC | #1
On 13/02/21 01:50, Sean Christopherson wrote:
> 
>  		pfn = spte_to_pfn(iter.old_spte);
>  		if (kvm_is_reserved_pfn(pfn) ||
> -		    (!PageTransCompoundMap(pfn_to_page(pfn)) &&
> -		     !kvm_is_zone_device_pfn(pfn)))
> +		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> +							    pfn, PG_LEVEL_NUM))
>  			continue;
>  


This changes the test to PageCompound.  Is it worth moving the change to 
patch 1?

Paolo
Sean Christopherson Feb. 18, 2021, 4:23 p.m. UTC | #2
On Thu, Feb 18, 2021, Paolo Bonzini wrote:
> On 13/02/21 01:50, Sean Christopherson wrote:
> > 
> >  		pfn = spte_to_pfn(iter.old_spte);
> >  		if (kvm_is_reserved_pfn(pfn) ||
> > -		    (!PageTransCompoundMap(pfn_to_page(pfn)) &&
> > -		     !kvm_is_zone_device_pfn(pfn)))
> > +		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> > +							    pfn, PG_LEVEL_NUM))
> >  			continue;
> 
> 
> This changes the test to PageCompound.  Is it worth moving the change to
> patch 1?

Yes?  I originally did that in a separate patch, then changed my mind.

If PageTransCompoundMap() also detects HugeTLB pages, then it is the "better"
option as it checks that the page is actually mapped huge.  I dropped the change
because PageTransCompound() is just a wrapper around PageCompound(), and so I
assumed PageTransCompoundMap() would detect HugeTLB pages, too.  I'm not so sure
about that after rereading the code, yet again.
Mike Kravetz Feb. 18, 2021, 10:30 p.m. UTC | #3
On 2/18/21 8:23 AM, Sean Christopherson wrote:
> On Thu, Feb 18, 2021, Paolo Bonzini wrote:
>> On 13/02/21 01:50, Sean Christopherson wrote:
>>>
>>>  		pfn = spte_to_pfn(iter.old_spte);
>>>  		if (kvm_is_reserved_pfn(pfn) ||
>>> -		    (!PageTransCompoundMap(pfn_to_page(pfn)) &&
>>> -		     !kvm_is_zone_device_pfn(pfn)))
>>> +		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
>>> +							    pfn, PG_LEVEL_NUM))
>>>  			continue;
>>
>>
>> This changes the test to PageCompound.  Is it worth moving the change to
>> patch 1?
> 
> Yes?  I originally did that in a separate patch, then changed my mind.
> 
> If PageTransCompoundMap() also detects HugeTLB pages, then it is the "better"
> option as it checks that the page is actually mapped huge.  I dropped the change
> because PageTransCompound() is just a wrapper around PageCompound(), and so I
> assumed PageTransCompoundMap() would detect HugeTLB pages, too.  I'm not so sure
> about that after rereading the code, yet again.

I have not followed this thread, but HugeTLB hit my mail filter and I can
help with this question.

No, PageTransCompoundMap() will not detect HugeTLB.  hugetlb pages do not
use the compound_mapcount_ptr field.  So, that final check/return in
PageTransCompoundMap() will always be false.
Sean Christopherson Feb. 19, 2021, 1:31 a.m. UTC | #4
On Thu, Feb 18, 2021, Mike Kravetz wrote:
> On 2/18/21 8:23 AM, Sean Christopherson wrote:
> > On Thu, Feb 18, 2021, Paolo Bonzini wrote:
> >> On 13/02/21 01:50, Sean Christopherson wrote:
> >>>
> >>>  		pfn = spte_to_pfn(iter.old_spte);
> >>>  		if (kvm_is_reserved_pfn(pfn) ||
> >>> -		    (!PageTransCompoundMap(pfn_to_page(pfn)) &&
> >>> -		     !kvm_is_zone_device_pfn(pfn)))
> >>> +		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> >>> +							    pfn, PG_LEVEL_NUM))
> >>>  			continue;
> >>
> >>
> >> This changes the test to PageCompound.  Is it worth moving the change to
> >> patch 1?
> > 
> > Yes?  I originally did that in a separate patch, then changed my mind.
> > 
> > If PageTransCompoundMap() also detects HugeTLB pages, then it is the "better"
> > option as it checks that the page is actually mapped huge.  I dropped the change
> > because PageTransCompound() is just a wrapper around PageCompound(), and so I
> > assumed PageTransCompoundMap() would detect HugeTLB pages, too.  I'm not so sure
> > about that after rereading the code, yet again.
> 
> I have not followed this thread, but HugeTLB hit my mail filter and I can
> help with this question.
> 
> No, PageTransCompoundMap() will not detect HugeTLB.  hugetlb pages do not
> use the compound_mapcount_ptr field.  So, that final check/return in
> PageTransCompoundMap() will always be false.

Thanks Mike!

Paolo, I agree it makes sense to switch to PageCompound in the earlier patch, in
case this one needs to be reverted.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fb719e7a0cbb..d5849a0e3de1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5553,8 +5553,8 @@  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		 * mapping if the indirect sp has level = 1.
 		 */
 		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
-		    (kvm_is_zone_device_pfn(pfn) ||
-		     PageCompound(pfn_to_page(pfn)))) {
+		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
+							       pfn, PG_LEVEL_NUM)) {
 			pte_list_remove(rmap_head, sptep);
 
 			if (kvm_available_flush_tlb_with_range())
@@ -5574,12 +5574,13 @@  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot)
 {
 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
+	struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+
 	write_lock(&kvm->mmu_lock);
-	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
-			 kvm_mmu_zap_collapsible_spte, true);
+	slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
 	if (is_tdp_mmu_enabled(kvm))
-		kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
+		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
 	write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3cc332ed099d..f8fa1f64e10d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1328,8 +1328,10 @@  bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
  */
 static void zap_collapsible_spte_range(struct kvm *kvm,
 				       struct kvm_mmu_page *root,
-				       gfn_t start, gfn_t end)
+				       struct kvm_memory_slot *slot)
 {
+	gfn_t start = slot->base_gfn;
+	gfn_t end = start + slot->npages;
 	struct tdp_iter iter;
 	kvm_pfn_t pfn;
 	bool spte_set = false;
@@ -1348,8 +1350,8 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 
 		pfn = spte_to_pfn(iter.old_spte);
 		if (kvm_is_reserved_pfn(pfn) ||
-		    (!PageTransCompoundMap(pfn_to_page(pfn)) &&
-		     !kvm_is_zone_device_pfn(pfn)))
+		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+							    pfn, PG_LEVEL_NUM))
 			continue;
 
 		tdp_mmu_set_spte(kvm, &iter, 0);
@@ -1367,7 +1369,7 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
  * be replaced by large mappings, for GFNs within the slot.
  */
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       const struct kvm_memory_slot *slot)
+				       struct kvm_memory_slot *slot)
 {
 	struct kvm_mmu_page *root;
 	int root_as_id;
@@ -1377,8 +1379,7 @@  void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
 		if (root_as_id != slot->as_id)
 			continue;
 
-		zap_collapsible_spte_range(kvm, root, slot->base_gfn,
-					   slot->base_gfn + slot->npages);
+		zap_collapsible_spte_range(kvm, root, slot);
 	}
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index b4b65e3699b3..d31c5ed81a18 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -35,7 +35,7 @@  void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				       bool wrprot);
 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       const struct kvm_memory_slot *slot);
+				       struct kvm_memory_slot *slot);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn);