diff mbox series

[v3,1/8] KVM: x86: Cache total page count to avoid traversing the memslot array

Message ID eb1c881ce814705c83813f02a1a13ced96f1b1d1.1621191551.git.maciej.szmigiero@oracle.com (mailing list archive)
State New, archived
Headers show
Series KVM: Scalable memslots implementation | expand

Commit Message

Maciej S. Szmigiero May 16, 2021, 9:44 p.m. UTC
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

There is no point in recalculating from scratch the total number of pages
in all memslots each time a memslot is created or deleted.

Just cache the value and update it accordingly on each such operation so
the code doesn't need to traverse the whole memslot array each time.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          | 24 ------------------------
 arch/x86/kvm/x86.c              | 18 +++++++++++++++---
 3 files changed, 16 insertions(+), 28 deletions(-)

Comments

Sean Christopherson May 19, 2021, 9 p.m. UTC | #1
On Sun, May 16, 2021, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> 
> There is no point in recalculating from scratch the total number of pages
> in all memslots each time a memslot is created or deleted.
> 
> Just cache the value and update it accordingly on each such operation so
> the code doesn't need to traverse the whole memslot array each time.
> 
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5bd550eaf683..8c7738b75393 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -11112,9 +11112,21 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>  				const struct kvm_memory_slot *new,
>  				enum kvm_mr_change change)
>  {
> -	if (!kvm->arch.n_requested_mmu_pages)
> -		kvm_mmu_change_mmu_pages(kvm,
> -				kvm_mmu_calculate_default_mmu_pages(kvm));
> +	if (change == KVM_MR_CREATE)
> +		kvm->arch.n_memslots_pages += new->npages;
> +	else if (change == KVM_MR_DELETE) {
> +		WARN_ON(kvm->arch.n_memslots_pages < old->npages);

Heh, so I think this WARN can be triggered at will by userspace on 32-bit KVM by
causing the running count to wrap.  KVM artificially caps the size of a single
memslot at ((1UL << 31) - 1), but userspace could create multiple gigantic slots
to overflow arch.n_memslots_pages.

I _think_ changing it to a u64 would fix the problem since KVM forbids overlapping
memslots in the GPA space.

Also, what about moving the check-and-WARN to prepare_memory_region() so that
KVM can error out if the check fails?  Doesn't really matter, but an explicit
error for userspace is preferable to underflowing the number of pages and getting
weird MMU errors/behavior down the line.

> +		kvm->arch.n_memslots_pages -= old->npages;
> +	}
> +
> +	if (!kvm->arch.n_requested_mmu_pages) {

If we're going to bother caching the number of pages then we should also skip
the update when the number pages isn't changing, e.g.

	if (change == KVM_MR_CREATE || change == KVM_MR_DELETE) {
		if (change == KVM_MR_CREATE)
			kvm->arch.n_memslots_pages += new->npages;
		else
			kvm->arch.n_memslots_pages -= old->npages;

		if (!kvm->arch.n_requested_mmu_pages) {
			unsigned long nr_mmu_pages;

			nr_mmu_pages = kvm->arch.n_memslots_pages *
				       KVM_PERMILLE_MMU_PAGES / 1000;
			nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
			kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
		}
	}

> +		unsigned long nr_mmu_pages;
> +
> +		nr_mmu_pages = kvm->arch.n_memslots_pages *
> +			       KVM_PERMILLE_MMU_PAGES / 1000;
> +		nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
> +		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
> +	}
>  
>  	/*
>  	 * FIXME: const-ify all uses of struct kvm_memory_slot.
Maciej S. Szmigiero May 21, 2021, 7:03 a.m. UTC | #2
On 19.05.2021 23:00, Sean Christopherson wrote:
> On Sun, May 16, 2021, Maciej S. Szmigiero wrote:
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> There is no point in recalculating from scratch the total number of pages
>> in all memslots each time a memslot is created or deleted.
>>
>> Just cache the value and update it accordingly on each such operation so
>> the code doesn't need to traverse the whole memslot array each time.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 5bd550eaf683..8c7738b75393 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -11112,9 +11112,21 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>>   				const struct kvm_memory_slot *new,
>>   				enum kvm_mr_change change)
>>   {
>> -	if (!kvm->arch.n_requested_mmu_pages)
>> -		kvm_mmu_change_mmu_pages(kvm,
>> -				kvm_mmu_calculate_default_mmu_pages(kvm));
>> +	if (change == KVM_MR_CREATE)
>> +		kvm->arch.n_memslots_pages += new->npages;
>> +	else if (change == KVM_MR_DELETE) {
>> +		WARN_ON(kvm->arch.n_memslots_pages < old->npages);
> 
> Heh, so I think this WARN can be triggered at will by userspace on 32-bit KVM by
> causing the running count to wrap.  KVM artificially caps the size of a single
> memslot at ((1UL << 31) - 1), but userspace could create multiple gigantic slots
> to overflow arch.n_memslots_pages.
> 
> I _think_ changing it to a u64 would fix the problem since KVM forbids overlapping
> memslots in the GPA space.

You are right, n_memslots_pages needs to be u64 so it does not overflow
on 32-bit KVM.

The memslot count is limited to 32k in each of 2 address spaces, so in
the worst case the variable should hold 15-bits + 1 bit + 31-bits = 47 bit number.

> Also, what about moving the check-and-WARN to prepare_memory_region() so that
> KVM can error out if the check fails?  Doesn't really matter, but an explicit
> error for userspace is preferable to underflowing the number of pages and getting
> weird MMU errors/behavior down the line.

In principle this seems like a possibility, however, it is a more
regression-risky option, in case something has (perhaps unintentionally)
relied on the fact that kvm_mmu_zap_oldest_mmu_pages() call from
kvm_mmu_change_mmu_pages() was being done only in the memslot commit
function.

>> +		kvm->arch.n_memslots_pages -= old->npages;
>> +	}
>> +
>> +	if (!kvm->arch.n_requested_mmu_pages) {
> 
> If we're going to bother caching the number of pages then we should also skip
> the update when the number pages isn't changing, e.g.
> 
> 	if (change == KVM_MR_CREATE || change == KVM_MR_DELETE) {
> 		if (change == KVM_MR_CREATE)
> 			kvm->arch.n_memslots_pages += new->npages;
> 		else
> 			kvm->arch.n_memslots_pages -= old->npages;
> 
> 		if (!kvm->arch.n_requested_mmu_pages) {
> 			unsigned long nr_mmu_pages;
> 
> 			nr_mmu_pages = kvm->arch.n_memslots_pages *
> 				       KVM_PERMILLE_MMU_PAGES / 1000;
> 			nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
> 			kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
> 		}
> 	}

The old code did it that way (unconditionally) and, as in the case above,
I didn't want to risk an regression.
If we are going to change this fact then I think it should happen in a
separate patch.

Thanks,
Maciej
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55efbacfc244..e594f54a3875 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -975,6 +975,7 @@  struct kvm_x86_msr_filter {
 #define APICV_INHIBIT_REASON_X2APIC	5
 
 struct kvm_arch {
+	unsigned long n_memslots_pages;
 	unsigned long n_used_mmu_pages;
 	unsigned long n_requested_mmu_pages;
 	unsigned long n_max_mmu_pages;
@@ -1472,7 +1473,6 @@  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
 
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0144c40d09c7..1e46a0ce034b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5890,30 +5890,6 @@  int kvm_mmu_module_init(void)
 	return ret;
 }
 
-/*
- * Calculate mmu pages needed for kvm.
- */
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
-{
-	unsigned long nr_mmu_pages;
-	unsigned long nr_pages = 0;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int i;
-
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-		slots = __kvm_memslots(kvm, i);
-
-		kvm_for_each_memslot(memslot, slots)
-			nr_pages += memslot->npages;
-	}
-
-	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
-	nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
-
-	return nr_mmu_pages;
-}
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_unload(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bd550eaf683..8c7738b75393 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11112,9 +11112,21 @@  void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
-	if (!kvm->arch.n_requested_mmu_pages)
-		kvm_mmu_change_mmu_pages(kvm,
-				kvm_mmu_calculate_default_mmu_pages(kvm));
+	if (change == KVM_MR_CREATE)
+		kvm->arch.n_memslots_pages += new->npages;
+	else if (change == KVM_MR_DELETE) {
+		WARN_ON(kvm->arch.n_memslots_pages < old->npages);
+		kvm->arch.n_memslots_pages -= old->npages;
+	}
+
+	if (!kvm->arch.n_requested_mmu_pages) {
+		unsigned long nr_mmu_pages;
+
+		nr_mmu_pages = kvm->arch.n_memslots_pages *
+			       KVM_PERMILLE_MMU_PAGES / 1000;
+		nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+	}
 
 	/*
 	 * FIXME: const-ify all uses of struct kvm_memory_slot.