diff mbox series

[bug,report] KVM: arm64: vgic-its: Performance degradation on GICv3 LPI injection

Message ID bb4aeb68-c77b-4071-bca5-d129a1c5f6f3@huawei.com (mailing list archive)
State New
Headers show
Series [bug,report] KVM: arm64: vgic-its: Performance degradation on GICv3 LPI injection | expand

Commit Message

Zhiqiang Ni Oct. 24, 2024, 5:06 a.m. UTC
Hi all,

I found a performance degradation on GICv3 LPI injection after this
commit ad362fe07fecf0aba839ff2cc59a3617bd42c33f(KVM: arm64: vgic-its:
Avoid potential UAF in LPI translation cache).

In my testcase, the vm's configuration is 60 VCPU 120G RAM with a
32-queue NIC and the kernel version is 5.10. The number of new TCP
connections per second changed from 150,000 to 50,000 after this
patch, with the %sys of cpu changed from 15% to 85%.

From the ftrace, I found that the duration of vgic_put_irq() is
13.320 us, which may be the reason for the performance degradation.

The call stack looks like below:
    kvm_arch_set_irq_inatomic()
      vgic_has_its();
      vgic_its_inject_cached_translation()
        vgic_its_check_cache()
        vgic_queue_irq_unlock()
        vgic_put_irq()

So, I tried to modify the code like below and I found the performance
degradation is gone. Is this a good idea?

From 2bd98f8a2cc02a17423ced36e07f7bb3c7e044af Mon Sep 17 00:00:00 2001
From: Zhiqiang Ni <nizhiqiang1@huawei.com>
Date: Thu, 24 Oct 2024 12:29:28 +0800
Subject: [PATCH] KVM: arm64: vgic-its: Avoid vgic_put_irq in LPI translation
 cache

---
 arch/arm64/kvm/vgic/vgic-its.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

Comments

Zhiqiang Ni Oct. 24, 2024, 6:08 a.m. UTC | #1
在 2024/10/24 13:06, Zhiqiang Ni 写道:
> Hi all,
> 
> I found a performance degradation on GICv3 LPI injection after this
> commit ad362fe07fecf0aba839ff2cc59a3617bd42c33f(KVM: arm64: vgic-its:
> Avoid potential UAF in LPI translation cache).
> 
> In my testcase, the vm's configuration is 60 VCPU 120G RAM with a
> 32-queue NIC and the kernel version is 5.10. The number of new TCP
> connections per second changed from 150,000 to 50,000 after this
> patch, with the %sys of cpu changed from 15% to 85%.
> 
> From the ftrace, I found that the duration of vgic_put_irq() is
> 13.320 us, which may be the reason for the performance degradation.
> 
> The call stack looks like below:
>     kvm_arch_set_irq_inatomic()
>       vgic_has_its();
>       vgic_its_inject_cached_translation()
>         vgic_its_check_cache()
>         vgic_queue_irq_unlock()
>         vgic_put_irq()
> 
> So, I tried to modify the code like below and I found the performance
> degradation is gone. Is this a good idea?
> 

I am sorry. This is not the correct patch.

> From 2bd98f8a2cc02a17423ced36e07f7bb3c7e044af Mon Sep 17 00:00:00 2001
> From: Zhiqiang Ni <nizhiqiang1@huawei.com>
> Date: Thu, 24 Oct 2024 12:29:28 +0800
> Subject: [PATCH] KVM: arm64: vgic-its: Avoid vgic_put_irq in LPI translation
>  cache
> 
> ---
>  arch/arm64/kvm/vgic/vgic-its.c | 32 ++++++++++----------------------
>  1 file changed, 10 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
> index 93c0365cd..0efabe555 100644
> --- a/arch/arm64/kvm/vgic/vgic-its.c
> +++ b/arch/arm64/kvm/vgic/vgic-its.c
> @@ -579,24 +579,6 @@ static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
>  	return NULL;
>  }
> 
> -static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
> -					     u32 devid, u32 eventid)
> -{
> -	struct vgic_dist *dist = &kvm->arch.vgic;
> -	struct vgic_irq *irq;
> -	unsigned long flags;
> -
> -	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
> -
> -	irq = __vgic_its_check_cache(dist, db, devid, eventid);
> -	if (irq)
> -		vgic_get_irq_kref(irq);
> -
> -	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
> -
> -	return irq;
> -}
> -
>  static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
>  				       u32 devid, u32 eventid,
>  				       struct vgic_irq *irq)
> @@ -759,18 +741,24 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
>  int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
>  {
>  	struct vgic_irq *irq;
> -	unsigned long flags;
> +	unsigned long flags, flags2;
>  	phys_addr_t db;
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> 
>  	db = (u64)msi->address_hi << 32 | msi->address_lo;
> -	irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
> -	if (!irq)
> +	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags2);
> +	irq = __vgic_its_check_cache(dist, db, msi->devid, msi->data);
> +
> +	if (!irq) {
> +		raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags2);
>  		return -EWOULDBLOCK;
> +	}
> 
>  	raw_spin_lock_irqsave(&irq->irq_lock, flags);
>  	irq->pending_latch = true;
>  	vgic_queue_irq_unlock(kvm, irq, flags);
> -	vgic_put_irq(kvm, irq);
> +
> +	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags2);
> 
>  	return 0;
>  }
Marc Zyngier Oct. 24, 2024, 8 a.m. UTC | #2
On Thu, 24 Oct 2024 06:06:58 +0100,
Zhiqiang Ni <nizhiqiang1@huawei.com> wrote:
> 
> Hi all,
> 
> I found a performance degradation on GICv3 LPI injection after this
> commit ad362fe07fecf0aba839ff2cc59a3617bd42c33f(KVM: arm64: vgic-its:
> Avoid potential UAF in LPI translation cache).
>
> In my testcase, the vm's configuration is 60 VCPU 120G RAM with a
> 32-queue NIC and the kernel version is 5.10. The number of new TCP
> connections per second changed from 150,000 to 50,000 after this
> patch, with the %sys of cpu changed from 15% to 85%.

What is the VM running? How is the traffic generated? Without a
reproducer, I struggle to see how we are going to analyse this issue.

We can't go back to the previous situation anyway, as it has been
shown that what we had before was simply unsafe (the commit message
explains why).

> From the ftrace, I found that the duration of vgic_put_irq() is
> 13.320 us, which may be the reason for the performance degradation.
>
> The call stack looks like below:
>     kvm_arch_set_irq_inatomic()
>       vgic_has_its();
>       vgic_its_inject_cached_translation()
>         vgic_its_check_cache()
>         vgic_queue_irq_unlock()
>         vgic_put_irq()

Are you suggesting that it is the combination of vgic_get_irq_kref() +
vgic_irq_put() that leads to excessive latency? Both are essentially
atomic operations, which should be pretty cheap on a modern CPU
(anything with FEAT_LSE).

The patch below indicates that you are looking at a rather old kernel
(6.8). What is the result on a more recent kernel (from 6.10)?

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 93c0365cd..0efabe555 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -579,24 +579,6 @@  static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
 	return NULL;
 }

-static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
-					     u32 devid, u32 eventid)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	irq = __vgic_its_check_cache(dist, db, devid, eventid);
-	if (irq)
-		vgic_get_irq_kref(irq);
-
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	return irq;
-}
-
 static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 				       u32 devid, u32 eventid,
 				       struct vgic_irq *irq)
@@ -759,18 +741,24 @@  static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
 int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
 {
 	struct vgic_irq *irq;
-	unsigned long flags;
+	unsigned long flags, flags2;
 	phys_addr_t db;
+	struct vgic_dist *dist = &kvm->arch.vgic;

 	db = (u64)msi->address_hi << 32 | msi->address_lo;
-	irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
-	if (!irq)
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags2);
+	irq = __vgic_its_check_cache(dist, db, msi->devid, msi->data);
+
+	if (!irq) {
+		raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags2);
 		return -EWOULDBLOCK;
+	}

 	raw_spin_lock_irqsave(&irq->irq_lock, flags);
 	irq->pending_latch = true;
 	vgic_queue_irq_unlock(kvm, irq, flags);
-	vgic_put_irq(kvm, irq);
+
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags2);

 	return 0;
 }