diff mbox series

[v2,7/9] KVM: arm/arm64: vgic-its: Cache successful MSI->LPI translation

Message ID 20190611170336.121706-8-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm/arm64: vgic: ITS translation cache | expand

Commit Message

Marc Zyngier June 11, 2019, 5:03 p.m. UTC
On a successful translation, preserve the parameters in the LPI
translation cache. Each translation is reusing the last slot
in the list, naturally evincting the least recently used entry.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 86 ++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

Comments

Zenghui Yu June 25, 2019, 11:50 a.m. UTC | #1
Hi Marc,

On 2019/6/12 1:03, Marc Zyngier wrote:
> On a successful translation, preserve the parameters in the LPI
> translation cache. Each translation is reusing the last slot
> in the list, naturally evincting the least recently used entry.
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>   virt/kvm/arm/vgic/vgic-its.c | 86 ++++++++++++++++++++++++++++++++++++
>   1 file changed, 86 insertions(+)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
> index 0aa0cbbc3af6..62932458476a 100644
> --- a/virt/kvm/arm/vgic/vgic-its.c
> +++ b/virt/kvm/arm/vgic/vgic-its.c
> @@ -546,6 +546,90 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
>   	return 0;
>   }
>   
> +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
> +					       phys_addr_t db,
> +					       u32 devid, u32 eventid)
> +{
> +	struct vgic_translation_cache_entry *cte;
> +	struct vgic_irq *irq = NULL;
> +
> +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
> +		/*
> +		 * If we hit a NULL entry, there is nothing after this
> +		 * point.
> +		 */
> +		if (!cte->irq)
> +			break;
> +
> +		if (cte->db == db &&
> +		    cte->devid == devid &&
> +		    cte->eventid == eventid) {
> +			/*
> +			 * Move this entry to the head, as it is the
> +			 * most recently used.
> +			 */
> +			list_move(&cte->entry, &dist->lpi_translation_cache);

Only for performance reasons: if we hit at the "head" of the list, we
don't need to do a list_move().
In our tests, we found that a single list_move() takes nearly (sometimes
even more than) one microsecond, for some unknown reason...


Thanks,
zenghui

> +			irq = cte->irq;
> +			break;
> +		}
> +	}
> +
> +	return irq;
> +}
> +
> +static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> +				       u32 devid, u32 eventid,
> +				       struct vgic_irq *irq)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	struct vgic_translation_cache_entry *cte;
> +	unsigned long flags;
> +	phys_addr_t db;
> +
> +	/* Do not cache a directly injected interrupt */
> +	if (irq->hw)
> +		return;
> +
> +	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
> +
> +	if (unlikely(list_empty(&dist->lpi_translation_cache)))
> +		goto out;
> +
> +	/*
> +	 * We could have raced with another CPU caching the same
> +	 * translation behind our back, so let's check it is not in
> +	 * already
> +	 */
> +	db = its->vgic_its_base + GITS_TRANSLATER;
> +	if (__vgic_its_check_cache(dist, db, devid, eventid))
> +		goto out;
> +
> +	/* Always reuse the last entry (LRU policy) */
> +	cte = list_last_entry(&dist->lpi_translation_cache,
> +			      typeof(*cte), entry);
> +
> +	/*
> +	 * Caching the translation implies having an extra reference
> +	 * to the interrupt, so drop the potential reference on what
> +	 * was in the cache, and increment it on the new interrupt.
> +	 */
> +	if (cte->irq)
> +		__vgic_put_lpi_locked(kvm, cte->irq);
> +
> +	vgic_get_irq_kref(irq);
> +
> +	cte->db		= db;
> +	cte->devid	= devid;
> +	cte->eventid	= eventid;
> +	cte->irq	= irq;
> +
> +	/* Move the new translation to the head of the list */
> +	list_move(&cte->entry, &dist->lpi_translation_cache);
> +
> +out:
> +	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
> +}
> +
>   void vgic_its_invalidate_cache(struct kvm *kvm)
>   {
>   	struct vgic_dist *dist = &kvm->arch.vgic;
> @@ -589,6 +673,8 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
>   	if (!vcpu->arch.vgic_cpu.lpis_enabled)
>   		return -EBUSY;
>   
> +	vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
> +
>   	*irq = ite->irq;
>   	return 0;
>   }
>
Marc Zyngier June 25, 2019, 12:31 p.m. UTC | #2
Hi Zenghui,

On 25/06/2019 12:50, Zenghui Yu wrote:
> Hi Marc,
> 
> On 2019/6/12 1:03, Marc Zyngier wrote:
>> On a successful translation, preserve the parameters in the LPI
>> translation cache. Each translation is reusing the last slot
>> in the list, naturally evincting the least recently used entry.
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>   virt/kvm/arm/vgic/vgic-its.c | 86 ++++++++++++++++++++++++++++++++++++
>>   1 file changed, 86 insertions(+)
>>
>> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
>> index 0aa0cbbc3af6..62932458476a 100644
>> --- a/virt/kvm/arm/vgic/vgic-its.c
>> +++ b/virt/kvm/arm/vgic/vgic-its.c
>> @@ -546,6 +546,90 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
>>   	return 0;
>>   }
>>   
>> +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
>> +					       phys_addr_t db,
>> +					       u32 devid, u32 eventid)
>> +{
>> +	struct vgic_translation_cache_entry *cte;
>> +	struct vgic_irq *irq = NULL;
>> +
>> +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
>> +		/*
>> +		 * If we hit a NULL entry, there is nothing after this
>> +		 * point.
>> +		 */
>> +		if (!cte->irq)
>> +			break;
>> +
>> +		if (cte->db == db &&
>> +		    cte->devid == devid &&
>> +		    cte->eventid == eventid) {
>> +			/*
>> +			 * Move this entry to the head, as it is the
>> +			 * most recently used.
>> +			 */
>> +			list_move(&cte->entry, &dist->lpi_translation_cache);
> 
> Only for performance reasons: if we hit at the "head" of the list, we
> don't need to do a list_move().
> In our tests, we found that a single list_move() takes nearly (sometimes
> even more than) one microsecond, for some unknown reason...

Huh... That's odd.

Can you narrow down under which conditions this happens? I'm not sure if
checking for the list head would be more efficient, as you end-up
fetching the head anyway. Can you try replacing this line with:

	if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
		list_move(&cte->entry, &dist->lpi_translation_cache);

and let me know whether it helps?

Thanks,

	M.
Zenghui Yu June 25, 2019, 4 p.m. UTC | #3
Hi Marc,

On 2019/6/25 20:31, Marc Zyngier wrote:
> Hi Zenghui,
> 
> On 25/06/2019 12:50, Zenghui Yu wrote:
>> Hi Marc,
>>
>> On 2019/6/12 1:03, Marc Zyngier wrote:
>>> On a successful translation, preserve the parameters in the LPI
>>> translation cache. Each translation is reusing the last slot
>>> in the list, naturally evincting the least recently used entry.
>>>
>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>> ---
>>>    virt/kvm/arm/vgic/vgic-its.c | 86 ++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 86 insertions(+)
>>>
>>> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
>>> index 0aa0cbbc3af6..62932458476a 100644
>>> --- a/virt/kvm/arm/vgic/vgic-its.c
>>> +++ b/virt/kvm/arm/vgic/vgic-its.c
>>> @@ -546,6 +546,90 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
>>>    	return 0;
>>>    }
>>>    
>>> +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
>>> +					       phys_addr_t db,
>>> +					       u32 devid, u32 eventid)
>>> +{
>>> +	struct vgic_translation_cache_entry *cte;
>>> +	struct vgic_irq *irq = NULL;
>>> +
>>> +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
>>> +		/*
>>> +		 * If we hit a NULL entry, there is nothing after this
>>> +		 * point.
>>> +		 */
>>> +		if (!cte->irq)
>>> +			break;
>>> +
>>> +		if (cte->db == db &&
>>> +		    cte->devid == devid &&
>>> +		    cte->eventid == eventid) {
>>> +			/*
>>> +			 * Move this entry to the head, as it is the
>>> +			 * most recently used.
>>> +			 */
>>> +			list_move(&cte->entry, &dist->lpi_translation_cache);
>>
>> Only for performance reasons: if we hit at the "head" of the list, we
>> don't need to do a list_move().
>> In our tests, we found that a single list_move() takes nearly (sometimes
>> even more than) one microsecond, for some unknown reason...
> 
> Huh... That's odd.
> 
> Can you narrow down under which conditions this happens? I'm not sure if
> checking for the list head would be more efficient, as you end-up
> fetching the head anyway. Can you try replacing this line with:
> 
> 	if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
> 		list_move(&cte->entry, &dist->lpi_translation_cache);
> 
> and let me know whether it helps?

It helps. With this change, the overhead of list_move() is gone.

We run 16 4-vcpu VMs on the host, each with a vhost-user nic, and run
"iperf" in pairs between them.  It's likely to hit at the head of the
cache list in our tests.
With this change, the sys% utilization of vhostdpfwd threads will
decrease by about 10%.  But I don't know the reason exactly (I haven't
found any clues in code yet, in implementation of list_move...).


Thanks,
zenghui
Zenghui Yu June 26, 2019, 3:54 a.m. UTC | #4
On 2019/6/26 0:00, Zenghui Yu wrote:
> Hi Marc,
> 
> On 2019/6/25 20:31, Marc Zyngier wrote:
>> Hi Zenghui,
>>
>> On 25/06/2019 12:50, Zenghui Yu wrote:
>>> Hi Marc,
>>>
>>> On 2019/6/12 1:03, Marc Zyngier wrote:
>>>> On a successful translation, preserve the parameters in the LPI
>>>> translation cache. Each translation is reusing the last slot
>>>> in the list, naturally evincting the least recently used entry.
>>>>
>>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>>> ---
>>>>    virt/kvm/arm/vgic/vgic-its.c | 86 
>>>> ++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 86 insertions(+)
>>>>
>>>> diff --git a/virt/kvm/arm/vgic/vgic-its.c 
>>>> b/virt/kvm/arm/vgic/vgic-its.c
>>>> index 0aa0cbbc3af6..62932458476a 100644
>>>> --- a/virt/kvm/arm/vgic/vgic-its.c
>>>> +++ b/virt/kvm/arm/vgic/vgic-its.c
>>>> @@ -546,6 +546,90 @@ static unsigned long 
>>>> vgic_mmio_read_its_idregs(struct kvm *kvm,
>>>>        return 0;
>>>>    }
>>>> +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
>>>> +                           phys_addr_t db,
>>>> +                           u32 devid, u32 eventid)
>>>> +{
>>>> +    struct vgic_translation_cache_entry *cte;
>>>> +    struct vgic_irq *irq = NULL;
>>>> +
>>>> +    list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
>>>> +        /*
>>>> +         * If we hit a NULL entry, there is nothing after this
>>>> +         * point.
>>>> +         */
>>>> +        if (!cte->irq)
>>>> +            break;
>>>> +
>>>> +        if (cte->db == db &&
>>>> +            cte->devid == devid &&
>>>> +            cte->eventid == eventid) {
>>>> +            /*
>>>> +             * Move this entry to the head, as it is the
>>>> +             * most recently used.
>>>> +             */
>>>> +            list_move(&cte->entry, &dist->lpi_translation_cache);
>>>
>>> Only for performance reasons: if we hit at the "head" of the list, we
>>> don't need to do a list_move().
>>> In our tests, we found that a single list_move() takes nearly (sometimes
>>> even more than) one microsecond, for some unknown reason...

s/one microsecond/500 nanoseconds/
(I got the value of CNTFRQ wrong, sorry.)

>>
>> Huh... That's odd.
>>
>> Can you narrow down under which conditions this happens? I'm not sure if
>> checking for the list head would be more efficient, as you end-up
>> fetching the head anyway. Can you try replacing this line with:
>>
>>     if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
>>         list_move(&cte->entry, &dist->lpi_translation_cache);
>>
>> and let me know whether it helps?
> 
> It helps. With this change, the overhead of list_move() is gone.
> 
> We run 16 4-vcpu VMs on the host, each with a vhost-user nic, and run
> "iperf" in pairs between them.  It's likely to hit at the head of the
> cache list in our tests.
> With this change, the sys% utilization of vhostdpfwd threads will
> decrease by about 10%.  But I don't know the reason exactly (I haven't
> found any clues in code yet, in implementation of list_move...).
> 
> 
> Thanks,
> zenghui
> 
>
Eric Auger July 23, 2019, 3:21 p.m. UTC | #5
Hi Marc,

On 6/11/19 7:03 PM, Marc Zyngier wrote:
> On a successful translation, preserve the parameters in the LPI
> translation cache. Each translation is reusing the last slot
> in the list, naturally evincting the least recently used entry.
evicting
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  virt/kvm/arm/vgic/vgic-its.c | 86 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 86 insertions(+)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
> index 0aa0cbbc3af6..62932458476a 100644
> --- a/virt/kvm/arm/vgic/vgic-its.c
> +++ b/virt/kvm/arm/vgic/vgic-its.c
> @@ -546,6 +546,90 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
>  	return 0;
>  }
>  
> +static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
> +					       phys_addr_t db,
> +					       u32 devid, u32 eventid)
> +{
> +	struct vgic_translation_cache_entry *cte;
> +	struct vgic_irq *irq = NULL;
> +
> +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
> +		/*
> +		 * If we hit a NULL entry, there is nothing after this
> +		 * point.
> +		 */
> +		if (!cte->irq)
> +			break;
> +
> +		if (cte->db == db &&
> +		    cte->devid == devid &&
> +		    cte->eventid == eventid) {
> +			/*
> +			 * Move this entry to the head, as it is the
> +			 * most recently used.
> +			 */
> +			list_move(&cte->entry, &dist->lpi_translation_cache);
> +			irq = cte->irq;
> +			break;
> +		}
> +	}
> +
> +	return irq;
> +}
> +
> +static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> +				       u32 devid, u32 eventid,
> +				       struct vgic_irq *irq)
> +{
> +	struct vgic_dist *dist = &kvm->arch.vgic;
> +	struct vgic_translation_cache_entry *cte;
> +	unsigned long flags;
> +	phys_addr_t db;
> +
> +	/* Do not cache a directly injected interrupt */
> +	if (irq->hw)
> +		return;
> +
> +	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
> +
> +	if (unlikely(list_empty(&dist->lpi_translation_cache)))
> +		goto out;
> +
> +	/*
> +	 * We could have raced with another CPU caching the same
> +	 * translation behind our back, so let's check it is not in
> +	 * already
> +	 */
> +	db = its->vgic_its_base + GITS_TRANSLATER;
> +	if (__vgic_its_check_cache(dist, db, devid, eventid))
> +		goto out;
> +
> +	/* Always reuse the last entry (LRU policy) */
> +	cte = list_last_entry(&dist->lpi_translation_cache,
> +			      typeof(*cte), entry);
> +
> +	/*
> +	 * Caching the translation implies having an extra reference
> +	 * to the interrupt, so drop the potential reference on what
> +	 * was in the cache, and increment it on the new interrupt.
> +	 */
> +	if (cte->irq)
> +		__vgic_put_lpi_locked(kvm, cte->irq);
> +
> +	vgic_get_irq_kref(irq);
> +
> +	cte->db		= db;
> +	cte->devid	= devid;
> +	cte->eventid	= eventid;
> +	cte->irq	= irq;
> +
> +	/* Move the new translation to the head of the list */
> +	list_move(&cte->entry, &dist->lpi_translation_cache);
> +
> +out:
> +	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
> +}
> +
>  void vgic_its_invalidate_cache(struct kvm *kvm)
>  {
>  	struct vgic_dist *dist = &kvm->arch.vgic;
> @@ -589,6 +673,8 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
>  	if (!vcpu->arch.vgic_cpu.lpis_enabled)
>  		return -EBUSY;
>  
> +	vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
> +
>  	*irq = ite->irq;
>  	return 0;
>  }
> 
Reviewed-by: Eric Auger <eric.auger@redhat.com>

Thanks

Eric
diff mbox series

Patch

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 0aa0cbbc3af6..62932458476a 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -546,6 +546,90 @@  static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
 	return 0;
 }
 
+static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
+					       phys_addr_t db,
+					       u32 devid, u32 eventid)
+{
+	struct vgic_translation_cache_entry *cte;
+	struct vgic_irq *irq = NULL;
+
+	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+		/*
+		 * If we hit a NULL entry, there is nothing after this
+		 * point.
+		 */
+		if (!cte->irq)
+			break;
+
+		if (cte->db == db &&
+		    cte->devid == devid &&
+		    cte->eventid == eventid) {
+			/*
+			 * Move this entry to the head, as it is the
+			 * most recently used.
+			 */
+			list_move(&cte->entry, &dist->lpi_translation_cache);
+			irq = cte->irq;
+			break;
+		}
+	}
+
+	return irq;
+}
+
+static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
+				       u32 devid, u32 eventid,
+				       struct vgic_irq *irq)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_translation_cache_entry *cte;
+	unsigned long flags;
+	phys_addr_t db;
+
+	/* Do not cache a directly injected interrupt */
+	if (irq->hw)
+		return;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+	if (unlikely(list_empty(&dist->lpi_translation_cache)))
+		goto out;
+
+	/*
+	 * We could have raced with another CPU caching the same
+	 * translation behind our back, so let's check it is not in
+	 * already
+	 */
+	db = its->vgic_its_base + GITS_TRANSLATER;
+	if (__vgic_its_check_cache(dist, db, devid, eventid))
+		goto out;
+
+	/* Always reuse the last entry (LRU policy) */
+	cte = list_last_entry(&dist->lpi_translation_cache,
+			      typeof(*cte), entry);
+
+	/*
+	 * Caching the translation implies having an extra reference
+	 * to the interrupt, so drop the potential reference on what
+	 * was in the cache, and increment it on the new interrupt.
+	 */
+	if (cte->irq)
+		__vgic_put_lpi_locked(kvm, cte->irq);
+
+	vgic_get_irq_kref(irq);
+
+	cte->db		= db;
+	cte->devid	= devid;
+	cte->eventid	= eventid;
+	cte->irq	= irq;
+
+	/* Move the new translation to the head of the list */
+	list_move(&cte->entry, &dist->lpi_translation_cache);
+
+out:
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
 void vgic_its_invalidate_cache(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
@@ -589,6 +673,8 @@  int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
 	if (!vcpu->arch.vgic_cpu.lpis_enabled)
 		return -EBUSY;
 
+	vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
+
 	*irq = ite->irq;
 	return 0;
 }