diff mbox series

[12/15] KVM: arm64: vgic-its: Pick cache victim based on usage count

Message ID 20240124204909.105952-13-oliver.upton@linux.dev (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Improvements to GICv3 LPI injection | expand

Commit Message

Oliver Upton Jan. 24, 2024, 8:49 p.m. UTC
To date the translation cache LRU policy relies on the ordering of the
linked-list to pick the victim, as entries are moved to the head of the
list on every cache hit. These sort of transformations are incompatible
with an rculist, necessitating a different strategy for recording usage
in-place.

Count the number of cache hits since the last translation cache miss for
every entry. The preferences for selecting a victim are as follows:

 - Invalid entries over valid entries

 - Valid entry with the lowest usage count

 - In the case of a tie, pick the entry closest to the tail (oldest)

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/kvm/vgic/vgic-its.c | 42 ++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 10 deletions(-)

Comments

Oliver Upton Jan. 25, 2024, 9:22 a.m. UTC | #1
On Wed, Jan 24, 2024 at 08:49:06PM +0000, Oliver Upton wrote:

[...]

> +static struct vgic_translation_cache_entry *vgic_its_cache_victim(struct vgic_dist *dist)
> +{
> +	struct vgic_translation_cache_entry *cte, *victim = NULL;
> +	u64 min, tmp;
> +
> +	/*
> +	 * Find the least used cache entry since the last cache miss, preferring
> +	 * older entries in the case of a tie. Note that usage accounting is
> +	 * deliberately non-atomic, so this is all best-effort.
> +	 */

This comment is stale, and does not reflect the fact that I'm very
obviously using atomics below.
Marc Zyngier Jan. 25, 2024, 10:55 a.m. UTC | #2
On Wed, 24 Jan 2024 20:49:06 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> To date the translation cache LRU policy relies on the ordering of the
> linked-list to pick the victim, as entries are moved to the head of the
> list on every cache hit. These sort of transformations are incompatible
> with an rculist, necessitating a different strategy for recording usage
> in-place.
> 
> Count the number of cache hits since the last translation cache miss for
> every entry. The preferences for selecting a victim are as follows:
> 
>  - Invalid entries over valid entries
> 
>  - Valid entry with the lowest usage count
> 
>  - In the case of a tie, pick the entry closest to the tail (oldest)
> 
> Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
> ---
>  arch/arm64/kvm/vgic/vgic-its.c | 42 ++++++++++++++++++++++++++--------
>  1 file changed, 32 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
> index aec82d9a1b3c..ed0c6c333a6c 100644
> --- a/arch/arm64/kvm/vgic/vgic-its.c
> +++ b/arch/arm64/kvm/vgic/vgic-its.c
> @@ -154,6 +154,7 @@ struct vgic_translation_cache_entry {
>  	u32			devid;
>  	u32			eventid;
>  	struct vgic_irq		*irq;
> +	atomic64_t		usage_count;
>  };
>  
>  /**
> @@ -577,13 +578,7 @@ static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
>  		    cte->eventid != eventid)
>  			continue;
>  
> -		/*
> -		 * Move this entry to the head, as it is the most
> -		 * recently used.
> -		 */
> -		if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
> -			list_move(&cte->entry, &dist->lpi_translation_cache);
> -
> +		atomic64_inc(&cte->usage_count);
>  		return cte->irq;
>  	}
>  
> @@ -616,6 +611,30 @@ static unsigned int vgic_its_max_cache_size(struct kvm *kvm)
>  	return atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
>  }
>  
> +static struct vgic_translation_cache_entry *vgic_its_cache_victim(struct vgic_dist *dist)
> +{
> +	struct vgic_translation_cache_entry *cte, *victim = NULL;
> +	u64 min, tmp;
> +
> +	/*
> +	 * Find the least used cache entry since the last cache miss, preferring
> +	 * older entries in the case of a tie. Note that usage accounting is
> +	 * deliberately non-atomic, so this is all best-effort.
> +	 */
> +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
> +		if (!cte->irq)
> +			return cte;
> +
> +		tmp = atomic64_xchg_relaxed(&cte->usage_count, 0);
> +		if (!victim || tmp <= min) {

min is not initialised until after the first round. Not great. How
comes the compiler doesn't spot this?

> +			victim = cte;
> +			min = tmp;
> +		}
> +	}

So this resets all the counters on each search for a new insertion?
Seems expensive, specially on large VMs (512 * 16 = up to 8K SWP
instructions in a tight loop, and I'm not even mentioning the fun
without LSE). I can at least think of a box that will throw its
interconnect out of the pram it tickled that way.

I'd rather the new cache entry inherits the max of the current set,
making it a lot cheaper. We can always detect the overflow and do a
full invalidation in that case (worse case -- better options exist).

> +
> +	return victim;
> +}
> +
>  static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
>  				       u32 devid, u32 eventid,
>  				       struct vgic_irq *irq)
> @@ -645,9 +664,12 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
>  		goto out;
>  
>  	if (dist->lpi_cache_count >= vgic_its_max_cache_size(kvm)) {
> -		/* Always reuse the last entry (LRU policy) */
> -		victim = list_last_entry(&dist->lpi_translation_cache,
> -				      typeof(*cte), entry);
> +		victim = vgic_its_cache_victim(dist);
> +		if (WARN_ON_ONCE(!victim)) {
> +			victim = new;
> +			goto out;
> +		}

I don't understand how this could happen. It sort of explains the
oddity I was mentioning earlier, but I don't think we need this
complexity.

Thanks,

	M.
Oliver Upton Jan. 25, 2024, 3:34 p.m. UTC | #3
On Thu, Jan 25, 2024 at 10:55:19AM +0000, Marc Zyngier wrote:
> On Wed, 24 Jan 2024 20:49:06 +0000, Oliver Upton <oliver.upton@linux.dev> wrote:

[...]

> > +static struct vgic_translation_cache_entry *vgic_its_cache_victim(struct vgic_dist *dist)
> > +{
> > +	struct vgic_translation_cache_entry *cte, *victim = NULL;
> > +	u64 min, tmp;
> > +
> > +	/*
> > +	 * Find the least used cache entry since the last cache miss, preferring
> > +	 * older entries in the case of a tie. Note that usage accounting is
> > +	 * deliberately non-atomic, so this is all best-effort.
> > +	 */
> > +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
> > +		if (!cte->irq)
> > +			return cte;
> > +
> > +		tmp = atomic64_xchg_relaxed(&cte->usage_count, 0);
> > +		if (!victim || tmp <= min) {
> 
> min is not initialised until after the first round. Not great. How
> comes the compiler doesn't spot this?

min never gets read on the first iteration, since victim is known to be
NULL. Happy to initialize it though to keep this more ovbviously sane.

> > +			victim = cte;
> > +			min = tmp;
> > +		}
> > +	}
> 
> So this resets all the counters on each search for a new insertion?
> Seems expensive, specially on large VMs (512 * 16 = up to 8K SWP
> instructions in a tight loop, and I'm not even mentioning the fun
> without LSE). I can at least think of a box that will throw its
> interconnect out of the pram it tickled that way.

Well, each cache eviction after we hit the cache limit. I wrote this up
to have _something_ that allowed the rculist conversion to later come
back to rework futher, but that obviously didn't happen.

> I'd rather the new cache entry inherits the max of the current set,
> making it a lot cheaper. We can always detect the overflow and do a
> full invalidation in that case (worse case -- better options exist).

Yeah, I like your suggested approach. I'll probably build a bit on top
of that.

> > +
> > +	return victim;
> > +}
> > +
> >  static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> >  				       u32 devid, u32 eventid,
> >  				       struct vgic_irq *irq)
> > @@ -645,9 +664,12 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> >  		goto out;
> >  
> >  	if (dist->lpi_cache_count >= vgic_its_max_cache_size(kvm)) {
> > -		/* Always reuse the last entry (LRU policy) */
> > -		victim = list_last_entry(&dist->lpi_translation_cache,
> > -				      typeof(*cte), entry);
> > +		victim = vgic_its_cache_victim(dist);
> > +		if (WARN_ON_ONCE(!victim)) {
> > +			victim = new;
> > +			goto out;
> > +		}
>
> I don't understand how this could happen. It sort of explains the
> oddity I was mentioning earlier, but I don't think we need this
> complexity.

The only way it could actually happen is if a bug were introduced where
lpi_cache_count is somehow nonzero but the list is empty. But yeah, we
can dump this and assume we find a victim, which ought to always be
true.
Marc Zyngier Jan. 25, 2024, 6:07 p.m. UTC | #4
On Thu, 25 Jan 2024 15:34:31 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Thu, Jan 25, 2024 at 10:55:19AM +0000, Marc Zyngier wrote:
> > On Wed, 24 Jan 2024 20:49:06 +0000, Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> [...]
> 
> > > +static struct vgic_translation_cache_entry *vgic_its_cache_victim(struct vgic_dist *dist)
> > > +{
> > > +	struct vgic_translation_cache_entry *cte, *victim = NULL;
> > > +	u64 min, tmp;
> > > +
> > > +	/*
> > > +	 * Find the least used cache entry since the last cache miss, preferring
> > > +	 * older entries in the case of a tie. Note that usage accounting is
> > > +	 * deliberately non-atomic, so this is all best-effort.
> > > +	 */
> > > +	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
> > > +		if (!cte->irq)
> > > +			return cte;
> > > +
> > > +		tmp = atomic64_xchg_relaxed(&cte->usage_count, 0);
> > > +		if (!victim || tmp <= min) {
> > 
> > min is not initialised until after the first round. Not great. How
> > comes the compiler doesn't spot this?
> 
> min never gets read on the first iteration, since victim is known to be
> NULL. Happy to initialize it though to keep this more ovbviously
> sane.

Ah, gotcha! Completely missed that. Sorry about the noise.

> 
> > > +			victim = cte;
> > > +			min = tmp;
> > > +		}
> > > +	}
> > 
> > So this resets all the counters on each search for a new insertion?
> > Seems expensive, specially on large VMs (512 * 16 = up to 8K SWP
> > instructions in a tight loop, and I'm not even mentioning the fun
> > without LSE). I can at least think of a box that will throw its
> > interconnect out of the pram it tickled that way.
> 
> Well, each cache eviction after we hit the cache limit. I wrote this up
> to have _something_ that allowed the rculist conversion to later come
> back to rework futher, but that obviously didn't happen.
> 
> > I'd rather the new cache entry inherits the max of the current set,
> > making it a lot cheaper. We can always detect the overflow and do a
> > full invalidation in that case (worse case -- better options exist).
> 
> Yeah, I like your suggested approach. I'll probably build a bit on top
> of that.
> 
> > > +
> > > +	return victim;
> > > +}
> > > +
> > >  static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> > >  				       u32 devid, u32 eventid,
> > >  				       struct vgic_irq *irq)
> > > @@ -645,9 +664,12 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
> > >  		goto out;
> > >  
> > >  	if (dist->lpi_cache_count >= vgic_its_max_cache_size(kvm)) {
> > > -		/* Always reuse the last entry (LRU policy) */
> > > -		victim = list_last_entry(&dist->lpi_translation_cache,
> > > -				      typeof(*cte), entry);
> > > +		victim = vgic_its_cache_victim(dist);
> > > +		if (WARN_ON_ONCE(!victim)) {
> > > +			victim = new;
> > > +			goto out;
> > > +		}
> >
> > I don't understand how this could happen. It sort of explains the
> > oddity I was mentioning earlier, but I don't think we need this
> > complexity.
> 
> The only way it could actually happen is if a bug were introduced where
> lpi_cache_count is somehow nonzero but the list is empty. But yeah, we
> can dump this and assume we find a victim, which ought to always be
> true.

Right, that was my impression as well, and I couldn't find a way to
fail it.

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index aec82d9a1b3c..ed0c6c333a6c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -154,6 +154,7 @@  struct vgic_translation_cache_entry {
 	u32			devid;
 	u32			eventid;
 	struct vgic_irq		*irq;
+	atomic64_t		usage_count;
 };
 
 /**
@@ -577,13 +578,7 @@  static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
 		    cte->eventid != eventid)
 			continue;
 
-		/*
-		 * Move this entry to the head, as it is the most
-		 * recently used.
-		 */
-		if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
-			list_move(&cte->entry, &dist->lpi_translation_cache);
-
+		atomic64_inc(&cte->usage_count);
 		return cte->irq;
 	}
 
@@ -616,6 +611,30 @@  static unsigned int vgic_its_max_cache_size(struct kvm *kvm)
 	return atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
 }
 
+static struct vgic_translation_cache_entry *vgic_its_cache_victim(struct vgic_dist *dist)
+{
+	struct vgic_translation_cache_entry *cte, *victim = NULL;
+	u64 min, tmp;
+
+	/*
+	 * Find the least used cache entry since the last cache miss, preferring
+	 * older entries in the case of a tie. Note that usage accounting is
+	 * deliberately non-atomic, so this is all best-effort.
+	 */
+	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+		if (!cte->irq)
+			return cte;
+
+		tmp = atomic64_xchg_relaxed(&cte->usage_count, 0);
+		if (!victim || tmp <= min) {
+			victim = cte;
+			min = tmp;
+		}
+	}
+
+	return victim;
+}
+
 static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 				       u32 devid, u32 eventid,
 				       struct vgic_irq *irq)
@@ -645,9 +664,12 @@  static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 		goto out;
 
 	if (dist->lpi_cache_count >= vgic_its_max_cache_size(kvm)) {
-		/* Always reuse the last entry (LRU policy) */
-		victim = list_last_entry(&dist->lpi_translation_cache,
-				      typeof(*cte), entry);
+		victim = vgic_its_cache_victim(dist);
+		if (WARN_ON_ONCE(!victim)) {
+			victim = new;
+			goto out;
+		}
+
 		list_del(&victim->entry);
 		dist->lpi_cache_count--;
 	} else {