diff mbox series

[RFC] kvm: nv: Optimize the unmapping of shadow S2-MMU tables.

Message ID 20240305054606.13261-1-gankulkarni@os.amperecomputing.com (mailing list archive)
State New, archived
Headers show
Series [RFC] kvm: nv: Optimize the unmapping of shadow S2-MMU tables. | expand

Commit Message

Ganapatrao Kulkarni March 5, 2024, 5:46 a.m. UTC
As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2
page tables")', when ever there is unmap of pages that
are mapped to L1, they are invalidated from both L1 S2-MMU and from
all the active shadow/L2 S2-MMU tables. Since there is no mapping
to invalidate the IPAs of Shadow S2 to a page, there is a complete
S2-MMU page table walk and invalidation is done covering complete
address space allocated to a L2. This has performance impacts and
even soft lockup for NV(L1 and L2) boots with higher number of
CPUs and large Memory.

Adding a lookup table of mapping of Shadow IPA to Canonical IPA
whenever a page is mapped to any of the L2. While any page is
unmaped, this lookup is helpful to unmap only if it is mapped in
any of the shadow S2-MMU tables. Hence avoids unnecessary long
iterations of S2-MMU table walk-through and invalidation for the
complete address space.

Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
---
 arch/arm64/include/asm/kvm_emulate.h |   5 ++
 arch/arm64/include/asm/kvm_host.h    |  14 ++++
 arch/arm64/include/asm/kvm_nested.h  |   4 +
 arch/arm64/kvm/mmu.c                 |  19 ++++-
 arch/arm64/kvm/nested.c              | 113 +++++++++++++++++++++++++++
 5 files changed, 152 insertions(+), 3 deletions(-)

Comments

Oliver Upton March 5, 2024, 8:46 a.m. UTC | #1
-cc old kvmarm list
+cc new kvmarm list, reviewers

Please run scripts/get_maintainer.pl next time around so we get the
right people looking at a patch.

On Mon, Mar 04, 2024 at 09:46:06PM -0800, Ganapatrao Kulkarni wrote:
> @@ -216,6 +223,13 @@ struct kvm_s2_mmu {
>  	 * >0: Somebody is actively using this.
>  	 */
>  	atomic_t refcnt;
> +
> +	/*
> +	 * For a Canonical IPA to Shadow IPA mapping.
> +	 */
> +	struct rb_root nested_mapipa_root;

There isn't any benefit to tracking the canonical IPA -> shadow IPA(s)
mapping on a per-S2 basis, as there already exists a one-to-many problem
(more below). Maintaining a per-VM data structure (since this is keyed
by canonical IPA) makes a bit more sense.

> +	rwlock_t mmu_lock;
> +

Err, is there any reason the existing mmu_lock is insufficient here?
Surely taking a new reference on a canonical IPA for a shadow S2 must be
done behind the MMU lock for it to be safe against MMU notifiers...

Also, Reusing the exact same name for it is sure to produce some lock
imbalance funnies.

>  };
>  
>  static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index da7ebd2f6e24..c31a59a1fdc6 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
>  extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>  extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>  extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern void add_shadow_ipa_map_node(
> +		struct kvm_s2_mmu *mmu,
> +		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);

style nitpick: no newline between the open bracket and first parameter.
Wrap as needed at 80 (or a bit more) columns.

> +/*
> + * Create a node and add to lookup table, when a page is mapped to
> + * Canonical IPA and also mapped to Shadow IPA.
> + */
> +void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
> +			phys_addr_t ipa,
> +			phys_addr_t shadow_ipa, long size)
> +{
> +	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
> +	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
> +	struct mapipa_node *new;
> +
> +	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
> +	if (!new)
> +		return;

Should be GFP_KERNEL_ACCOUNT, you want to charge this to the user.

> +
> +	new->shadow_ipa = shadow_ipa;
> +	new->ipa = ipa;
> +	new->size = size;

What about aliasing? You could have multiple shadow IPAs that point to
the same canonical IPA, even within a single MMU.

> +	write_lock(&mmu->mmu_lock);
> +
> +	while (*node) {
> +		struct mapipa_node *tmp;
> +
> +		tmp = container_of(*node, struct mapipa_node, node);
> +		parent = *node;
> +		if (new->ipa < tmp->ipa) {
> +			node = &(*node)->rb_left;
> +		} else if (new->ipa > tmp->ipa) {
> +			node = &(*node)->rb_right;
> +		} else {
> +			write_unlock(&mmu->mmu_lock);
> +			kfree(new);
> +			return;
> +		}
> +	}
> +
> +	rb_link_node(&new->node, parent, node);
> +	rb_insert_color(&new->node, ipa_root);
> +	write_unlock(&mmu->mmu_lock);

Meh, one of the annoying things with rbtree is you have to build your
own search functions...

It would appear that the rbtree intends to express intervals (i.e. GPA +
size), but the search implementation treats GPA as an index. So I don't
think this works as intended.

Have you considered other abstract data types (e.g. xarray, maple tree)
and how they might apply here?

> +bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)
> +{
> +	struct rb_node *node;
> +	struct mapipa_node *tmp = NULL;
> +
> +	read_lock(&mmu->mmu_lock);
> +	node = mmu->nested_mapipa_root.rb_node;
> +
> +	while (node) {
> +		tmp = container_of(node, struct mapipa_node, node);
> +
> +		if (tmp->ipa == ipa)
> +			break;
> +		else if (ipa > tmp->ipa)
> +			node = node->rb_right;
> +		else
> +			node = node->rb_left;
> +	}
> +
> +	read_unlock(&mmu->mmu_lock);
> +
> +	if (tmp && tmp->ipa == ipa) {
> +		*shadow_ipa = tmp->shadow_ipa;
> +		*size = tmp->size;
> +		write_lock(&mmu->mmu_lock);
> +		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
> +		write_unlock(&mmu->mmu_lock);
> +		kfree(tmp);
> +		return true;
> +	}

Implicitly evicting the entry isn't going to work if we want to use it
for updates to a stage-2 that do not evict the mapping, like write
protection or access flag updates.
Marc Zyngier March 5, 2024, 11:08 a.m. UTC | #2
On Tue, 05 Mar 2024 05:46:06 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2

$ git describe --contains 178a6915434c --match=v\*
fatal: cannot describe '178a6915434c141edefd116b8da3d55555ea3e63'

This commit simply doesn't exist upstream. It only lives in a
now deprecated branch that will never be merged.

> page tables")', when ever there is unmap of pages that
> are mapped to L1, they are invalidated from both L1 S2-MMU and from
> all the active shadow/L2 S2-MMU tables. Since there is no mapping
> to invalidate the IPAs of Shadow S2 to a page, there is a complete
> S2-MMU page table walk and invalidation is done covering complete
> address space allocated to a L2. This has performance impacts and
> even soft lockup for NV(L1 and L2) boots with higher number of
> CPUs and large Memory.
> 
> Adding a lookup table of mapping of Shadow IPA to Canonical IPA
> whenever a page is mapped to any of the L2. While any page is
> unmaped, this lookup is helpful to unmap only if it is mapped in
> any of the shadow S2-MMU tables. Hence avoids unnecessary long
> iterations of S2-MMU table walk-through and invalidation for the
> complete address space.

All of this falls in the "premature optimisation" bucket. Why should
we bother with any of this when not even 'AT S1' works correctly,
making it trivial to prevent a guest from making forward progress? You
also show no numbers that would hint at a measurable improvement under
any particular workload.

I am genuinely puzzled that you are wasting valuable engineering time
on *this*.

>
> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   5 ++
>  arch/arm64/include/asm/kvm_host.h    |  14 ++++
>  arch/arm64/include/asm/kvm_nested.h  |   4 +
>  arch/arm64/kvm/mmu.c                 |  19 ++++-
>  arch/arm64/kvm/nested.c              | 113 +++++++++++++++++++++++++++
>  5 files changed, 152 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 5173f8cf2904..f503b2eaedc4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -656,4 +656,9 @@ static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
>  		vcpu->arch.hw_mmu->nested_stage2_enabled);
>  }
>  
> +static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu *vcpu)
> +{
> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
> +}

Isn't that the very definition of "!in_hyp_ctxt()"? You are abusing
the hw_mmu pointer to derive something, but the source of truth is the
translation regime, as defined by HCR_EL2.{E2H,TGE} and PSTATE.M.

> +
>  #endif /* __ARM64_KVM_EMULATE_H__ */
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 8da3c9a81ae3..f61c674c300a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -144,6 +144,13 @@ struct kvm_vmid {
>  	atomic64_t id;
>  };
>  
> +struct mapipa_node {
> +	struct rb_node node;
> +	phys_addr_t ipa;
> +	phys_addr_t shadow_ipa;
> +	long size;
> +};
> +
>  struct kvm_s2_mmu {
>  	struct kvm_vmid vmid;
>  
> @@ -216,6 +223,13 @@ struct kvm_s2_mmu {
>  	 * >0: Somebody is actively using this.
>  	 */
>  	atomic_t refcnt;
> +
> +	/*
> +	 * For a Canonical IPA to Shadow IPA mapping.
> +	 */
> +	struct rb_root nested_mapipa_root;

Why isn't this a maple tree? If there is no overlap between mappings
(and it really shouldn't be any), why should we use a bare-bone rb-tree?

> +	rwlock_t mmu_lock;

Hell no. We have plenty of locking already, and there is no reason why
this should gain its own locking. I can't see a case where you would
take this lock outside of holding the *real* mmu_lock -- extra bonus
point for the ill-chosen name.

> +
>  };
>  
>  static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index da7ebd2f6e24..c31a59a1fdc6 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
>  extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>  extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>  extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern void add_shadow_ipa_map_node(
> +		struct kvm_s2_mmu *mmu,
> +		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
>  
>  union tlbi_info;
>  
> @@ -123,6 +126,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
>  extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
>  extern void kvm_nested_s2_wp(struct kvm *kvm);
>  extern void kvm_nested_s2_unmap(struct kvm *kvm);
> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range);
>  extern void kvm_nested_s2_flush(struct kvm *kvm);
>  int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
>  
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 61bdd8798f83..3948681426a0 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  					     memcache,
>  					     KVM_PGTABLE_WALK_HANDLE_FAULT |
>  					     KVM_PGTABLE_WALK_SHARED);
> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {

I don't understand this condition. If nested is non-NULL, it's because
we're using a shadow S2. So why the additional condition?

> +			struct kvm_s2_mmu *shadow_s2_mmu;
> +
> +			ipa &= ~(vma_pagesize - 1);
> +			shadow_s2_mmu = lookup_s2_mmu(vcpu);
> +			add_shadow_ipa_map_node(shadow_s2_mmu, ipa, fault_ipa, vma_pagesize);
> +		}
>  	}
>  
>  	/* Mark the page dirty only if the fault is handled successfully */
> @@ -1918,7 +1925,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  			     (range->end - range->start) << PAGE_SHIFT,
>  			     range->may_block);
>  
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, range);
>  	return false;
>  }
>  
> @@ -1953,7 +1960,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  			       PAGE_SIZE, __pfn_to_phys(pfn),
>  			       KVM_PGTABLE_PROT_R, NULL, 0);
>  
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, range);
>  	return false;
>  }
>  
> @@ -2223,12 +2230,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
>  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  				   struct kvm_memory_slot *slot)
>  {
> +	struct kvm_gfn_range range;
> +
>  	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
>  	phys_addr_t size = slot->npages << PAGE_SHIFT;
>  
> +	range.start = gpa;
> +	range.end = gpa + size;
> +	range.may_block = true;
> +
>  	write_lock(&kvm->mmu_lock);
>  	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, &range);
>  	write_unlock(&kvm->mmu_lock);
>  }
>  
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index f88d9213c6b3..888ec9fba4a0 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -565,6 +565,88 @@ void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
>  	write_unlock(&kvm->mmu_lock);
>  }
>  
> +/*
> + * Create a node and add to lookup table, when a page is mapped to
> + * Canonical IPA and also mapped to Shadow IPA.
> + */
> +void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
> +			phys_addr_t ipa,
> +			phys_addr_t shadow_ipa, long size)
> +{
> +	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
> +	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
> +	struct mapipa_node *new;
> +
> +	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
> +	if (!new)
> +		return;
> +
> +	new->shadow_ipa = shadow_ipa;
> +	new->ipa = ipa;
> +	new->size = size;
> +
> +	write_lock(&mmu->mmu_lock);
> +
> +	while (*node) {
> +		struct mapipa_node *tmp;
> +
> +		tmp = container_of(*node, struct mapipa_node, node);
> +		parent = *node;
> +		if (new->ipa < tmp->ipa) {
> +			node = &(*node)->rb_left;
> +		} else if (new->ipa > tmp->ipa) {
> +			node = &(*node)->rb_right;
> +		} else {
> +			write_unlock(&mmu->mmu_lock);
> +			kfree(new);
> +			return;
> +		}
> +	}
> +
> +	rb_link_node(&new->node, parent, node);
> +	rb_insert_color(&new->node, ipa_root);
> +	write_unlock(&mmu->mmu_lock);

All this should be removed in favour of simply using a maple tree.

> +}
> +
> +/*
> + * Iterate over the lookup table of Canonical IPA to Shadow IPA.
> + * Return Shadow IPA, if the page mapped to Canonical IPA is
> + * also mapped to a Shadow IPA.
> + *
> + */
> +bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)

static?

> +{
> +	struct rb_node *node;
> +	struct mapipa_node *tmp = NULL;
> +
> +	read_lock(&mmu->mmu_lock);
> +	node = mmu->nested_mapipa_root.rb_node;
> +
> +	while (node) {
> +		tmp = container_of(node, struct mapipa_node, node);
> +
> +		if (tmp->ipa == ipa)

What guarantees that the mapping you have for L1 has the same starting
address as the one you have for L2? L1 could have a 2MB mapping and L2
only 4kB *in the middle*.

> +			break;
> +		else if (ipa > tmp->ipa)
> +			node = node->rb_right;
> +		else
> +			node = node->rb_left;
> +	}
> +
> +	read_unlock(&mmu->mmu_lock);

Why would you drop the lock here....

> +
> +	if (tmp && tmp->ipa == ipa) {
> +		*shadow_ipa = tmp->shadow_ipa;
> +		*size = tmp->size;
> +		write_lock(&mmu->mmu_lock);

... if taking it again here? What could have changed in between?

> +		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
> +		write_unlock(&mmu->mmu_lock);
> +		kfree(tmp);
> +		return true;
> +	}
> +	return false;
> +}

So simply hitting in the reverse mapping structure *frees* it? Meaning
that you cannot use it as a way to update a mapping?

> +
>  /* Must be called with kvm->mmu_lock held */
>  struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
>  {
> @@ -674,6 +756,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
>  	mmu->tlb_vttbr = 1;
>  	mmu->nested_stage2_enabled = false;
>  	atomic_set(&mmu->refcnt, 0);
> +	mmu->nested_mapipa_root = RB_ROOT;
>  }
>  
>  void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> @@ -760,6 +843,36 @@ void kvm_nested_s2_unmap(struct kvm *kvm)
>  	}
>  }
>  
> +void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	int i;
> +	long size;
> +	bool ret;
> +
> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> +		if (kvm_s2_mmu_valid(mmu)) {
> +			phys_addr_t shadow_ipa, start, end;
> +
> +			start = range->start << PAGE_SHIFT;
> +			end = range->end << PAGE_SHIFT;
> +
> +			while (start < end) {
> +				size = PAGE_SIZE;
> +				/*
> +				 * get the Shadow IPA if the page is mapped
> +				 * to L1 and also mapped to any of active L2.
> +				 */

Why is L1 relevant here?

> +				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
> +				if (ret)
> +					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
> +				start += size;
> +			}
> +		}
> +	}
> +}
> +
>  /* expects kvm->mmu_lock to be held */
>  void kvm_nested_s2_flush(struct kvm *kvm)
>  {

There are a bunch of worrying issues with this patch. But more
importantly, this looks like a waste of effort until the core issues
that NV still has are solved, and I will not consider anything of the
sort until then.

I get the ugly feeling that you are trying to make it look as if it
was "production ready", which it won't be for another few years,
specially if the few interested people (such as you) are ignoring the
core issues in favour of marketing driven features ("make it fast").

Thanks,

	M.
Marc Zyngier March 5, 2024, 11:13 a.m. UTC | #3
[re-sending with kvmarm@ fixed]

On Tue, 05 Mar 2024 05:46:06 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2

$ git describe --contains 178a6915434c --match=v\*
fatal: cannot describe '178a6915434c141edefd116b8da3d55555ea3e63'

This commit simply doesn't exist upstream. It only lives in a
now deprecated branch that will never be merged.

> page tables")', when ever there is unmap of pages that
> are mapped to L1, they are invalidated from both L1 S2-MMU and from
> all the active shadow/L2 S2-MMU tables. Since there is no mapping
> to invalidate the IPAs of Shadow S2 to a page, there is a complete
> S2-MMU page table walk and invalidation is done covering complete
> address space allocated to a L2. This has performance impacts and
> even soft lockup for NV(L1 and L2) boots with higher number of
> CPUs and large Memory.
> 
> Adding a lookup table of mapping of Shadow IPA to Canonical IPA
> whenever a page is mapped to any of the L2. While any page is
> unmaped, this lookup is helpful to unmap only if it is mapped in
> any of the shadow S2-MMU tables. Hence avoids unnecessary long
> iterations of S2-MMU table walk-through and invalidation for the
> complete address space.

All of this falls in the "premature optimisation" bucket. Why should
we bother with any of this when not even 'AT S1' works correctly,
making it trivial to prevent a guest from making forward progress? You
also show no numbers that would hint at a measurable improvement under
any particular workload.

I am genuinely puzzled that you are wasting valuable engineering time
on *this*.

>
> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   5 ++
>  arch/arm64/include/asm/kvm_host.h    |  14 ++++
>  arch/arm64/include/asm/kvm_nested.h  |   4 +
>  arch/arm64/kvm/mmu.c                 |  19 ++++-
>  arch/arm64/kvm/nested.c              | 113 +++++++++++++++++++++++++++
>  5 files changed, 152 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 5173f8cf2904..f503b2eaedc4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -656,4 +656,9 @@ static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
>  		vcpu->arch.hw_mmu->nested_stage2_enabled);
>  }
>  
> +static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu *vcpu)
> +{
> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
> +}

Isn't that the very definition of "!in_hyp_ctxt()"? You are abusing
the hw_mmu pointer to derive something, but the source of truth is the
translation regime, as defined by HCR_EL2.{E2H,TGE} and PSTATE.M.

> +
>  #endif /* __ARM64_KVM_EMULATE_H__ */
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 8da3c9a81ae3..f61c674c300a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -144,6 +144,13 @@ struct kvm_vmid {
>  	atomic64_t id;
>  };
>  
> +struct mapipa_node {
> +	struct rb_node node;
> +	phys_addr_t ipa;
> +	phys_addr_t shadow_ipa;
> +	long size;
> +};
> +
>  struct kvm_s2_mmu {
>  	struct kvm_vmid vmid;
>  
> @@ -216,6 +223,13 @@ struct kvm_s2_mmu {
>  	 * >0: Somebody is actively using this.
>  	 */
>  	atomic_t refcnt;
> +
> +	/*
> +	 * For a Canonical IPA to Shadow IPA mapping.
> +	 */
> +	struct rb_root nested_mapipa_root;

Why isn't this a maple tree? If there is no overlap between mappings
(and it really shouldn't be any), why should we use a bare-bone rb-tree?

> +	rwlock_t mmu_lock;

Hell no. We have plenty of locking already, and there is no reason why
this should gain its own locking. I can't see a case where you would
take this lock outside of holding the *real* mmu_lock -- extra bonus
point for the ill-chosen name.

> +
>  };
>  
>  static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index da7ebd2f6e24..c31a59a1fdc6 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
>  extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>  extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>  extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern void add_shadow_ipa_map_node(
> +		struct kvm_s2_mmu *mmu,
> +		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
>  
>  union tlbi_info;
>  
> @@ -123,6 +126,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
>  extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
>  extern void kvm_nested_s2_wp(struct kvm *kvm);
>  extern void kvm_nested_s2_unmap(struct kvm *kvm);
> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range);
>  extern void kvm_nested_s2_flush(struct kvm *kvm);
>  int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
>  
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 61bdd8798f83..3948681426a0 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  					     memcache,
>  					     KVM_PGTABLE_WALK_HANDLE_FAULT |
>  					     KVM_PGTABLE_WALK_SHARED);
> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {

I don't understand this condition. If nested is non-NULL, it's because
we're using a shadow S2. So why the additional condition?

> +			struct kvm_s2_mmu *shadow_s2_mmu;
> +
> +			ipa &= ~(vma_pagesize - 1);
> +			shadow_s2_mmu = lookup_s2_mmu(vcpu);
> +			add_shadow_ipa_map_node(shadow_s2_mmu, ipa, fault_ipa, vma_pagesize);
> +		}
>  	}
>  
>  	/* Mark the page dirty only if the fault is handled successfully */
> @@ -1918,7 +1925,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  			     (range->end - range->start) << PAGE_SHIFT,
>  			     range->may_block);
>  
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, range);
>  	return false;
>  }
>  
> @@ -1953,7 +1960,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  			       PAGE_SIZE, __pfn_to_phys(pfn),
>  			       KVM_PGTABLE_PROT_R, NULL, 0);
>  
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, range);
>  	return false;
>  }
>  
> @@ -2223,12 +2230,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
>  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  				   struct kvm_memory_slot *slot)
>  {
> +	struct kvm_gfn_range range;
> +
>  	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
>  	phys_addr_t size = slot->npages << PAGE_SHIFT;
>  
> +	range.start = gpa;
> +	range.end = gpa + size;
> +	range.may_block = true;
> +
>  	write_lock(&kvm->mmu_lock);
>  	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
> -	kvm_nested_s2_unmap(kvm);
> +	kvm_nested_s2_unmap_range(kvm, &range);
>  	write_unlock(&kvm->mmu_lock);
>  }
>  
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index f88d9213c6b3..888ec9fba4a0 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -565,6 +565,88 @@ void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
>  	write_unlock(&kvm->mmu_lock);
>  }
>  
> +/*
> + * Create a node and add to lookup table, when a page is mapped to
> + * Canonical IPA and also mapped to Shadow IPA.
> + */
> +void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
> +			phys_addr_t ipa,
> +			phys_addr_t shadow_ipa, long size)
> +{
> +	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
> +	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
> +	struct mapipa_node *new;
> +
> +	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
> +	if (!new)
> +		return;
> +
> +	new->shadow_ipa = shadow_ipa;
> +	new->ipa = ipa;
> +	new->size = size;
> +
> +	write_lock(&mmu->mmu_lock);
> +
> +	while (*node) {
> +		struct mapipa_node *tmp;
> +
> +		tmp = container_of(*node, struct mapipa_node, node);
> +		parent = *node;
> +		if (new->ipa < tmp->ipa) {
> +			node = &(*node)->rb_left;
> +		} else if (new->ipa > tmp->ipa) {
> +			node = &(*node)->rb_right;
> +		} else {
> +			write_unlock(&mmu->mmu_lock);
> +			kfree(new);
> +			return;
> +		}
> +	}
> +
> +	rb_link_node(&new->node, parent, node);
> +	rb_insert_color(&new->node, ipa_root);
> +	write_unlock(&mmu->mmu_lock);

All this should be removed in favour of simply using a maple tree.

> +}
> +
> +/*
> + * Iterate over the lookup table of Canonical IPA to Shadow IPA.
> + * Return Shadow IPA, if the page mapped to Canonical IPA is
> + * also mapped to a Shadow IPA.
> + *
> + */
> +bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)

static?

> +{
> +	struct rb_node *node;
> +	struct mapipa_node *tmp = NULL;
> +
> +	read_lock(&mmu->mmu_lock);
> +	node = mmu->nested_mapipa_root.rb_node;
> +
> +	while (node) {
> +		tmp = container_of(node, struct mapipa_node, node);
> +
> +		if (tmp->ipa == ipa)

What guarantees that the mapping you have for L1 has the same starting
address as the one you have for L2? L1 could have a 2MB mapping and L2
only 4kB *in the middle*.

> +			break;
> +		else if (ipa > tmp->ipa)
> +			node = node->rb_right;
> +		else
> +			node = node->rb_left;
> +	}
> +
> +	read_unlock(&mmu->mmu_lock);

Why would you drop the lock here....

> +
> +	if (tmp && tmp->ipa == ipa) {
> +		*shadow_ipa = tmp->shadow_ipa;
> +		*size = tmp->size;
> +		write_lock(&mmu->mmu_lock);

... if taking it again here? What could have changed in between?

> +		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
> +		write_unlock(&mmu->mmu_lock);
> +		kfree(tmp);
> +		return true;
> +	}
> +	return false;
> +}

So simply hitting in the reverse mapping structure *frees* it? Meaning
that you cannot use it as a way to update a mapping?

> +
>  /* Must be called with kvm->mmu_lock held */
>  struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
>  {
> @@ -674,6 +756,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
>  	mmu->tlb_vttbr = 1;
>  	mmu->nested_stage2_enabled = false;
>  	atomic_set(&mmu->refcnt, 0);
> +	mmu->nested_mapipa_root = RB_ROOT;
>  }
>  
>  void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> @@ -760,6 +843,36 @@ void kvm_nested_s2_unmap(struct kvm *kvm)
>  	}
>  }
>  
> +void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	int i;
> +	long size;
> +	bool ret;
> +
> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> +		if (kvm_s2_mmu_valid(mmu)) {
> +			phys_addr_t shadow_ipa, start, end;
> +
> +			start = range->start << PAGE_SHIFT;
> +			end = range->end << PAGE_SHIFT;
> +
> +			while (start < end) {
> +				size = PAGE_SIZE;
> +				/*
> +				 * get the Shadow IPA if the page is mapped
> +				 * to L1 and also mapped to any of active L2.
> +				 */

Why is L1 relevant here?

> +				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
> +				if (ret)
> +					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
> +				start += size;
> +			}
> +		}
> +	}
> +}
> +
>  /* expects kvm->mmu_lock to be held */
>  void kvm_nested_s2_flush(struct kvm *kvm)
>  {

There are a bunch of worrying issues with this patch. But more
importantly, this looks like a waste of effort until the core issues
that NV still has are solved, and I will not consider anything of the
sort until then.

I get the ugly feeling that you are trying to make it look as if it
was "production ready", which it won't be for another few years,
specially if the few interested people (such as you) are ignoring the
core issues in favour of marketing driven features ("make it fast").

Thanks,

	M.
Ganapatrao Kulkarni March 5, 2024, 1:29 p.m. UTC | #4
On 05-03-2024 04:43 pm, Marc Zyngier wrote:
> [re-sending with kvmarm@ fixed]
> 
> On Tue, 05 Mar 2024 05:46:06 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>> As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2
> 
> $ git describe --contains 178a6915434c --match=v\*
> fatal: cannot describe '178a6915434c141edefd116b8da3d55555ea3e63'
> 

My bad(I would have been more verbose), I missed to mention that this 
patch is on top of NV-V11 patch series.

> This commit simply doesn't exist upstream. It only lives in a
> now deprecated branch that will never be merged.
> 
>> page tables")', when ever there is unmap of pages that
>> are mapped to L1, they are invalidated from both L1 S2-MMU and from
>> all the active shadow/L2 S2-MMU tables. Since there is no mapping
>> to invalidate the IPAs of Shadow S2 to a page, there is a complete
>> S2-MMU page table walk and invalidation is done covering complete
>> address space allocated to a L2. This has performance impacts and
>> even soft lockup for NV(L1 and L2) boots with higher number of
>> CPUs and large Memory.
>>
>> Adding a lookup table of mapping of Shadow IPA to Canonical IPA
>> whenever a page is mapped to any of the L2. While any page is
>> unmaped, this lookup is helpful to unmap only if it is mapped in
>> any of the shadow S2-MMU tables. Hence avoids unnecessary long
>> iterations of S2-MMU table walk-through and invalidation for the
>> complete address space.
> 
> All of this falls in the "premature optimisation" bucket. Why should
> we bother with any of this when not even 'AT S1' works correctly,

Hmm, I am not aware of this, is this something new issue of V11?

> making it trivial to prevent a guest from making forward progress? You
> also show no numbers that would hint at a measurable improvement under
> any particular workload.

This patch is avoiding long iterations of unmap which was resulting in 
soft-lockup, when tried L1 and L2 with 192 cores.
Fixing soft lockup isn't a required fix for feature enablement?

> 
> I am genuinely puzzled that you are wasting valuable engineering time
> on *this*.
> 
>>
>> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
>> ---
>>   arch/arm64/include/asm/kvm_emulate.h |   5 ++
>>   arch/arm64/include/asm/kvm_host.h    |  14 ++++
>>   arch/arm64/include/asm/kvm_nested.h  |   4 +
>>   arch/arm64/kvm/mmu.c                 |  19 ++++-
>>   arch/arm64/kvm/nested.c              | 113 +++++++++++++++++++++++++++
>>   5 files changed, 152 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index 5173f8cf2904..f503b2eaedc4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -656,4 +656,9 @@ static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
>>   		vcpu->arch.hw_mmu->nested_stage2_enabled);
>>   }
>>   
>> +static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu *vcpu)
>> +{
>> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
>> +}
> 
> Isn't that the very definition of "!in_hyp_ctxt()"? You are abusing

"!in_hyp_ctxt()" isn't true for non-NV case also?
This function added to know that L1 is NV enabled and using shadow S2.

> the hw_mmu pointer to derive something, but the source of truth is the
> translation regime, as defined by HCR_EL2.{E2H,TGE} and PSTATE.M.
> 

OK, I can try HCR_EL2.{E2H,TGE} and PSTATE.M instead of hw_mmu in next 
version.

>> +
>>   #endif /* __ARM64_KVM_EMULATE_H__ */
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 8da3c9a81ae3..f61c674c300a 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -144,6 +144,13 @@ struct kvm_vmid {
>>   	atomic64_t id;
>>   };
>>   
>> +struct mapipa_node {
>> +	struct rb_node node;
>> +	phys_addr_t ipa;
>> +	phys_addr_t shadow_ipa;
>> +	long size;
>> +};
>> +
>>   struct kvm_s2_mmu {
>>   	struct kvm_vmid vmid;
>>   
>> @@ -216,6 +223,13 @@ struct kvm_s2_mmu {
>>   	 * >0: Somebody is actively using this.
>>   	 */
>>   	atomic_t refcnt;
>> +
>> +	/*
>> +	 * For a Canonical IPA to Shadow IPA mapping.
>> +	 */
>> +	struct rb_root nested_mapipa_root;
> 
> Why isn't this a maple tree? If there is no overlap between mappings
> (and it really shouldn't be any), why should we use a bare-bone rb-tree?
> 
>> +	rwlock_t mmu_lock;
> 
> Hell no. We have plenty of locking already, and there is no reason why
> this should gain its own locking. I can't see a case where you would
> take this lock outside of holding the *real* mmu_lock -- extra bonus
> point for the ill-chosen name.

OK, this should be avoided with maple tree.
> 
>> +
>>   };
>>   
>>   static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
>> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
>> index da7ebd2f6e24..c31a59a1fdc6 100644
>> --- a/arch/arm64/include/asm/kvm_nested.h
>> +++ b/arch/arm64/include/asm/kvm_nested.h
>> @@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
>>   extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>>   extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>>   extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
>> +extern void add_shadow_ipa_map_node(
>> +		struct kvm_s2_mmu *mmu,
>> +		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
>>   
>>   union tlbi_info;
>>   
>> @@ -123,6 +126,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
>>   extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
>>   extern void kvm_nested_s2_wp(struct kvm *kvm);
>>   extern void kvm_nested_s2_unmap(struct kvm *kvm);
>> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range);
>>   extern void kvm_nested_s2_flush(struct kvm *kvm);
>>   int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
>>   
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 61bdd8798f83..3948681426a0 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>   					     memcache,
>>   					     KVM_PGTABLE_WALK_HANDLE_FAULT |
>>   					     KVM_PGTABLE_WALK_SHARED);
>> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
> 
> I don't understand this condition. If nested is non-NULL, it's because
> we're using a shadow S2. So why the additional condition?

No, nested is set only for L2, for L1 it is not.
To handle L1 shadow S2 case, I have added this condition.

> 
>> +			struct kvm_s2_mmu *shadow_s2_mmu;
>> +
>> +			ipa &= ~(vma_pagesize - 1);
>> +			shadow_s2_mmu = lookup_s2_mmu(vcpu);
>> +			add_shadow_ipa_map_node(shadow_s2_mmu, ipa, fault_ipa, vma_pagesize);
>> +		}
>>   	}
>>   
>>   	/* Mark the page dirty only if the fault is handled successfully */
>> @@ -1918,7 +1925,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>>   			     (range->end - range->start) << PAGE_SHIFT,
>>   			     range->may_block);
>>   
>> -	kvm_nested_s2_unmap(kvm);
>> +	kvm_nested_s2_unmap_range(kvm, range);
>>   	return false;
>>   }
>>   
>> @@ -1953,7 +1960,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>>   			       PAGE_SIZE, __pfn_to_phys(pfn),
>>   			       KVM_PGTABLE_PROT_R, NULL, 0);
>>   
>> -	kvm_nested_s2_unmap(kvm);
>> +	kvm_nested_s2_unmap_range(kvm, range);
>>   	return false;
>>   }
>>   
>> @@ -2223,12 +2230,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
>>   void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>>   				   struct kvm_memory_slot *slot)
>>   {
>> +	struct kvm_gfn_range range;
>> +
>>   	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
>>   	phys_addr_t size = slot->npages << PAGE_SHIFT;
>>   
>> +	range.start = gpa;
>> +	range.end = gpa + size;
>> +	range.may_block = true;
>> +
>>   	write_lock(&kvm->mmu_lock);
>>   	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
>> -	kvm_nested_s2_unmap(kvm);
>> +	kvm_nested_s2_unmap_range(kvm, &range);
>>   	write_unlock(&kvm->mmu_lock);
>>   }
>>   
>> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
>> index f88d9213c6b3..888ec9fba4a0 100644
>> --- a/arch/arm64/kvm/nested.c
>> +++ b/arch/arm64/kvm/nested.c
>> @@ -565,6 +565,88 @@ void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
>>   	write_unlock(&kvm->mmu_lock);
>>   }
>>   
>> +/*
>> + * Create a node and add to lookup table, when a page is mapped to
>> + * Canonical IPA and also mapped to Shadow IPA.
>> + */
>> +void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
>> +			phys_addr_t ipa,
>> +			phys_addr_t shadow_ipa, long size)
>> +{
>> +	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
>> +	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
>> +	struct mapipa_node *new;
>> +
>> +	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
>> +	if (!new)
>> +		return;
>> +
>> +	new->shadow_ipa = shadow_ipa;
>> +	new->ipa = ipa;
>> +	new->size = size;
>> +
>> +	write_lock(&mmu->mmu_lock);
>> +
>> +	while (*node) {
>> +		struct mapipa_node *tmp;
>> +
>> +		tmp = container_of(*node, struct mapipa_node, node);
>> +		parent = *node;
>> +		if (new->ipa < tmp->ipa) {
>> +			node = &(*node)->rb_left;
>> +		} else if (new->ipa > tmp->ipa) {
>> +			node = &(*node)->rb_right;
>> +		} else {
>> +			write_unlock(&mmu->mmu_lock);
>> +			kfree(new);
>> +			return;
>> +		}
>> +	}
>> +
>> +	rb_link_node(&new->node, parent, node);
>> +	rb_insert_color(&new->node, ipa_root);
>> +	write_unlock(&mmu->mmu_lock);
> 
> All this should be removed in favour of simply using a maple tree.
> 

Thanks for the suggestion to use maple tree. I will use it in next 
version, which help to avoid the locks.

>> +}
>> +
>> +/*
>> + * Iterate over the lookup table of Canonical IPA to Shadow IPA.
>> + * Return Shadow IPA, if the page mapped to Canonical IPA is
>> + * also mapped to a Shadow IPA.
>> + *
>> + */
>> +bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)
> 
> static?

It should be, thanks.
> 
>> +{
>> +	struct rb_node *node;
>> +	struct mapipa_node *tmp = NULL;
>> +
>> +	read_lock(&mmu->mmu_lock);
>> +	node = mmu->nested_mapipa_root.rb_node;
>> +
>> +	while (node) {
>> +		tmp = container_of(node, struct mapipa_node, node);
>> +
>> +		if (tmp->ipa == ipa)
> 
> What guarantees that the mapping you have for L1 has the same starting
> address as the one you have for L2? L1 could have a 2MB mapping and L2
> only 4kB *in the middle*.

IIUC, when a page is mapped to 2MB in L1, it won't be
mapped to L2 and we iterate with the step of PAGE_SIZE and we should be 
hitting the L2's IPA in lookup table, provided the L2 page falls in 
unmap range.

> 
>> +			break;
>> +		else if (ipa > tmp->ipa)
>> +			node = node->rb_right;
>> +		else
>> +			node = node->rb_left;
>> +	}
>> +
>> +	read_unlock(&mmu->mmu_lock);
> 
> Why would you drop the lock here....
> 
>> +
>> +	if (tmp && tmp->ipa == ipa) {
>> +		*shadow_ipa = tmp->shadow_ipa;
>> +		*size = tmp->size;
>> +		write_lock(&mmu->mmu_lock);
> 
> ... if taking it again here? What could have changed in between?
> 
>> +		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
>> +		write_unlock(&mmu->mmu_lock);
>> +		kfree(tmp);
>> +		return true;
>> +	}
>> +	return false;
>> +}
> 
> So simply hitting in the reverse mapping structure *frees* it? Meaning
> that you cannot use it as a way to update a mapping?

Freeing it since this page already unmapped/migrated on host and will be 
done on shadow S2 after this lookup. I should have considered other 
cases as well, as Oliver mentioned.

> 
>> +
>>   /* Must be called with kvm->mmu_lock held */
>>   struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
>>   {
>> @@ -674,6 +756,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
>>   	mmu->tlb_vttbr = 1;
>>   	mmu->nested_stage2_enabled = false;
>>   	atomic_set(&mmu->refcnt, 0);
>> +	mmu->nested_mapipa_root = RB_ROOT;
>>   }
>>   
>>   void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>> @@ -760,6 +843,36 @@ void kvm_nested_s2_unmap(struct kvm *kvm)
>>   	}
>>   }
>>   
>> +void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range)
>> +{
>> +	int i;
>> +	long size;
>> +	bool ret;
>> +
>> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>> +
>> +		if (kvm_s2_mmu_valid(mmu)) {
>> +			phys_addr_t shadow_ipa, start, end;
>> +
>> +			start = range->start << PAGE_SHIFT;
>> +			end = range->end << PAGE_SHIFT;
>> +
>> +			while (start < end) {
>> +				size = PAGE_SIZE;
>> +				/*
>> +				 * get the Shadow IPA if the page is mapped
>> +				 * to L1 and also mapped to any of active L2.
>> +				 */
> 
> Why is L1 relevant here?

We do map while L1 boots(early stage) in shadow S2, at that moment
if the L1 mapped page is unmapped/migrated we do need to unmap from L1's 
S2 table also.

> 
>> +				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
>> +				if (ret)
>> +					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
>> +				start += size;
>> +			}
>> +		}
>> +	}
>> +}
>> +
>>   /* expects kvm->mmu_lock to be held */
>>   void kvm_nested_s2_flush(struct kvm *kvm)
>>   {
> 
> There are a bunch of worrying issues with this patch. But more
> importantly, this looks like a waste of effort until the core issues
> that NV still has are solved, and I will not consider anything of the
> sort until then.

OK thanks for letting us know, I will pause the work on V2 of this patch 
until then.

> 
> I get the ugly feeling that you are trying to make it look as if it
> was "production ready", which it won't be for another few years,
> specially if the few interested people (such as you) are ignoring the
> core issues in favour of marketing driven features ("make it fast").
> 

What are the core issues (please forgive me if you mentioned already)? 
certainly we will prioritise them than this.

> Thanks,
> 
> 	M.
> 

Thanks,
Ganapat
Marc Zyngier March 5, 2024, 3:03 p.m. UTC | #5
On Tue, 05 Mar 2024 13:29:08 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> 
> 
> On 05-03-2024 04:43 pm, Marc Zyngier wrote:
> > [re-sending with kvmarm@ fixed]
> > 
> > On Tue, 05 Mar 2024 05:46:06 +0000,
> > Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> >> 
> >> As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2
> > 
> > $ git describe --contains 178a6915434c --match=v\*
> > fatal: cannot describe '178a6915434c141edefd116b8da3d55555ea3e63'
> > 
> 
> My bad(I would have been more verbose), I missed to mention that this
> patch is on top of NV-V11 patch series.
> 
> > This commit simply doesn't exist upstream. It only lives in a
> > now deprecated branch that will never be merged.
> > 
> >> page tables")', when ever there is unmap of pages that
> >> are mapped to L1, they are invalidated from both L1 S2-MMU and from
> >> all the active shadow/L2 S2-MMU tables. Since there is no mapping
> >> to invalidate the IPAs of Shadow S2 to a page, there is a complete
> >> S2-MMU page table walk and invalidation is done covering complete
> >> address space allocated to a L2. This has performance impacts and
> >> even soft lockup for NV(L1 and L2) boots with higher number of
> >> CPUs and large Memory.
> >> 
> >> Adding a lookup table of mapping of Shadow IPA to Canonical IPA
> >> whenever a page is mapped to any of the L2. While any page is
> >> unmaped, this lookup is helpful to unmap only if it is mapped in
> >> any of the shadow S2-MMU tables. Hence avoids unnecessary long
> >> iterations of S2-MMU table walk-through and invalidation for the
> >> complete address space.
> > 
> > All of this falls in the "premature optimisation" bucket. Why should
> > we bother with any of this when not even 'AT S1' works correctly,
> 
> Hmm, I am not aware of this, is this something new issue of V11?

it's been there since v0. All we have is a trivial implementation that
doesn't survive the S1 page-tables being swapped out. It requires a
full S1 PTW to be written.

> 
> > making it trivial to prevent a guest from making forward progress? You
> > also show no numbers that would hint at a measurable improvement under
> > any particular workload.
> 
> This patch is avoiding long iterations of unmap which was resulting in
> soft-lockup, when tried L1 and L2 with 192 cores.
> Fixing soft lockup isn't a required fix for feature enablement?

No. All we care is correctness, not performance. Addressing
soft-lockups is *definitely* a performance issue, which I'm 100% happy
to ignore.

[...]

> >>   +static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu
> >> *vcpu)
> >> +{
> >> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
> >> +}
> > 
> > Isn't that the very definition of "!in_hyp_ctxt()"? You are abusing
> 
> "!in_hyp_ctxt()" isn't true for non-NV case also?

Surely you don't try to use this in non-NV contexts, right? Why would
you try to populate a shadow reverse-map outside of a NV context?

> This function added to know that L1 is NV enabled and using shadow S2.
> 
> > the hw_mmu pointer to derive something, but the source of truth is the
> > translation regime, as defined by HCR_EL2.{E2H,TGE} and PSTATE.M.
> > 
> 
> OK, I can try HCR_EL2.{E2H,TGE} and PSTATE.M instead of hw_mmu in next
> version.

No. Use is_hyp_ctxt().

[...]

> >> index 61bdd8798f83..3948681426a0 100644
> >> --- a/arch/arm64/kvm/mmu.c
> >> +++ b/arch/arm64/kvm/mmu.c
> >> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>   					     memcache,
> >>   					     KVM_PGTABLE_WALK_HANDLE_FAULT |
> >>   					     KVM_PGTABLE_WALK_SHARED);
> >> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
> > 
> > I don't understand this condition. If nested is non-NULL, it's because
> > we're using a shadow S2. So why the additional condition?
> 
> No, nested is set only for L2, for L1 it is not.
> To handle L1 shadow S2 case, I have added this condition.

But there is *no shadow* for L1 at all. The only way to get a shadow
is to be outside of the EL2(&0) translation regime. El2(&0) itself is
always backed by the canonical S2. By definition, L1 does not run with
a S2 it is in control of. No S2, no shadow.

[...]

> > What guarantees that the mapping you have for L1 has the same starting
> > address as the one you have for L2? L1 could have a 2MB mapping and L2
> > only 4kB *in the middle*.
> 
> IIUC, when a page is mapped to 2MB in L1, it won't be
> mapped to L2 and we iterate with the step of PAGE_SIZE and we should
> be hitting the L2's IPA in lookup table, provided the L2 page falls in
> unmap range.

But then how do you handle the reverse (4kB at L1, 2MB at L2)? Without
tracking of the *intersection*, this fails to be correctly detected.
This is TLB matching 101.

[...]

> >> +			while (start < end) {
> >> +				size = PAGE_SIZE;
> >> +				/*
> >> +				 * get the Shadow IPA if the page is mapped
> >> +				 * to L1 and also mapped to any of active L2.
> >> +				 */
> > 
> > Why is L1 relevant here?
> 
> We do map while L1 boots(early stage) in shadow S2, at that moment
> if the L1 mapped page is unmapped/migrated we do need to unmap from
> L1's S2 table also.

Sure. But you can also get a page that is mapped in L2 and not mapped
in the canonical S2, which is L1's. I more and more feel that you have
a certain misconception of how L1 gets its pages mapped.

> 
> > 
> >> +				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
> >> +				if (ret)
> >> +					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
> >> +				start += size;
> >> +			}
> >> +		}
> >> +	}
> >> +}
> >> +
> >>   /* expects kvm->mmu_lock to be held */
> >>   void kvm_nested_s2_flush(struct kvm *kvm)
> >>   {
> > 
> > There are a bunch of worrying issues with this patch. But more
> > importantly, this looks like a waste of effort until the core issues
> > that NV still has are solved, and I will not consider anything of the
> > sort until then.
> 
> OK thanks for letting us know, I will pause the work on V2 of this
> patch until then.
> 
> > 
> > I get the ugly feeling that you are trying to make it look as if it
> > was "production ready", which it won't be for another few years,
> > specially if the few interested people (such as you) are ignoring the
> > core issues in favour of marketing driven features ("make it fast").
> > 
> 
> What are the core issues (please forgive me if you mentioned already)?
> certainly we will prioritise them than this.

AT is a big one. Maintenance interrupts are more or less broken. I'm
slowly plugging PAuth, but there's no testing whatsoever (running
Linux doesn't count). Lack of SVE support is also definitely a
blocker.

Thanks,

	M.
Ganapatrao Kulkarni March 5, 2024, 6:33 p.m. UTC | #6
On 05-03-2024 08:33 pm, Marc Zyngier wrote:
> On Tue, 05 Mar 2024 13:29:08 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>>
>>
>> On 05-03-2024 04:43 pm, Marc Zyngier wrote:
>>> [re-sending with kvmarm@ fixed]
>>>
>>> On Tue, 05 Mar 2024 05:46:06 +0000,
>>> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>>>
>>>> As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2
>>>
>>> $ git describe --contains 178a6915434c --match=v\*
>>> fatal: cannot describe '178a6915434c141edefd116b8da3d55555ea3e63'
>>>
>>
>> My bad(I would have been more verbose), I missed to mention that this
>> patch is on top of NV-V11 patch series.
>>
>>> This commit simply doesn't exist upstream. It only lives in a
>>> now deprecated branch that will never be merged.
>>>
>>>> page tables")', when ever there is unmap of pages that
>>>> are mapped to L1, they are invalidated from both L1 S2-MMU and from
>>>> all the active shadow/L2 S2-MMU tables. Since there is no mapping
>>>> to invalidate the IPAs of Shadow S2 to a page, there is a complete
>>>> S2-MMU page table walk and invalidation is done covering complete
>>>> address space allocated to a L2. This has performance impacts and
>>>> even soft lockup for NV(L1 and L2) boots with higher number of
>>>> CPUs and large Memory.
>>>>
>>>> Adding a lookup table of mapping of Shadow IPA to Canonical IPA
>>>> whenever a page is mapped to any of the L2. While any page is
>>>> unmaped, this lookup is helpful to unmap only if it is mapped in
>>>> any of the shadow S2-MMU tables. Hence avoids unnecessary long
>>>> iterations of S2-MMU table walk-through and invalidation for the
>>>> complete address space.
>>>
>>> All of this falls in the "premature optimisation" bucket. Why should
>>> we bother with any of this when not even 'AT S1' works correctly,
>>
>> Hmm, I am not aware of this, is this something new issue of V11?
> 
> it's been there since v0. All we have is a trivial implementation that
> doesn't survive the S1 page-tables being swapped out. It requires a
> full S1 PTW to be written.
> 
>>
>>> making it trivial to prevent a guest from making forward progress? You
>>> also show no numbers that would hint at a measurable improvement under
>>> any particular workload.
>>
>> This patch is avoiding long iterations of unmap which was resulting in
>> soft-lockup, when tried L1 and L2 with 192 cores.
>> Fixing soft lockup isn't a required fix for feature enablement?
> 
> No. All we care is correctness, not performance. Addressing
> soft-lockups is *definitely* a performance issue, which I'm 100% happy
> to ignore.
> 
> [...]
> 
>>>>    +static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu
>>>> *vcpu)
>>>> +{
>>>> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
>>>> +}
>>>
>>> Isn't that the very definition of "!in_hyp_ctxt()"? You are abusing
>>
>> "!in_hyp_ctxt()" isn't true for non-NV case also?
> 
> Surely you don't try to use this in non-NV contexts, right? Why would
> you try to populate a shadow reverse-map outside of a NV context?
> 
>> This function added to know that L1 is NV enabled and using shadow S2.
>>
>>> the hw_mmu pointer to derive something, but the source of truth is the
>>> translation regime, as defined by HCR_EL2.{E2H,TGE} and PSTATE.M.
>>>
>>
>> OK, I can try HCR_EL2.{E2H,TGE} and PSTATE.M instead of hw_mmu in next
>> version.
> 
> No. Use is_hyp_ctxt().
> 
> [...]
> 
>>>> index 61bdd8798f83..3948681426a0 100644
>>>> --- a/arch/arm64/kvm/mmu.c
>>>> +++ b/arch/arm64/kvm/mmu.c
>>>> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>    					     memcache,
>>>>    					     KVM_PGTABLE_WALK_HANDLE_FAULT |
>>>>    					     KVM_PGTABLE_WALK_SHARED);
>>>> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
>>>
>>> I don't understand this condition. If nested is non-NULL, it's because
>>> we're using a shadow S2. So why the additional condition?
>>
>> No, nested is set only for L2, for L1 it is not.
>> To handle L1 shadow S2 case, I have added this condition.
> 
> But there is *no shadow* for L1 at all. The only way to get a shadow
> is to be outside of the EL2(&0) translation regime. El2(&0) itself is
> always backed by the canonical S2. By definition, L1 does not run with
> a S2 it is in control of. No S2, no shadow.

Shadow, I mean nested_mmus[0] which is used(first consumer of the S2-MMU 
array) while L1 booting till it switches to NV2.
As per my tracing, the nested_mmus[0] is used for L1 after first ERET 
trap while L1 is booting and switches back to canonical S2, when it is 
moved to NV2.

In this window, if the pages are unmapped, we need to unmap from the 
nested_mmus[0] table.

> 
> [...]
> 
>>> What guarantees that the mapping you have for L1 has the same starting
>>> address as the one you have for L2? L1 could have a 2MB mapping and L2
>>> only 4kB *in the middle*.
>>
>> IIUC, when a page is mapped to 2MB in L1, it won't be
>> mapped to L2 and we iterate with the step of PAGE_SIZE and we should
>> be hitting the L2's IPA in lookup table, provided the L2 page falls in
>> unmap range.
> 
> But then how do you handle the reverse (4kB at L1, 2MB at L2)? Without
> tracking of the *intersection*, this fails to be correctly detected.
> This is TLB matching 101.
> 
> [...]
> 
>>>> +			while (start < end) {
>>>> +				size = PAGE_SIZE;
>>>> +				/*
>>>> +				 * get the Shadow IPA if the page is mapped
>>>> +				 * to L1 and also mapped to any of active L2.
>>>> +				 */
>>>
>>> Why is L1 relevant here?
>>
>> We do map while L1 boots(early stage) in shadow S2, at that moment
>> if the L1 mapped page is unmapped/migrated we do need to unmap from
>> L1's S2 table also.
> 
> Sure. But you can also get a page that is mapped in L2 and not mapped
> in the canonical S2, which is L1's. I more and more feel that you have
> a certain misconception of how L1 gets its pages mapped.
> 
>>
>>>
>>>> +				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
>>>> +				if (ret)
>>>> +					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
>>>> +				start += size;
>>>> +			}
>>>> +		}
>>>> +	}
>>>> +}
>>>> +
>>>>    /* expects kvm->mmu_lock to be held */
>>>>    void kvm_nested_s2_flush(struct kvm *kvm)
>>>>    {
>>>
>>> There are a bunch of worrying issues with this patch. But more
>>> importantly, this looks like a waste of effort until the core issues
>>> that NV still has are solved, and I will not consider anything of the
>>> sort until then.
>>
>> OK thanks for letting us know, I will pause the work on V2 of this
>> patch until then.
>>
>>>
>>> I get the ugly feeling that you are trying to make it look as if it
>>> was "production ready", which it won't be for another few years,
>>> specially if the few interested people (such as you) are ignoring the
>>> core issues in favour of marketing driven features ("make it fast").
>>>
>>
>> What are the core issues (please forgive me if you mentioned already)?
>> certainly we will prioritise them than this.
> 
> AT is a big one. Maintenance interrupts are more or less broken. I'm
> slowly plugging PAuth, but there's no testing whatsoever (running
> Linux doesn't count). Lack of SVE support is also definitely a
> blocker.
> 
> Thanks,
> 
> 	M.
> 

Thanks,
Ganapat
Ganapatrao Kulkarni March 6, 2024, 5:31 a.m. UTC | #7
On 05-03-2024 02:16 pm, Oliver Upton wrote:
> -cc old kvmarm list
> +cc new kvmarm list, reviewers
> 
> Please run scripts/get_maintainer.pl next time around so we get the
> right people looking at a patch.
> 

Of course I know this script -:)
I didn't cc since I felt to avoid unnecessary overloading someone's 
inbox. I don't think anyone(even ARM) is interested in this feature 
other than Marc and me/Ampere. Otherwise this would have merged upstream 
by now.
BTW, NV feature development started way back in 2016/17.

> On Mon, Mar 04, 2024 at 09:46:06PM -0800, Ganapatrao Kulkarni wrote:
>> @@ -216,6 +223,13 @@ struct kvm_s2_mmu {
>>   	 * >0: Somebody is actively using this.
>>   	 */
>>   	atomic_t refcnt;
>> +
>> +	/*
>> +	 * For a Canonical IPA to Shadow IPA mapping.
>> +	 */
>> +	struct rb_root nested_mapipa_root;
> 
> There isn't any benefit to tracking the canonical IPA -> shadow IPA(s)
> mapping on a per-S2 basis, as there already exists a one-to-many problem
> (more below). Maintaining a per-VM data structure (since this is keyed
> by canonical IPA) makes a bit more sense.
> 
>> +	rwlock_t mmu_lock;
>> +
> 
> Err, is there any reason the existing mmu_lock is insufficient here?
> Surely taking a new reference on a canonical IPA for a shadow S2 must be
> done behind the MMU lock for it to be safe against MMU notifiers...
> 
> Also, Reusing the exact same name for it is sure to produce some lock
> imbalance funnies.
> 
>>   };
>>   
>>   static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
>> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
>> index da7ebd2f6e24..c31a59a1fdc6 100644
>> --- a/arch/arm64/include/asm/kvm_nested.h
>> +++ b/arch/arm64/include/asm/kvm_nested.h
>> @@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
>>   extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>>   extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>>   extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
>> +extern void add_shadow_ipa_map_node(
>> +		struct kvm_s2_mmu *mmu,
>> +		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
> 
> style nitpick: no newline between the open bracket and first parameter.
> Wrap as needed at 80 (or a bit more) columns.
> 
>> +/*
>> + * Create a node and add to lookup table, when a page is mapped to
>> + * Canonical IPA and also mapped to Shadow IPA.
>> + */
>> +void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
>> +			phys_addr_t ipa,
>> +			phys_addr_t shadow_ipa, long size)
>> +{
>> +	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
>> +	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
>> +	struct mapipa_node *new;
>> +
>> +	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
>> +	if (!new)
>> +		return;
> 
> Should be GFP_KERNEL_ACCOUNT, you want to charge this to the user.
> 
>> +
>> +	new->shadow_ipa = shadow_ipa;
>> +	new->ipa = ipa;
>> +	new->size = size;
> 
> What about aliasing? You could have multiple shadow IPAs that point to
> the same canonical IPA, even within a single MMU.
> 
>> +	write_lock(&mmu->mmu_lock);
>> +
>> +	while (*node) {
>> +		struct mapipa_node *tmp;
>> +
>> +		tmp = container_of(*node, struct mapipa_node, node);
>> +		parent = *node;
>> +		if (new->ipa < tmp->ipa) {
>> +			node = &(*node)->rb_left;
>> +		} else if (new->ipa > tmp->ipa) {
>> +			node = &(*node)->rb_right;
>> +		} else {
>> +			write_unlock(&mmu->mmu_lock);
>> +			kfree(new);
>> +			return;
>> +		}
>> +	}
>> +
>> +	rb_link_node(&new->node, parent, node);
>> +	rb_insert_color(&new->node, ipa_root);
>> +	write_unlock(&mmu->mmu_lock);
> 
> Meh, one of the annoying things with rbtree is you have to build your
> own search functions...
> 
> It would appear that the rbtree intends to express intervals (i.e. GPA +
> size), but the search implementation treats GPA as an index. So I don't
> think this works as intended.
> 
> Have you considered other abstract data types (e.g. xarray, maple tree)
> and how they might apply here?
> 

Thanks for suggesting the maple tree based lookup, I will try it in next 
version.

>> +bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)
>> +{
>> +	struct rb_node *node;
>> +	struct mapipa_node *tmp = NULL;
>> +
>> +	read_lock(&mmu->mmu_lock);
>> +	node = mmu->nested_mapipa_root.rb_node;
>> +
>> +	while (node) {
>> +		tmp = container_of(node, struct mapipa_node, node);
>> +
>> +		if (tmp->ipa == ipa)
>> +			break;
>> +		else if (ipa > tmp->ipa)
>> +			node = node->rb_right;
>> +		else
>> +			node = node->rb_left;
>> +	}
>> +
>> +	read_unlock(&mmu->mmu_lock);
>> +
>> +	if (tmp && tmp->ipa == ipa) {
>> +		*shadow_ipa = tmp->shadow_ipa;
>> +		*size = tmp->size;
>> +		write_lock(&mmu->mmu_lock);
>> +		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
>> +		write_unlock(&mmu->mmu_lock);
>> +		kfree(tmp);
>> +		return true;
>> +	}
> 
> Implicitly evicting the entry isn't going to work if we want to use it
> for updates to a stage-2 that do not evict the mapping, like write
> protection or access flag updates.
> 

Thanks,
Ganapat
Oliver Upton March 6, 2024, 8:39 a.m. UTC | #8
On Wed, Mar 06, 2024 at 11:01:09AM +0530, Ganapatrao Kulkarni wrote:
> Of course I know this script -:)
> I didn't cc since I felt to avoid unnecessary overloading someone's inbox.

People signed up as reviewers for a subsystem know what they've gotten
themselves into... or at least realize it after the fact with the pile
of patches in their inbox.

> I don't think anyone(even ARM) is interested in this feature other than Marc
> and me/Ampere.

... and based on this _presumption_ it's OK to leave folks in the dark?

> Otherwise this would have merged upstream by now.

Stuff lands upstream when it is ready, no sooner.
Marc Zyngier March 6, 2024, 10:23 a.m. UTC | #9
On Tue, 05 Mar 2024 18:33:27 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> >>>> index 61bdd8798f83..3948681426a0 100644
> >>>> --- a/arch/arm64/kvm/mmu.c
> >>>> +++ b/arch/arm64/kvm/mmu.c
> >>>> @@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>    					     memcache,
> >>>>    					     KVM_PGTABLE_WALK_HANDLE_FAULT |
> >>>>    					     KVM_PGTABLE_WALK_SHARED);
> >>>> +		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
> >>> 
> >>> I don't understand this condition. If nested is non-NULL, it's because
> >>> we're using a shadow S2. So why the additional condition?
> >> 
> >> No, nested is set only for L2, for L1 it is not.
> >> To handle L1 shadow S2 case, I have added this condition.
> > 
> > But there is *no shadow* for L1 at all. The only way to get a shadow
> > is to be outside of the EL2(&0) translation regime. El2(&0) itself is
> > always backed by the canonical S2. By definition, L1 does not run with
> > a S2 it is in control of. No S2, no shadow.
> 
> Shadow, I mean nested_mmus[0] which is used(first consumer of the
> S2-MMU array) while L1 booting till it switches to NV2.

Please fix your terminology:

- if someone is using *any* of the nested_mmus[], then it is an L2. It
  may come from the same guest binary, but it doesn't change that it
  has changed translation regime to EL1&0. So by definition, it is an
  L2. Yes, booting a Linux guest at EL2 involve both an L1 (the EL2
  part) *and* an L2 (the EL1 part).

- I don't understand 'till it switches to NV2'. Do you mean EL2?

> As per my tracing, the nested_mmus[0] is used for L1 after first ERET
> trap while L1 is booting and switches back to canonical S2, when it is
> moved to NV2.
> 
> In this window, if the pages are unmapped, we need to unmap from the
> nested_mmus[0] table.

Well, we need to unmap things from all shadow PTs that target the same
PA. Index 0 isn't special.

Thanks,

	M.
Marc Zyngier March 6, 2024, 1:33 p.m. UTC | #10
On Wed, 06 Mar 2024 05:31:09 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> 
> 
> On 05-03-2024 02:16 pm, Oliver Upton wrote:
> > -cc old kvmarm list
> > +cc new kvmarm list, reviewers
> > 
> > Please run scripts/get_maintainer.pl next time around so we get the
> > right people looking at a patch.
> > 
> 
> Of course I know this script -:)
> I didn't cc since I felt to avoid unnecessary overloading someone's
> inbox. I don't think anyone(even ARM) is interested in this feature
> other than Marc and me/Ampere. Otherwise this would have merged
> upstream by now.

You're wrong on a lot of this. There is quite a significant interest,
actually. But in the fine corporate tradition, not even my employer
can be bothered to actively contribute. These people are waiting for
things to land. I'm happy to make them wait even longer (I'm paying
for my own HW, so I don't owe anyone anything).

The architecture also wasn't in a good enough state for the support to
be merged until very recently, and it still is broken in a lot of
respects.

> BTW, NV feature development started way back in 2016/17.

And? Cc'ing individuals and lists is basic courtesy, I'm afraid.
Please don't leave people in the dark.

Thanks,

	M.
Ganapatrao Kulkarni March 6, 2024, 2:57 p.m. UTC | #11
On 06-03-2024 07:03 pm, Marc Zyngier wrote:
> On Wed, 06 Mar 2024 05:31:09 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>>
>>
>> On 05-03-2024 02:16 pm, Oliver Upton wrote:
>>> -cc old kvmarm list
>>> +cc new kvmarm list, reviewers
>>>
>>> Please run scripts/get_maintainer.pl next time around so we get the
>>> right people looking at a patch.
>>>
>>
>> Of course I know this script -:)
>> I didn't cc since I felt to avoid unnecessary overloading someone's
>> inbox. I don't think anyone(even ARM) is interested in this feature
>> other than Marc and me/Ampere. Otherwise this would have merged
>> upstream by now.
> 
> You're wrong on a lot of this. There is quite a significant interest,
> actually. But in the fine corporate tradition, not even my employer
> can be bothered to actively contribute. These people are waiting for
> things to land. I'm happy to make them wait even longer (I'm paying
> for my own HW, so I don't owe anyone anything).
> 
> The architecture also wasn't in a good enough state for the support to
> be merged until very recently, and it still is broken in a lot of
> respects.
> 
>> BTW, NV feature development started way back in 2016/17.
> 
> And? Cc'ing individuals and lists is basic courtesy, I'm afraid.
> Please don't leave people in the dark.

Sure, My apologies, will not skip in future!

> 
> Thanks,
> 
> 	M.
> 

Thanks,
Ganapat
Ganapatrao Kulkarni March 26, 2024, 11:33 a.m. UTC | #12
Hi Marc,

On 05-03-2024 08:33 pm, Marc Zyngier wrote:
> On Tue, 05 Mar 2024 13:29:08 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>>
>>
>> What are the core issues (please forgive me if you mentioned already)?
>> certainly we will prioritise them than this.
> 
> AT is a big one. Maintenance interrupts are more or less broken. I'm
> slowly plugging PAuth, but there's no testing whatsoever (running
> Linux doesn't count). Lack of SVE support is also definitely a
> blocker.
> 

I am debugging an issue where EDK2(ArmVirtPkg) boot hangs when tried to 
boot from L1 using QEMU.

The hang is due to failure of AT instruction and resulting in immediate 
return to Guest(L2) and the loop continues...

AT instruction is executed in function of 
__get_fault_info(__translate_far_to_hpfar) in L1 when data abort is 
forwarded. Then AT instruction is trapped and executed/emulated in L0 in 
function "__kvm_at_s1e01" is failing and resulting in the return to guest.

Is this also the manifestation of the issue of AT that you are referring to?

Thanks,
Ganapat
Marc Zyngier March 27, 2024, 12:12 p.m. UTC | #13
On Tue, 26 Mar 2024 11:33:27 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> 
> Hi Marc,
> 
> On 05-03-2024 08:33 pm, Marc Zyngier wrote:
> > On Tue, 05 Mar 2024 13:29:08 +0000,
> > Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> >> 
> >> 
> >> 
> >> What are the core issues (please forgive me if you mentioned already)?
> >> certainly we will prioritise them than this.
> > 
> > AT is a big one. Maintenance interrupts are more or less broken. I'm
> > slowly plugging PAuth, but there's no testing whatsoever (running
> > Linux doesn't count). Lack of SVE support is also definitely a
> > blocker.
> > 
> 
> I am debugging an issue where EDK2(ArmVirtPkg) boot hangs when tried
> to boot from L1 using QEMU.
> 
> The hang is due to failure of AT instruction and resulting in
> immediate return to Guest(L2) and the loop continues...
>
> AT instruction is executed in function of
> __get_fault_info(__translate_far_to_hpfar) in L1 when data abort is
> forwarded. Then AT instruction is trapped and executed/emulated in L0
> in function "__kvm_at_s1e01" is failing and resulting in the return to
> guest.
> 
> Is this also the manifestation of the issue of AT that you are referring to?

It's possible, but you are looking at the symptom and not necessarily
the problem.

FWIW, I can boot EDK2 as built by debian as an L2 using QEMU without
any problem, but I'm not using QEMU for L1 (it shouldn't have much of
an impact anyway).

I expect AT S1E1R to fail if any of the following are true:

- the guest S1 page tables have been swapped out in the interval
  between the fault and the AT emulation

- the shadow S2 page tables do not have a translation for the output
  of the S1 page tables yet

You will need to work out which of these two are true. It is perfectly
possible that there are more edge cases that need addressing, as what
you describe just works with my setup. It could also be that 4kB page
support at L1 is broken (as I have no way to test it and only run with
a 16kB L1).

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 5173f8cf2904..f503b2eaedc4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -656,4 +656,9 @@  static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
 		vcpu->arch.hw_mmu->nested_stage2_enabled);
 }
 
+static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8da3c9a81ae3..f61c674c300a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -144,6 +144,13 @@  struct kvm_vmid {
 	atomic64_t id;
 };
 
+struct mapipa_node {
+	struct rb_node node;
+	phys_addr_t ipa;
+	phys_addr_t shadow_ipa;
+	long size;
+};
+
 struct kvm_s2_mmu {
 	struct kvm_vmid vmid;
 
@@ -216,6 +223,13 @@  struct kvm_s2_mmu {
 	 * >0: Somebody is actively using this.
 	 */
 	atomic_t refcnt;
+
+	/*
+	 * For a Canonical IPA to Shadow IPA mapping.
+	 */
+	struct rb_root nested_mapipa_root;
+	rwlock_t mmu_lock;
+
 };
 
 static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index da7ebd2f6e24..c31a59a1fdc6 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -65,6 +65,9 @@  extern void kvm_init_nested(struct kvm *kvm);
 extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
 extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
 extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
+extern void add_shadow_ipa_map_node(
+		struct kvm_s2_mmu *mmu,
+		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
 
 union tlbi_info;
 
@@ -123,6 +126,7 @@  extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
 extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
 extern void kvm_nested_s2_wp(struct kvm *kvm);
 extern void kvm_nested_s2_unmap(struct kvm *kvm);
+extern void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range);
 extern void kvm_nested_s2_flush(struct kvm *kvm);
 int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 61bdd8798f83..3948681426a0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1695,6 +1695,13 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					     memcache,
 					     KVM_PGTABLE_WALK_HANDLE_FAULT |
 					     KVM_PGTABLE_WALK_SHARED);
+		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
+			struct kvm_s2_mmu *shadow_s2_mmu;
+
+			ipa &= ~(vma_pagesize - 1);
+			shadow_s2_mmu = lookup_s2_mmu(vcpu);
+			add_shadow_ipa_map_node(shadow_s2_mmu, ipa, fault_ipa, vma_pagesize);
+		}
 	}
 
 	/* Mark the page dirty only if the fault is handled successfully */
@@ -1918,7 +1925,7 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 			     (range->end - range->start) << PAGE_SHIFT,
 			     range->may_block);
 
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, range);
 	return false;
 }
 
@@ -1953,7 +1960,7 @@  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 			       PAGE_SIZE, __pfn_to_phys(pfn),
 			       KVM_PGTABLE_PROT_R, NULL, 0);
 
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, range);
 	return false;
 }
 
@@ -2223,12 +2230,18 @@  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
+	struct kvm_gfn_range range;
+
 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
 	phys_addr_t size = slot->npages << PAGE_SHIFT;
 
+	range.start = gpa;
+	range.end = gpa + size;
+	range.may_block = true;
+
 	write_lock(&kvm->mmu_lock);
 	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, &range);
 	write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index f88d9213c6b3..888ec9fba4a0 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -565,6 +565,88 @@  void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
 	write_unlock(&kvm->mmu_lock);
 }
 
+/*
+ * Create a node and add to lookup table, when a page is mapped to
+ * Canonical IPA and also mapped to Shadow IPA.
+ */
+void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
+			phys_addr_t ipa,
+			phys_addr_t shadow_ipa, long size)
+{
+	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
+	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
+	struct mapipa_node *new;
+
+	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
+	if (!new)
+		return;
+
+	new->shadow_ipa = shadow_ipa;
+	new->ipa = ipa;
+	new->size = size;
+
+	write_lock(&mmu->mmu_lock);
+
+	while (*node) {
+		struct mapipa_node *tmp;
+
+		tmp = container_of(*node, struct mapipa_node, node);
+		parent = *node;
+		if (new->ipa < tmp->ipa) {
+			node = &(*node)->rb_left;
+		} else if (new->ipa > tmp->ipa) {
+			node = &(*node)->rb_right;
+		} else {
+			write_unlock(&mmu->mmu_lock);
+			kfree(new);
+			return;
+		}
+	}
+
+	rb_link_node(&new->node, parent, node);
+	rb_insert_color(&new->node, ipa_root);
+	write_unlock(&mmu->mmu_lock);
+}
+
+/*
+ * Iterate over the lookup table of Canonical IPA to Shadow IPA.
+ * Return Shadow IPA, if the page mapped to Canonical IPA is
+ * also mapped to a Shadow IPA.
+ *
+ */
+bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)
+{
+	struct rb_node *node;
+	struct mapipa_node *tmp = NULL;
+
+	read_lock(&mmu->mmu_lock);
+	node = mmu->nested_mapipa_root.rb_node;
+
+	while (node) {
+		tmp = container_of(node, struct mapipa_node, node);
+
+		if (tmp->ipa == ipa)
+			break;
+		else if (ipa > tmp->ipa)
+			node = node->rb_right;
+		else
+			node = node->rb_left;
+	}
+
+	read_unlock(&mmu->mmu_lock);
+
+	if (tmp && tmp->ipa == ipa) {
+		*shadow_ipa = tmp->shadow_ipa;
+		*size = tmp->size;
+		write_lock(&mmu->mmu_lock);
+		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
+		write_unlock(&mmu->mmu_lock);
+		kfree(tmp);
+		return true;
+	}
+	return false;
+}
+
 /* Must be called with kvm->mmu_lock held */
 struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
 {
@@ -674,6 +756,7 @@  void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
 	mmu->tlb_vttbr = 1;
 	mmu->nested_stage2_enabled = false;
 	atomic_set(&mmu->refcnt, 0);
+	mmu->nested_mapipa_root = RB_ROOT;
 }
 
 void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
@@ -760,6 +843,36 @@  void kvm_nested_s2_unmap(struct kvm *kvm)
 	}
 }
 
+void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	int i;
+	long size;
+	bool ret;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (kvm_s2_mmu_valid(mmu)) {
+			phys_addr_t shadow_ipa, start, end;
+
+			start = range->start << PAGE_SHIFT;
+			end = range->end << PAGE_SHIFT;
+
+			while (start < end) {
+				size = PAGE_SIZE;
+				/*
+				 * get the Shadow IPA if the page is mapped
+				 * to L1 and also mapped to any of active L2.
+				 */
+				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
+				if (ret)
+					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
+				start += size;
+			}
+		}
+	}
+}
+
 /* expects kvm->mmu_lock to be held */
 void kvm_nested_s2_flush(struct kvm *kvm)
 {