diff mbox series

[v4,28/34] KVM: arm64: Use page-table to track page ownership

Message ID 20210310175751.3320106-29-qperret@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: A stage 2 for the host | expand

Commit Message

Quentin Perret March 10, 2021, 5:57 p.m. UTC
As the host stage 2 will be identity mapped, all the .hyp memory regions
and/or memory pages donated to protected guestis will have to marked
invalid in the host stage 2 page-table. At the same time, the hypervisor
will need a way to track the ownership of each physical page to ensure
memory sharing or donation between entities (host, guests, hypervisor) is
legal.

In order to enable this tracking at EL2, let's use the host stage 2
page-table itself. The idea is to use the top bits of invalid mappings
to store the unique identifier of the page owner. The page-table owner
(the host) gets identifier 0 such that, at boot time, it owns the entire
IPA space as the pgd starts zeroed.

Provide kvm_pgtable_stage2_set_owner() which allows to modify the
ownership of pages in the host stage 2. It re-uses most of the map()
logic, but ends up creating invalid mappings instead. This impacts
how we do refcount as we now need to count invalid mappings when they
are used for ownership tracking.

Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 21 +++++++
 arch/arm64/kvm/hyp/pgtable.c         | 92 ++++++++++++++++++++++++----
 2 files changed, 101 insertions(+), 12 deletions(-)

Comments

Will Deacon March 11, 2021, 6:38 p.m. UTC | #1
On Wed, Mar 10, 2021 at 05:57:45PM +0000, Quentin Perret wrote:
> As the host stage 2 will be identity mapped, all the .hyp memory regions
> and/or memory pages donated to protected guestis will have to marked
> invalid in the host stage 2 page-table. At the same time, the hypervisor
> will need a way to track the ownership of each physical page to ensure
> memory sharing or donation between entities (host, guests, hypervisor) is
> legal.
> 
> In order to enable this tracking at EL2, let's use the host stage 2
> page-table itself. The idea is to use the top bits of invalid mappings
> to store the unique identifier of the page owner. The page-table owner
> (the host) gets identifier 0 such that, at boot time, it owns the entire
> IPA space as the pgd starts zeroed.
> 
> Provide kvm_pgtable_stage2_set_owner() which allows to modify the
> ownership of pages in the host stage 2. It re-uses most of the map()
> logic, but ends up creating invalid mappings instead. This impacts
> how we do refcount as we now need to count invalid mappings when they
> are used for ownership tracking.
> 
> Signed-off-by: Quentin Perret <qperret@google.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 21 +++++++
>  arch/arm64/kvm/hyp/pgtable.c         | 92 ++++++++++++++++++++++++----
>  2 files changed, 101 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 4ae19247837b..b09af4612656 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -238,6 +238,27 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
>  			   u64 phys, enum kvm_pgtable_prot prot,
>  			   void *mc);
>  
> +/**
> + * kvm_pgtable_stage2_set_owner() - Annotate invalid mappings with metadata
> + *				    encoding the ownership of a page in the
> + *				    IPA space.
> + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	Intermediate physical address at which to place the annotation.

This confused me a bit, as the annotation is stored in the page-table, not
at the memory identified by @addr. How about:

  "Base intermediate physical address to annotate"

> + * @size:	Size of the IPA range to annotate.

  "Size of the annotated range"

> + * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
> + *		page-table pages.
> + * @owner_id:	Unique identifier for the owner of the page.
> + *
> + * The page-table owner has identifier 0.

Perhaps, "By default, all page-tables are owned by identifier 0"

> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +				 void *mc, u32 owner_id);

Is there a need for the owner_id to be 32-bit rather than e.g. 16-bit? Just
strikes me that it might be difficult to recover these bits in future if we
give them out freely now.

> +
>  /**
>   * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
>   * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index f37b4179b880..e4670b639726 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -48,6 +48,8 @@
>  					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
>  					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
>  
> +#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(63, 32)

Ah, so that '02' earlier was a typo for '32'.

> +
>  struct kvm_pgtable_walk_data {
>  	struct kvm_pgtable		*pgt;
>  	struct kvm_pgtable_walker	*walker;
> @@ -186,6 +188,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
>  	return pte;
>  }
>  
> +static kvm_pte_t kvm_init_invalid_leaf_owner(u32 owner_id)
> +{
> +	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
> +}
> +
>  static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
>  				  u32 level, kvm_pte_t *ptep,
>  				  enum kvm_pgtable_walk_flags flag)
> @@ -440,6 +447,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
>  struct stage2_map_data {
>  	u64				phys;
>  	kvm_pte_t			attr;
> +	u32				owner_id;
>  
>  	kvm_pte_t			*anchor;
>  	kvm_pte_t			*childp;
> @@ -506,6 +514,24 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
>  	return 0;
>  }
>  
> +static bool stage2_is_permission_change(kvm_pte_t old, kvm_pte_t new)
> +{
> +	if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
> +		return false;
> +
> +	return !((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
> +}

This new name throws me, because it will return true if old == new.

> +static bool stage2_pte_is_counted(kvm_pte_t pte)
> +{
> +	/*
> +	 * The refcount tracks valid entries as well as invalid entries if they
> +	 * encode ownership of a page to another entity than the page-table
> +	 * owner, whose id is 0.
> +	 */
> +	return !!pte;
> +}
> +
>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>  				      kvm_pte_t *ptep,
>  				      struct stage2_map_data *data)
> @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>  	if (!kvm_block_mapping_supported(addr, end, phys, level))
>  		return -E2BIG;
>  
> -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> -	if (kvm_pte_valid(old)) {
> +	if (kvm_pte_valid(data->attr))

This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
is intended to correspond directly to the lower/upper attributes of the
descriptor as per the architecture, so tagging the valid bit in there is
pretty grotty. However, I can see the significant advantage in being able
to re-use the stage2_map_walker functionality, so about instead of nobbling
attr, you set phys to something invalid instead, e.g.:

	#define KVM_PHYS_SET_OWNER	(-1ULL)

> +		new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> +	else
> +		new = kvm_init_invalid_leaf_owner(data->owner_id);
> +
> +	if (stage2_pte_is_counted(old)) {
>  		/*
>  		 * Skip updating the PTE if we are trying to recreate the exact
>  		 * same mapping or only change the access permissions. Instead,
>  		 * the vCPU will exit one more time from guest if still needed
>  		 * and then go through the path of relaxing permissions.
>  		 */
> -		if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
> +		if (stage2_is_permission_change(old, new))
>  			return -EAGAIN;
>  
>  		/*
> -		 * There's an existing different valid leaf entry, so perform
> -		 * break-before-make.
> +		 * Clear the existing PTE, and perform break-before-make with
> +		 * TLB maintenance if it was valid.
>  		 */
>  		kvm_clear_pte(ptep);
> -		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
> +		if (kvm_pte_valid(old)) {
> +			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr,
> +				     level);
> +		}

Why do you clear the pte unconditionally here? I think you can move that
into the 'if' as well.

>  		mm_ops->put_page(ptep);
>  	}
>  
>  	smp_store_release(ptep, new);
> -	mm_ops->get_page(ptep);
> +	if (stage2_pte_is_counted(new))
> +		mm_ops->get_page(ptep);
>  	data->phys += granule;
>  	return 0;
>  }
> @@ -574,7 +608,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>  	int ret;
>  
>  	if (data->anchor) {
> -		if (kvm_pte_valid(pte))
> +		if (stage2_pte_is_counted(pte))
>  			mm_ops->put_page(ptep);
>  
>  		return 0;
> @@ -599,9 +633,10 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>  	 * a table. Accesses beyond 'end' that fall within the new table
>  	 * will be mapped lazily.
>  	 */
> -	if (kvm_pte_valid(pte)) {
> +	if (stage2_pte_is_counted(pte)) {
>  		kvm_clear_pte(ptep);
> -		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
> +		if (kvm_pte_valid(pte))
> +			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);

Same here.

>  		mm_ops->put_page(ptep);
>  	}
>  
> @@ -683,6 +718,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
>  		.mmu		= pgt->mmu,
>  		.memcache	= mc,
>  		.mm_ops		= pgt->mm_ops,
> +		.owner_id	= 0,

Not needed.

>  	};
>  	struct kvm_pgtable_walker walker = {
>  		.cb		= stage2_map_walker,
> @@ -696,6 +732,33 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
>  	if (ret)
>  		return ret;
>  
> +	/* Set the valid flag to distinguish with the set_owner() path. */
> +	map_data.attr |= KVM_PTE_VALID;

(see earlier comments)

> +	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> +	dsb(ishst);
> +	return ret;
> +}
> +
> +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +				 void *mc, u32 owner_id)
> +{
> +	int ret;
> +	struct stage2_map_data map_data = {
> +		.mmu		= pgt->mmu,
> +		.memcache	= mc,
> +		.mm_ops		= pgt->mm_ops,
> +		.owner_id	= owner_id,
> +		.attr		= 0,

Not needed.

> +	};
> +	struct kvm_pgtable_walker walker = {
> +		.cb		= stage2_map_walker,
> +		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
> +				  KVM_PGTABLE_WALK_LEAF |
> +				  KVM_PGTABLE_WALK_TABLE_POST,

Oh man, now I see why phys is zero -- so that the table callbacks will
put down the largest possible block. That needs a comment because it's
pretty horrible, and also means my idea of using -1 won't help you. Hmm.

Is there ever a reason to use kvm_pgtable_stage2_set_owner() to set an
owner of 0, or should you just use the map/unmap APIs for that? If so,
then maybe the key is simply if owner_id is non-zero, then an invalid
entry is installed?

Will
Quentin Perret March 12, 2021, 6:23 a.m. UTC | #2
On Thursday 11 Mar 2021 at 18:38:36 (+0000), Will Deacon wrote:
> On Wed, Mar 10, 2021 at 05:57:45PM +0000, Quentin Perret wrote:
> > As the host stage 2 will be identity mapped, all the .hyp memory regions
> > and/or memory pages donated to protected guestis will have to marked
> > invalid in the host stage 2 page-table. At the same time, the hypervisor
> > will need a way to track the ownership of each physical page to ensure
> > memory sharing or donation between entities (host, guests, hypervisor) is
> > legal.
> > 
> > In order to enable this tracking at EL2, let's use the host stage 2
> > page-table itself. The idea is to use the top bits of invalid mappings
> > to store the unique identifier of the page owner. The page-table owner
> > (the host) gets identifier 0 such that, at boot time, it owns the entire
> > IPA space as the pgd starts zeroed.
> > 
> > Provide kvm_pgtable_stage2_set_owner() which allows to modify the
> > ownership of pages in the host stage 2. It re-uses most of the map()
> > logic, but ends up creating invalid mappings instead. This impacts
> > how we do refcount as we now need to count invalid mappings when they
> > are used for ownership tracking.
> > 
> > Signed-off-by: Quentin Perret <qperret@google.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h | 21 +++++++
> >  arch/arm64/kvm/hyp/pgtable.c         | 92 ++++++++++++++++++++++++----
> >  2 files changed, 101 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index 4ae19247837b..b09af4612656 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -238,6 +238,27 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >  			   u64 phys, enum kvm_pgtable_prot prot,
> >  			   void *mc);
> >  
> > +/**
> > + * kvm_pgtable_stage2_set_owner() - Annotate invalid mappings with metadata
> > + *				    encoding the ownership of a page in the
> > + *				    IPA space.
> > + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:	Intermediate physical address at which to place the annotation.
> 
> This confused me a bit, as the annotation is stored in the page-table, not
> at the memory identified by @addr. How about:
> 
>   "Base intermediate physical address to annotate"
> 
> > + * @size:	Size of the IPA range to annotate.
> 
>   "Size of the annotated range"
> 
> > + * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
> > + *		page-table pages.
> > + * @owner_id:	Unique identifier for the owner of the page.
> > + *
> > + * The page-table owner has identifier 0.
> 
> Perhaps, "By default, all page-tables are owned by identifier 0"

Ack all of the above.

> > + *
> > + * Return: 0 on success, negative error code on failure.
> > + */
> > +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +				 void *mc, u32 owner_id);
> 
> Is there a need for the owner_id to be 32-bit rather than e.g. 16-bit? Just
> strikes me that it might be difficult to recover these bits in future if we
> give them out freely now.

I figured we might want to use identifiers that are stable for the
lifetime of protected VMs. I wasn't sure using e.g. VMIDs would be a
better choice here as re-using them will cause a lot of pain for the
host stage 2 pgtable maintenance.

> > +
> >  /**
> >   * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
> >   * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index f37b4179b880..e4670b639726 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -48,6 +48,8 @@
> >  					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
> >  					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
> >  
> > +#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(63, 32)
> 
> Ah, so that '02' earlier was a typo for '32'.
> 
> > +
> >  struct kvm_pgtable_walk_data {
> >  	struct kvm_pgtable		*pgt;
> >  	struct kvm_pgtable_walker	*walker;
> > @@ -186,6 +188,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
> >  	return pte;
> >  }
> >  
> > +static kvm_pte_t kvm_init_invalid_leaf_owner(u32 owner_id)
> > +{
> > +	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
> > +}
> > +
> >  static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
> >  				  u32 level, kvm_pte_t *ptep,
> >  				  enum kvm_pgtable_walk_flags flag)
> > @@ -440,6 +447,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
> >  struct stage2_map_data {
> >  	u64				phys;
> >  	kvm_pte_t			attr;
> > +	u32				owner_id;
> >  
> >  	kvm_pte_t			*anchor;
> >  	kvm_pte_t			*childp;
> > @@ -506,6 +514,24 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
> >  	return 0;
> >  }
> >  
> > +static bool stage2_is_permission_change(kvm_pte_t old, kvm_pte_t new)
> > +{
> > +	if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
> > +		return false;
> > +
> > +	return !((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
> > +}
> 
> This new name throws me, because it will return true if old == new.

How about I invert it and call it 'stage2_pte_needs_update()' or so?

> > +static bool stage2_pte_is_counted(kvm_pte_t pte)
> > +{
> > +	/*
> > +	 * The refcount tracks valid entries as well as invalid entries if they
> > +	 * encode ownership of a page to another entity than the page-table
> > +	 * owner, whose id is 0.
> > +	 */
> > +	return !!pte;
> > +}
> > +
> >  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> >  				      kvm_pte_t *ptep,
> >  				      struct stage2_map_data *data)
> > @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> >  	if (!kvm_block_mapping_supported(addr, end, phys, level))
> >  		return -E2BIG;
> >  
> > -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > -	if (kvm_pte_valid(old)) {
> > +	if (kvm_pte_valid(data->attr))
> 
> This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
> is intended to correspond directly to the lower/upper attributes of the
> descriptor as per the architecture, so tagging the valid bit in there is
> pretty grotty. However, I can see the significant advantage in being able
> to re-use the stage2_map_walker functionality, so about instead of nobbling
> attr, you set phys to something invalid instead, e.g.:
> 
> 	#define KVM_PHYS_SET_OWNER	(-1ULL)

That'll confuse kvm_block_mapping_supported() and friends I think, at
least in their current form. If you _really_ don't like this, maybe we
could have an extra 'flags' field in stage2_map_data?

> > +		new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > +	else
> > +		new = kvm_init_invalid_leaf_owner(data->owner_id);
> > +
> > +	if (stage2_pte_is_counted(old)) {
> >  		/*
> >  		 * Skip updating the PTE if we are trying to recreate the exact
> >  		 * same mapping or only change the access permissions. Instead,
> >  		 * the vCPU will exit one more time from guest if still needed
> >  		 * and then go through the path of relaxing permissions.
> >  		 */
> > -		if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
> > +		if (stage2_is_permission_change(old, new))
> >  			return -EAGAIN;
> >  
> >  		/*
> > -		 * There's an existing different valid leaf entry, so perform
> > -		 * break-before-make.
> > +		 * Clear the existing PTE, and perform break-before-make with
> > +		 * TLB maintenance if it was valid.
> >  		 */
> >  		kvm_clear_pte(ptep);
> > -		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
> > +		if (kvm_pte_valid(old)) {
> > +			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr,
> > +				     level);
> > +		}
> 
> Why do you clear the pte unconditionally here? I think you can move that
> into the 'if' as well.

Yep that should work.

> >  		mm_ops->put_page(ptep);
> >  	}
> >  
> >  	smp_store_release(ptep, new);
> > -	mm_ops->get_page(ptep);
> > +	if (stage2_pte_is_counted(new))
> > +		mm_ops->get_page(ptep);
> >  	data->phys += granule;
> >  	return 0;
> >  }
> > @@ -574,7 +608,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> >  	int ret;
> >  
> >  	if (data->anchor) {
> > -		if (kvm_pte_valid(pte))
> > +		if (stage2_pte_is_counted(pte))
> >  			mm_ops->put_page(ptep);
> >  
> >  		return 0;
> > @@ -599,9 +633,10 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> >  	 * a table. Accesses beyond 'end' that fall within the new table
> >  	 * will be mapped lazily.
> >  	 */
> > -	if (kvm_pte_valid(pte)) {
> > +	if (stage2_pte_is_counted(pte)) {
> >  		kvm_clear_pte(ptep);
> > -		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
> > +		if (kvm_pte_valid(pte))
> > +			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
> 
> Same here.
> 
> >  		mm_ops->put_page(ptep);
> >  	}
> >  
> > @@ -683,6 +718,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >  		.mmu		= pgt->mmu,
> >  		.memcache	= mc,
> >  		.mm_ops		= pgt->mm_ops,
> > +		.owner_id	= 0,
> 
> Not needed.
> 
> >  	};
> >  	struct kvm_pgtable_walker walker = {
> >  		.cb		= stage2_map_walker,
> > @@ -696,6 +732,33 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >  	if (ret)
> >  		return ret;
> >  
> > +	/* Set the valid flag to distinguish with the set_owner() path. */
> > +	map_data.attr |= KVM_PTE_VALID;
> 
> (see earlier comments)
> 
> > +	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> > +	dsb(ishst);
> > +	return ret;
> > +}
> > +
> > +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +				 void *mc, u32 owner_id)
> > +{
> > +	int ret;
> > +	struct stage2_map_data map_data = {
> > +		.mmu		= pgt->mmu,
> > +		.memcache	= mc,
> > +		.mm_ops		= pgt->mm_ops,
> > +		.owner_id	= owner_id,
> > +		.attr		= 0,
> 
> Not needed.
> 
> > +	};
> > +	struct kvm_pgtable_walker walker = {
> > +		.cb		= stage2_map_walker,
> > +		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
> > +				  KVM_PGTABLE_WALK_LEAF |
> > +				  KVM_PGTABLE_WALK_TABLE_POST,
> 
> Oh man, now I see why phys is zero -- so that the table callbacks will
> put down the largest possible block. That needs a comment because it's
> pretty horrible, and also means my idea of using -1 won't help you. Hmm.

We're creating invalid mappings which by definition don't map to
physical pages (the annotation is purely in the _IPA_ space), so yeah
the best thing to do was to take phys 'out of the way'. Setting it to
e.g. @addr would be quite confusing IMO -- imagine if we ever decide to
annotate the stage 2 of guests one day.

And as you noticed, weird values will also happen to confuse
kvm_block_mapping_supported(), and/or to cause range issues, so 0 seemed
like a reasonable choice.

I'll stick a comment.

> Is there ever a reason to use kvm_pgtable_stage2_set_owner() to set an
> owner of 0, or should you just use the map/unmap APIs for that? If so,
> then maybe the key is simply if owner_id is non-zero, then an invalid
> entry is installed?

I couldn't find a good reason to restrict it, as that wouldn't change
the implementation much anyway. Also, if we added the right CMOs, we
could probably remove the unmap walker and re-express it in terms of
set_owner(0) ... But I suppose that is for later :-)

Thanks,
Quentin
Will Deacon March 12, 2021, 9:32 a.m. UTC | #3
On Fri, Mar 12, 2021 at 06:23:00AM +0000, Quentin Perret wrote:
> On Thursday 11 Mar 2021 at 18:38:36 (+0000), Will Deacon wrote:
> > On Wed, Mar 10, 2021 at 05:57:45PM +0000, Quentin Perret wrote:
> > > As the host stage 2 will be identity mapped, all the .hyp memory regions
> > > and/or memory pages donated to protected guestis will have to marked
> > > invalid in the host stage 2 page-table. At the same time, the hypervisor
> > > will need a way to track the ownership of each physical page to ensure
> > > memory sharing or donation between entities (host, guests, hypervisor) is
> > > legal.
> > > 
> > > In order to enable this tracking at EL2, let's use the host stage 2
> > > page-table itself. The idea is to use the top bits of invalid mappings
> > > to store the unique identifier of the page owner. The page-table owner
> > > (the host) gets identifier 0 such that, at boot time, it owns the entire
> > > IPA space as the pgd starts zeroed.
> > > 
> > > Provide kvm_pgtable_stage2_set_owner() which allows to modify the
> > > ownership of pages in the host stage 2. It re-uses most of the map()
> > > logic, but ends up creating invalid mappings instead. This impacts
> > > how we do refcount as we now need to count invalid mappings when they
> > > are used for ownership tracking.
> > > 
> > > Signed-off-by: Quentin Perret <qperret@google.com>
> > > ---
> > >  arch/arm64/include/asm/kvm_pgtable.h | 21 +++++++
> > >  arch/arm64/kvm/hyp/pgtable.c         | 92 ++++++++++++++++++++++++----
> > >  2 files changed, 101 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > > index 4ae19247837b..b09af4612656 100644
> > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > @@ -238,6 +238,27 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > >  			   u64 phys, enum kvm_pgtable_prot prot,
> > >  			   void *mc);
> > >  
> > > +/**
> > > + * kvm_pgtable_stage2_set_owner() - Annotate invalid mappings with metadata
> > > + *				    encoding the ownership of a page in the
> > > + *				    IPA space.
> > > + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > > + * @addr:	Intermediate physical address at which to place the annotation.
> > 
> > This confused me a bit, as the annotation is stored in the page-table, not
> > at the memory identified by @addr. How about:
> > 
> >   "Base intermediate physical address to annotate"
> > 
> > > + * @size:	Size of the IPA range to annotate.
> > 
> >   "Size of the annotated range"
> > 
> > > + * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
> > > + *		page-table pages.
> > > + * @owner_id:	Unique identifier for the owner of the page.
> > > + *
> > > + * The page-table owner has identifier 0.
> > 
> > Perhaps, "By default, all page-tables are owned by identifier 0"
> 
> Ack all of the above.
> 
> > > + *
> > > + * Return: 0 on success, negative error code on failure.
> > > + */
> > > +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > +				 void *mc, u32 owner_id);
> > 
> > Is there a need for the owner_id to be 32-bit rather than e.g. 16-bit? Just
> > strikes me that it might be difficult to recover these bits in future if we
> > give them out freely now.
> 
> I figured we might want to use identifiers that are stable for the
> lifetime of protected VMs. I wasn't sure using e.g. VMIDs would be a
> better choice here as re-using them will cause a lot of pain for the
> host stage 2 pgtable maintenance.

I'm not saying to use the VMID directly, just that allocating half of the
pte feels a bit OTT given that the state of things after this patch series
is that we're using exactly 1 bit.

> > > @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> > >  	if (!kvm_block_mapping_supported(addr, end, phys, level))
> > >  		return -E2BIG;
> > >  
> > > -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > > -	if (kvm_pte_valid(old)) {
> > > +	if (kvm_pte_valid(data->attr))
> > 
> > This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
> > is intended to correspond directly to the lower/upper attributes of the
> > descriptor as per the architecture, so tagging the valid bit in there is
> > pretty grotty. However, I can see the significant advantage in being able
> > to re-use the stage2_map_walker functionality, so about instead of nobbling
> > attr, you set phys to something invalid instead, e.g.:
> > 
> > 	#define KVM_PHYS_SET_OWNER	(-1ULL)
> 
> That'll confuse kvm_block_mapping_supported() and friends I think, at
> least in their current form. If you _really_ don't like this, maybe we
> could have an extra 'flags' field in stage2_map_data?

I was pondering this last night and I thought of two ways to do it:

1. Add a 'bool valid' and then stick the owner and the phys in a union.
   (yes, you'll need to update the block mapping checks to look at the
    valid flag)

2. Go with my latter suggestion:

> > Is there ever a reason to use kvm_pgtable_stage2_set_owner() to set an
> > owner of 0, or should you just use the map/unmap APIs for that? If so,
> > then maybe the key is simply if owner_id is non-zero, then an invalid
> > entry is installed?
> 
> I couldn't find a good reason to restrict it, as that wouldn't change
> the implementation much anyway. Also, if we added the right CMOs, we
> could probably remove the unmap walker and re-express it in terms of
> set_owner(0) ... But I suppose that is for later :-)

The idea being that if owner is 0, then we install a mapping for phys, but
if owner is !0 then we set the invalid mapping.

(1) is probably the less hacky option... what do you reckon?

Will
Quentin Perret March 12, 2021, 10:13 a.m. UTC | #4
On Friday 12 Mar 2021 at 09:32:06 (+0000), Will Deacon wrote:
> I'm not saying to use the VMID directly, just that allocating half of the
> pte feels a bit OTT given that the state of things after this patch series
> is that we're using exactly 1 bit.

Right, and that was the reason for the PROT_NONE approach in the
previous version, but we agreed it'd be worth generalizing to allow for
future use-cases :-)

> > > > @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> > > >  	if (!kvm_block_mapping_supported(addr, end, phys, level))
> > > >  		return -E2BIG;
> > > >  
> > > > -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > > > -	if (kvm_pte_valid(old)) {
> > > > +	if (kvm_pte_valid(data->attr))
> > > 
> > > This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
> > > is intended to correspond directly to the lower/upper attributes of the
> > > descriptor as per the architecture, so tagging the valid bit in there is
> > > pretty grotty. However, I can see the significant advantage in being able
> > > to re-use the stage2_map_walker functionality, so about instead of nobbling
> > > attr, you set phys to something invalid instead, e.g.:
> > > 
> > > 	#define KVM_PHYS_SET_OWNER	(-1ULL)
> > 
> > That'll confuse kvm_block_mapping_supported() and friends I think, at
> > least in their current form. If you _really_ don't like this, maybe we
> > could have an extra 'flags' field in stage2_map_data?
> 
> I was pondering this last night and I thought of two ways to do it:
> 
> 1. Add a 'bool valid' and then stick the owner and the phys in a union.
>    (yes, you'll need to update the block mapping checks to look at the
>     valid flag)

Right, though that is also used for the hyp s1 which doesn't use any of
this ATM. That shouldn't be too bad to change, I'll have a look.

> 2. Go with my latter suggestion:
> 
> > > Is there ever a reason to use kvm_pgtable_stage2_set_owner() to set an
> > > owner of 0, or should you just use the map/unmap APIs for that? If so,
> > > then maybe the key is simply if owner_id is non-zero, then an invalid
> > > entry is installed?
> > 
> > I couldn't find a good reason to restrict it, as that wouldn't change
> > the implementation much anyway. Also, if we added the right CMOs, we
> > could probably remove the unmap walker and re-express it in terms of
> > set_owner(0) ... But I suppose that is for later :-)
> 
> The idea being that if owner is 0, then we install a mapping for phys, but
> if owner is !0 then we set the invalid mapping.

And I could even special-case set_owner(0) by calling unmap under the
hood so the caller doesn't need to care, but it's a bit yuck.

> (1) is probably the less hacky option... what do you reckon?

Agreed, (1) is a bit nicer. I was also considering setting phys = BIT(63)
in the set_owner() path. That makes it obvious it is an invalid PA, and
it should retain the nice alignment properties I need. But I suppose an
explicit flag makes it easier to reason about, so I'll have a go at it.

Thanks,
Quentin
Will Deacon March 12, 2021, 11:18 a.m. UTC | #5
On Fri, Mar 12, 2021 at 10:13:26AM +0000, Quentin Perret wrote:
> On Friday 12 Mar 2021 at 09:32:06 (+0000), Will Deacon wrote:
> > I'm not saying to use the VMID directly, just that allocating half of the
> > pte feels a bit OTT given that the state of things after this patch series
> > is that we're using exactly 1 bit.
> 
> Right, and that was the reason for the PROT_NONE approach in the
> previous version, but we agreed it'd be worth generalizing to allow for
> future use-cases :-)

Yeah, just generalising to 32 bits feels like going too far! I dunno,
make it a u8 for now, or define the hypervisor owner ID as 1 and reject
owners greater than that? We can easily extend it later.

> > > > > @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> > > > >  	if (!kvm_block_mapping_supported(addr, end, phys, level))
> > > > >  		return -E2BIG;
> > > > >  
> > > > > -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > > > > -	if (kvm_pte_valid(old)) {
> > > > > +	if (kvm_pte_valid(data->attr))
> > > > 
> > > > This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
> > > > is intended to correspond directly to the lower/upper attributes of the
> > > > descriptor as per the architecture, so tagging the valid bit in there is
> > > > pretty grotty. However, I can see the significant advantage in being able
> > > > to re-use the stage2_map_walker functionality, so about instead of nobbling
> > > > attr, you set phys to something invalid instead, e.g.:
> > > > 
> > > > 	#define KVM_PHYS_SET_OWNER	(-1ULL)
> > > 
> > > That'll confuse kvm_block_mapping_supported() and friends I think, at
> > > least in their current form. If you _really_ don't like this, maybe we
> > > could have an extra 'flags' field in stage2_map_data?
> > 
> > I was pondering this last night and I thought of two ways to do it:
> > 
> > 1. Add a 'bool valid' and then stick the owner and the phys in a union.
> >    (yes, you'll need to update the block mapping checks to look at the
> >     valid flag)
> 
> Right, though that is also used for the hyp s1 which doesn't use any of
> this ATM. That shouldn't be too bad to change, I'll have a look.

Oh, I meant stick the bool in the stage2_map_data so that should be limited
to the stage2 path.

Will
Quentin Perret March 12, 2021, 11:45 a.m. UTC | #6
On Friday 12 Mar 2021 at 11:18:05 (+0000), Will Deacon wrote:
> On Fri, Mar 12, 2021 at 10:13:26AM +0000, Quentin Perret wrote:
> > On Friday 12 Mar 2021 at 09:32:06 (+0000), Will Deacon wrote:
> > > I'm not saying to use the VMID directly, just that allocating half of the
> > > pte feels a bit OTT given that the state of things after this patch series
> > > is that we're using exactly 1 bit.
> > 
> > Right, and that was the reason for the PROT_NONE approach in the
> > previous version, but we agreed it'd be worth generalizing to allow for
> > future use-cases :-)
> 
> Yeah, just generalising to 32 bits feels like going too far! I dunno,
> make it a u8 for now, or define the hypervisor owner ID as 1 and reject
> owners greater than that? We can easily extend it later.

Alrighty I'll do _both_

> > > > > > @@ -517,28 +543,36 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> > > > > >  	if (!kvm_block_mapping_supported(addr, end, phys, level))
> > > > > >  		return -E2BIG;
> > > > > >  
> > > > > > -	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
> > > > > > -	if (kvm_pte_valid(old)) {
> > > > > > +	if (kvm_pte_valid(data->attr))
> > > > > 
> > > > > This feels like a bit of a hack to me: the 'attr' field in stage2_map_data
> > > > > is intended to correspond directly to the lower/upper attributes of the
> > > > > descriptor as per the architecture, so tagging the valid bit in there is
> > > > > pretty grotty. However, I can see the significant advantage in being able
> > > > > to re-use the stage2_map_walker functionality, so about instead of nobbling
> > > > > attr, you set phys to something invalid instead, e.g.:
> > > > > 
> > > > > 	#define KVM_PHYS_SET_OWNER	(-1ULL)
> > > > 
> > > > That'll confuse kvm_block_mapping_supported() and friends I think, at
> > > > least in their current form. If you _really_ don't like this, maybe we
> > > > could have an extra 'flags' field in stage2_map_data?
> > > 
> > > I was pondering this last night and I thought of two ways to do it:
> > > 
> > > 1. Add a 'bool valid' and then stick the owner and the phys in a union.
> > >    (yes, you'll need to update the block mapping checks to look at the
> > >     valid flag)
> > 
> > Right, though that is also used for the hyp s1 which doesn't use any of
> > this ATM. That shouldn't be too bad to change, I'll have a look.
> 
> Oh, I meant stick the bool in the stage2_map_data so that should be limited
> to the stage2 path.

I mean I still want to use kvm_block_mapping_supported() but ignore the
phys check when it's not valid. I find it ugly to add a 'valid'
parameter to the function itself, so maybe we're better off with just
special casing phys == -1ULL as you first suggested. How much do you hate
the below (totally untested)?

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 8e4599256969..9ec937462fd6 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -71,6 +71,13 @@ static u64 kvm_granule_size(u32 level)
        return BIT(kvm_granule_shift(level));
 }

+#define KVM_PHYS_INVALID (-1ULL)
+
+static bool kvm_phys_is_valid(u64 phys)
+{
+       return phys != KVM_PHYS_INVALID;
+}
+
 static bool kvm_level_support_block_mappings(u32 level)
 {
        /*
@@ -90,7 +97,10 @@ static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
        if (granule > (end - addr))
                return false;

-       return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+       if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+               return false;
+
+       return IS_ALIGNED(addr, granule);
 }

 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -550,7 +560,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
        if (!kvm_block_mapping_supported(addr, end, phys, level))
                return -E2BIG;

-       if (kvm_pte_valid(data->attr))
+       if (kvm_phys_is_valid(phys))
                new = kvm_init_valid_leaf_pte(phys, data->attr, level);
        else
                new = kvm_init_invalid_leaf_owner(data->owner_id);
@@ -580,7 +590,8 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
        smp_store_release(ptep, new);
        if (stage2_pte_is_counted(new))
                mm_ops->get_page(ptep);
-       data->phys += granule;
+       if (kvm_phys_is_valid(phys))
+               data->phys += granule;
        return 0;
 }

@@ -739,9 +750,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
        if (ret)
                return ret;

-       /* Set the valid flag to distinguish with the set_owner() path. */
-       map_data.attr |= KVM_PTE_VALID;
-
        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        dsb(ishst);
        return ret;
@@ -752,6 +760,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 {
        int ret;
        struct stage2_map_data map_data = {
+               .phys           = KVM_PHYS_INVALID,
                .mmu            = pgt->mmu,
                .memcache       = mc,
                .mm_ops         = pgt->mm_ops,
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4ae19247837b..b09af4612656 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -238,6 +238,27 @@  int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 			   u64 phys, enum kvm_pgtable_prot prot,
 			   void *mc);
 
+/**
+ * kvm_pgtable_stage2_set_owner() - Annotate invalid mappings with metadata
+ *				    encoding the ownership of a page in the
+ *				    IPA space.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address at which to place the annotation.
+ * @size:	Size of the IPA range to annotate.
+ * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
+ *		page-table pages.
+ * @owner_id:	Unique identifier for the owner of the page.
+ *
+ * The page-table owner has identifier 0. This function can be used to mark
+ * portions of the IPA space as owned by other entities. When a stage 2 is used
+ * with identity-mappings, these annotations allow to use the page-table data
+ * structure as a simple rmap.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+				 void *mc, u32 owner_id);
+
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index f37b4179b880..e4670b639726 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -48,6 +48,8 @@ 
 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
+#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(63, 32)
+
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;
 	struct kvm_pgtable_walker	*walker;
@@ -186,6 +188,11 @@  static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 	return pte;
 }
 
+static kvm_pte_t kvm_init_invalid_leaf_owner(u32 owner_id)
+{
+	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
 				  u32 level, kvm_pte_t *ptep,
 				  enum kvm_pgtable_walk_flags flag)
@@ -440,6 +447,7 @@  void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 struct stage2_map_data {
 	u64				phys;
 	kvm_pte_t			attr;
+	u32				owner_id;
 
 	kvm_pte_t			*anchor;
 	kvm_pte_t			*childp;
@@ -506,6 +514,24 @@  static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
 	return 0;
 }
 
+static bool stage2_is_permission_change(kvm_pte_t old, kvm_pte_t new)
+{
+	if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
+		return false;
+
+	return !((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte)
+{
+	/*
+	 * The refcount tracks valid entries as well as invalid entries if they
+	 * encode ownership of a page to another entity than the page-table
+	 * owner, whose id is 0.
+	 */
+	return !!pte;
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep,
 				      struct stage2_map_data *data)
@@ -517,28 +543,36 @@  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	if (!kvm_block_mapping_supported(addr, end, phys, level))
 		return -E2BIG;
 
-	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-	if (kvm_pte_valid(old)) {
+	if (kvm_pte_valid(data->attr))
+		new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+	else
+		new = kvm_init_invalid_leaf_owner(data->owner_id);
+
+	if (stage2_pte_is_counted(old)) {
 		/*
 		 * Skip updating the PTE if we are trying to recreate the exact
 		 * same mapping or only change the access permissions. Instead,
 		 * the vCPU will exit one more time from guest if still needed
 		 * and then go through the path of relaxing permissions.
 		 */
-		if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+		if (stage2_is_permission_change(old, new))
 			return -EAGAIN;
 
 		/*
-		 * There's an existing different valid leaf entry, so perform
-		 * break-before-make.
+		 * Clear the existing PTE, and perform break-before-make with
+		 * TLB maintenance if it was valid.
 		 */
 		kvm_clear_pte(ptep);
-		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+		if (kvm_pte_valid(old)) {
+			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr,
+				     level);
+		}
 		mm_ops->put_page(ptep);
 	}
 
 	smp_store_release(ptep, new);
-	mm_ops->get_page(ptep);
+	if (stage2_pte_is_counted(new))
+		mm_ops->get_page(ptep);
 	data->phys += granule;
 	return 0;
 }
@@ -574,7 +608,7 @@  static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	int ret;
 
 	if (data->anchor) {
-		if (kvm_pte_valid(pte))
+		if (stage2_pte_is_counted(pte))
 			mm_ops->put_page(ptep);
 
 		return 0;
@@ -599,9 +633,10 @@  static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	 * a table. Accesses beyond 'end' that fall within the new table
 	 * will be mapped lazily.
 	 */
-	if (kvm_pte_valid(pte)) {
+	if (stage2_pte_is_counted(pte)) {
 		kvm_clear_pte(ptep);
-		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+		if (kvm_pte_valid(pte))
+			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
 		mm_ops->put_page(ptep);
 	}
 
@@ -683,6 +718,7 @@  int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 		.mmu		= pgt->mmu,
 		.memcache	= mc,
 		.mm_ops		= pgt->mm_ops,
+		.owner_id	= 0,
 	};
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_map_walker,
@@ -696,6 +732,33 @@  int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 	if (ret)
 		return ret;
 
+	/* Set the valid flag to distinguish with the set_owner() path. */
+	map_data.attr |= KVM_PTE_VALID;
+
+	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+	dsb(ishst);
+	return ret;
+}
+
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+				 void *mc, u32 owner_id)
+{
+	int ret;
+	struct stage2_map_data map_data = {
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+		.mm_ops		= pgt->mm_ops,
+		.owner_id	= owner_id,
+		.attr		= 0,
+	};
+	struct kvm_pgtable_walker walker = {
+		.cb		= stage2_map_walker,
+		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
+				  KVM_PGTABLE_WALK_LEAF |
+				  KVM_PGTABLE_WALK_TABLE_POST,
+		.arg		= &map_data,
+	};
+
 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
 	dsb(ishst);
 	return ret;
@@ -725,8 +788,13 @@  static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	kvm_pte_t pte = *ptep, *childp = NULL;
 	bool need_flush = false;
 
-	if (!kvm_pte_valid(pte))
+	if (!kvm_pte_valid(pte)) {
+		if (stage2_pte_is_counted(pte)) {
+			kvm_clear_pte(ptep);
+			mm_ops->put_page(ptep);
+		}
 		return 0;
+	}
 
 	if (kvm_pte_table(pte, level)) {
 		childp = kvm_pte_follow(pte, mm_ops);
@@ -948,7 +1016,7 @@  static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	struct kvm_pgtable_mm_ops *mm_ops = arg;
 	kvm_pte_t pte = *ptep;
 
-	if (!kvm_pte_valid(pte))
+	if (!stage2_pte_is_counted(pte))
 		return 0;
 
 	mm_ops->put_page(ptep);