diff mbox series

[v4,21/43] arm64: RME: Runtime faulting of memory

Message ID 20240821153844.60084-22-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Support for Arm CCA in KVM | expand

Commit Message

Steven Price Aug. 21, 2024, 3:38 p.m. UTC
At runtime if the realm guest accesses memory which hasn't yet been
mapped then KVM needs to either populate the region or fault the guest.

For memory in the lower (protected) region of IPA a fresh page is
provided to the RMM which will zero the contents. For memory in the
upper (shared) region of IPA, the memory from the memslot is mapped
into the realm VM non secure.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v2:
 * Avoid leaking memory if failing to map it in the realm.
 * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
 * Adapt to changes in previous patches.
---
 arch/arm64/include/asm/kvm_emulate.h |  10 ++
 arch/arm64/include/asm/kvm_rme.h     |  10 ++
 arch/arm64/kvm/mmu.c                 | 120 +++++++++++++++-
 arch/arm64/kvm/rme.c                 | 205 +++++++++++++++++++++++++--
 4 files changed, 325 insertions(+), 20 deletions(-)

Comments

Aneesh Kumar K.V Aug. 22, 2024, 3:32 a.m. UTC | #1
Steven Price <steven.price@arm.com> writes:

> At runtime if the realm guest accesses memory which hasn't yet been
> mapped then KVM needs to either populate the region or fault the guest.
>
> For memory in the lower (protected) region of IPA a fresh page is
> provided to the RMM which will zero the contents. For memory in the
> upper (shared) region of IPA, the memory from the memslot is mapped
> into the realm VM non secure.
>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v2:
>  * Avoid leaking memory if failing to map it in the realm.
>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>  * Adapt to changes in previous patches.
>

....

> -	gfn = ipa >> PAGE_SHIFT;
> +	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
> +
> +	if (kvm_slot_can_be_private(memslot)) {
> +		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
> +		if (ret != -EAGAIN)
> +			goto out;
> +	}
>

Shouldn't this be s/fault_ipa/ipa ?

	ret = private_memslot_fault(vcpu, ipa, memslot);

-aneesh
Aneesh Kumar K.V Aug. 22, 2024, 3:50 a.m. UTC | #2
Steven Price <steven.price@arm.com> writes:

> At runtime if the realm guest accesses memory which hasn't yet been
> mapped then KVM needs to either populate the region or fault the guest.
>
> For memory in the lower (protected) region of IPA a fresh page is
> provided to the RMM which will zero the contents. For memory in the
> upper (shared) region of IPA, the memory from the memslot is mapped
> into the realm VM non secure.
>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v2:
>  * Avoid leaking memory if failing to map it in the realm.
>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>  * Adapt to changes in previous patches.
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  10 ++
>  arch/arm64/include/asm/kvm_rme.h     |  10 ++
>  arch/arm64/kvm/mmu.c                 | 120 +++++++++++++++-
>  arch/arm64/kvm/rme.c                 | 205 +++++++++++++++++++++++++--
>  4 files changed, 325 insertions(+), 20 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 7430c77574e3..0b50572d3719 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -710,6 +710,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>  }
>  
> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +{
> +	if (kvm_is_realm(kvm)) {
> +		struct realm *realm = &kvm->arch.realm;
> +
> +		return BIT(realm->ia_bits - 1);
> +	}
> +	return 0;
> +}
> +
>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>  {
>  	if (static_branch_unlikely(&kvm_rme_is_available))
> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
> index 0e44b20cfa48..c50854f44674 100644
> --- a/arch/arm64/include/asm/kvm_rme.h
> +++ b/arch/arm64/include/asm/kvm_rme.h
> @@ -103,6 +103,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>  			   unsigned long ipa,
>  			   u64 size,
>  			   bool unmap_private);
> +int realm_map_protected(struct realm *realm,
> +			unsigned long base_ipa,
> +			struct page *dst_page,
> +			unsigned long map_size,
> +			struct kvm_mmu_memory_cache *memcache);
> +int realm_map_non_secure(struct realm *realm,
> +			 unsigned long ipa,
> +			 struct page *page,
> +			 unsigned long map_size,
> +			 struct kvm_mmu_memory_cache *memcache);
>  int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>  			unsigned long addr, unsigned long end,
>  			unsigned long ripas,
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 620d26810019..eb8b8d013f3e 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>  
>  	lockdep_assert_held_write(&kvm->mmu_lock);
>  	WARN_ON(size & ~PAGE_MASK);
> -	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
> -				   may_block));
> +
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_unmap_range(kvm, start, size, !only_shared);
> +	else
> +		WARN_ON(stage2_apply_range(mmu, start, end,
> +					   kvm_pgtable_stage2_unmap,
> +					   may_block));
>  }
>  
>  void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
> @@ -345,7 +350,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>  
> -	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_unmap_range(kvm, addr, end - addr, false);
> +	else
> +		kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>  }
>  
>  /**
> @@ -1037,6 +1045,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>  	struct kvm_memory_slot *memslot;
>  	int idx, bkt;
>  
> +	/* For realms this is handled by the RMM so nothing to do here */
> +	if (kvm_is_realm(kvm))
> +		return;
> +
>  	idx = srcu_read_lock(&kvm->srcu);
>  	mmap_read_lock(current->mm);
>  	write_lock(&kvm->mmu_lock);
> @@ -1062,6 +1074,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  	if (kvm_is_realm(kvm) &&
>  	    (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>  	     kvm_realm_state(kvm) != REALM_STATE_NONE)) {
> +		kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
>  		write_unlock(&kvm->mmu_lock);
>  		kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>  		return;
> @@ -1428,6 +1441,71 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>  	return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>  
> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
> +			 kvm_pfn_t pfn, unsigned long map_size,
> +			 enum kvm_pgtable_prot prot,
> +			 struct kvm_mmu_memory_cache *memcache)
> +{
> +	struct realm *realm = &kvm->arch.realm;
> +	struct page *page = pfn_to_page(pfn);
> +
> +	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
> +		return -EFAULT;
> +
> +	if (!realm_is_addr_protected(realm, ipa))
> +		return realm_map_non_secure(realm, ipa, page, map_size,
> +					    memcache);
> +
> +	return realm_map_protected(realm, ipa, page, map_size, memcache);
> +}
> +
> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
> +				 phys_addr_t fault_ipa,
> +				 struct kvm_memory_slot *memslot)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> +	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
> +	bool priv_exists = kvm_mem_is_private(kvm, gfn);
> +	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> +	kvm_pfn_t pfn;
> +	int ret;
> +
> +	if (priv_exists != is_priv_gfn) {
> +		kvm_prepare_memory_fault_exit(vcpu,
> +					      fault_ipa & ~gpa_stolen_mask,
> +					      PAGE_SIZE,
> +					      kvm_is_write_fault(vcpu),
> +					      false, is_priv_gfn);
> +
> +		return 0;
> +	}
> +
> +	if (!is_priv_gfn) {
> +		/* Not a private mapping, handling normally */
> +		return -EAGAIN;
> +	}
>

Instead of that EAGAIN, it better to handle as below?

 arch/arm64/kvm/mmu.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1eddbc7d7156..33ef95b5c94a 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1480,11 +1480,6 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
 		return 0;
 	}
 
-	if (!is_priv_gfn) {
-		/* Not a private mapping, handling normally */
-		return -EAGAIN;
-	}
-
 	ret = kvm_mmu_topup_memory_cache(memcache,
 					 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
 	if (ret)
@@ -1925,12 +1920,25 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
 
-	if (kvm_slot_can_be_private(memslot)) {
-		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
-		if (ret != -EAGAIN)
+	if (kvm_slot_can_be_private(memslot) &&
+	    kvm_is_private_gpa(vcpu->kvm, ipa)) {
+		ret = private_memslot_fault(vcpu, ipa, memslot);
 			goto out;
 	}
+	/* attribute msimatch. shared access fault on a mem with private attribute */
+	if (kvm_mem_is_private(vcpu->kvm, gfn)) {
+		/* let VMM fixup the memory attribute */
+		kvm_prepare_memory_fault_exit(vcpu,
+					      kvm_gpa_from_fault(vcpu->kvm, ipa),
+					      PAGE_SIZE,
+					      kvm_is_write_fault(vcpu),
+					      false, false);
+
+		ret =  0;
+		goto out;
+	}
 
+	/* Slot can be be private, but fault addr is not, handle that as normal fault */
 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
 	write_fault = kvm_is_write_fault(vcpu);
 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
Steven Price Aug. 22, 2024, 3:14 p.m. UTC | #3
On 22/08/2024 04:32, Aneesh Kumar K.V wrote:
> Steven Price <steven.price@arm.com> writes:
> 
>> At runtime if the realm guest accesses memory which hasn't yet been
>> mapped then KVM needs to either populate the region or fault the guest.
>>
>> For memory in the lower (protected) region of IPA a fresh page is
>> provided to the RMM which will zero the contents. For memory in the
>> upper (shared) region of IPA, the memory from the memslot is mapped
>> into the realm VM non secure.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v2:
>>  * Avoid leaking memory if failing to map it in the realm.
>>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>>  * Adapt to changes in previous patches.
>>
> 
> ....
> 
>> -	gfn = ipa >> PAGE_SHIFT;
>> +	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>> +
>> +	if (kvm_slot_can_be_private(memslot)) {
>> +		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
>> +		if (ret != -EAGAIN)
>> +			goto out;
>> +	}
>>
> 
> Shouldn't this be s/fault_ipa/ipa ?

Well they should both be the same unless we're in some scary parallel
universe where we have nested virtualisation *and* realms at the same
time (shudder!). But yes "ipa" would be more consistent so I'll change it!

Steve

> 	ret = private_memslot_fault(vcpu, ipa, memslot);
> 
> -aneesh
Steven Price Aug. 22, 2024, 3:40 p.m. UTC | #4
On 22/08/2024 04:50, Aneesh Kumar K.V wrote:
> Steven Price <steven.price@arm.com> writes:
> 
>> At runtime if the realm guest accesses memory which hasn't yet been
>> mapped then KVM needs to either populate the region or fault the guest.
>>
>> For memory in the lower (protected) region of IPA a fresh page is
>> provided to the RMM which will zero the contents. For memory in the
>> upper (shared) region of IPA, the memory from the memslot is mapped
>> into the realm VM non secure.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v2:
>>  * Avoid leaking memory if failing to map it in the realm.
>>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>>  * Adapt to changes in previous patches.
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |  10 ++
>>  arch/arm64/include/asm/kvm_rme.h     |  10 ++
>>  arch/arm64/kvm/mmu.c                 | 120 +++++++++++++++-
>>  arch/arm64/kvm/rme.c                 | 205 +++++++++++++++++++++++++--
>>  4 files changed, 325 insertions(+), 20 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index 7430c77574e3..0b50572d3719 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -710,6 +710,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>>  }
>>  
>> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
>> +{
>> +	if (kvm_is_realm(kvm)) {
>> +		struct realm *realm = &kvm->arch.realm;
>> +
>> +		return BIT(realm->ia_bits - 1);
>> +	}
>> +	return 0;
>> +}
>> +
>>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>>  {
>>  	if (static_branch_unlikely(&kvm_rme_is_available))
>> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
>> index 0e44b20cfa48..c50854f44674 100644
>> --- a/arch/arm64/include/asm/kvm_rme.h
>> +++ b/arch/arm64/include/asm/kvm_rme.h
>> @@ -103,6 +103,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>>  			   unsigned long ipa,
>>  			   u64 size,
>>  			   bool unmap_private);
>> +int realm_map_protected(struct realm *realm,
>> +			unsigned long base_ipa,
>> +			struct page *dst_page,
>> +			unsigned long map_size,
>> +			struct kvm_mmu_memory_cache *memcache);
>> +int realm_map_non_secure(struct realm *realm,
>> +			 unsigned long ipa,
>> +			 struct page *page,
>> +			 unsigned long map_size,
>> +			 struct kvm_mmu_memory_cache *memcache);
>>  int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>>  			unsigned long addr, unsigned long end,
>>  			unsigned long ripas,
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 620d26810019..eb8b8d013f3e 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>>  
>>  	lockdep_assert_held_write(&kvm->mmu_lock);
>>  	WARN_ON(size & ~PAGE_MASK);
>> -	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
>> -				   may_block));
>> +
>> +	if (kvm_is_realm(kvm))
>> +		kvm_realm_unmap_range(kvm, start, size, !only_shared);
>> +	else
>> +		WARN_ON(stage2_apply_range(mmu, start, end,
>> +					   kvm_pgtable_stage2_unmap,
>> +					   may_block));
>>  }
>>  
>>  void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
>> @@ -345,7 +350,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>>  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>>  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>>  
>> -	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>> +	if (kvm_is_realm(kvm))
>> +		kvm_realm_unmap_range(kvm, addr, end - addr, false);
>> +	else
>> +		kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>>  }
>>  
>>  /**
>> @@ -1037,6 +1045,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>>  	struct kvm_memory_slot *memslot;
>>  	int idx, bkt;
>>  
>> +	/* For realms this is handled by the RMM so nothing to do here */
>> +	if (kvm_is_realm(kvm))
>> +		return;
>> +
>>  	idx = srcu_read_lock(&kvm->srcu);
>>  	mmap_read_lock(current->mm);
>>  	write_lock(&kvm->mmu_lock);
>> @@ -1062,6 +1074,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>  	if (kvm_is_realm(kvm) &&
>>  	    (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>>  	     kvm_realm_state(kvm) != REALM_STATE_NONE)) {
>> +		kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
>>  		write_unlock(&kvm->mmu_lock);
>>  		kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>>  		return;
>> @@ -1428,6 +1441,71 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>>  	return vma->vm_flags & VM_MTE_ALLOWED;
>>  }
>>  
>> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
>> +			 kvm_pfn_t pfn, unsigned long map_size,
>> +			 enum kvm_pgtable_prot prot,
>> +			 struct kvm_mmu_memory_cache *memcache)
>> +{
>> +	struct realm *realm = &kvm->arch.realm;
>> +	struct page *page = pfn_to_page(pfn);
>> +
>> +	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
>> +		return -EFAULT;
>> +
>> +	if (!realm_is_addr_protected(realm, ipa))
>> +		return realm_map_non_secure(realm, ipa, page, map_size,
>> +					    memcache);
>> +
>> +	return realm_map_protected(realm, ipa, page, map_size, memcache);
>> +}
>> +
>> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
>> +				 phys_addr_t fault_ipa,
>> +				 struct kvm_memory_slot *memslot)
>> +{
>> +	struct kvm *kvm = vcpu->kvm;
>> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
>> +	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> +	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
>> +	bool priv_exists = kvm_mem_is_private(kvm, gfn);
>> +	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>> +	kvm_pfn_t pfn;
>> +	int ret;
>> +
>> +	if (priv_exists != is_priv_gfn) {
>> +		kvm_prepare_memory_fault_exit(vcpu,
>> +					      fault_ipa & ~gpa_stolen_mask,
>> +					      PAGE_SIZE,
>> +					      kvm_is_write_fault(vcpu),
>> +					      false, is_priv_gfn);
>> +
>> +		return 0;
>> +	}
>> +
>> +	if (!is_priv_gfn) {
>> +		/* Not a private mapping, handling normally */
>> +		return -EAGAIN;
>> +	}
>>
> 
> Instead of that EAGAIN, it better to handle as below?

I'm not finding the below easier to read.

>  arch/arm64/kvm/mmu.c | 24 ++++++++++++++++--------
>  1 file changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 1eddbc7d7156..33ef95b5c94a 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1480,11 +1480,6 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>  		return 0;
>  	}
>  
> -	if (!is_priv_gfn) {
> -		/* Not a private mapping, handling normally */
> -		return -EAGAIN;
> -	}
> -
>  	ret = kvm_mmu_topup_memory_cache(memcache,
>  					 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
>  	if (ret)
> @@ -1925,12 +1920,25 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>  
> -	if (kvm_slot_can_be_private(memslot)) {
> -		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
> -		if (ret != -EAGAIN)
> +	if (kvm_slot_can_be_private(memslot) &&
> +	    kvm_is_private_gpa(vcpu->kvm, ipa)) {

I presume kvm_is_private_gpa() is defined as something like:

static bool kvm_is_private_gpa(kvm, phys_addr_t ipa)
{
	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
	return  !((ipa & gpa_stolen_mask) == gpa_stolen_mask);
}

> +		ret = private_memslot_fault(vcpu, ipa, memslot);

So this handles the case in private_memslot_fault() where is_priv_gfn is 
true. So there's a little bit of simplification in 
private_memslot_fault().

>  			goto out;
>  	}
> +	/* attribute msimatch. shared access fault on a mem with private attribute */
> +	if (kvm_mem_is_private(vcpu->kvm, gfn)) {
> +		/* let VMM fixup the memory attribute */
> +		kvm_prepare_memory_fault_exit(vcpu,
> +					      kvm_gpa_from_fault(vcpu->kvm, ipa),
> +					      PAGE_SIZE,
> +					      kvm_is_write_fault(vcpu),
> +					      false, false);

And then we have to duplicate the code here for calling 
kvm_prepare_memory_fault_exit(). Which seems a bit ugly to me. Am I 
missing something? Your patch doesn't seem complete.

> +
> +		ret =  0;
> +		goto out;
> +	}
>  
> +	/* Slot can be be private, but fault addr is not, handle that as normal fault */
>  	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {


Note your email had a signature line "--" here which causes my email 
client to remove the rest of your reply - it's worth dropping that from 
the git output when sending diffs. I've attempted to include your
second diff below manually.

> Instead of referring this as stolen bits is it better to do
> 
>  arch/arm64/include/asm/kvm_emulate.h | 20 +++++++++++++++++---
>  arch/arm64/kvm/mmu.c                 | 21 ++++++++-------------
>  2 files changed, 25 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 0b50572d3719..790412fd53b8 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -710,14 +710,28 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>  }
>  
> -static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_addr)
>  {
> +	gpa_t addr_mask;
> +
>  	if (kvm_is_realm(kvm)) {
>  		struct realm *realm = &kvm->arch.realm;
>  
> -		return BIT(realm->ia_bits - 1);
> +		addr_mask = BIT(realm->ia_bits - 1);
> +		/* clear shared bit and return */
> +		return fault_addr & ~addr_mask;
>  	}
> -	return 0;
> +	return fault_addr;
> +}
> +
> +static inline bool kvm_is_private_gpa(struct kvm *kvm, phys_addr_t fault_addr)
> +{
> +	/*
> +	 * For Realms, the shared address is an alias of the private GPA
> +	 * with top bit set and we have a single address space. Thus if the
> +	 * fault address matches the GPA, it is the private GPA
> +	 */
> +	return fault_addr == kvm_gpa_from_fault(kvm, fault_addr);
>  }

Ah, so here's the missing function from above.

>  
>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index eb8b8d013f3e..1eddbc7d7156 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1464,20 +1464,18 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>  				 struct kvm_memory_slot *memslot)
>  {
>  	struct kvm *kvm = vcpu->kvm;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> -	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> -	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
> -	bool priv_exists = kvm_mem_is_private(kvm, gfn);
> +	gfn_t gfn = kvm_gpa_from_fault(kvm, fault_ipa) >> PAGE_SHIFT;
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>  	kvm_pfn_t pfn;
>  	int ret;
>  
> -	if (priv_exists != is_priv_gfn) {
> +	if (!kvm_mem_is_private(kvm, gfn)) {
> +		/* let VMM fixup the memory attribute */
>  		kvm_prepare_memory_fault_exit(vcpu,
> -					      fault_ipa & ~gpa_stolen_mask,
> +					      kvm_gpa_from_fault(kvm, fault_ipa),
>  					      PAGE_SIZE,
>  					      kvm_is_write_fault(vcpu),
> -					      false, is_priv_gfn);
> +					      false, true);
>  
>  		return 0;
>  	}
> @@ -1527,7 +1525,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	long vma_pagesize, fault_granule;
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>  
>  	if (fault_is_perm)
>  		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> @@ -1640,7 +1637,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
>  		fault_ipa &= ~(vma_pagesize - 1);
>  
> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
>  	mte_allowed = kvm_vma_mte_allowed(vma);
>  
>  	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> @@ -1835,7 +1832,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  	struct kvm_memory_slot *memslot;
>  	unsigned long hva;
>  	bool is_iabt, write_fault, writable;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>  	gfn_t gfn;
>  	int ret, idx;
>  
> @@ -1926,7 +1922,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		nested = &nested_trans;
>  	}
>  
> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>  
>  	if (kvm_slot_can_be_private(memslot)) {
> @@ -1978,8 +1974,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		 * of the page size.
>  		 */
>  		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
> -		ipa &= ~gpa_stolen_mask;
> -		ret = io_mem_abort(vcpu, ipa);
> +		ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
>  		goto out_unlock;
>  	}

I can see your point that kvm_gpa_from_fault() makes sense. I'm still
not convinced about the duplication of the kvm_prepare_memory_fault_exit()
call though.

How about the following (untested):

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0b50572d3719..fa03520d7933 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -710,14 +710,14 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
 	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
 }
 
-static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
+static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_ipa)
 {
 	if (kvm_is_realm(kvm)) {
 		struct realm *realm = &kvm->arch.realm;
 
-		return BIT(realm->ia_bits - 1);
+		return fault_ipa & ~BIT(realm->ia_bits - 1);
 	}
-	return 0;
+	return fault_ipa;
 }
 
 static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d7e8b0c4f2a3..c0a3054201a9 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1468,9 +1468,9 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
 				 struct kvm_memory_slot *memslot)
 {
 	struct kvm *kvm = vcpu->kvm;
-	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
-	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
-	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
+	gpa_t gpa = kvm_gpa_from_fault(kvm, fault_ipa);
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	bool is_priv_gfn = (gpa == fault_ipa);
 	bool priv_exists = kvm_mem_is_private(kvm, gfn);
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 	kvm_pfn_t pfn;
@@ -1478,7 +1478,7 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
 
 	if (priv_exists != is_priv_gfn) {
 		kvm_prepare_memory_fault_exit(vcpu,
-					      fault_ipa & ~gpa_stolen_mask,
+					      gpa,
 					      PAGE_SIZE,
 					      kvm_is_write_fault(vcpu),
 					      false, is_priv_gfn);
@@ -1531,7 +1531,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	long vma_pagesize, fault_granule;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
-	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
 
 	if (fault_is_perm)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1648,7 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
 		fault_ipa &= ~(vma_pagesize - 1);
 
-	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
+	gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1843,7 +1842,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	struct kvm_memory_slot *memslot;
 	unsigned long hva;
 	bool is_iabt, write_fault, writable;
-	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
 	gfn_t gfn;
 	int ret, idx;
 
@@ -1934,7 +1932,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		nested = &nested_trans;
 	}
 
-	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
+	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
 
 	if (kvm_slot_can_be_private(memslot)) {
@@ -1986,8 +1984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		 * of the page size.
 		 */
 		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
-		ipa &= ~gpa_stolen_mask;
-		ret = io_mem_abort(vcpu, ipa);
+		ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
 		goto out_unlock;
 	}
 

Thanks,
Steve
Aneesh Kumar K.V Aug. 23, 2024, 4:30 a.m. UTC | #5
Steven Price <steven.price@arm.com> writes:

> On 22/08/2024 04:50, Aneesh Kumar K.V wrote:
>> Steven Price <steven.price@arm.com> writes:
>>
>>> At runtime if the realm guest accesses memory which hasn't yet been
>>> mapped then KVM needs to either populate the region or fault the guest.
>>>
>>> For memory in the lower (protected) region of IPA a fresh page is
>>> provided to the RMM which will zero the contents. For memory in the
>>> upper (shared) region of IPA, the memory from the memslot is mapped
>>> into the realm VM non secure.
>>>
>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>> ---
>>> Changes since v2:
>>>  * Avoid leaking memory if failing to map it in the realm.
>>>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>>>  * Adapt to changes in previous patches.
>>> ---
>>>  arch/arm64/include/asm/kvm_emulate.h |  10 ++
>>>  arch/arm64/include/asm/kvm_rme.h     |  10 ++
>>>  arch/arm64/kvm/mmu.c                 | 120 +++++++++++++++-
>>>  arch/arm64/kvm/rme.c                 | 205 +++++++++++++++++++++++++--
>>>  4 files changed, 325 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>>> index 7430c77574e3..0b50572d3719 100644
>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>> @@ -710,6 +710,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>>>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>>>  }
>>>
>>> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
>>> +{
>>> +	if (kvm_is_realm(kvm)) {
>>> +		struct realm *realm = &kvm->arch.realm;
>>> +
>>> +		return BIT(realm->ia_bits - 1);
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>>>  {
>>>  	if (static_branch_unlikely(&kvm_rme_is_available))
>>> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
>>> index 0e44b20cfa48..c50854f44674 100644
>>> --- a/arch/arm64/include/asm/kvm_rme.h
>>> +++ b/arch/arm64/include/asm/kvm_rme.h
>>> @@ -103,6 +103,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>>>  			   unsigned long ipa,
>>>  			   u64 size,
>>>  			   bool unmap_private);
>>> +int realm_map_protected(struct realm *realm,
>>> +			unsigned long base_ipa,
>>> +			struct page *dst_page,
>>> +			unsigned long map_size,
>>> +			struct kvm_mmu_memory_cache *memcache);
>>> +int realm_map_non_secure(struct realm *realm,
>>> +			 unsigned long ipa,
>>> +			 struct page *page,
>>> +			 unsigned long map_size,
>>> +			 struct kvm_mmu_memory_cache *memcache);
>>>  int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>>>  			unsigned long addr, unsigned long end,
>>>  			unsigned long ripas,
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index 620d26810019..eb8b8d013f3e 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>>>
>>>  	lockdep_assert_held_write(&kvm->mmu_lock);
>>>  	WARN_ON(size & ~PAGE_MASK);
>>> -	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
>>> -				   may_block));
>>> +
>>> +	if (kvm_is_realm(kvm))
>>> +		kvm_realm_unmap_range(kvm, start, size, !only_shared);
>>> +	else
>>> +		WARN_ON(stage2_apply_range(mmu, start, end,
>>> +					   kvm_pgtable_stage2_unmap,
>>> +					   may_block));
>>>  }
>>>
>>>  void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
>>> @@ -345,7 +350,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>>>  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>>>  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>>>
>>> -	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>>> +	if (kvm_is_realm(kvm))
>>> +		kvm_realm_unmap_range(kvm, addr, end - addr, false);
>>> +	else
>>> +		kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>>>  }
>>>
>>>  /**
>>> @@ -1037,6 +1045,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>>>  	struct kvm_memory_slot *memslot;
>>>  	int idx, bkt;
>>>
>>> +	/* For realms this is handled by the RMM so nothing to do here */
>>> +	if (kvm_is_realm(kvm))
>>> +		return;
>>> +
>>>  	idx = srcu_read_lock(&kvm->srcu);
>>>  	mmap_read_lock(current->mm);
>>>  	write_lock(&kvm->mmu_lock);
>>> @@ -1062,6 +1074,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>>  	if (kvm_is_realm(kvm) &&
>>>  	    (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>>>  	     kvm_realm_state(kvm) != REALM_STATE_NONE)) {
>>> +		kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
>>>  		write_unlock(&kvm->mmu_lock);
>>>  		kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>>>  		return;
>>> @@ -1428,6 +1441,71 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>>>  	return vma->vm_flags & VM_MTE_ALLOWED;
>>>  }
>>>
>>> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
>>> +			 kvm_pfn_t pfn, unsigned long map_size,
>>> +			 enum kvm_pgtable_prot prot,
>>> +			 struct kvm_mmu_memory_cache *memcache)
>>> +{
>>> +	struct realm *realm = &kvm->arch.realm;
>>> +	struct page *page = pfn_to_page(pfn);
>>> +
>>> +	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
>>> +		return -EFAULT;
>>> +
>>> +	if (!realm_is_addr_protected(realm, ipa))
>>> +		return realm_map_non_secure(realm, ipa, page, map_size,
>>> +					    memcache);
>>> +
>>> +	return realm_map_protected(realm, ipa, page, map_size, memcache);
>>> +}
>>> +
>>> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
>>> +				 phys_addr_t fault_ipa,
>>> +				 struct kvm_memory_slot *memslot)
>>> +{
>>> +	struct kvm *kvm = vcpu->kvm;
>>> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
>>> +	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>>> +	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
>>> +	bool priv_exists = kvm_mem_is_private(kvm, gfn);
>>> +	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>>> +	kvm_pfn_t pfn;
>>> +	int ret;
>>> +
>>> +	if (priv_exists != is_priv_gfn) {
>>> +		kvm_prepare_memory_fault_exit(vcpu,
>>> +					      fault_ipa & ~gpa_stolen_mask,
>>> +					      PAGE_SIZE,
>>> +					      kvm_is_write_fault(vcpu),
>>> +					      false, is_priv_gfn);
>>> +
>>> +		return 0;
>>> +	}
>>> +
>>> +	if (!is_priv_gfn) {
>>> +		/* Not a private mapping, handling normally */
>>> +		return -EAGAIN;
>>> +	}
>>>
>>
>> Instead of that EAGAIN, it better to handle as below?
>
> I'm not finding the below easier to read.
>
>>  arch/arm64/kvm/mmu.c | 24 ++++++++++++++++--------
>>  1 file changed, 16 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 1eddbc7d7156..33ef95b5c94a 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -1480,11 +1480,6 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>>  		return 0;
>>  	}
>>
>> -	if (!is_priv_gfn) {
>> -		/* Not a private mapping, handling normally */
>> -		return -EAGAIN;
>> -	}
>> -
>>  	ret = kvm_mmu_topup_memory_cache(memcache,
>>  					 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
>>  	if (ret)
>> @@ -1925,12 +1920,25 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>>  	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
>>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>>
>> -	if (kvm_slot_can_be_private(memslot)) {
>> -		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
>> -		if (ret != -EAGAIN)
>> +	if (kvm_slot_can_be_private(memslot) &&
>> +	    kvm_is_private_gpa(vcpu->kvm, ipa)) {
>
> I presume kvm_is_private_gpa() is defined as something like:
>
> static bool kvm_is_private_gpa(kvm, phys_addr_t ipa)
> {
> 	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> 	return  !((ipa & gpa_stolen_mask) == gpa_stolen_mask);
> }
>
>> +		ret = private_memslot_fault(vcpu, ipa, memslot);
>
> So this handles the case in private_memslot_fault() where is_priv_gfn is
> true. So there's a little bit of simplification in
> private_memslot_fault().
>
>>  			goto out;
>>  	}
>> +	/* attribute msimatch. shared access fault on a mem with private attribute */
>> +	if (kvm_mem_is_private(vcpu->kvm, gfn)) {
>> +		/* let VMM fixup the memory attribute */
>> +		kvm_prepare_memory_fault_exit(vcpu,
>> +					      kvm_gpa_from_fault(vcpu->kvm, ipa),
>> +					      PAGE_SIZE,
>> +					      kvm_is_write_fault(vcpu),
>> +					      false, false);
>
> And then we have to duplicate the code here for calling
> kvm_prepare_memory_fault_exit(). Which seems a bit ugly to me. Am I
> missing something? Your patch doesn't seem complete.
>


What confused me was I was looking at EAGAIN as retry access. But here
it is not a retry. It is an error condition for handling the fault as a
shared access fault. IMHO having that check in the caller makes it
simpler. ie,

if it is a private fault private_memslot_fault handle it with the fault
exit condition that indicates an attribute mismatch

if it is a shared fault the existing fault handling code handles it with
the fault exit condition indicating an attribute mismatch.

If you find the change not clean, can we rename the error to EINVAL?

>
>> +
>> +		ret =  0;
>> +		goto out;
>> +	}
>>
>> +	/* Slot can be be private, but fault addr is not, handle that as normal fault */
>>  	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
>>  	write_fault = kvm_is_write_fault(vcpu);
>>  	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
>
>
> Note your email had a signature line "--" here which causes my email
> client to remove the rest of your reply - it's worth dropping that from
> the git output when sending diffs. I've attempted to include your
> second diff below manually.
>
>> Instead of referring this as stolen bits is it better to do
>>
>>  arch/arm64/include/asm/kvm_emulate.h | 20 +++++++++++++++++---
>>  arch/arm64/kvm/mmu.c                 | 21 ++++++++-------------
>>  2 files changed, 25 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index 0b50572d3719..790412fd53b8 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -710,14 +710,28 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>>  }
>>
>> -static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
>> +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_addr)
>>  {
>> +	gpa_t addr_mask;
>> +
>>  	if (kvm_is_realm(kvm)) {
>>  		struct realm *realm = &kvm->arch.realm;
>>
>> -		return BIT(realm->ia_bits - 1);
>> +		addr_mask = BIT(realm->ia_bits - 1);
>> +		/* clear shared bit and return */
>> +		return fault_addr & ~addr_mask;
>>  	}
>> -	return 0;
>> +	return fault_addr;
>> +}
>> +
>> +static inline bool kvm_is_private_gpa(struct kvm *kvm, phys_addr_t fault_addr)
>> +{
>> +	/*
>> +	 * For Realms, the shared address is an alias of the private GPA
>> +	 * with top bit set and we have a single address space. Thus if the
>> +	 * fault address matches the GPA, it is the private GPA
>> +	 */
>> +	return fault_addr == kvm_gpa_from_fault(kvm, fault_addr);
>>  }
>
> Ah, so here's the missing function from above.
>
>>
>>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index eb8b8d013f3e..1eddbc7d7156 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -1464,20 +1464,18 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>>  				 struct kvm_memory_slot *memslot)
>>  {
>>  	struct kvm *kvm = vcpu->kvm;
>> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
>> -	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> -	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
>> -	bool priv_exists = kvm_mem_is_private(kvm, gfn);
>> +	gfn_t gfn = kvm_gpa_from_fault(kvm, fault_ipa) >> PAGE_SHIFT;
>>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>>  	kvm_pfn_t pfn;
>>  	int ret;
>>
>> -	if (priv_exists != is_priv_gfn) {
>> +	if (!kvm_mem_is_private(kvm, gfn)) {
>> +		/* let VMM fixup the memory attribute */
>>  		kvm_prepare_memory_fault_exit(vcpu,
>> -					      fault_ipa & ~gpa_stolen_mask,
>> +					      kvm_gpa_from_fault(kvm, fault_ipa),
>>  					      PAGE_SIZE,
>>  					      kvm_is_write_fault(vcpu),
>> -					      false, is_priv_gfn);
>> +					      false, true);
>>
>>  		return 0;
>>  	}
>> @@ -1527,7 +1525,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	long vma_pagesize, fault_granule;
>>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>>  	struct kvm_pgtable *pgt;
>> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>>
>>  	if (fault_is_perm)
>>  		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
>> @@ -1640,7 +1637,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
>>  		fault_ipa &= ~(vma_pagesize - 1);
>>
>> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> +	gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
>>  	mte_allowed = kvm_vma_mte_allowed(vma);
>>
>>  	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
>> @@ -1835,7 +1832,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>>  	struct kvm_memory_slot *memslot;
>>  	unsigned long hva;
>>  	bool is_iabt, write_fault, writable;
>> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>>  	gfn_t gfn;
>>  	int ret, idx;
>>
>> @@ -1926,7 +1922,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>>  		nested = &nested_trans;
>>  	}
>>
>> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> +	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
>>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>>
>>  	if (kvm_slot_can_be_private(memslot)) {
>> @@ -1978,8 +1974,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>>  		 * of the page size.
>>  		 */
>>  		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
>> -		ipa &= ~gpa_stolen_mask;
>> -		ret = io_mem_abort(vcpu, ipa);
>> +		ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
>>  		goto out_unlock;
>>  	}
>
> I can see your point that kvm_gpa_from_fault() makes sense. I'm still
> not convinced about the duplication of the kvm_prepare_memory_fault_exit()
> call though.
>
> How about the following (untested):
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 0b50572d3719..fa03520d7933 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -710,14 +710,14 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>  }
>
> -static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_ipa)
>  {
>  	if (kvm_is_realm(kvm)) {
>  		struct realm *realm = &kvm->arch.realm;
>
> -		return BIT(realm->ia_bits - 1);
> +		return fault_ipa & ~BIT(realm->ia_bits - 1);
>  	}
> -	return 0;
> +	return fault_ipa;
>  }
>
>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d7e8b0c4f2a3..c0a3054201a9 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1468,9 +1468,9 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>  				 struct kvm_memory_slot *memslot)
>  {
>  	struct kvm *kvm = vcpu->kvm;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> -	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> -	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
> +	gpa_t gpa = kvm_gpa_from_fault(kvm, fault_ipa);
> +	gfn_t gfn = gpa >> PAGE_SHIFT;
> +	bool is_priv_gfn = (gpa == fault_ipa);
>  	bool priv_exists = kvm_mem_is_private(kvm, gfn);
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>  	kvm_pfn_t pfn;
> @@ -1478,7 +1478,7 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
>
>  	if (priv_exists != is_priv_gfn) {
>

Can we also have a helper with document for this?

static inline bool kvm_is_private_gpa(struct kvm *kvm, phys_addr_t fault_addr)
{
	/*
	 * For Realms, the shared address is an alias of the private GPA
	 * with top bit set and we have a single address space. Thus if the
	 * fault address matches the GPA, it is the private GPA
	 */
	return fault_addr == kvm_gpa_from_fault(kvm, fault_addr);
}


>  		kvm_prepare_memory_fault_exit(vcpu,
> -					      fault_ipa & ~gpa_stolen_mask,
> +					      gpa,
>  					      PAGE_SIZE,
>  					      kvm_is_write_fault(vcpu),
>  					      false, is_priv_gfn);
> @@ -1531,7 +1531,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	long vma_pagesize, fault_granule;
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>
>  	if (fault_is_perm)
>  		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> @@ -1648,7 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
>  		fault_ipa &= ~(vma_pagesize - 1);
>
> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
>  	mte_allowed = kvm_vma_mte_allowed(vma);
>
>  	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> @@ -1843,7 +1842,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  	struct kvm_memory_slot *memslot;
>  	unsigned long hva;
>  	bool is_iabt, write_fault, writable;
> -	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>  	gfn_t gfn;
>  	int ret, idx;
>
> @@ -1934,7 +1932,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		nested = &nested_trans;
>  	}
>
> -	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>
>  	if (kvm_slot_can_be_private(memslot)) {
> @@ -1986,8 +1984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		 * of the page size.
>  		 */
>  		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
> -		ipa &= ~gpa_stolen_mask;
> -		ret = io_mem_abort(vcpu, ipa);
> +		ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
>  		goto out_unlock;
>  	}
>
>
> Thanks,
> Steve
Matias Ezequiel Vara Larsen Sept. 2, 2024, 1:25 p.m. UTC | #6
Hello Steven, 

On Wed, Aug 21, 2024 at 04:38:22PM +0100, Steven Price wrote:
> At runtime if the realm guest accesses memory which hasn't yet been
> mapped then KVM needs to either populate the region or fault the guest.
> 
> For memory in the lower (protected) region of IPA a fresh page is
> provided to the RMM which will zero the contents. For memory in the
> upper (shared) region of IPA, the memory from the memslot is mapped
> into the realm VM non secure.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v2:
>  * Avoid leaking memory if failing to map it in the realm.
>  * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>  * Adapt to changes in previous patches.
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  10 ++
>  arch/arm64/include/asm/kvm_rme.h     |  10 ++
>  arch/arm64/kvm/mmu.c                 | 120 +++++++++++++++-
>  arch/arm64/kvm/rme.c                 | 205 +++++++++++++++++++++++++--
>  4 files changed, 325 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 7430c77574e3..0b50572d3719 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -710,6 +710,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>  	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>  }
>  
> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +{
> +	if (kvm_is_realm(kvm)) {
> +		struct realm *realm = &kvm->arch.realm;
> +
> +		return BIT(realm->ia_bits - 1);
> +	}
> +	return 0;
> +}
> +
>  static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>  {
>  	if (static_branch_unlikely(&kvm_rme_is_available))
> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
> index 0e44b20cfa48..c50854f44674 100644
> --- a/arch/arm64/include/asm/kvm_rme.h
> +++ b/arch/arm64/include/asm/kvm_rme.h
> @@ -103,6 +103,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>  			   unsigned long ipa,
>  			   u64 size,
>  			   bool unmap_private);
> +int realm_map_protected(struct realm *realm,
> +			unsigned long base_ipa,
> +			struct page *dst_page,
> +			unsigned long map_size,
> +			struct kvm_mmu_memory_cache *memcache);
> +int realm_map_non_secure(struct realm *realm,
> +			 unsigned long ipa,
> +			 struct page *page,
> +			 unsigned long map_size,
> +			 struct kvm_mmu_memory_cache *memcache);
>  int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>  			unsigned long addr, unsigned long end,
>  			unsigned long ripas,
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 620d26810019..eb8b8d013f3e 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>  
>  	lockdep_assert_held_write(&kvm->mmu_lock);
>  	WARN_ON(size & ~PAGE_MASK);
> -	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
> -				   may_block));
> +
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_unmap_range(kvm, start, size, !only_shared);
> +	else
> +		WARN_ON(stage2_apply_range(mmu, start, end,
> +					   kvm_pgtable_stage2_unmap,
> +					   may_block));
>  }
>  
>  void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
> @@ -345,7 +350,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>  
> -	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_unmap_range(kvm, addr, end - addr, false);
> +	else
> +		kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>  }
>  
>  /**
> @@ -1037,6 +1045,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>  	struct kvm_memory_slot *memslot;
>  	int idx, bkt;
>  
> +	/* For realms this is handled by the RMM so nothing to do here */
> +	if (kvm_is_realm(kvm))
> +		return;
> +
>  	idx = srcu_read_lock(&kvm->srcu);
>  	mmap_read_lock(current->mm);
>  	write_lock(&kvm->mmu_lock);
> @@ -1062,6 +1074,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  	if (kvm_is_realm(kvm) &&
>  	    (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>  	     kvm_realm_state(kvm) != REALM_STATE_NONE)) {
> +		kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
>  		write_unlock(&kvm->mmu_lock);
>  		kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>  		return;
> @@ -1428,6 +1441,71 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>  	return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>  
> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
> +			 kvm_pfn_t pfn, unsigned long map_size,
> +			 enum kvm_pgtable_prot prot,
> +			 struct kvm_mmu_memory_cache *memcache)
> +{
> +	struct realm *realm = &kvm->arch.realm;
> +	struct page *page = pfn_to_page(pfn);
> +
> +	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
> +		return -EFAULT;
> +
> +	if (!realm_is_addr_protected(realm, ipa))
> +		return realm_map_non_secure(realm, ipa, page, map_size,
> +					    memcache);
> +
> +	return realm_map_protected(realm, ipa, page, map_size, memcache);
> +}
> +
> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
> +				 phys_addr_t fault_ipa,
> +				 struct kvm_memory_slot *memslot)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> +	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> +	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
> +	bool priv_exists = kvm_mem_is_private(kvm, gfn);
> +	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> +	kvm_pfn_t pfn;
> +	int ret;
> +
> +	if (priv_exists != is_priv_gfn) {
> +		kvm_prepare_memory_fault_exit(vcpu,
> +					      fault_ipa & ~gpa_stolen_mask,
> +					      PAGE_SIZE,
> +					      kvm_is_write_fault(vcpu),
> +					      false, is_priv_gfn);
> +
> +		return 0;
> +	}

If I understand correctly, `kvm_prepare_memory_fault_exit()` ends up
returning to the VMM with the KVM_EXIT_MEMORY_FAULT exit reason. The
documentation says (https://docs.kernel.org/virt/kvm/api.html#kvm-run):

"Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that
it accompanies a return code of ‘-1’, not ‘0’! errno will always be set
to EFAULT or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT,
userspace should assume kvm_run.exit_reason is stale/undefined for all
other error numbers."

Shall the return code be different for KVM_EXIT_MEMORY_FAULT?

Thanks, Matias.

> +
> +	if (!is_priv_gfn) {
> +		/* Not a private mapping, handling normally */
> +		return -EAGAIN;
> +	}
> +
> +	ret = kvm_mmu_topup_memory_cache(memcache,
> +					 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
> +	if (ret)
> +		return ret;
> +
> +	ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, NULL);
> +	if (ret)
> +		return ret;
> +
> +	/* FIXME: Should be able to use bigger than PAGE_SIZE mappings */
> +	ret = realm_map_ipa(kvm, fault_ipa, pfn, PAGE_SIZE, KVM_PGTABLE_PROT_W,
> +			    memcache);
> +	if (!ret)
> +		return 1; /* Handled */
> +
> +	put_page(pfn_to_page(pfn));
> +	return ret;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_s2_trans *nested,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -1449,10 +1527,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	long vma_pagesize, fault_granule;
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>  
>  	if (fault_is_perm)
>  		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
>  	write_fault = kvm_is_write_fault(vcpu);
> +
> +	/*
> +	 * Realms cannot map protected pages read-only
> +	 * FIXME: It should be possible to map unprotected pages read-only
> +	 */
> +	if (vcpu_is_rec(vcpu))
> +		write_fault = true;
> +
>  	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
>  	VM_BUG_ON(write_fault && exec_fault);
>  
> @@ -1553,7 +1640,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
>  		fault_ipa &= ~(vma_pagesize - 1);
>  
> -	gfn = ipa >> PAGE_SHIFT;
> +	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>  	mte_allowed = kvm_vma_mte_allowed(vma);
>  
>  	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> @@ -1634,7 +1721,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	 * If we are not forced to use page mapping, check if we are
>  	 * backed by a THP and thus use block mapping if possible.
>  	 */
> -	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
> +	/* FIXME: We shouldn't need to disable this for realms */
> +	if (vma_pagesize == PAGE_SIZE && !(force_pte || device || kvm_is_realm(kvm))) {
>  		if (fault_is_perm && fault_granule > PAGE_SIZE)
>  			vma_pagesize = fault_granule;
>  		else
> @@ -1686,6 +1774,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		 */
>  		prot &= ~KVM_NV_GUEST_MAP_SZ;
>  		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> +	} else if (kvm_is_realm(kvm)) {
> +		ret = realm_map_ipa(kvm, fault_ipa, pfn, vma_pagesize,
> +				    prot, memcache);
>  	} else {
>  		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
>  					     __pfn_to_phys(pfn), prot,
> @@ -1744,6 +1835,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  	struct kvm_memory_slot *memslot;
>  	unsigned long hva;
>  	bool is_iabt, write_fault, writable;
> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>  	gfn_t gfn;
>  	int ret, idx;
>  
> @@ -1834,8 +1926,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		nested = &nested_trans;
>  	}
>  
> -	gfn = ipa >> PAGE_SHIFT;
> +	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
> +
> +	if (kvm_slot_can_be_private(memslot)) {
> +		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
> +		if (ret != -EAGAIN)
> +			goto out;
> +	}
> +
>  	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
> @@ -1879,6 +1978,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		 * of the page size.
>  		 */
>  		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
> +		ipa &= ~gpa_stolen_mask;
>  		ret = io_mem_abort(vcpu, ipa);
>  		goto out_unlock;
>  	}
> @@ -1927,6 +2027,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  	if (!kvm->arch.mmu.pgt)
>  		return false;
>  
> +	/* We don't support aging for Realms */
> +	if (kvm_is_realm(kvm))
> +		return true;
> +
>  	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
>  						   range->start << PAGE_SHIFT,
>  						   size, true);
> @@ -1943,6 +2047,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  	if (!kvm->arch.mmu.pgt)
>  		return false;
>  
> +	/* We don't support aging for Realms */
> +	if (kvm_is_realm(kvm))
> +		return true;
> +
>  	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
>  						   range->start << PAGE_SHIFT,
>  						   size, false);
> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> index 2c4e28b457be..337b3dd1e00c 100644
> --- a/arch/arm64/kvm/rme.c
> +++ b/arch/arm64/kvm/rme.c
> @@ -627,6 +627,181 @@ static int fold_rtt(struct realm *realm, unsigned long addr, int level)
>  	return 0;
>  }
>  
> +static phys_addr_t rtt_get_phys(struct realm *realm, struct rtt_entry *rtt)
> +{
> +	bool lpa2 = realm->params->flags & RMI_REALM_PARAM_FLAG_LPA2;
> +
> +	if (lpa2)
> +		return rtt->desc & GENMASK(49, 12);
> +	return rtt->desc & GENMASK(47, 12);
> +}
> +
> +int realm_map_protected(struct realm *realm,
> +			unsigned long base_ipa,
> +			struct page *dst_page,
> +			unsigned long map_size,
> +			struct kvm_mmu_memory_cache *memcache)
> +{
> +	phys_addr_t dst_phys = page_to_phys(dst_page);
> +	phys_addr_t rd = virt_to_phys(realm->rd);
> +	unsigned long phys = dst_phys;
> +	unsigned long ipa = base_ipa;
> +	unsigned long size;
> +	int map_level;
> +	int ret = 0;
> +
> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> +		return -EINVAL;
> +
> +	switch (map_size) {
> +	case PAGE_SIZE:
> +		map_level = 3;
> +		break;
> +	case RME_L2_BLOCK_SIZE:
> +		map_level = 2;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (map_level < RME_RTT_MAX_LEVEL) {
> +		/*
> +		 * A temporary RTT is needed during the map, precreate it,
> +		 * however if there is an error (e.g. missing parent tables)
> +		 * this will be handled below.
> +		 */
> +		realm_create_rtt_levels(realm, ipa, map_level,
> +					RME_RTT_MAX_LEVEL, memcache);
> +	}
> +
> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
> +		if (rmi_granule_delegate(phys)) {
> +			struct rtt_entry rtt;
> +
> +			/*
> +			 * It's possible we raced with another VCPU on the same
> +			 * fault. If the entry exists and matches then exit
> +			 * early and assume the other VCPU will handle the
> +			 * mapping.
> +			 */
> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> +				goto err;
> +
> +			/*
> +			 * FIXME: For a block mapping this could race at level
> +			 * 2 or 3... currently we don't support block mappings
> +			 */
> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> +				     rtt.state != RMI_ASSIGNED ||
> +				     rtt_get_phys(realm, &rtt) != phys))) {
> +				goto err;
> +			}
> +
> +			return 0;
> +		}
> +
> +		ret = rmi_data_create_unknown(rd, phys, ipa);
> +
> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> +			/* Create missing RTTs and retry */
> +			int level = RMI_RETURN_INDEX(ret);
> +
> +			ret = realm_create_rtt_levels(realm, ipa, level,
> +						      RME_RTT_MAX_LEVEL,
> +						      memcache);
> +			WARN_ON(ret);
> +			if (ret)
> +				goto err_undelegate;
> +
> +			ret = rmi_data_create_unknown(rd, phys, ipa);
> +		}
> +		WARN_ON(ret);
> +
> +		if (ret)
> +			goto err_undelegate;
> +
> +		phys += PAGE_SIZE;
> +		ipa += PAGE_SIZE;
> +	}
> +
> +	if (map_size == RME_L2_BLOCK_SIZE)
> +		ret = fold_rtt(realm, base_ipa, map_level);
> +	if (WARN_ON(ret))
> +		goto err;
> +
> +	return 0;
> +
> +err_undelegate:
> +	if (WARN_ON(rmi_granule_undelegate(phys))) {
> +		/* Page can't be returned to NS world so is lost */
> +		get_page(phys_to_page(phys));
> +	}
> +err:
> +	while (size > 0) {
> +		unsigned long data, top;
> +
> +		phys -= PAGE_SIZE;
> +		size -= PAGE_SIZE;
> +		ipa -= PAGE_SIZE;
> +
> +		WARN_ON(rmi_data_destroy(rd, ipa, &data, &top));
> +
> +		if (WARN_ON(rmi_granule_undelegate(phys))) {
> +			/* Page can't be returned to NS world so is lost */
> +			get_page(phys_to_page(phys));
> +		}
> +	}
> +	return -ENXIO;
> +}
> +
> +int realm_map_non_secure(struct realm *realm,
> +			 unsigned long ipa,
> +			 struct page *page,
> +			 unsigned long map_size,
> +			 struct kvm_mmu_memory_cache *memcache)
> +{
> +	phys_addr_t rd = virt_to_phys(realm->rd);
> +	int map_level;
> +	int ret = 0;
> +	unsigned long desc = page_to_phys(page) |
> +			     PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) |
> +			     /* FIXME: Read+Write permissions for now */
> +			     (3 << 6) |
> +			     PTE_SHARED;
> +
> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> +		return -EINVAL;
> +
> +	switch (map_size) {
> +	case PAGE_SIZE:
> +		map_level = 3;
> +		break;
> +	case RME_L2_BLOCK_SIZE:
> +		map_level = 2;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
> +
> +	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> +		/* Create missing RTTs and retry */
> +		int level = RMI_RETURN_INDEX(ret);
> +
> +		ret = realm_create_rtt_levels(realm, ipa, level, map_level,
> +					      memcache);
> +		if (WARN_ON(ret))
> +			return -ENXIO;
> +
> +		ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
> +	}
> +	if (WARN_ON(ret))
> +		return -ENXIO;
> +
> +	return 0;
> +}
> +
>  static int populate_par_region(struct kvm *kvm,
>  			       phys_addr_t ipa_base,
>  			       phys_addr_t ipa_end,
> @@ -638,7 +813,6 @@ static int populate_par_region(struct kvm *kvm,
>  	int idx;
>  	phys_addr_t ipa;
>  	int ret = 0;
> -	struct page *tmp_page;
>  	unsigned long data_flags = 0;
>  
>  	base_gfn = gpa_to_gfn(ipa_base);
> @@ -660,9 +834,8 @@ static int populate_par_region(struct kvm *kvm,
>  		goto out;
>  	}
>  
> -	tmp_page = alloc_page(GFP_KERNEL);
> -	if (!tmp_page) {
> -		ret = -ENOMEM;
> +	if (!kvm_slot_can_be_private(memslot)) {
> +		ret = -EINVAL;
>  		goto out;
>  	}
>  
> @@ -729,28 +902,32 @@ static int populate_par_region(struct kvm *kvm,
>  		for (offset = 0; offset < map_size && !ret;
>  		     offset += PAGE_SIZE, page++) {
>  			phys_addr_t page_ipa = ipa + offset;
> +			kvm_pfn_t priv_pfn;
> +			int order;
> +
> +			ret = kvm_gmem_get_pfn(kvm, memslot,
> +					       page_ipa >> PAGE_SHIFT,
> +					       &priv_pfn, &order);
> +			if (ret)
> +				break;
>  
>  			ret = realm_create_protected_data_page(realm, page_ipa,
> -							       page, tmp_page,
> -							       data_flags);
> +							       pfn_to_page(priv_pfn),
> +							       page, data_flags);
>  		}
> +
> +		kvm_release_pfn_clean(pfn);
> +
>  		if (ret)
> -			goto err_release_pfn;
> +			break;
>  
>  		if (level == 2)
>  			fold_rtt(realm, ipa, level);
>  
>  		ipa += map_size;
> -		kvm_release_pfn_dirty(pfn);
> -err_release_pfn:
> -		if (ret) {
> -			kvm_release_pfn_clean(pfn);
> -			break;
> -		}
>  	}
>  
>  	mmap_read_unlock(current->mm);
> -	__free_page(tmp_page);
>  
>  out:
>  	srcu_read_unlock(&kvm->srcu, idx);
> -- 
> 2.34.1
> 
>
Steven Price Sept. 2, 2024, 3:34 p.m. UTC | #7
On 02/09/2024 14:25, Matias Ezequiel Vara Larsen wrote:
> Hello Steven, 
> 
> On Wed, Aug 21, 2024 at 04:38:22PM +0100, Steven Price wrote:

...

>> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
>> +				 phys_addr_t fault_ipa,
>> +				 struct kvm_memory_slot *memslot)
>> +{
>> +	struct kvm *kvm = vcpu->kvm;
>> +	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
>> +	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> +	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
>> +	bool priv_exists = kvm_mem_is_private(kvm, gfn);
>> +	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>> +	kvm_pfn_t pfn;
>> +	int ret;
>> +
>> +	if (priv_exists != is_priv_gfn) {
>> +		kvm_prepare_memory_fault_exit(vcpu,
>> +					      fault_ipa & ~gpa_stolen_mask,
>> +					      PAGE_SIZE,
>> +					      kvm_is_write_fault(vcpu),
>> +					      false, is_priv_gfn);
>> +
>> +		return 0;
>> +	}
> 
> If I understand correctly, `kvm_prepare_memory_fault_exit()` ends up
> returning to the VMM with the KVM_EXIT_MEMORY_FAULT exit reason. The
> documentation says (https://docs.kernel.org/virt/kvm/api.html#kvm-run):
> 
> "Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that
> it accompanies a return code of ‘-1’, not ‘0’! errno will always be set
> to EFAULT or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT,
> userspace should assume kvm_run.exit_reason is stale/undefined for all
> other error numbers."
> 
> Shall the return code be different for KVM_EXIT_MEMORY_FAULT?
> 
> Thanks, Matias.

Ah, good spot - I've no idea why KVM_EXIT_MEMORY_FAULT is special in
this regard, but yes I guess we should be returning -EFAULT here.

Thanks,
Steve
Jean-Philippe Brucker Sept. 4, 2024, 2:48 p.m. UTC | #8
On Wed, Aug 21, 2024 at 04:38:22PM +0100, Steven Price wrote:
> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> index 2c4e28b457be..337b3dd1e00c 100644
> --- a/arch/arm64/kvm/rme.c
> +++ b/arch/arm64/kvm/rme.c
> @@ -627,6 +627,181 @@ static int fold_rtt(struct realm *realm, unsigned long addr, int level)
>  	return 0;
>  }
>  
> +static phys_addr_t rtt_get_phys(struct realm *realm, struct rtt_entry *rtt)
> +{
> +	bool lpa2 = realm->params->flags & RMI_REALM_PARAM_FLAG_LPA2;

At this point realm->params is NULL, cleared by kvm_create_realm()

Thanks,
Jean

> +
> +	if (lpa2)
> +		return rtt->desc & GENMASK(49, 12);
> +	return rtt->desc & GENMASK(47, 12);
> +}
> +
> +int realm_map_protected(struct realm *realm,
> +			unsigned long base_ipa,
> +			struct page *dst_page,
> +			unsigned long map_size,
> +			struct kvm_mmu_memory_cache *memcache)
> +{
> +	phys_addr_t dst_phys = page_to_phys(dst_page);
> +	phys_addr_t rd = virt_to_phys(realm->rd);
> +	unsigned long phys = dst_phys;
> +	unsigned long ipa = base_ipa;
> +	unsigned long size;
> +	int map_level;
> +	int ret = 0;
> +
> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> +		return -EINVAL;
> +
> +	switch (map_size) {
> +	case PAGE_SIZE:
> +		map_level = 3;
> +		break;
> +	case RME_L2_BLOCK_SIZE:
> +		map_level = 2;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (map_level < RME_RTT_MAX_LEVEL) {
> +		/*
> +		 * A temporary RTT is needed during the map, precreate it,
> +		 * however if there is an error (e.g. missing parent tables)
> +		 * this will be handled below.
> +		 */
> +		realm_create_rtt_levels(realm, ipa, map_level,
> +					RME_RTT_MAX_LEVEL, memcache);
> +	}
> +
> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
> +		if (rmi_granule_delegate(phys)) {
> +			struct rtt_entry rtt;
> +
> +			/*
> +			 * It's possible we raced with another VCPU on the same
> +			 * fault. If the entry exists and matches then exit
> +			 * early and assume the other VCPU will handle the
> +			 * mapping.
> +			 */
> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> +				goto err;
> +
> +			/*
> +			 * FIXME: For a block mapping this could race at level
> +			 * 2 or 3... currently we don't support block mappings
> +			 */
> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> +				     rtt.state != RMI_ASSIGNED ||
> +				     rtt_get_phys(realm, &rtt) != phys))) {
Steven Price Sept. 4, 2024, 3:59 p.m. UTC | #9
On 04/09/2024 15:48, Jean-Philippe Brucker wrote:
> On Wed, Aug 21, 2024 at 04:38:22PM +0100, Steven Price wrote:
>> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
>> index 2c4e28b457be..337b3dd1e00c 100644
>> --- a/arch/arm64/kvm/rme.c
>> +++ b/arch/arm64/kvm/rme.c
>> @@ -627,6 +627,181 @@ static int fold_rtt(struct realm *realm, unsigned long addr, int level)
>>  	return 0;
>>  }
>>  
>> +static phys_addr_t rtt_get_phys(struct realm *realm, struct rtt_entry *rtt)
>> +{
>> +	bool lpa2 = realm->params->flags & RMI_REALM_PARAM_FLAG_LPA2;
> 
> At this point realm->params is NULL, cleared by kvm_create_realm()

Ah, indeed so. Also LPA2 isn't yet supported (we have no way of setting
that flag).

Since this code is only called for block mappings (also not yet
supported) that explains why I've never seen the issue.

Thanks,
Steve

> Thanks,
> Jean
> 
>> +
>> +	if (lpa2)
>> +		return rtt->desc & GENMASK(49, 12);
>> +	return rtt->desc & GENMASK(47, 12);
>> +}
>> +
>> +int realm_map_protected(struct realm *realm,
>> +			unsigned long base_ipa,
>> +			struct page *dst_page,
>> +			unsigned long map_size,
>> +			struct kvm_mmu_memory_cache *memcache)
>> +{
>> +	phys_addr_t dst_phys = page_to_phys(dst_page);
>> +	phys_addr_t rd = virt_to_phys(realm->rd);
>> +	unsigned long phys = dst_phys;
>> +	unsigned long ipa = base_ipa;
>> +	unsigned long size;
>> +	int map_level;
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
>> +		return -EINVAL;
>> +
>> +	switch (map_size) {
>> +	case PAGE_SIZE:
>> +		map_level = 3;
>> +		break;
>> +	case RME_L2_BLOCK_SIZE:
>> +		map_level = 2;
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (map_level < RME_RTT_MAX_LEVEL) {
>> +		/*
>> +		 * A temporary RTT is needed during the map, precreate it,
>> +		 * however if there is an error (e.g. missing parent tables)
>> +		 * this will be handled below.
>> +		 */
>> +		realm_create_rtt_levels(realm, ipa, map_level,
>> +					RME_RTT_MAX_LEVEL, memcache);
>> +	}
>> +
>> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
>> +		if (rmi_granule_delegate(phys)) {
>> +			struct rtt_entry rtt;
>> +
>> +			/*
>> +			 * It's possible we raced with another VCPU on the same
>> +			 * fault. If the entry exists and matches then exit
>> +			 * early and assume the other VCPU will handle the
>> +			 * mapping.
>> +			 */
>> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
>> +				goto err;
>> +
>> +			/*
>> +			 * FIXME: For a block mapping this could race at level
>> +			 * 2 or 3... currently we don't support block mappings
>> +			 */
>> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
>> +				     rtt.state != RMI_ASSIGNED ||
>> +				     rtt_get_phys(realm, &rtt) != phys))) {
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 7430c77574e3..0b50572d3719 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -710,6 +710,16 @@  static inline bool kvm_realm_is_created(struct kvm *kvm)
 	return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
 }
 
+static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
+{
+	if (kvm_is_realm(kvm)) {
+		struct realm *realm = &kvm->arch.realm;
+
+		return BIT(realm->ia_bits - 1);
+	}
+	return 0;
+}
+
 static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
 {
 	if (static_branch_unlikely(&kvm_rme_is_available))
diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
index 0e44b20cfa48..c50854f44674 100644
--- a/arch/arm64/include/asm/kvm_rme.h
+++ b/arch/arm64/include/asm/kvm_rme.h
@@ -103,6 +103,16 @@  void kvm_realm_unmap_range(struct kvm *kvm,
 			   unsigned long ipa,
 			   u64 size,
 			   bool unmap_private);
+int realm_map_protected(struct realm *realm,
+			unsigned long base_ipa,
+			struct page *dst_page,
+			unsigned long map_size,
+			struct kvm_mmu_memory_cache *memcache);
+int realm_map_non_secure(struct realm *realm,
+			 unsigned long ipa,
+			 struct page *page,
+			 unsigned long map_size,
+			 struct kvm_mmu_memory_cache *memcache);
 int realm_set_ipa_state(struct kvm_vcpu *vcpu,
 			unsigned long addr, unsigned long end,
 			unsigned long ripas,
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 620d26810019..eb8b8d013f3e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -325,8 +325,13 @@  static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 	WARN_ON(size & ~PAGE_MASK);
-	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
-				   may_block));
+
+	if (kvm_is_realm(kvm))
+		kvm_realm_unmap_range(kvm, start, size, !only_shared);
+	else
+		WARN_ON(stage2_apply_range(mmu, start, end,
+					   kvm_pgtable_stage2_unmap,
+					   may_block));
 }
 
 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -345,7 +350,10 @@  static void stage2_flush_memslot(struct kvm *kvm,
 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 
-	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
+	if (kvm_is_realm(kvm))
+		kvm_realm_unmap_range(kvm, addr, end - addr, false);
+	else
+		kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
 }
 
 /**
@@ -1037,6 +1045,10 @@  void stage2_unmap_vm(struct kvm *kvm)
 	struct kvm_memory_slot *memslot;
 	int idx, bkt;
 
+	/* For realms this is handled by the RMM so nothing to do here */
+	if (kvm_is_realm(kvm))
+		return;
+
 	idx = srcu_read_lock(&kvm->srcu);
 	mmap_read_lock(current->mm);
 	write_lock(&kvm->mmu_lock);
@@ -1062,6 +1074,7 @@  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	if (kvm_is_realm(kvm) &&
 	    (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
 	     kvm_realm_state(kvm) != REALM_STATE_NONE)) {
+		kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
 		write_unlock(&kvm->mmu_lock);
 		kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
 		return;
@@ -1428,6 +1441,71 @@  static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
+			 kvm_pfn_t pfn, unsigned long map_size,
+			 enum kvm_pgtable_prot prot,
+			 struct kvm_mmu_memory_cache *memcache)
+{
+	struct realm *realm = &kvm->arch.realm;
+	struct page *page = pfn_to_page(pfn);
+
+	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
+		return -EFAULT;
+
+	if (!realm_is_addr_protected(realm, ipa))
+		return realm_map_non_secure(realm, ipa, page, map_size,
+					    memcache);
+
+	return realm_map_protected(realm, ipa, page, map_size, memcache);
+}
+
+static int private_memslot_fault(struct kvm_vcpu *vcpu,
+				 phys_addr_t fault_ipa,
+				 struct kvm_memory_slot *memslot)
+{
+	struct kvm *kvm = vcpu->kvm;
+	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
+	gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
+	bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
+	bool priv_exists = kvm_mem_is_private(kvm, gfn);
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	kvm_pfn_t pfn;
+	int ret;
+
+	if (priv_exists != is_priv_gfn) {
+		kvm_prepare_memory_fault_exit(vcpu,
+					      fault_ipa & ~gpa_stolen_mask,
+					      PAGE_SIZE,
+					      kvm_is_write_fault(vcpu),
+					      false, is_priv_gfn);
+
+		return 0;
+	}
+
+	if (!is_priv_gfn) {
+		/* Not a private mapping, handling normally */
+		return -EAGAIN;
+	}
+
+	ret = kvm_mmu_topup_memory_cache(memcache,
+					 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
+	if (ret)
+		return ret;
+
+	ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, NULL);
+	if (ret)
+		return ret;
+
+	/* FIXME: Should be able to use bigger than PAGE_SIZE mappings */
+	ret = realm_map_ipa(kvm, fault_ipa, pfn, PAGE_SIZE, KVM_PGTABLE_PROT_W,
+			    memcache);
+	if (!ret)
+		return 1; /* Handled */
+
+	put_page(pfn_to_page(pfn));
+	return ret;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_s2_trans *nested,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1449,10 +1527,19 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	long vma_pagesize, fault_granule;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
+	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
 
 	if (fault_is_perm)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
 	write_fault = kvm_is_write_fault(vcpu);
+
+	/*
+	 * Realms cannot map protected pages read-only
+	 * FIXME: It should be possible to map unprotected pages read-only
+	 */
+	if (vcpu_is_rec(vcpu))
+		write_fault = true;
+
 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
 	VM_BUG_ON(write_fault && exec_fault);
 
@@ -1553,7 +1640,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
 		fault_ipa &= ~(vma_pagesize - 1);
 
-	gfn = ipa >> PAGE_SHIFT;
+	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1634,7 +1721,8 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+	/* FIXME: We shouldn't need to disable this for realms */
+	if (vma_pagesize == PAGE_SIZE && !(force_pte || device || kvm_is_realm(kvm))) {
 		if (fault_is_perm && fault_granule > PAGE_SIZE)
 			vma_pagesize = fault_granule;
 		else
@@ -1686,6 +1774,9 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		 */
 		prot &= ~KVM_NV_GUEST_MAP_SZ;
 		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
+	} else if (kvm_is_realm(kvm)) {
+		ret = realm_map_ipa(kvm, fault_ipa, pfn, vma_pagesize,
+				    prot, memcache);
 	} else {
 		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
 					     __pfn_to_phys(pfn), prot,
@@ -1744,6 +1835,7 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	struct kvm_memory_slot *memslot;
 	unsigned long hva;
 	bool is_iabt, write_fault, writable;
+	gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
 	gfn_t gfn;
 	int ret, idx;
 
@@ -1834,8 +1926,15 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		nested = &nested_trans;
 	}
 
-	gfn = ipa >> PAGE_SHIFT;
+	gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+
+	if (kvm_slot_can_be_private(memslot)) {
+		ret = private_memslot_fault(vcpu, fault_ipa, memslot);
+		if (ret != -EAGAIN)
+			goto out;
+	}
+
 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
 	write_fault = kvm_is_write_fault(vcpu);
 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
@@ -1879,6 +1978,7 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		 * of the page size.
 		 */
 		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+		ipa &= ~gpa_stolen_mask;
 		ret = io_mem_abort(vcpu, ipa);
 		goto out_unlock;
 	}
@@ -1927,6 +2027,10 @@  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
+	/* We don't support aging for Realms */
+	if (kvm_is_realm(kvm))
+		return true;
+
 	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
 						   range->start << PAGE_SHIFT,
 						   size, true);
@@ -1943,6 +2047,10 @@  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
+	/* We don't support aging for Realms */
+	if (kvm_is_realm(kvm))
+		return true;
+
 	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
 						   range->start << PAGE_SHIFT,
 						   size, false);
diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
index 2c4e28b457be..337b3dd1e00c 100644
--- a/arch/arm64/kvm/rme.c
+++ b/arch/arm64/kvm/rme.c
@@ -627,6 +627,181 @@  static int fold_rtt(struct realm *realm, unsigned long addr, int level)
 	return 0;
 }
 
+static phys_addr_t rtt_get_phys(struct realm *realm, struct rtt_entry *rtt)
+{
+	bool lpa2 = realm->params->flags & RMI_REALM_PARAM_FLAG_LPA2;
+
+	if (lpa2)
+		return rtt->desc & GENMASK(49, 12);
+	return rtt->desc & GENMASK(47, 12);
+}
+
+int realm_map_protected(struct realm *realm,
+			unsigned long base_ipa,
+			struct page *dst_page,
+			unsigned long map_size,
+			struct kvm_mmu_memory_cache *memcache)
+{
+	phys_addr_t dst_phys = page_to_phys(dst_page);
+	phys_addr_t rd = virt_to_phys(realm->rd);
+	unsigned long phys = dst_phys;
+	unsigned long ipa = base_ipa;
+	unsigned long size;
+	int map_level;
+	int ret = 0;
+
+	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
+		return -EINVAL;
+
+	switch (map_size) {
+	case PAGE_SIZE:
+		map_level = 3;
+		break;
+	case RME_L2_BLOCK_SIZE:
+		map_level = 2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (map_level < RME_RTT_MAX_LEVEL) {
+		/*
+		 * A temporary RTT is needed during the map, precreate it,
+		 * however if there is an error (e.g. missing parent tables)
+		 * this will be handled below.
+		 */
+		realm_create_rtt_levels(realm, ipa, map_level,
+					RME_RTT_MAX_LEVEL, memcache);
+	}
+
+	for (size = 0; size < map_size; size += PAGE_SIZE) {
+		if (rmi_granule_delegate(phys)) {
+			struct rtt_entry rtt;
+
+			/*
+			 * It's possible we raced with another VCPU on the same
+			 * fault. If the entry exists and matches then exit
+			 * early and assume the other VCPU will handle the
+			 * mapping.
+			 */
+			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
+				goto err;
+
+			/*
+			 * FIXME: For a block mapping this could race at level
+			 * 2 or 3... currently we don't support block mappings
+			 */
+			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
+				     rtt.state != RMI_ASSIGNED ||
+				     rtt_get_phys(realm, &rtt) != phys))) {
+				goto err;
+			}
+
+			return 0;
+		}
+
+		ret = rmi_data_create_unknown(rd, phys, ipa);
+
+		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+			/* Create missing RTTs and retry */
+			int level = RMI_RETURN_INDEX(ret);
+
+			ret = realm_create_rtt_levels(realm, ipa, level,
+						      RME_RTT_MAX_LEVEL,
+						      memcache);
+			WARN_ON(ret);
+			if (ret)
+				goto err_undelegate;
+
+			ret = rmi_data_create_unknown(rd, phys, ipa);
+		}
+		WARN_ON(ret);
+
+		if (ret)
+			goto err_undelegate;
+
+		phys += PAGE_SIZE;
+		ipa += PAGE_SIZE;
+	}
+
+	if (map_size == RME_L2_BLOCK_SIZE)
+		ret = fold_rtt(realm, base_ipa, map_level);
+	if (WARN_ON(ret))
+		goto err;
+
+	return 0;
+
+err_undelegate:
+	if (WARN_ON(rmi_granule_undelegate(phys))) {
+		/* Page can't be returned to NS world so is lost */
+		get_page(phys_to_page(phys));
+	}
+err:
+	while (size > 0) {
+		unsigned long data, top;
+
+		phys -= PAGE_SIZE;
+		size -= PAGE_SIZE;
+		ipa -= PAGE_SIZE;
+
+		WARN_ON(rmi_data_destroy(rd, ipa, &data, &top));
+
+		if (WARN_ON(rmi_granule_undelegate(phys))) {
+			/* Page can't be returned to NS world so is lost */
+			get_page(phys_to_page(phys));
+		}
+	}
+	return -ENXIO;
+}
+
+int realm_map_non_secure(struct realm *realm,
+			 unsigned long ipa,
+			 struct page *page,
+			 unsigned long map_size,
+			 struct kvm_mmu_memory_cache *memcache)
+{
+	phys_addr_t rd = virt_to_phys(realm->rd);
+	int map_level;
+	int ret = 0;
+	unsigned long desc = page_to_phys(page) |
+			     PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) |
+			     /* FIXME: Read+Write permissions for now */
+			     (3 << 6) |
+			     PTE_SHARED;
+
+	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
+		return -EINVAL;
+
+	switch (map_size) {
+	case PAGE_SIZE:
+		map_level = 3;
+		break;
+	case RME_L2_BLOCK_SIZE:
+		map_level = 2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
+
+	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+		/* Create missing RTTs and retry */
+		int level = RMI_RETURN_INDEX(ret);
+
+		ret = realm_create_rtt_levels(realm, ipa, level, map_level,
+					      memcache);
+		if (WARN_ON(ret))
+			return -ENXIO;
+
+		ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
+	}
+	if (WARN_ON(ret))
+		return -ENXIO;
+
+	return 0;
+}
+
 static int populate_par_region(struct kvm *kvm,
 			       phys_addr_t ipa_base,
 			       phys_addr_t ipa_end,
@@ -638,7 +813,6 @@  static int populate_par_region(struct kvm *kvm,
 	int idx;
 	phys_addr_t ipa;
 	int ret = 0;
-	struct page *tmp_page;
 	unsigned long data_flags = 0;
 
 	base_gfn = gpa_to_gfn(ipa_base);
@@ -660,9 +834,8 @@  static int populate_par_region(struct kvm *kvm,
 		goto out;
 	}
 
-	tmp_page = alloc_page(GFP_KERNEL);
-	if (!tmp_page) {
-		ret = -ENOMEM;
+	if (!kvm_slot_can_be_private(memslot)) {
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -729,28 +902,32 @@  static int populate_par_region(struct kvm *kvm,
 		for (offset = 0; offset < map_size && !ret;
 		     offset += PAGE_SIZE, page++) {
 			phys_addr_t page_ipa = ipa + offset;
+			kvm_pfn_t priv_pfn;
+			int order;
+
+			ret = kvm_gmem_get_pfn(kvm, memslot,
+					       page_ipa >> PAGE_SHIFT,
+					       &priv_pfn, &order);
+			if (ret)
+				break;
 
 			ret = realm_create_protected_data_page(realm, page_ipa,
-							       page, tmp_page,
-							       data_flags);
+							       pfn_to_page(priv_pfn),
+							       page, data_flags);
 		}
+
+		kvm_release_pfn_clean(pfn);
+
 		if (ret)
-			goto err_release_pfn;
+			break;
 
 		if (level == 2)
 			fold_rtt(realm, ipa, level);
 
 		ipa += map_size;
-		kvm_release_pfn_dirty(pfn);
-err_release_pfn:
-		if (ret) {
-			kvm_release_pfn_clean(pfn);
-			break;
-		}
 	}
 
 	mmap_read_unlock(current->mm);
-	__free_page(tmp_page);
 
 out:
 	srcu_read_unlock(&kvm->srcu, idx);