diff mbox series

[RFC,16/28] arm64: RME: Allow populating initial contents

Message ID 20230127112932.38045-17-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Support for Arm CCA in KVM | expand

Commit Message

Steven Price Jan. 27, 2023, 11:29 a.m. UTC
The VMM needs to populate the realm with some data before starting (e.g.
a kernel and initrd). This is measured by the RMM and used as part of
the attestation later on.

Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 366 insertions(+)

Comments

Zhi Wang March 6, 2023, 5:34 p.m. UTC | #1
On Fri, 27 Jan 2023 11:29:20 +0000
Steven Price <steven.price@arm.com> wrote:

> The VMM needs to populate the realm with some data before starting (e.g.
> a kernel and initrd). This is measured by the RMM and used as part of
> the attestation later on.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 366 insertions(+)
> 
> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> index 16e0bfea98b1..3405b43e1421 100644
> --- a/arch/arm64/kvm/rme.c
> +++ b/arch/arm64/kvm/rme.c
> @@ -4,6 +4,7 @@
>   */
>  
>  #include <linux/kvm_host.h>
> +#include <linux/hugetlb.h>
>  
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_mmu.h>
> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
>  	}
>  }
>  
> +static int realm_create_protected_data_page(struct realm *realm,
> +					    unsigned long ipa,
> +					    struct page *dst_page,
> +					    struct page *tmp_page)
> +{
> +	phys_addr_t dst_phys, tmp_phys;
> +	int ret;
> +
> +	copy_page(page_address(tmp_page), page_address(dst_page));
> +
> +	dst_phys = page_to_phys(dst_page);
> +	tmp_phys = page_to_phys(tmp_page);
> +
> +	if (rmi_granule_delegate(dst_phys))
> +		return -ENXIO;
> +
> +	ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
> +			      RMI_MEASURE_CONTENT);
> +
> +	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> +		/* Create missing RTTs and retry */
> +		int level = RMI_RETURN_INDEX(ret);
> +
> +		ret = realm_create_rtt_levels(realm, ipa, level,
> +					      RME_RTT_MAX_LEVEL, NULL);
> +		if (ret)
> +			goto err;
> +
> +		ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
> +				      tmp_phys, RMI_MEASURE_CONTENT);
> +	}
> +
> +	if (ret)
> +		goto err;
> +
> +	return 0;
> +
> +err:
> +	if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
> +		/* Page can't be returned to NS world so is lost */
> +		get_page(dst_page);
> +	}
> +	return -ENXIO;
> +}
> +
> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
> +		    struct realm *realm)
> +{
> +	struct rtt_entry rtt;
> +	phys_addr_t rtt_addr;
> +
> +	if (rmi_rtt_read_entry(rd, addr, level, &rtt))
> +		return -ENXIO;
> +
> +	if (rtt.state != RMI_TABLE)
> +		return -EINVAL;
> +
> +	rtt_addr = rmi_rtt_get_phys(&rtt);
> +	if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
> +		return -ENXIO;
> +
> +	free_delegated_page(realm, rtt_addr);
> +
> +	return 0;
> +}
> +
> +int realm_map_protected(struct realm *realm,
> +			unsigned long hva,
> +			unsigned long base_ipa,
> +			struct page *dst_page,
> +			unsigned long map_size,
> +			struct kvm_mmu_memory_cache *memcache)
> +{
> +	phys_addr_t dst_phys = page_to_phys(dst_page);
> +	phys_addr_t rd = virt_to_phys(realm->rd);
> +	unsigned long phys = dst_phys;
> +	unsigned long ipa = base_ipa;
> +	unsigned long size;
> +	int map_level;
> +	int ret = 0;
> +
> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> +		return -EINVAL;
> +
> +	switch (map_size) {
> +	case PAGE_SIZE:
> +		map_level = 3;
> +		break;
> +	case RME_L2_BLOCK_SIZE:
> +		map_level = 2;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (map_level < RME_RTT_MAX_LEVEL) {
> +		/*
> +		 * A temporary RTT is needed during the map, precreate it,
> +		 * however if there is an error (e.g. missing parent tables)
> +		 * this will be handled below.
> +		 */
> +		realm_create_rtt_levels(realm, ipa, map_level,
> +					RME_RTT_MAX_LEVEL, memcache);
> +	}
> +
> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
> +		if (rmi_granule_delegate(phys)) {
> +			struct rtt_entry rtt;
> +
> +			/*
> +			 * It's possible we raced with another VCPU on the same
> +			 * fault. If the entry exists and matches then exit
> +			 * early and assume the other VCPU will handle the
> +			 * mapping.
> +			 */
> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> +				goto err;
> +
> +			// FIXME: For a block mapping this could race at level
> +			// 2 or 3...
> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> +				     rtt.state != RMI_ASSIGNED ||
> +				     rtt.desc != phys))) {
> +				goto err;
> +			}
> +
> +			return 0;
> +		}
> +
> +		ret = rmi_data_create_unknown(phys, rd, ipa);
> +
> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> +			/* Create missing RTTs and retry */
> +			int level = RMI_RETURN_INDEX(ret);
> +
> +			ret = realm_create_rtt_levels(realm, ipa, level,
> +						      RME_RTT_MAX_LEVEL,
> +						      memcache);
> +			WARN_ON(ret);
> +			if (ret)
> +				goto err_undelegate;
> +
> +			ret = rmi_data_create_unknown(phys, rd, ipa);
> +		}
> +		WARN_ON(ret);
> +
> +		if (ret)
> +			goto err_undelegate;
> +
> +		phys += PAGE_SIZE;
> +		ipa += PAGE_SIZE;
> +	}
> +
> +	if (map_size == RME_L2_BLOCK_SIZE)
> +		ret = fold_rtt(rd, base_ipa, map_level, realm);
> +	if (WARN_ON(ret))
> +		goto err;
> +
> +	return 0;
> +
> +err_undelegate:
> +	if (WARN_ON(rmi_granule_undelegate(phys))) {
> +		/* Page can't be returned to NS world so is lost */
> +		get_page(phys_to_page(phys));
> +	}
> +err:
> +	while (size > 0) {
> +		phys -= PAGE_SIZE;
> +		size -= PAGE_SIZE;
> +		ipa -= PAGE_SIZE;
> +
> +		rmi_data_destroy(rd, ipa);
> +
> +		if (WARN_ON(rmi_granule_undelegate(phys))) {
> +			/* Page can't be returned to NS world so is lost */
> +			get_page(phys_to_page(phys));
> +		}
> +	}
> +	return -ENXIO;
> +}
> +

There seems no caller to the function above. Better move it to the related
patch.

> +static int populate_par_region(struct kvm *kvm,
> +			       phys_addr_t ipa_base,
> +			       phys_addr_t ipa_end)
> +{
> +	struct realm *realm = &kvm->arch.realm;
> +	struct kvm_memory_slot *memslot;
> +	gfn_t base_gfn, end_gfn;
> +	int idx;
> +	phys_addr_t ipa;
> +	int ret = 0;
> +	struct page *tmp_page;
> +	phys_addr_t rd = virt_to_phys(realm->rd);
> +
> +	base_gfn = gpa_to_gfn(ipa_base);
> +	end_gfn = gpa_to_gfn(ipa_end);
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	memslot = gfn_to_memslot(kvm, base_gfn);
> +	if (!memslot) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* We require the region to be contained within a single memslot */
> +	if (memslot->base_gfn + memslot->npages < end_gfn) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	tmp_page = alloc_page(GFP_KERNEL);
> +	if (!tmp_page) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	mmap_read_lock(current->mm);
> +
> +	ipa = ipa_base;
> +
> +	while (ipa < ipa_end) {
> +		struct vm_area_struct *vma;
> +		unsigned long map_size;
> +		unsigned int vma_shift;
> +		unsigned long offset;
> +		unsigned long hva;
> +		struct page *page;
> +		kvm_pfn_t pfn;
> +		int level;
> +
> +		hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> +		vma = vma_lookup(current->mm, hva);
> +		if (!vma) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		if (is_vm_hugetlb_page(vma))
> +			vma_shift = huge_page_shift(hstate_vma(vma));
> +		else
> +			vma_shift = PAGE_SHIFT;
> +
> +		map_size = 1 << vma_shift;
> +
> +		/*
> +		 * FIXME: This causes over mapping, but there's no good
> +		 * solution here with the ABI as it stands
> +		 */
> +		ipa = ALIGN_DOWN(ipa, map_size);
> +
> +		switch (map_size) {
> +		case RME_L2_BLOCK_SIZE:
> +			level = 2;
> +			break;
> +		case PAGE_SIZE:
> +			level = 3;
> +			break;
> +		default:
> +			WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
> +
> +		if (is_error_pfn(pfn)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		ret = rmi_rtt_init_ripas(rd, ipa, level);
> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> +			ret = realm_create_rtt_levels(realm, ipa,
> +						      RMI_RETURN_INDEX(ret),
> +						      level, NULL);
> +			if (ret)
> +				break;
> +			ret = rmi_rtt_init_ripas(rd, ipa, level);
> +			if (ret) {
> +				ret = -ENXIO;
> +				break;
> +			}
> +		}
> +
> +		if (level < RME_RTT_MAX_LEVEL) {
> +			/*
> +			 * A temporary RTT is needed during the map, precreate
> +			 * it, however if there is an error (e.g. missing
> +			 * parent tables) this will be handled in the
> +			 * realm_create_protected_data_page() call.
> +			 */
> +			realm_create_rtt_levels(realm, ipa, level,
> +						RME_RTT_MAX_LEVEL, NULL);
> +		}
> +
> +		page = pfn_to_page(pfn);
> +
> +		for (offset = 0; offset < map_size && !ret;
> +		     offset += PAGE_SIZE, page++) {
> +			phys_addr_t page_ipa = ipa + offset;
> +
> +			ret = realm_create_protected_data_page(realm, page_ipa,
> +							       page, tmp_page);
> +		}
> +		if (ret)
> +			goto err_release_pfn;
> +
> +		if (level == 2) {
> +			ret = fold_rtt(rd, ipa, level, realm);
> +			if (ret)
> +				goto err_release_pfn;
> +		}
> +
> +		ipa += map_size;

> +		kvm_set_pfn_accessed(pfn);
> +		kvm_set_pfn_dirty(pfn);

kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().

> +		kvm_release_pfn_dirty(pfn);
> +err_release_pfn:
> +		if (ret) {
> +			kvm_release_pfn_clean(pfn);
> +			break;
> +		}
> +	}
> +
> +	mmap_read_unlock(current->mm);
> +	__free_page(tmp_page);
> +
> +out:
> +	srcu_read_unlock(&kvm->srcu, idx);
> +	return ret;
> +}
> +
> +static int kvm_populate_realm(struct kvm *kvm,
> +			      struct kvm_cap_arm_rme_populate_realm_args *args)
> +{
> +	phys_addr_t ipa_base, ipa_end;
> +

Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().

> +	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
> +		return -EBUSY;

Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
yet). The userspace shouldn't reach this path.

> +
> +	if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
> +	    !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
> +		return -EINVAL;
> +
> +	ipa_base = args->populate_ipa_base;
> +	ipa_end = ipa_base + args->populate_ipa_size;
> +
> +	if (ipa_end < ipa_base)
> +		return -EINVAL;
> +
> +	return populate_par_region(kvm, ipa_base, ipa_end);
> +}
> +
>  static int set_ipa_state(struct kvm_vcpu *vcpu,
>  			 unsigned long ipa,
>  			 unsigned long end,
> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>  		r = kvm_init_ipa_range_realm(kvm, &args);
>  		break;
>  	}
> +	case KVM_CAP_ARM_RME_POPULATE_REALM: {
> +		struct kvm_cap_arm_rme_populate_realm_args args;
> +		void __user *argp = u64_to_user_ptr(cap->args[1]);
> +
> +		if (copy_from_user(&args, argp, sizeof(args))) {
> +			r = -EFAULT;
> +			break;
> +		}
> +
> +		r = kvm_populate_realm(kvm, &args);
> +		break;
> +	}
>  	default:
>  		r = -EINVAL;
>  		break;
Steven Price March 10, 2023, 3:47 p.m. UTC | #2
On 06/03/2023 17:34, Zhi Wang wrote:
> On Fri, 27 Jan 2023 11:29:20 +0000
> Steven Price <steven.price@arm.com> wrote:
> 
>> The VMM needs to populate the realm with some data before starting (e.g.
>> a kernel and initrd). This is measured by the RMM and used as part of
>> the attestation later on.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>>  arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 366 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
>> index 16e0bfea98b1..3405b43e1421 100644
>> --- a/arch/arm64/kvm/rme.c
>> +++ b/arch/arm64/kvm/rme.c
>> @@ -4,6 +4,7 @@
>>   */
>>  
>>  #include <linux/kvm_host.h>
>> +#include <linux/hugetlb.h>
>>  
>>  #include <asm/kvm_emulate.h>
>>  #include <asm/kvm_mmu.h>
>> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
>>  	}
>>  }
>>  
>> +static int realm_create_protected_data_page(struct realm *realm,
>> +					    unsigned long ipa,
>> +					    struct page *dst_page,
>> +					    struct page *tmp_page)
>> +{
>> +	phys_addr_t dst_phys, tmp_phys;
>> +	int ret;
>> +
>> +	copy_page(page_address(tmp_page), page_address(dst_page));
>> +
>> +	dst_phys = page_to_phys(dst_page);
>> +	tmp_phys = page_to_phys(tmp_page);
>> +
>> +	if (rmi_granule_delegate(dst_phys))
>> +		return -ENXIO;
>> +
>> +	ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
>> +			      RMI_MEASURE_CONTENT);
>> +
>> +	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> +		/* Create missing RTTs and retry */
>> +		int level = RMI_RETURN_INDEX(ret);
>> +
>> +		ret = realm_create_rtt_levels(realm, ipa, level,
>> +					      RME_RTT_MAX_LEVEL, NULL);
>> +		if (ret)
>> +			goto err;
>> +
>> +		ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
>> +				      tmp_phys, RMI_MEASURE_CONTENT);
>> +	}
>> +
>> +	if (ret)
>> +		goto err;
>> +
>> +	return 0;
>> +
>> +err:
>> +	if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
>> +		/* Page can't be returned to NS world so is lost */
>> +		get_page(dst_page);
>> +	}
>> +	return -ENXIO;
>> +}
>> +
>> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
>> +		    struct realm *realm)
>> +{
>> +	struct rtt_entry rtt;
>> +	phys_addr_t rtt_addr;
>> +
>> +	if (rmi_rtt_read_entry(rd, addr, level, &rtt))
>> +		return -ENXIO;
>> +
>> +	if (rtt.state != RMI_TABLE)
>> +		return -EINVAL;
>> +
>> +	rtt_addr = rmi_rtt_get_phys(&rtt);
>> +	if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
>> +		return -ENXIO;
>> +
>> +	free_delegated_page(realm, rtt_addr);
>> +
>> +	return 0;
>> +}
>> +
>> +int realm_map_protected(struct realm *realm,
>> +			unsigned long hva,
>> +			unsigned long base_ipa,
>> +			struct page *dst_page,
>> +			unsigned long map_size,
>> +			struct kvm_mmu_memory_cache *memcache)
>> +{
>> +	phys_addr_t dst_phys = page_to_phys(dst_page);
>> +	phys_addr_t rd = virt_to_phys(realm->rd);
>> +	unsigned long phys = dst_phys;
>> +	unsigned long ipa = base_ipa;
>> +	unsigned long size;
>> +	int map_level;
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
>> +		return -EINVAL;
>> +
>> +	switch (map_size) {
>> +	case PAGE_SIZE:
>> +		map_level = 3;
>> +		break;
>> +	case RME_L2_BLOCK_SIZE:
>> +		map_level = 2;
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (map_level < RME_RTT_MAX_LEVEL) {
>> +		/*
>> +		 * A temporary RTT is needed during the map, precreate it,
>> +		 * however if there is an error (e.g. missing parent tables)
>> +		 * this will be handled below.
>> +		 */
>> +		realm_create_rtt_levels(realm, ipa, map_level,
>> +					RME_RTT_MAX_LEVEL, memcache);
>> +	}
>> +
>> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
>> +		if (rmi_granule_delegate(phys)) {
>> +			struct rtt_entry rtt;
>> +
>> +			/*
>> +			 * It's possible we raced with another VCPU on the same
>> +			 * fault. If the entry exists and matches then exit
>> +			 * early and assume the other VCPU will handle the
>> +			 * mapping.
>> +			 */
>> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
>> +				goto err;
>> +
>> +			// FIXME: For a block mapping this could race at level
>> +			// 2 or 3...
>> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
>> +				     rtt.state != RMI_ASSIGNED ||
>> +				     rtt.desc != phys))) {
>> +				goto err;
>> +			}
>> +
>> +			return 0;
>> +		}
>> +
>> +		ret = rmi_data_create_unknown(phys, rd, ipa);
>> +
>> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> +			/* Create missing RTTs and retry */
>> +			int level = RMI_RETURN_INDEX(ret);
>> +
>> +			ret = realm_create_rtt_levels(realm, ipa, level,
>> +						      RME_RTT_MAX_LEVEL,
>> +						      memcache);
>> +			WARN_ON(ret);
>> +			if (ret)
>> +				goto err_undelegate;
>> +
>> +			ret = rmi_data_create_unknown(phys, rd, ipa);
>> +		}
>> +		WARN_ON(ret);
>> +
>> +		if (ret)
>> +			goto err_undelegate;
>> +
>> +		phys += PAGE_SIZE;
>> +		ipa += PAGE_SIZE;
>> +	}
>> +
>> +	if (map_size == RME_L2_BLOCK_SIZE)
>> +		ret = fold_rtt(rd, base_ipa, map_level, realm);
>> +	if (WARN_ON(ret))
>> +		goto err;
>> +
>> +	return 0;
>> +
>> +err_undelegate:
>> +	if (WARN_ON(rmi_granule_undelegate(phys))) {
>> +		/* Page can't be returned to NS world so is lost */
>> +		get_page(phys_to_page(phys));
>> +	}
>> +err:
>> +	while (size > 0) {
>> +		phys -= PAGE_SIZE;
>> +		size -= PAGE_SIZE;
>> +		ipa -= PAGE_SIZE;
>> +
>> +		rmi_data_destroy(rd, ipa);
>> +
>> +		if (WARN_ON(rmi_granule_undelegate(phys))) {
>> +			/* Page can't be returned to NS world so is lost */
>> +			get_page(phys_to_page(phys));
>> +		}
>> +	}
>> +	return -ENXIO;
>> +}
>> +
> 
> There seems no caller to the function above. Better move it to the related
> patch.

Indeed this should really be in the next patch - will move as it's very
confusing having it in this patch (sorry about that).

>> +static int populate_par_region(struct kvm *kvm,
>> +			       phys_addr_t ipa_base,
>> +			       phys_addr_t ipa_end)
>> +{
>> +	struct realm *realm = &kvm->arch.realm;
>> +	struct kvm_memory_slot *memslot;
>> +	gfn_t base_gfn, end_gfn;
>> +	int idx;
>> +	phys_addr_t ipa;
>> +	int ret = 0;
>> +	struct page *tmp_page;
>> +	phys_addr_t rd = virt_to_phys(realm->rd);
>> +
>> +	base_gfn = gpa_to_gfn(ipa_base);
>> +	end_gfn = gpa_to_gfn(ipa_end);
>> +
>> +	idx = srcu_read_lock(&kvm->srcu);
>> +	memslot = gfn_to_memslot(kvm, base_gfn);
>> +	if (!memslot) {
>> +		ret = -EFAULT;
>> +		goto out;
>> +	}
>> +
>> +	/* We require the region to be contained within a single memslot */
>> +	if (memslot->base_gfn + memslot->npages < end_gfn) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +	tmp_page = alloc_page(GFP_KERNEL);
>> +	if (!tmp_page) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	mmap_read_lock(current->mm);
>> +
>> +	ipa = ipa_base;
>> +
>> +	while (ipa < ipa_end) {
>> +		struct vm_area_struct *vma;
>> +		unsigned long map_size;
>> +		unsigned int vma_shift;
>> +		unsigned long offset;
>> +		unsigned long hva;
>> +		struct page *page;
>> +		kvm_pfn_t pfn;
>> +		int level;
>> +
>> +		hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
>> +		vma = vma_lookup(current->mm, hva);
>> +		if (!vma) {
>> +			ret = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		if (is_vm_hugetlb_page(vma))
>> +			vma_shift = huge_page_shift(hstate_vma(vma));
>> +		else
>> +			vma_shift = PAGE_SHIFT;
>> +
>> +		map_size = 1 << vma_shift;
>> +
>> +		/*
>> +		 * FIXME: This causes over mapping, but there's no good
>> +		 * solution here with the ABI as it stands
>> +		 */
>> +		ipa = ALIGN_DOWN(ipa, map_size);
>> +
>> +		switch (map_size) {
>> +		case RME_L2_BLOCK_SIZE:
>> +			level = 2;
>> +			break;
>> +		case PAGE_SIZE:
>> +			level = 3;
>> +			break;
>> +		default:
>> +			WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
>> +			ret = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
>> +
>> +		if (is_error_pfn(pfn)) {
>> +			ret = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		ret = rmi_rtt_init_ripas(rd, ipa, level);
>> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> +			ret = realm_create_rtt_levels(realm, ipa,
>> +						      RMI_RETURN_INDEX(ret),
>> +						      level, NULL);
>> +			if (ret)
>> +				break;
>> +			ret = rmi_rtt_init_ripas(rd, ipa, level);
>> +			if (ret) {
>> +				ret = -ENXIO;
>> +				break;
>> +			}
>> +		}
>> +
>> +		if (level < RME_RTT_MAX_LEVEL) {
>> +			/*
>> +			 * A temporary RTT is needed during the map, precreate
>> +			 * it, however if there is an error (e.g. missing
>> +			 * parent tables) this will be handled in the
>> +			 * realm_create_protected_data_page() call.
>> +			 */
>> +			realm_create_rtt_levels(realm, ipa, level,
>> +						RME_RTT_MAX_LEVEL, NULL);
>> +		}
>> +
>> +		page = pfn_to_page(pfn);
>> +
>> +		for (offset = 0; offset < map_size && !ret;
>> +		     offset += PAGE_SIZE, page++) {
>> +			phys_addr_t page_ipa = ipa + offset;
>> +
>> +			ret = realm_create_protected_data_page(realm, page_ipa,
>> +							       page, tmp_page);
>> +		}
>> +		if (ret)
>> +			goto err_release_pfn;
>> +
>> +		if (level == 2) {
>> +			ret = fold_rtt(rd, ipa, level, realm);
>> +			if (ret)
>> +				goto err_release_pfn;
>> +		}
>> +
>> +		ipa += map_size;
> 
>> +		kvm_set_pfn_accessed(pfn);
>> +		kvm_set_pfn_dirty(pfn);
> 
> kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().

Will remove those calls.

>> +		kvm_release_pfn_dirty(pfn);
>> +err_release_pfn:
>> +		if (ret) {
>> +			kvm_release_pfn_clean(pfn);
>> +			break;
>> +		}
>> +	}
>> +
>> +	mmap_read_unlock(current->mm);
>> +	__free_page(tmp_page);
>> +
>> +out:
>> +	srcu_read_unlock(&kvm->srcu, idx);
>> +	return ret;
>> +}
>> +
>> +static int kvm_populate_realm(struct kvm *kvm,
>> +			      struct kvm_cap_arm_rme_populate_realm_args *args)
>> +{
>> +	phys_addr_t ipa_base, ipa_end;
>> +
> 
> Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().

I'm going to update kvm_vm_ioctl_enable_cap() to check kvm_is_realm() so
we won't get here.

>> +	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
>> +		return -EBUSY;
> 
> Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
> yet). The userspace shouldn't reach this path.

Well user space can attempt to populate in the ACTIVE state - which is
where the idea of 'busy' comes from. Admittedly it's a little confusing
when RMI_REALM_CREATE hasn't been called.

I'm not particularly bothered about the return code, but it's useful to
have a different code to -EINVAL as it's not an invalid argument, but
calling at the wrong time. I can't immediately see a better error code
though.

Steve

>> +
>> +	if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
>> +	    !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
>> +		return -EINVAL;
>> +
>> +	ipa_base = args->populate_ipa_base;
>> +	ipa_end = ipa_base + args->populate_ipa_size;
>> +
>> +	if (ipa_end < ipa_base)
>> +		return -EINVAL;
>> +
>> +	return populate_par_region(kvm, ipa_base, ipa_end);
>> +}
>> +
>>  static int set_ipa_state(struct kvm_vcpu *vcpu,
>>  			 unsigned long ipa,
>>  			 unsigned long end,
>> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>>  		r = kvm_init_ipa_range_realm(kvm, &args);
>>  		break;
>>  	}
>> +	case KVM_CAP_ARM_RME_POPULATE_REALM: {
>> +		struct kvm_cap_arm_rme_populate_realm_args args;
>> +		void __user *argp = u64_to_user_ptr(cap->args[1]);
>> +
>> +		if (copy_from_user(&args, argp, sizeof(args))) {
>> +			r = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		r = kvm_populate_realm(kvm, &args);
>> +		break;
>> +	}
>>  	default:
>>  		r = -EINVAL;
>>  		break;
>
Zhi Wang March 14, 2023, 3:31 p.m. UTC | #3
On Fri, 10 Mar 2023 15:47:16 +0000
Steven Price <steven.price@arm.com> wrote:

> On 06/03/2023 17:34, Zhi Wang wrote:
> > On Fri, 27 Jan 2023 11:29:20 +0000
> > Steven Price <steven.price@arm.com> wrote:
> >   
> >> The VMM needs to populate the realm with some data before starting (e.g.
> >> a kernel and initrd). This is measured by the RMM and used as part of
> >> the attestation later on.
> >>
> >> Signed-off-by: Steven Price <steven.price@arm.com>
> >> ---
> >>  arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 366 insertions(+)
> >>
> >> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> >> index 16e0bfea98b1..3405b43e1421 100644
> >> --- a/arch/arm64/kvm/rme.c
> >> +++ b/arch/arm64/kvm/rme.c
> >> @@ -4,6 +4,7 @@
> >>   */
> >>  
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/hugetlb.h>
> >>  
> >>  #include <asm/kvm_emulate.h>
> >>  #include <asm/kvm_mmu.h>
> >> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
> >>  	}
> >>  }
> >>  
> >> +static int realm_create_protected_data_page(struct realm *realm,
> >> +					    unsigned long ipa,
> >> +					    struct page *dst_page,
> >> +					    struct page *tmp_page)
> >> +{
> >> +	phys_addr_t dst_phys, tmp_phys;
> >> +	int ret;
> >> +
> >> +	copy_page(page_address(tmp_page), page_address(dst_page));
> >> +
> >> +	dst_phys = page_to_phys(dst_page);
> >> +	tmp_phys = page_to_phys(tmp_page);
> >> +
> >> +	if (rmi_granule_delegate(dst_phys))
> >> +		return -ENXIO;
> >> +
> >> +	ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
> >> +			      RMI_MEASURE_CONTENT);
> >> +
> >> +	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> +		/* Create missing RTTs and retry */
> >> +		int level = RMI_RETURN_INDEX(ret);
> >> +
> >> +		ret = realm_create_rtt_levels(realm, ipa, level,
> >> +					      RME_RTT_MAX_LEVEL, NULL);
> >> +		if (ret)
> >> +			goto err;
> >> +
> >> +		ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
> >> +				      tmp_phys, RMI_MEASURE_CONTENT);
> >> +	}
> >> +
> >> +	if (ret)
> >> +		goto err;
> >> +
> >> +	return 0;
> >> +
> >> +err:
> >> +	if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
> >> +		/* Page can't be returned to NS world so is lost */
> >> +		get_page(dst_page);
> >> +	}
> >> +	return -ENXIO;
> >> +}
> >> +
> >> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
> >> +		    struct realm *realm)
> >> +{
> >> +	struct rtt_entry rtt;
> >> +	phys_addr_t rtt_addr;
> >> +
> >> +	if (rmi_rtt_read_entry(rd, addr, level, &rtt))
> >> +		return -ENXIO;
> >> +
> >> +	if (rtt.state != RMI_TABLE)
> >> +		return -EINVAL;
> >> +
> >> +	rtt_addr = rmi_rtt_get_phys(&rtt);
> >> +	if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
> >> +		return -ENXIO;
> >> +
> >> +	free_delegated_page(realm, rtt_addr);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +int realm_map_protected(struct realm *realm,
> >> +			unsigned long hva,
> >> +			unsigned long base_ipa,
> >> +			struct page *dst_page,
> >> +			unsigned long map_size,
> >> +			struct kvm_mmu_memory_cache *memcache)
> >> +{
> >> +	phys_addr_t dst_phys = page_to_phys(dst_page);
> >> +	phys_addr_t rd = virt_to_phys(realm->rd);
> >> +	unsigned long phys = dst_phys;
> >> +	unsigned long ipa = base_ipa;
> >> +	unsigned long size;
> >> +	int map_level;
> >> +	int ret = 0;
> >> +
> >> +	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> >> +		return -EINVAL;
> >> +
> >> +	switch (map_size) {
> >> +	case PAGE_SIZE:
> >> +		map_level = 3;
> >> +		break;
> >> +	case RME_L2_BLOCK_SIZE:
> >> +		map_level = 2;
> >> +		break;
> >> +	default:
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	if (map_level < RME_RTT_MAX_LEVEL) {
> >> +		/*
> >> +		 * A temporary RTT is needed during the map, precreate it,
> >> +		 * however if there is an error (e.g. missing parent tables)
> >> +		 * this will be handled below.
> >> +		 */
> >> +		realm_create_rtt_levels(realm, ipa, map_level,
> >> +					RME_RTT_MAX_LEVEL, memcache);
> >> +	}
> >> +
> >> +	for (size = 0; size < map_size; size += PAGE_SIZE) {
> >> +		if (rmi_granule_delegate(phys)) {
> >> +			struct rtt_entry rtt;
> >> +
> >> +			/*
> >> +			 * It's possible we raced with another VCPU on the same
> >> +			 * fault. If the entry exists and matches then exit
> >> +			 * early and assume the other VCPU will handle the
> >> +			 * mapping.
> >> +			 */
> >> +			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> >> +				goto err;
> >> +
> >> +			// FIXME: For a block mapping this could race at level
> >> +			// 2 or 3...
> >> +			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> >> +				     rtt.state != RMI_ASSIGNED ||
> >> +				     rtt.desc != phys))) {
> >> +				goto err;
> >> +			}
> >> +
> >> +			return 0;
> >> +		}
> >> +
> >> +		ret = rmi_data_create_unknown(phys, rd, ipa);
> >> +
> >> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> +			/* Create missing RTTs and retry */
> >> +			int level = RMI_RETURN_INDEX(ret);
> >> +
> >> +			ret = realm_create_rtt_levels(realm, ipa, level,
> >> +						      RME_RTT_MAX_LEVEL,
> >> +						      memcache);
> >> +			WARN_ON(ret);
> >> +			if (ret)
> >> +				goto err_undelegate;
> >> +
> >> +			ret = rmi_data_create_unknown(phys, rd, ipa);
> >> +		}
> >> +		WARN_ON(ret);
> >> +
> >> +		if (ret)
> >> +			goto err_undelegate;
> >> +
> >> +		phys += PAGE_SIZE;
> >> +		ipa += PAGE_SIZE;
> >> +	}
> >> +
> >> +	if (map_size == RME_L2_BLOCK_SIZE)
> >> +		ret = fold_rtt(rd, base_ipa, map_level, realm);
> >> +	if (WARN_ON(ret))
> >> +		goto err;
> >> +
> >> +	return 0;
> >> +
> >> +err_undelegate:
> >> +	if (WARN_ON(rmi_granule_undelegate(phys))) {
> >> +		/* Page can't be returned to NS world so is lost */
> >> +		get_page(phys_to_page(phys));
> >> +	}
> >> +err:
> >> +	while (size > 0) {
> >> +		phys -= PAGE_SIZE;
> >> +		size -= PAGE_SIZE;
> >> +		ipa -= PAGE_SIZE;
> >> +
> >> +		rmi_data_destroy(rd, ipa);
> >> +
> >> +		if (WARN_ON(rmi_granule_undelegate(phys))) {
> >> +			/* Page can't be returned to NS world so is lost */
> >> +			get_page(phys_to_page(phys));
> >> +		}
> >> +	}
> >> +	return -ENXIO;
> >> +}
> >> +  
> > 
> > There seems no caller to the function above. Better move it to the related
> > patch.  
> 
> Indeed this should really be in the next patch - will move as it's very
> confusing having it in this patch (sorry about that).
> 
> >> +static int populate_par_region(struct kvm *kvm,
> >> +			       phys_addr_t ipa_base,
> >> +			       phys_addr_t ipa_end)
> >> +{
> >> +	struct realm *realm = &kvm->arch.realm;
> >> +	struct kvm_memory_slot *memslot;
> >> +	gfn_t base_gfn, end_gfn;
> >> +	int idx;
> >> +	phys_addr_t ipa;
> >> +	int ret = 0;
> >> +	struct page *tmp_page;
> >> +	phys_addr_t rd = virt_to_phys(realm->rd);
> >> +
> >> +	base_gfn = gpa_to_gfn(ipa_base);
> >> +	end_gfn = gpa_to_gfn(ipa_end);
> >> +
> >> +	idx = srcu_read_lock(&kvm->srcu);
> >> +	memslot = gfn_to_memslot(kvm, base_gfn);
> >> +	if (!memslot) {
> >> +		ret = -EFAULT;
> >> +		goto out;
> >> +	}
> >> +
> >> +	/* We require the region to be contained within a single memslot */
> >> +	if (memslot->base_gfn + memslot->npages < end_gfn) {
> >> +		ret = -EINVAL;
> >> +		goto out;
> >> +	}
> >> +
> >> +	tmp_page = alloc_page(GFP_KERNEL);
> >> +	if (!tmp_page) {
> >> +		ret = -ENOMEM;
> >> +		goto out;
> >> +	}
> >> +
> >> +	mmap_read_lock(current->mm);
> >> +
> >> +	ipa = ipa_base;
> >> +
> >> +	while (ipa < ipa_end) {
> >> +		struct vm_area_struct *vma;
> >> +		unsigned long map_size;
> >> +		unsigned int vma_shift;
> >> +		unsigned long offset;
> >> +		unsigned long hva;
> >> +		struct page *page;
> >> +		kvm_pfn_t pfn;
> >> +		int level;
> >> +
> >> +		hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> >> +		vma = vma_lookup(current->mm, hva);
> >> +		if (!vma) {
> >> +			ret = -EFAULT;
> >> +			break;
> >> +		}
> >> +
> >> +		if (is_vm_hugetlb_page(vma))
> >> +			vma_shift = huge_page_shift(hstate_vma(vma));
> >> +		else
> >> +			vma_shift = PAGE_SHIFT;
> >> +
> >> +		map_size = 1 << vma_shift;
> >> +
> >> +		/*
> >> +		 * FIXME: This causes over mapping, but there's no good
> >> +		 * solution here with the ABI as it stands
> >> +		 */
> >> +		ipa = ALIGN_DOWN(ipa, map_size);
> >> +
> >> +		switch (map_size) {
> >> +		case RME_L2_BLOCK_SIZE:
> >> +			level = 2;
> >> +			break;
> >> +		case PAGE_SIZE:
> >> +			level = 3;
> >> +			break;
> >> +		default:
> >> +			WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
> >> +			ret = -EFAULT;
> >> +			break;
> >> +		}
> >> +
> >> +		pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
> >> +
> >> +		if (is_error_pfn(pfn)) {
> >> +			ret = -EFAULT;
> >> +			break;
> >> +		}
> >> +
> >> +		ret = rmi_rtt_init_ripas(rd, ipa, level);
> >> +		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> +			ret = realm_create_rtt_levels(realm, ipa,
> >> +						      RMI_RETURN_INDEX(ret),
> >> +						      level, NULL);
> >> +			if (ret)
> >> +				break;
> >> +			ret = rmi_rtt_init_ripas(rd, ipa, level);
> >> +			if (ret) {
> >> +				ret = -ENXIO;
> >> +				break;
> >> +			}
> >> +		}
> >> +
> >> +		if (level < RME_RTT_MAX_LEVEL) {
> >> +			/*
> >> +			 * A temporary RTT is needed during the map, precreate
> >> +			 * it, however if there is an error (e.g. missing
> >> +			 * parent tables) this will be handled in the
> >> +			 * realm_create_protected_data_page() call.
> >> +			 */
> >> +			realm_create_rtt_levels(realm, ipa, level,
> >> +						RME_RTT_MAX_LEVEL, NULL);
> >> +		}
> >> +
> >> +		page = pfn_to_page(pfn);
> >> +
> >> +		for (offset = 0; offset < map_size && !ret;
> >> +		     offset += PAGE_SIZE, page++) {
> >> +			phys_addr_t page_ipa = ipa + offset;
> >> +
> >> +			ret = realm_create_protected_data_page(realm, page_ipa,
> >> +							       page, tmp_page);
> >> +		}
> >> +		if (ret)
> >> +			goto err_release_pfn;
> >> +
> >> +		if (level == 2) {
> >> +			ret = fold_rtt(rd, ipa, level, realm);
> >> +			if (ret)
> >> +				goto err_release_pfn;
> >> +		}
> >> +
> >> +		ipa += map_size;  
> >   
> >> +		kvm_set_pfn_accessed(pfn);
> >> +		kvm_set_pfn_dirty(pfn);  
> > 
> > kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().  
> 
> Will remove those calls.
> 
> >> +		kvm_release_pfn_dirty(pfn);
> >> +err_release_pfn:
> >> +		if (ret) {
> >> +			kvm_release_pfn_clean(pfn);
> >> +			break;
> >> +		}
> >> +	}
> >> +
> >> +	mmap_read_unlock(current->mm);
> >> +	__free_page(tmp_page);
> >> +
> >> +out:
> >> +	srcu_read_unlock(&kvm->srcu, idx);
> >> +	return ret;
> >> +}
> >> +
> >> +static int kvm_populate_realm(struct kvm *kvm,
> >> +			      struct kvm_cap_arm_rme_populate_realm_args *args)
> >> +{
> >> +	phys_addr_t ipa_base, ipa_end;
> >> +  
> > 
> > Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().  
> 
> I'm going to update kvm_vm_ioctl_enable_cap() to check kvm_is_realm() so
> we won't get here.
> 
> >> +	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
> >> +		return -EBUSY;  
> > 
> > Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
> > yet). The userspace shouldn't reach this path.  
> 
> Well user space can attempt to populate in the ACTIVE state - which is
> where the idea of 'busy' comes from. Admittedly it's a little confusing
> when RMI_REALM_CREATE hasn't been called.
> 
> I'm not particularly bothered about the return code, but it's useful to
> have a different code to -EINVAL as it's not an invalid argument, but
> calling at the wrong time. I can't immediately see a better error code
> though.
> 
The reason why I feel -EBUSY is little bit off is EBUSY usually indicates
something is already initialized and currently running, then another
calling path wanna to operate it. 

I took a look on the ioctls in arch/arm64/kvm/arm.c. It seems people have
different opinions for calling execution path at a wrong time:

For example:

long kvm_arch_vcpu_ioctl()
...
        case KVM_GET_REG_LIST: {
                struct kvm_reg_list __user *user_list = argp;
                struct kvm_reg_list reg_list;
                unsigned n;

                r = -ENOEXEC;
                if (unlikely(!kvm_vcpu_initialized(vcpu)))
                        break;

                r = -EPERM;
                if (!kvm_arm_vcpu_is_finalized(vcpu))
                        break;

If we have to choose one, I prefer -ENOEXEC as -EPERM is stranger. But
personally my vote goes to -EINVAL.

> Steve
> 
> >> +
> >> +	if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
> >> +	    !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
> >> +		return -EINVAL;
> >> +
> >> +	ipa_base = args->populate_ipa_base;
> >> +	ipa_end = ipa_base + args->populate_ipa_size;
> >> +
> >> +	if (ipa_end < ipa_base)
> >> +		return -EINVAL;
> >> +
> >> +	return populate_par_region(kvm, ipa_base, ipa_end);
> >> +}
> >> +
> >>  static int set_ipa_state(struct kvm_vcpu *vcpu,
> >>  			 unsigned long ipa,
> >>  			 unsigned long end,
> >> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
> >>  		r = kvm_init_ipa_range_realm(kvm, &args);
> >>  		break;
> >>  	}
> >> +	case KVM_CAP_ARM_RME_POPULATE_REALM: {
> >> +		struct kvm_cap_arm_rme_populate_realm_args args;
> >> +		void __user *argp = u64_to_user_ptr(cap->args[1]);
> >> +
> >> +		if (copy_from_user(&args, argp, sizeof(args))) {
> >> +			r = -EFAULT;
> >> +			break;
> >> +		}
> >> +
> >> +		r = kvm_populate_realm(kvm, &args);
> >> +		break;
> >> +	}
> >>  	default:
> >>  		r = -EINVAL;
> >>  		break;  
> >   
>
Steven Price March 22, 2023, 11:51 a.m. UTC | #4
On 14/03/2023 15:31, Zhi Wang wrote:
> On Fri, 10 Mar 2023 15:47:16 +0000
> Steven Price <steven.price@arm.com> wrote:
> 
>> On 06/03/2023 17:34, Zhi Wang wrote:
>>> On Fri, 27 Jan 2023 11:29:20 +0000
>>> Steven Price <steven.price@arm.com> wrote:

<snip>

>>>> +	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
>>>> +		return -EBUSY;  
>>>
>>> Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
>>> yet). The userspace shouldn't reach this path.  
>>
>> Well user space can attempt to populate in the ACTIVE state - which is
>> where the idea of 'busy' comes from. Admittedly it's a little confusing
>> when RMI_REALM_CREATE hasn't been called.
>>
>> I'm not particularly bothered about the return code, but it's useful to
>> have a different code to -EINVAL as it's not an invalid argument, but
>> calling at the wrong time. I can't immediately see a better error code
>> though.
>>
> The reason why I feel -EBUSY is little bit off is EBUSY usually indicates
> something is already initialized and currently running, then another
> calling path wanna to operate it. 
> 
> I took a look on the ioctls in arch/arm64/kvm/arm.c. It seems people have
> different opinions for calling execution path at a wrong time:
> 
> For example:
> 
> long kvm_arch_vcpu_ioctl()
> ...
>         case KVM_GET_REG_LIST: {
>                 struct kvm_reg_list __user *user_list = argp;
>                 struct kvm_reg_list reg_list;
>                 unsigned n;
> 
>                 r = -ENOEXEC;
>                 if (unlikely(!kvm_vcpu_initialized(vcpu)))
>                         break;
> 
>                 r = -EPERM;
>                 if (!kvm_arm_vcpu_is_finalized(vcpu))
>                         break;
> 
> If we have to choose one, I prefer -ENOEXEC as -EPERM is stranger. But
> personally my vote goes to -EINVAL.

Ok, I think you've convinced me - I'll change to -EINVAL. It is invalid
use of the API and none of the other error codes seem a great fit.

Although I do wish Linux had more descriptive error codes - I often end
up peppering the kernel with a few printks when using a new API to find
out what I'm doing wrong.

Steve

>> Steve
>>
>>>> +
>>>> +	if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
>>>> +	    !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
>>>> +		return -EINVAL;
>>>> +
>>>> +	ipa_base = args->populate_ipa_base;
>>>> +	ipa_end = ipa_base + args->populate_ipa_size;
>>>> +
>>>> +	if (ipa_end < ipa_base)
>>>> +		return -EINVAL;
>>>> +
>>>> +	return populate_par_region(kvm, ipa_base, ipa_end);
>>>> +}
>>>> +
>>>>  static int set_ipa_state(struct kvm_vcpu *vcpu,
>>>>  			 unsigned long ipa,
>>>>  			 unsigned long end,
>>>> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>>>>  		r = kvm_init_ipa_range_realm(kvm, &args);
>>>>  		break;
>>>>  	}
>>>> +	case KVM_CAP_ARM_RME_POPULATE_REALM: {
>>>> +		struct kvm_cap_arm_rme_populate_realm_args args;
>>>> +		void __user *argp = u64_to_user_ptr(cap->args[1]);
>>>> +
>>>> +		if (copy_from_user(&args, argp, sizeof(args))) {
>>>> +			r = -EFAULT;
>>>> +			break;
>>>> +		}
>>>> +
>>>> +		r = kvm_populate_realm(kvm, &args);
>>>> +		break;
>>>> +	}
>>>>  	default:
>>>>  		r = -EINVAL;
>>>>  		break;  
>>>   
>>
>
diff mbox series

Patch

diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
index 16e0bfea98b1..3405b43e1421 100644
--- a/arch/arm64/kvm/rme.c
+++ b/arch/arm64/kvm/rme.c
@@ -4,6 +4,7 @@ 
  */
 
 #include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
 
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_mmu.h>
@@ -426,6 +427,359 @@  void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
 	}
 }
 
+static int realm_create_protected_data_page(struct realm *realm,
+					    unsigned long ipa,
+					    struct page *dst_page,
+					    struct page *tmp_page)
+{
+	phys_addr_t dst_phys, tmp_phys;
+	int ret;
+
+	copy_page(page_address(tmp_page), page_address(dst_page));
+
+	dst_phys = page_to_phys(dst_page);
+	tmp_phys = page_to_phys(tmp_page);
+
+	if (rmi_granule_delegate(dst_phys))
+		return -ENXIO;
+
+	ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
+			      RMI_MEASURE_CONTENT);
+
+	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+		/* Create missing RTTs and retry */
+		int level = RMI_RETURN_INDEX(ret);
+
+		ret = realm_create_rtt_levels(realm, ipa, level,
+					      RME_RTT_MAX_LEVEL, NULL);
+		if (ret)
+			goto err;
+
+		ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
+				      tmp_phys, RMI_MEASURE_CONTENT);
+	}
+
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
+		/* Page can't be returned to NS world so is lost */
+		get_page(dst_page);
+	}
+	return -ENXIO;
+}
+
+static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
+		    struct realm *realm)
+{
+	struct rtt_entry rtt;
+	phys_addr_t rtt_addr;
+
+	if (rmi_rtt_read_entry(rd, addr, level, &rtt))
+		return -ENXIO;
+
+	if (rtt.state != RMI_TABLE)
+		return -EINVAL;
+
+	rtt_addr = rmi_rtt_get_phys(&rtt);
+	if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
+		return -ENXIO;
+
+	free_delegated_page(realm, rtt_addr);
+
+	return 0;
+}
+
+int realm_map_protected(struct realm *realm,
+			unsigned long hva,
+			unsigned long base_ipa,
+			struct page *dst_page,
+			unsigned long map_size,
+			struct kvm_mmu_memory_cache *memcache)
+{
+	phys_addr_t dst_phys = page_to_phys(dst_page);
+	phys_addr_t rd = virt_to_phys(realm->rd);
+	unsigned long phys = dst_phys;
+	unsigned long ipa = base_ipa;
+	unsigned long size;
+	int map_level;
+	int ret = 0;
+
+	if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
+		return -EINVAL;
+
+	switch (map_size) {
+	case PAGE_SIZE:
+		map_level = 3;
+		break;
+	case RME_L2_BLOCK_SIZE:
+		map_level = 2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (map_level < RME_RTT_MAX_LEVEL) {
+		/*
+		 * A temporary RTT is needed during the map, precreate it,
+		 * however if there is an error (e.g. missing parent tables)
+		 * this will be handled below.
+		 */
+		realm_create_rtt_levels(realm, ipa, map_level,
+					RME_RTT_MAX_LEVEL, memcache);
+	}
+
+	for (size = 0; size < map_size; size += PAGE_SIZE) {
+		if (rmi_granule_delegate(phys)) {
+			struct rtt_entry rtt;
+
+			/*
+			 * It's possible we raced with another VCPU on the same
+			 * fault. If the entry exists and matches then exit
+			 * early and assume the other VCPU will handle the
+			 * mapping.
+			 */
+			if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
+				goto err;
+
+			// FIXME: For a block mapping this could race at level
+			// 2 or 3...
+			if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
+				     rtt.state != RMI_ASSIGNED ||
+				     rtt.desc != phys))) {
+				goto err;
+			}
+
+			return 0;
+		}
+
+		ret = rmi_data_create_unknown(phys, rd, ipa);
+
+		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+			/* Create missing RTTs and retry */
+			int level = RMI_RETURN_INDEX(ret);
+
+			ret = realm_create_rtt_levels(realm, ipa, level,
+						      RME_RTT_MAX_LEVEL,
+						      memcache);
+			WARN_ON(ret);
+			if (ret)
+				goto err_undelegate;
+
+			ret = rmi_data_create_unknown(phys, rd, ipa);
+		}
+		WARN_ON(ret);
+
+		if (ret)
+			goto err_undelegate;
+
+		phys += PAGE_SIZE;
+		ipa += PAGE_SIZE;
+	}
+
+	if (map_size == RME_L2_BLOCK_SIZE)
+		ret = fold_rtt(rd, base_ipa, map_level, realm);
+	if (WARN_ON(ret))
+		goto err;
+
+	return 0;
+
+err_undelegate:
+	if (WARN_ON(rmi_granule_undelegate(phys))) {
+		/* Page can't be returned to NS world so is lost */
+		get_page(phys_to_page(phys));
+	}
+err:
+	while (size > 0) {
+		phys -= PAGE_SIZE;
+		size -= PAGE_SIZE;
+		ipa -= PAGE_SIZE;
+
+		rmi_data_destroy(rd, ipa);
+
+		if (WARN_ON(rmi_granule_undelegate(phys))) {
+			/* Page can't be returned to NS world so is lost */
+			get_page(phys_to_page(phys));
+		}
+	}
+	return -ENXIO;
+}
+
+static int populate_par_region(struct kvm *kvm,
+			       phys_addr_t ipa_base,
+			       phys_addr_t ipa_end)
+{
+	struct realm *realm = &kvm->arch.realm;
+	struct kvm_memory_slot *memslot;
+	gfn_t base_gfn, end_gfn;
+	int idx;
+	phys_addr_t ipa;
+	int ret = 0;
+	struct page *tmp_page;
+	phys_addr_t rd = virt_to_phys(realm->rd);
+
+	base_gfn = gpa_to_gfn(ipa_base);
+	end_gfn = gpa_to_gfn(ipa_end);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, base_gfn);
+	if (!memslot) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* We require the region to be contained within a single memslot */
+	if (memslot->base_gfn + memslot->npages < end_gfn) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	tmp_page = alloc_page(GFP_KERNEL);
+	if (!tmp_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mmap_read_lock(current->mm);
+
+	ipa = ipa_base;
+
+	while (ipa < ipa_end) {
+		struct vm_area_struct *vma;
+		unsigned long map_size;
+		unsigned int vma_shift;
+		unsigned long offset;
+		unsigned long hva;
+		struct page *page;
+		kvm_pfn_t pfn;
+		int level;
+
+		hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
+		vma = vma_lookup(current->mm, hva);
+		if (!vma) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (is_vm_hugetlb_page(vma))
+			vma_shift = huge_page_shift(hstate_vma(vma));
+		else
+			vma_shift = PAGE_SHIFT;
+
+		map_size = 1 << vma_shift;
+
+		/*
+		 * FIXME: This causes over mapping, but there's no good
+		 * solution here with the ABI as it stands
+		 */
+		ipa = ALIGN_DOWN(ipa, map_size);
+
+		switch (map_size) {
+		case RME_L2_BLOCK_SIZE:
+			level = 2;
+			break;
+		case PAGE_SIZE:
+			level = 3;
+			break;
+		default:
+			WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
+
+		if (is_error_pfn(pfn)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = rmi_rtt_init_ripas(rd, ipa, level);
+		if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+			ret = realm_create_rtt_levels(realm, ipa,
+						      RMI_RETURN_INDEX(ret),
+						      level, NULL);
+			if (ret)
+				break;
+			ret = rmi_rtt_init_ripas(rd, ipa, level);
+			if (ret) {
+				ret = -ENXIO;
+				break;
+			}
+		}
+
+		if (level < RME_RTT_MAX_LEVEL) {
+			/*
+			 * A temporary RTT is needed during the map, precreate
+			 * it, however if there is an error (e.g. missing
+			 * parent tables) this will be handled in the
+			 * realm_create_protected_data_page() call.
+			 */
+			realm_create_rtt_levels(realm, ipa, level,
+						RME_RTT_MAX_LEVEL, NULL);
+		}
+
+		page = pfn_to_page(pfn);
+
+		for (offset = 0; offset < map_size && !ret;
+		     offset += PAGE_SIZE, page++) {
+			phys_addr_t page_ipa = ipa + offset;
+
+			ret = realm_create_protected_data_page(realm, page_ipa,
+							       page, tmp_page);
+		}
+		if (ret)
+			goto err_release_pfn;
+
+		if (level == 2) {
+			ret = fold_rtt(rd, ipa, level, realm);
+			if (ret)
+				goto err_release_pfn;
+		}
+
+		ipa += map_size;
+		kvm_set_pfn_accessed(pfn);
+		kvm_set_pfn_dirty(pfn);
+		kvm_release_pfn_dirty(pfn);
+err_release_pfn:
+		if (ret) {
+			kvm_release_pfn_clean(pfn);
+			break;
+		}
+	}
+
+	mmap_read_unlock(current->mm);
+	__free_page(tmp_page);
+
+out:
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
+}
+
+static int kvm_populate_realm(struct kvm *kvm,
+			      struct kvm_cap_arm_rme_populate_realm_args *args)
+{
+	phys_addr_t ipa_base, ipa_end;
+
+	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
+		return -EBUSY;
+
+	if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
+	    !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
+		return -EINVAL;
+
+	ipa_base = args->populate_ipa_base;
+	ipa_end = ipa_base + args->populate_ipa_size;
+
+	if (ipa_end < ipa_base)
+		return -EINVAL;
+
+	return populate_par_region(kvm, ipa_base, ipa_end);
+}
+
 static int set_ipa_state(struct kvm_vcpu *vcpu,
 			 unsigned long ipa,
 			 unsigned long end,
@@ -748,6 +1102,18 @@  int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		r = kvm_init_ipa_range_realm(kvm, &args);
 		break;
 	}
+	case KVM_CAP_ARM_RME_POPULATE_REALM: {
+		struct kvm_cap_arm_rme_populate_realm_args args;
+		void __user *argp = u64_to_user_ptr(cap->args[1]);
+
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+
+		r = kvm_populate_realm(kvm, &args);
+		break;
+	}
 	default:
 		r = -EINVAL;
 		break;