diff mbox series

[v12,7/8] KVM: arm64: ioctl to fetch/store tags in a guest

Message ID 20210517123239.8025-8-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series MTE support for KVM guest | expand

Commit Message

Steven Price May 17, 2021, 12:32 p.m. UTC
The VMM may not wish to have it's own mapping of guest memory mapped
with PROT_MTE because this causes problems if the VMM has tag checking
enabled (the guest controls the tags in physical RAM and it's unlikely
the tags are correct for the VMM).

Instead add a new ioctl which allows the VMM to easily read/write the
tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
while the VMM can still read/write the tags for the purpose of
migration.

Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/include/uapi/asm/kvm.h | 11 +++++
 arch/arm64/kvm/arm.c              | 69 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 81 insertions(+)

Comments

Marc Zyngier May 17, 2021, 6:04 p.m. UTC | #1
On Mon, 17 May 2021 13:32:38 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> The VMM may not wish to have it's own mapping of guest memory mapped
> with PROT_MTE because this causes problems if the VMM has tag checking
> enabled (the guest controls the tags in physical RAM and it's unlikely
> the tags are correct for the VMM).
> 
> Instead add a new ioctl which allows the VMM to easily read/write the
> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
> while the VMM can still read/write the tags for the purpose of
> migration.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>  arch/arm64/kvm/arm.c              | 69 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  1 +
>  3 files changed, 81 insertions(+)
> 
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 24223adae150..b3edde68bc3e 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>  	__u32 reserved[12];
>  };
>  
> +struct kvm_arm_copy_mte_tags {
> +	__u64 guest_ipa;
> +	__u64 length;
> +	void __user *addr;
> +	__u64 flags;
> +	__u64 reserved[2];
> +};
> +
> +#define KVM_ARM_TAGS_TO_GUEST		0
> +#define KVM_ARM_TAGS_FROM_GUEST		1
> +
>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT	16
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index e89a5e275e25..4b6c83beb75d 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>  	}
>  }
>  
> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +				      struct kvm_arm_copy_mte_tags *copy_tags)
> +{
> +	gpa_t guest_ipa = copy_tags->guest_ipa;
> +	size_t length = copy_tags->length;
> +	void __user *tags = copy_tags->addr;
> +	gpa_t gfn;
> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> +	int ret = 0;
> +
> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
> +		return -EINVAL;
> +
> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> +		return -EINVAL;
> +
> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	gfn = gpa_to_gfn(guest_ipa);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	while (length > 0) {
> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> +		void *maddr;
> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;

nit: this is a compile time constant, make it a #define. This will
avoid the confusing overloading of "num_tags" as both an input and an
output for the mte_copy_tags-* functions.

> +
> +		if (is_error_noslot_pfn(pfn)) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		maddr = page_address(pfn_to_page(pfn));
> +
> +		if (!write) {
> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
> +			kvm_release_pfn_clean(pfn);
> +		} else {
> +			num_tags = mte_copy_tags_from_user(maddr, tags,
> +							   num_tags);
> +			kvm_release_pfn_dirty(pfn);
> +		}
> +
> +		if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		gfn++;
> +		tags += num_tags;
> +		length -= PAGE_SIZE;
> +	}
> +
> +out:
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +

nit again: I'd really prefer it if you moved this to guest.c, where we
already have a bunch of the save/restore stuff.

>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -1345,6 +1404,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  
>  		return 0;
>  	}
> +	case KVM_ARM_MTE_COPY_TAGS: {
> +		struct kvm_arm_copy_mte_tags copy_tags;
> +
> +		if (!kvm_has_mte(kvm))
> +			return -EINVAL;
> +
> +		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
> +			return -EFAULT;
> +		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
> +	}
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 8c95ba0fadda..4c011c60d468 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1428,6 +1428,7 @@ struct kvm_s390_ucas_mapping {
>  /* Available with KVM_CAP_PMU_EVENT_FILTER */
>  #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
>  #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
> +#define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
>  
>  /* ioctl for vm fd */
>  #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)

Thanks,

	M.
Steven Price May 19, 2021, 1:51 p.m. UTC | #2
On 17/05/2021 19:04, Marc Zyngier wrote:
> On Mon, 17 May 2021 13:32:38 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The VMM may not wish to have it's own mapping of guest memory mapped
>> with PROT_MTE because this causes problems if the VMM has tag checking
>> enabled (the guest controls the tags in physical RAM and it's unlikely
>> the tags are correct for the VMM).
>>
>> Instead add a new ioctl which allows the VMM to easily read/write the
>> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
>> while the VMM can still read/write the tags for the purpose of
>> migration.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>>  arch/arm64/kvm/arm.c              | 69 +++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h          |  1 +
>>  3 files changed, 81 insertions(+)
>>
>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
>> index 24223adae150..b3edde68bc3e 100644
>> --- a/arch/arm64/include/uapi/asm/kvm.h
>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>>  	__u32 reserved[12];
>>  };
>>  
>> +struct kvm_arm_copy_mte_tags {
>> +	__u64 guest_ipa;
>> +	__u64 length;
>> +	void __user *addr;
>> +	__u64 flags;
>> +	__u64 reserved[2];
>> +};
>> +
>> +#define KVM_ARM_TAGS_TO_GUEST		0
>> +#define KVM_ARM_TAGS_FROM_GUEST		1
>> +
>>  /* If you need to interpret the index values, here is the key: */
>>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>>  #define KVM_REG_ARM_COPROC_SHIFT	16
>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index e89a5e275e25..4b6c83beb75d 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>>  	}
>>  }
>>  
>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>> +				      struct kvm_arm_copy_mte_tags *copy_tags)
>> +{
>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
>> +	size_t length = copy_tags->length;
>> +	void __user *tags = copy_tags->addr;
>> +	gpa_t gfn;
>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>> +	int ret = 0;
>> +
>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
>> +		return -EINVAL;
>> +
>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>> +		return -EINVAL;
>> +
>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>> +		return -EINVAL;
>> +
>> +	gfn = gpa_to_gfn(guest_ipa);
>> +
>> +	mutex_lock(&kvm->slots_lock);
>> +
>> +	while (length > 0) {
>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>> +		void *maddr;
>> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
> 
> nit: this is a compile time constant, make it a #define. This will
> avoid the confusing overloading of "num_tags" as both an input and an
> output for the mte_copy_tags-* functions.

No problem, I agree my usage of num_tags wasn't very clear.

>> +
>> +		if (is_error_noslot_pfn(pfn)) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		maddr = page_address(pfn_to_page(pfn));
>> +
>> +		if (!write) {
>> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
>> +			kvm_release_pfn_clean(pfn);
>> +		} else {
>> +			num_tags = mte_copy_tags_from_user(maddr, tags,
>> +							   num_tags);
>> +			kvm_release_pfn_dirty(pfn);
>> +		}
>> +
>> +		if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		gfn++;
>> +		tags += num_tags;
>> +		length -= PAGE_SIZE;
>> +	}
>> +
>> +out:
>> +	mutex_unlock(&kvm->slots_lock);
>> +	return ret;
>> +}
>> +
> 
> nit again: I'd really prefer it if you moved this to guest.c, where we
> already have a bunch of the save/restore stuff.

Sure - I'll move it across.

Thanks,

Steve
Catalin Marinas May 20, 2021, 12:05 p.m. UTC | #3
On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 24223adae150..b3edde68bc3e 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>  	__u32 reserved[12];
>  };
>  
> +struct kvm_arm_copy_mte_tags {
> +	__u64 guest_ipa;
> +	__u64 length;
> +	void __user *addr;
> +	__u64 flags;
> +	__u64 reserved[2];

I forgot the past discussions, what's the reserved for? Future
expansion?

> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index e89a5e275e25..4b6c83beb75d 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>  	}
>  }
>  
> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +				      struct kvm_arm_copy_mte_tags *copy_tags)
> +{
> +	gpa_t guest_ipa = copy_tags->guest_ipa;
> +	size_t length = copy_tags->length;
> +	void __user *tags = copy_tags->addr;
> +	gpa_t gfn;
> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> +	int ret = 0;
> +
> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
> +		return -EINVAL;
> +
> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> +		return -EINVAL;
> +
> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	gfn = gpa_to_gfn(guest_ipa);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	while (length > 0) {
> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> +		void *maddr;
> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
> +
> +		if (is_error_noslot_pfn(pfn)) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		maddr = page_address(pfn_to_page(pfn));
> +
> +		if (!write) {
> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
> +			kvm_release_pfn_clean(pfn);

Do we need to check if PG_mte_tagged is set? If the page was not faulted
into the guest address space but the VMM has the page, does the
gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
not, this may read stale tags.

> +		} else {
> +			num_tags = mte_copy_tags_from_user(maddr, tags,
> +							   num_tags);
> +			kvm_release_pfn_dirty(pfn);
> +		}

Same question here, if the we can't guarantee the stage 2 pte being set,
we'd need to set PG_mte_tagged.

> +
> +		if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		gfn++;
> +		tags += num_tags;
> +		length -= PAGE_SIZE;
> +	}
> +
> +out:
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
Steven Price May 20, 2021, 3:58 p.m. UTC | #4
On 20/05/2021 13:05, Catalin Marinas wrote:
> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
>> index 24223adae150..b3edde68bc3e 100644
>> --- a/arch/arm64/include/uapi/asm/kvm.h
>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>>  	__u32 reserved[12];
>>  };
>>  
>> +struct kvm_arm_copy_mte_tags {
>> +	__u64 guest_ipa;
>> +	__u64 length;
>> +	void __user *addr;
>> +	__u64 flags;
>> +	__u64 reserved[2];
> 
> I forgot the past discussions, what's the reserved for? Future
> expansion?

Yes - for future expansion. Marc asked for them[1]:

> I'd be keen on a couple of reserved __64s. Just in case...

[1] https://lore.kernel.org/r/87ft14xl9e.wl-maz%40kernel.org

>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index e89a5e275e25..4b6c83beb75d 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>>  	}
>>  }
>>  
>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>> +				      struct kvm_arm_copy_mte_tags *copy_tags)
>> +{
>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
>> +	size_t length = copy_tags->length;
>> +	void __user *tags = copy_tags->addr;
>> +	gpa_t gfn;
>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>> +	int ret = 0;
>> +
>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
>> +		return -EINVAL;
>> +
>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>> +		return -EINVAL;
>> +
>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>> +		return -EINVAL;
>> +
>> +	gfn = gpa_to_gfn(guest_ipa);
>> +
>> +	mutex_lock(&kvm->slots_lock);
>> +
>> +	while (length > 0) {
>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>> +		void *maddr;
>> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
>> +
>> +		if (is_error_noslot_pfn(pfn)) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		maddr = page_address(pfn_to_page(pfn));
>> +
>> +		if (!write) {
>> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
>> +			kvm_release_pfn_clean(pfn);
> 
> Do we need to check if PG_mte_tagged is set? If the page was not faulted
> into the guest address space but the VMM has the page, does the
> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
> not, this may read stale tags.

Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot()
will fault it into the guest.

>> +		} else {
>> +			num_tags = mte_copy_tags_from_user(maddr, tags,
>> +							   num_tags);
>> +			kvm_release_pfn_dirty(pfn);
>> +		}
> 
> Same question here, if the we can't guarantee the stage 2 pte being set,
> we'd need to set PG_mte_tagged.

This is arguably worse as we'll be writing tags into the guest but
without setting PG_mte_tagged - so they'll be lost when the guest then
faults the pages in. Which sounds like it should break migration.

I think the below should be safe, and avoids the overhead of setting the
flag just for reads.

Thanks,

Steve

----8<----
		page = pfn_to_page(pfn);
		maddr = page_address(page);

		if (!write) {
			if (test_bit(PG_mte_tagged, &page->flags))
				num_tags = mte_copy_tags_to_user(tags, maddr,
							MTE_GRANULES_PER_PAGE);
			else
				/* No tags in memory, so write zeros */
				num_tags = MTE_GRANULES_PER_PAGE -
					clear_user(tag, MTE_GRANULES_PER_PAGE);
			kvm_release_pfn_clean(pfn);
		} else {
			num_tags = mte_copy_tags_from_user(maddr, tags,
							MTE_GRANULES_PER_PAGE);
			kvm_release_pfn_dirty(pfn);
		}

		if (num_tags != MTE_GRANULES_PER_PAGE) {
			ret = -EFAULT;
			goto out;
		}

		if (write)
			test_and_set_bit(PG_mte_tagged, &page->flags);
Catalin Marinas May 20, 2021, 5:27 p.m. UTC | #5
On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote:
> On 20/05/2021 13:05, Catalin Marinas wrote:
> > On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
> >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> >> index e89a5e275e25..4b6c83beb75d 100644
> >> --- a/arch/arm64/kvm/arm.c
> >> +++ b/arch/arm64/kvm/arm.c
> >> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
> >>  	}
> >>  }
> >>  
> >> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> >> +				      struct kvm_arm_copy_mte_tags *copy_tags)
> >> +{
> >> +	gpa_t guest_ipa = copy_tags->guest_ipa;
> >> +	size_t length = copy_tags->length;
> >> +	void __user *tags = copy_tags->addr;
> >> +	gpa_t gfn;
> >> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> >> +	int ret = 0;
> >> +
> >> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
> >> +		return -EINVAL;
> >> +
> >> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> >> +		return -EINVAL;
> >> +
> >> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> >> +		return -EINVAL;
> >> +
> >> +	gfn = gpa_to_gfn(guest_ipa);
> >> +
> >> +	mutex_lock(&kvm->slots_lock);
> >> +
> >> +	while (length > 0) {
> >> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> >> +		void *maddr;
> >> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
> >> +
> >> +		if (is_error_noslot_pfn(pfn)) {
> >> +			ret = -EFAULT;
> >> +			goto out;
> >> +		}
> >> +
> >> +		maddr = page_address(pfn_to_page(pfn));
> >> +
> >> +		if (!write) {
> >> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
> >> +			kvm_release_pfn_clean(pfn);
> > 
> > Do we need to check if PG_mte_tagged is set? If the page was not faulted
> > into the guest address space but the VMM has the page, does the
> > gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
> > not, this may read stale tags.
> 
> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot()
> will fault it into the guest.

It doesn't indeed. What it does is a get_user_pages() but it's not of
much help since the VMM pte wouldn't be tagged (we would have solved
lots of problems if we required PROT_MTE in the VMM...)

> >> +		} else {
> >> +			num_tags = mte_copy_tags_from_user(maddr, tags,
> >> +							   num_tags);
> >> +			kvm_release_pfn_dirty(pfn);
> >> +		}
> > 
> > Same question here, if the we can't guarantee the stage 2 pte being set,
> > we'd need to set PG_mte_tagged.
> 
> This is arguably worse as we'll be writing tags into the guest but
> without setting PG_mte_tagged - so they'll be lost when the guest then
> faults the pages in. Which sounds like it should break migration.
> 
> I think the below should be safe, and avoids the overhead of setting the
> flag just for reads.
> 
> Thanks,
> 
> Steve
> 
> ----8<----
> 		page = pfn_to_page(pfn);
> 		maddr = page_address(page);
> 
> 		if (!write) {
> 			if (test_bit(PG_mte_tagged, &page->flags))
> 				num_tags = mte_copy_tags_to_user(tags, maddr,
> 							MTE_GRANULES_PER_PAGE);
> 			else
> 				/* No tags in memory, so write zeros */
> 				num_tags = MTE_GRANULES_PER_PAGE -
> 					clear_user(tag, MTE_GRANULES_PER_PAGE);
> 			kvm_release_pfn_clean(pfn);

For ptrace we return a -EOPNOTSUPP if the address doesn't have VM_MTE
but I don't think it makes sense here, so I'm fine with clearing the
destination and assuming that the tags are zero (as they'd be on
faulting into the guest.

Another thing I forgot to ask, what's guaranteeing that the page
supports tags? Does this ioctl ensure that it would attempt the tag
copying from some device mapping? Do we need some kvm_is_device_pfn()
check? I guess ZONE_DEVICE memory we just refuse to map in an earlier
patch.

> 		} else {
> 			num_tags = mte_copy_tags_from_user(maddr, tags,
> 							MTE_GRANULES_PER_PAGE);
> 			kvm_release_pfn_dirty(pfn);
> 		}
> 
> 		if (num_tags != MTE_GRANULES_PER_PAGE) {
> 			ret = -EFAULT;
> 			goto out;
> 		}
> 
> 		if (write)
> 			test_and_set_bit(PG_mte_tagged, &page->flags);

I think a set_bit() would do, I doubt it's any more efficient. But why
not add it in the 'else' block above where we actually wrote the tags?
The copy function may have failed part-way through. Maybe your logic is
correct though, there are invalid tags in the page. Just add a comment.
Steven Price May 21, 2021, 9:42 a.m. UTC | #6
On 20/05/2021 18:27, Catalin Marinas wrote:
> On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote:
>> On 20/05/2021 13:05, Catalin Marinas wrote:
>>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
>>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>>>> index e89a5e275e25..4b6c83beb75d 100644
>>>> --- a/arch/arm64/kvm/arm.c
>>>> +++ b/arch/arm64/kvm/arm.c
>>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>>>>  	}
>>>>  }
>>>>  
>>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>>>> +				      struct kvm_arm_copy_mte_tags *copy_tags)
>>>> +{
>>>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
>>>> +	size_t length = copy_tags->length;
>>>> +	void __user *tags = copy_tags->addr;
>>>> +	gpa_t gfn;
>>>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>>>> +	int ret = 0;
>>>> +
>>>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>>>> +		return -EINVAL;
>>>> +
>>>> +	gfn = gpa_to_gfn(guest_ipa);
>>>> +
>>>> +	mutex_lock(&kvm->slots_lock);
>>>> +
>>>> +	while (length > 0) {
>>>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>>>> +		void *maddr;
>>>> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
>>>> +
>>>> +		if (is_error_noslot_pfn(pfn)) {
>>>> +			ret = -EFAULT;
>>>> +			goto out;
>>>> +		}
>>>> +
>>>> +		maddr = page_address(pfn_to_page(pfn));
>>>> +
>>>> +		if (!write) {
>>>> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
>>>> +			kvm_release_pfn_clean(pfn);
>>>
>>> Do we need to check if PG_mte_tagged is set? If the page was not faulted
>>> into the guest address space but the VMM has the page, does the
>>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
>>> not, this may read stale tags.
>>
>> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot()
>> will fault it into the guest.
> 
> It doesn't indeed. What it does is a get_user_pages() but it's not of
> much help since the VMM pte wouldn't be tagged (we would have solved
> lots of problems if we required PROT_MTE in the VMM...)

Sadly it solves some problems and creates others :(

>>>> +		} else {
>>>> +			num_tags = mte_copy_tags_from_user(maddr, tags,
>>>> +							   num_tags);
>>>> +			kvm_release_pfn_dirty(pfn);
>>>> +		}
>>>
>>> Same question here, if the we can't guarantee the stage 2 pte being set,
>>> we'd need to set PG_mte_tagged.
>>
>> This is arguably worse as we'll be writing tags into the guest but
>> without setting PG_mte_tagged - so they'll be lost when the guest then
>> faults the pages in. Which sounds like it should break migration.
>>
>> I think the below should be safe, and avoids the overhead of setting the
>> flag just for reads.
>>
>> Thanks,
>>
>> Steve
>>
>> ----8<----
>> 		page = pfn_to_page(pfn);
>> 		maddr = page_address(page);
>>
>> 		if (!write) {
>> 			if (test_bit(PG_mte_tagged, &page->flags))
>> 				num_tags = mte_copy_tags_to_user(tags, maddr,
>> 							MTE_GRANULES_PER_PAGE);
>> 			else
>> 				/* No tags in memory, so write zeros */
>> 				num_tags = MTE_GRANULES_PER_PAGE -
>> 					clear_user(tag, MTE_GRANULES_PER_PAGE);
>> 			kvm_release_pfn_clean(pfn);
> 
> For ptrace we return a -EOPNOTSUPP if the address doesn't have VM_MTE
> but I don't think it makes sense here, so I'm fine with clearing the
> destination and assuming that the tags are zero (as they'd be on
> faulting into the guest.

Yeah - conceptually all pages in an MTE-enabled guest have tags. It's
just we don't actually populate the physical memory until the guest
tries to touch them. So it makes sense to just return zeros here.
Alternatively we could populate the physical tags but that seems
unnecessary.

> Another thing I forgot to ask, what's guaranteeing that the page
> supports tags? Does this ioctl ensure that it would attempt the tag
> copying from some device mapping? Do we need some kvm_is_device_pfn()
> check? I guess ZONE_DEVICE memory we just refuse to map in an earlier
> patch.

Hmm, nothing much. While reads are now fine (the memory won't have
PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory.

The fix is to just replace pfn_to_page() with pfn_to_online_page() and
handle the error.

>> 		} else {
>> 			num_tags = mte_copy_tags_from_user(maddr, tags,
>> 							MTE_GRANULES_PER_PAGE);
>> 			kvm_release_pfn_dirty(pfn);
>> 		}
>>
>> 		if (num_tags != MTE_GRANULES_PER_PAGE) {
>> 			ret = -EFAULT;
>> 			goto out;
>> 		}
>>
>> 		if (write)
>> 			test_and_set_bit(PG_mte_tagged, &page->flags);
> 
> I think a set_bit() would do, I doubt it's any more efficient. But why

I'd seen test_and_set_bit() used elsewhere (I forget where now) as a
slightly more efficient approach. It complies down to a READ_ONCE and a
conditional atomic, vs a single non-conditional atomic. But I don't have
any actual data on the performance and this isn't a hot path, so I'll
switch to the more obvious set_bit().

> not add it in the 'else' block above where we actually wrote the tags?
> The copy function may have failed part-way through. Maybe your logic is
> correct though, there are invalid tags in the page. Just add a comment.

Yeah it's in case the write fails part way through - we don't want to
expose the tags which were not written. I'll add a comment to explain that.

Steve
Catalin Marinas May 24, 2021, 6:11 p.m. UTC | #7
On Fri, May 21, 2021 at 10:42:09AM +0100, Steven Price wrote:
> On 20/05/2021 18:27, Catalin Marinas wrote:
> > On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote:
> >> On 20/05/2021 13:05, Catalin Marinas wrote:
> >>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
> >>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> >>>> index e89a5e275e25..4b6c83beb75d 100644
> >>>> --- a/arch/arm64/kvm/arm.c
> >>>> +++ b/arch/arm64/kvm/arm.c
> >>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
> >>>>  	}
> >>>>  }
> >>>>  
> >>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> >>>> +				      struct kvm_arm_copy_mte_tags *copy_tags)
> >>>> +{
> >>>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
> >>>> +	size_t length = copy_tags->length;
> >>>> +	void __user *tags = copy_tags->addr;
> >>>> +	gpa_t gfn;
> >>>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> >>>> +	int ret = 0;
> >>>> +
> >>>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> >>>> +		return -EINVAL;
> >>>> +
> >>>> +	gfn = gpa_to_gfn(guest_ipa);
> >>>> +
> >>>> +	mutex_lock(&kvm->slots_lock);
> >>>> +
> >>>> +	while (length > 0) {
> >>>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> >>>> +		void *maddr;
> >>>> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
> >>>> +
> >>>> +		if (is_error_noslot_pfn(pfn)) {
> >>>> +			ret = -EFAULT;
> >>>> +			goto out;
> >>>> +		}
> >>>> +
> >>>> +		maddr = page_address(pfn_to_page(pfn));
> >>>> +
> >>>> +		if (!write) {
> >>>> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
> >>>> +			kvm_release_pfn_clean(pfn);
> >>>
> >>> Do we need to check if PG_mte_tagged is set? If the page was not faulted
> >>> into the guest address space but the VMM has the page, does the
> >>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
> >>> not, this may read stale tags.
> >>
> >> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot()
> >> will fault it into the guest.
> > 
> > It doesn't indeed. What it does is a get_user_pages() but it's not of
> > much help since the VMM pte wouldn't be tagged (we would have solved
> > lots of problems if we required PROT_MTE in the VMM...)
> 
> Sadly it solves some problems and creates others :(

I had some (random) thoughts on how to make things simpler, maybe. I
think most of these races would have been solved if we required PROT_MTE
in the VMM but this has an impact on the VMM if it wants to use MTE
itself. If such requirement was in place, all KVM needed to do is check
PG_mte_tagged.

So what we actually need is a set_pte_at() in the VMM to clear the tags
and set PG_mte_tagged. Currently, we only do this if the memory type is
tagged (PROT_MTE) but it's not strictly necessary.

As an optimisation for normal programs, we don't want to do this all the
time but the visible behaviour wouldn't change (well, maybe for ptrace
slightly). However, it doesn't mean we couldn't for a VMM, with an
opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't
think of a better name) to mm_context_t.flags and set_pte_at() would
behave as if the pte was tagged without actually mapping the memory in
user space as tagged (protection flags not changed). Pages that don't
support tagging are still safe, just some unnecessary ignored tag
writes. This would need to be set before the mmap() for the guest
memory.

If we want finer-grained control we'd have to store this information in
the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without
affecting the actual memory type. The easiest would be another pte bit,
though we are short on them. A more intrusive (not too bad) approach is
to introduce a set_pte_at_vma() and read the flags directly in the arch
code. In most places where set_pte_at() is called on a user mm, the vma
is also available.

Anyway, I'm not saying we go this route, just thinking out loud, get
some opinions.

> > Another thing I forgot to ask, what's guaranteeing that the page
> > supports tags? Does this ioctl ensure that it would attempt the tag
> > copying from some device mapping? Do we need some kvm_is_device_pfn()
> > check? I guess ZONE_DEVICE memory we just refuse to map in an earlier
> > patch.
> 
> Hmm, nothing much. While reads are now fine (the memory won't have
> PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory.

I don't think it's a problem for writes either as the host wouldn't map
such memory as tagged. It's just that it returns zeros and writes are
ignored, so we could instead return an error (I haven't checked your
latest series yet).

> >> 		} else {
> >> 			num_tags = mte_copy_tags_from_user(maddr, tags,
> >> 							MTE_GRANULES_PER_PAGE);
> >> 			kvm_release_pfn_dirty(pfn);
> >> 		}
> >>
> >> 		if (num_tags != MTE_GRANULES_PER_PAGE) {
> >> 			ret = -EFAULT;
> >> 			goto out;
> >> 		}
> >>
> >> 		if (write)
> >> 			test_and_set_bit(PG_mte_tagged, &page->flags);
> > 
> > I think a set_bit() would do, I doubt it's any more efficient. But why
> 
> I'd seen test_and_set_bit() used elsewhere (I forget where now) as a
> slightly more efficient approach. It complies down to a READ_ONCE and a
> conditional atomic, vs a single non-conditional atomic. But I don't have
> any actual data on the performance and this isn't a hot path, so I'll
> switch to the more obvious set_bit().

Yeah, I think I've seen this as well. Anyway, it's probably lost in the
noise of tag writing here.
Steven Price May 27, 2021, 7:50 a.m. UTC | #8
On 24/05/2021 19:11, Catalin Marinas wrote:
> On Fri, May 21, 2021 at 10:42:09AM +0100, Steven Price wrote:
>> On 20/05/2021 18:27, Catalin Marinas wrote:
>>> On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote:
>>>> On 20/05/2021 13:05, Catalin Marinas wrote:
>>>>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote:
>>>>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>>>>>> index e89a5e275e25..4b6c83beb75d 100644
>>>>>> --- a/arch/arm64/kvm/arm.c
>>>>>> +++ b/arch/arm64/kvm/arm.c
>>>>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
>>>>>>  	}
>>>>>>  }
>>>>>>  
>>>>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>>>>>> +				      struct kvm_arm_copy_mte_tags *copy_tags)
>>>>>> +{
>>>>>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
>>>>>> +	size_t length = copy_tags->length;
>>>>>> +	void __user *tags = copy_tags->addr;
>>>>>> +	gpa_t gfn;
>>>>>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>>>>>> +	int ret = 0;
>>>>>> +
>>>>>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	gfn = gpa_to_gfn(guest_ipa);
>>>>>> +
>>>>>> +	mutex_lock(&kvm->slots_lock);
>>>>>> +
>>>>>> +	while (length > 0) {
>>>>>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>>>>>> +		void *maddr;
>>>>>> +		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
>>>>>> +
>>>>>> +		if (is_error_noslot_pfn(pfn)) {
>>>>>> +			ret = -EFAULT;
>>>>>> +			goto out;
>>>>>> +		}
>>>>>> +
>>>>>> +		maddr = page_address(pfn_to_page(pfn));
>>>>>> +
>>>>>> +		if (!write) {
>>>>>> +			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
>>>>>> +			kvm_release_pfn_clean(pfn);
>>>>>
>>>>> Do we need to check if PG_mte_tagged is set? If the page was not faulted
>>>>> into the guest address space but the VMM has the page, does the
>>>>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If
>>>>> not, this may read stale tags.
>>>>
>>>> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot()
>>>> will fault it into the guest.
>>>
>>> It doesn't indeed. What it does is a get_user_pages() but it's not of
>>> much help since the VMM pte wouldn't be tagged (we would have solved
>>> lots of problems if we required PROT_MTE in the VMM...)
>>
>> Sadly it solves some problems and creates others :(
> 
> I had some (random) thoughts on how to make things simpler, maybe. I
> think most of these races would have been solved if we required PROT_MTE
> in the VMM but this has an impact on the VMM if it wants to use MTE
> itself. If such requirement was in place, all KVM needed to do is check
> PG_mte_tagged.
> 
> So what we actually need is a set_pte_at() in the VMM to clear the tags
> and set PG_mte_tagged. Currently, we only do this if the memory type is
> tagged (PROT_MTE) but it's not strictly necessary.
> 
> As an optimisation for normal programs, we don't want to do this all the
> time but the visible behaviour wouldn't change (well, maybe for ptrace
> slightly). However, it doesn't mean we couldn't for a VMM, with an
> opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't
> think of a better name) to mm_context_t.flags and set_pte_at() would
> behave as if the pte was tagged without actually mapping the memory in
> user space as tagged (protection flags not changed). Pages that don't
> support tagging are still safe, just some unnecessary ignored tag
> writes. This would need to be set before the mmap() for the guest
> memory.
> 
> If we want finer-grained control we'd have to store this information in
> the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without
> affecting the actual memory type. The easiest would be another pte bit,
> though we are short on them. A more intrusive (not too bad) approach is
> to introduce a set_pte_at_vma() and read the flags directly in the arch
> code. In most places where set_pte_at() is called on a user mm, the vma
> is also available.
> 
> Anyway, I'm not saying we go this route, just thinking out loud, get
> some opinions.

Does get_user_pages() actually end up calling set_pte_at() normally? If
not then on the normal user_mem_abort() route although we can easily
check VM_MTE_TAG_INIT there's no obvious place to hook in to ensure that
the pages actually allocated have the PG_mte_tagged flag.

I'm also not sure how well this would work with the MMU notifiers path
in KVM. With MMU notifiers (i.e. the VMM replacing a page in the
memslot) there's not even an obvious hook to enforce the VMA flag. So I
think we'd end up with something like the sanitise_mte_tags() function
to at least check that the PG_mte_tagged flag is set on the pages
(assuming that the trigger for the MMU notifier has done the
corresponding set_pte_at()). Admittedly this might close the current
race documented there.

It also feels wrong to me to tie this to a process with prctl(), it
seems much more normal to implement this as a new mprotect() flag as
this is really a memory property not a process property. And I think
we'll find some scary corner cases if we try to associate everything
back to a process - although I can't instantly think of anything that
will actually break.

>>> Another thing I forgot to ask, what's guaranteeing that the page
>>> supports tags? Does this ioctl ensure that it would attempt the tag
>>> copying from some device mapping? Do we need some kvm_is_device_pfn()
>>> check? I guess ZONE_DEVICE memory we just refuse to map in an earlier
>>> patch.
>>
>> Hmm, nothing much. While reads are now fine (the memory won't have
>> PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory.
> 
> I don't think it's a problem for writes either as the host wouldn't map
> such memory as tagged. It's just that it returns zeros and writes are
> ignored, so we could instead return an error (I haven't checked your
> latest series yet).

The latest series uses pfn_to_online_page() to reject ZONE_DEVICE early.

>>>> 		} else {
>>>> 			num_tags = mte_copy_tags_from_user(maddr, tags,
>>>> 							MTE_GRANULES_PER_PAGE);
>>>> 			kvm_release_pfn_dirty(pfn);
>>>> 		}
>>>>
>>>> 		if (num_tags != MTE_GRANULES_PER_PAGE) {
>>>> 			ret = -EFAULT;
>>>> 			goto out;
>>>> 		}
>>>>
>>>> 		if (write)
>>>> 			test_and_set_bit(PG_mte_tagged, &page->flags);
>>>
>>> I think a set_bit() would do, I doubt it's any more efficient. But why
>>
>> I'd seen test_and_set_bit() used elsewhere (I forget where now) as a
>> slightly more efficient approach. It complies down to a READ_ONCE and a
>> conditional atomic, vs a single non-conditional atomic. But I don't have
>> any actual data on the performance and this isn't a hot path, so I'll
>> switch to the more obvious set_bit().
> 
> Yeah, I think I've seen this as well. Anyway, it's probably lost in the
> noise of tag writing here.
> 

Agreed.

Thanks,

Steve
Catalin Marinas May 27, 2021, 1:08 p.m. UTC | #9
On Thu, May 27, 2021 at 08:50:30AM +0100, Steven Price wrote:
> On 24/05/2021 19:11, Catalin Marinas wrote:
> > I had some (random) thoughts on how to make things simpler, maybe. I
> > think most of these races would have been solved if we required PROT_MTE
> > in the VMM but this has an impact on the VMM if it wants to use MTE
> > itself. If such requirement was in place, all KVM needed to do is check
> > PG_mte_tagged.
> > 
> > So what we actually need is a set_pte_at() in the VMM to clear the tags
> > and set PG_mte_tagged. Currently, we only do this if the memory type is
> > tagged (PROT_MTE) but it's not strictly necessary.
> > 
> > As an optimisation for normal programs, we don't want to do this all the
> > time but the visible behaviour wouldn't change (well, maybe for ptrace
> > slightly). However, it doesn't mean we couldn't for a VMM, with an
> > opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't
> > think of a better name) to mm_context_t.flags and set_pte_at() would
> > behave as if the pte was tagged without actually mapping the memory in
> > user space as tagged (protection flags not changed). Pages that don't
> > support tagging are still safe, just some unnecessary ignored tag
> > writes. This would need to be set before the mmap() for the guest
> > memory.
> > 
> > If we want finer-grained control we'd have to store this information in
> > the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without
> > affecting the actual memory type. The easiest would be another pte bit,
> > though we are short on them. A more intrusive (not too bad) approach is
> > to introduce a set_pte_at_vma() and read the flags directly in the arch
> > code. In most places where set_pte_at() is called on a user mm, the vma
> > is also available.
> > 
> > Anyway, I'm not saying we go this route, just thinking out loud, get
> > some opinions.
> 
> Does get_user_pages() actually end up calling set_pte_at() normally?

Not always, at least as how it's called from hva_to_pfn(). My reading of
the get_user_page_fast_only() is that it doesn't touch the pte, just
walks the page tables and pins the page. Of course, it expects a valid
pte to have been set in the VMM already, otherwise it doesn't pin any
page and the caller falls back to the slow path.

The slow path, get_user_pages_unlocked(), passes FOLL_TOUCH and
set_pte_at() will be called either in follow_pfn_pte() if it was valid
or via faultin_page() -> handle_mm_fault().

> If not then on the normal user_mem_abort() route although we can
> easily check VM_MTE_TAG_INIT there's no obvious place to hook in to
> ensure that the pages actually allocated have the PG_mte_tagged flag.

I don't think it helps if we checked such vma flag in user_mem_abort(),
we'd still have the race with set_pte_at() on the page flags. What I was
trying to avoid is touching the page flags in too many places, so
deferring this always to set_pte_at() in the VMM.

> I'm also not sure how well this would work with the MMU notifiers path
> in KVM. With MMU notifiers (i.e. the VMM replacing a page in the
> memslot) there's not even an obvious hook to enforce the VMA flag. So I
> think we'd end up with something like the sanitise_mte_tags() function
> to at least check that the PG_mte_tagged flag is set on the pages
> (assuming that the trigger for the MMU notifier has done the
> corresponding set_pte_at()). Admittedly this might close the current
> race documented there.

If we kept this check to the VMM set_pte_at(), I think we can ignore the
notifiers.

> It also feels wrong to me to tie this to a process with prctl(), it
> seems much more normal to implement this as a new mprotect() flag as
> this is really a memory property not a process property. And I think
> we'll find some scary corner cases if we try to associate everything
> back to a process - although I can't instantly think of anything that
> will actually break.

I agree, tying it to the process looks wrong, only that it's less
intrusive. I don't think it would break anything, only potential
performance regression. A process would still need to pass PROT_MTE to
be able to get tag checking. That's basically what I had in an early MTE
implementation with clear_user_page() always zeroing the tags.

I agree with you that a vma flag would be better but it's more
complicated without an additional pte bit. We could also miss some
updates as mprotect() for example checks for pte_same() before calling
set_pte_at() (it would need to check the updated vma flags).

I'll review the latest series but I'm tempted to move the logic in
santise_mte_tags() to mte.c and take the big lock in there if
PG_mte_tagged is not already set. If we hit performance issues, we can
optimise this later to have the page flag set already on creation (new
PROT flag, prctl etc.).
diff mbox series

Patch

diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..b3edde68bc3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,17 @@  struct kvm_vcpu_events {
 	__u32 reserved[12];
 };
 
+struct kvm_arm_copy_mte_tags {
+	__u64 guest_ipa;
+	__u64 length;
+	void __user *addr;
+	__u64 flags;
+	__u64 reserved[2];
+};
+
+#define KVM_ARM_TAGS_TO_GUEST		0
+#define KVM_ARM_TAGS_FROM_GUEST		1
+
 /* If you need to interpret the index values, here is the key: */
 #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
 #define KVM_REG_ARM_COPROC_SHIFT	16
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e89a5e275e25..4b6c83beb75d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1309,6 +1309,65 @@  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 	}
 }
 
+static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+				      struct kvm_arm_copy_mte_tags *copy_tags)
+{
+	gpa_t guest_ipa = copy_tags->guest_ipa;
+	size_t length = copy_tags->length;
+	void __user *tags = copy_tags->addr;
+	gpa_t gfn;
+	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+	int ret = 0;
+
+	if (copy_tags->reserved[0] || copy_tags->reserved[1])
+		return -EINVAL;
+
+	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+		return -EINVAL;
+
+	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+		return -EINVAL;
+
+	gfn = gpa_to_gfn(guest_ipa);
+
+	mutex_lock(&kvm->slots_lock);
+
+	while (length > 0) {
+		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+		void *maddr;
+		unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
+
+		if (is_error_noslot_pfn(pfn)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		maddr = page_address(pfn_to_page(pfn));
+
+		if (!write) {
+			num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
+			kvm_release_pfn_clean(pfn);
+		} else {
+			num_tags = mte_copy_tags_from_user(maddr, tags,
+							   num_tags);
+			kvm_release_pfn_dirty(pfn);
+		}
+
+		if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		gfn++;
+		tags += num_tags;
+		length -= PAGE_SIZE;
+	}
+
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -1345,6 +1404,16 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 		return 0;
 	}
+	case KVM_ARM_MTE_COPY_TAGS: {
+		struct kvm_arm_copy_mte_tags copy_tags;
+
+		if (!kvm_has_mte(kvm))
+			return -EINVAL;
+
+		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
+			return -EFAULT;
+		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8c95ba0fadda..4c011c60d468 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1428,6 +1428,7 @@  struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PMU_EVENT_FILTER */
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
+#define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)