diff mbox series

[v13,4/8] KVM: arm64: Introduce MTE VM feature

Message ID 20210524104513.13258-5-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series MTE support for KVM guest | expand

Commit Message

Steven Price May 24, 2021, 10:45 a.m. UTC
Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This will expose the feature to the guest and automatically
tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
storage) to ensure that the guest cannot see stale tags, and so that
the tags are correctly saved/restored across swap.

Actually exposing the new capability to user space happens in a later
patch.

Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  3 ++
 arch/arm64/include/asm/kvm_host.h    |  3 ++
 arch/arm64/kvm/hyp/exception.c       |  3 +-
 arch/arm64/kvm/mmu.c                 | 48 +++++++++++++++++++++++++++-
 arch/arm64/kvm/sys_regs.c            |  7 ++++
 include/uapi/linux/kvm.h             |  1 +
 6 files changed, 63 insertions(+), 2 deletions(-)

Comments

Catalin Marinas June 3, 2021, 4 p.m. UTC | #1
On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c5d1f3c87dbd..226035cf7d6c 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -822,6 +822,42 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>  	return PAGE_SIZE;
>  }
>  
> +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> +			     unsigned long size)
> +{
> +	if (kvm_has_mte(kvm)) {

Nitpick (less indentation):

	if (!kvm_has_mte(kvm))
		return 0;

> +		/*
> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> +		 * the VM will be able to see the page's tags and therefore
> +		 * they must be initialised first. If PG_mte_tagged is set,
> +		 * tags have already been initialised.
> +		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
> +		 * that may not support tags.
> +		 */
> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> +		struct page *page = pfn_to_online_page(pfn);
> +
> +		if (!page)
> +			return -EFAULT;
> +
> +		for (i = 0; i < nr_pages; i++, page++) {
> +			/*
> +			 * There is a potential (but very unlikely) race
> +			 * between two VMs which are sharing a physical page
> +			 * entering this at the same time. However by splitting
> +			 * the test/set the only risk is tags being overwritten
> +			 * by the mte_clear_page_tags() call.
> +			 */

And I think the real risk here is when the page is writable by at least
one of the VMs sharing the page. This excludes KSM, so it only leaves
the MAP_SHARED mappings.

> +			if (!test_bit(PG_mte_tagged, &page->flags)) {
> +				mte_clear_page_tags(page_address(page));
> +				set_bit(PG_mte_tagged, &page->flags);
> +			}
> +		}

If we want to cover this race (I'd say in a separate patch), we can call
mte_sync_page_tags(page, __pte(0), false, true) directly (hopefully I
got the arguments right). We can avoid the big lock in most cases if
kvm_arch_prepare_memory_region() sets a VM_MTE_RESET (tag clear etc.)
and __alloc_zeroed_user_highpage() clears the tags on allocation (as we
do for VM_MTE but the new flag would not affect the stage 1 VMM page
attributes).

> +	}
> +
> +	return 0;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>  			  unsigned long fault_status)
> @@ -971,8 +1007,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (writable)
>  		prot |= KVM_PGTABLE_PROT_W;
>  
> -	if (fault_status != FSC_PERM && !device)
> +	if (fault_status != FSC_PERM && !device) {
> +		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
> +		if (ret)
> +			goto out_unlock;

Maybe it was discussed in a previous version, why do we need this in
addition to kvm_set_spte_gfn()?

> +
>  		clean_dcache_guest_page(pfn, vma_pagesize);
> +	}
>  
>  	if (exec_fault) {
>  		prot |= KVM_PGTABLE_PROT_X;
> @@ -1168,12 +1209,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	kvm_pfn_t pfn = pte_pfn(range->pte);
> +	int ret;
>  
>  	if (!kvm->arch.mmu.pgt)
>  		return 0;
>  
>  	WARN_ON(range->end - range->start != 1);
>  
> +	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
> +	if (ret)
> +		return false;
> +
>  	/*
>  	 * We've moved a page around, probably through CoW, so let's treat it
>  	 * just like a translation fault and clean the cache to the PoC.

Otherwise the patch looks fine.
Catalin Marinas June 4, 2021, 9:01 a.m. UTC | #2
On Thu, Jun 03, 2021 at 05:00:31PM +0100, Catalin Marinas wrote:
> On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index c5d1f3c87dbd..226035cf7d6c 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -822,6 +822,42 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >  	return PAGE_SIZE;
> >  }
> >  
> > +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> > +			     unsigned long size)
> > +{
> > +	if (kvm_has_mte(kvm)) {
> > +		/*
> > +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> > +		 * the VM will be able to see the page's tags and therefore
> > +		 * they must be initialised first. If PG_mte_tagged is set,
> > +		 * tags have already been initialised.
> > +		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
> > +		 * that may not support tags.
> > +		 */
> > +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> > +		struct page *page = pfn_to_online_page(pfn);
> > +
> > +		if (!page)
> > +			return -EFAULT;
> > +
> > +		for (i = 0; i < nr_pages; i++, page++) {
> > +			/*
> > +			 * There is a potential (but very unlikely) race
> > +			 * between two VMs which are sharing a physical page
> > +			 * entering this at the same time. However by splitting
> > +			 * the test/set the only risk is tags being overwritten
> > +			 * by the mte_clear_page_tags() call.
> > +			 */
> 
> And I think the real risk here is when the page is writable by at least
> one of the VMs sharing the page. This excludes KSM, so it only leaves
> the MAP_SHARED mappings.
> 
> > +			if (!test_bit(PG_mte_tagged, &page->flags)) {
> > +				mte_clear_page_tags(page_address(page));
> > +				set_bit(PG_mte_tagged, &page->flags);
> > +			}
> > +		}
> 
> If we want to cover this race (I'd say in a separate patch), we can call
> mte_sync_page_tags(page, __pte(0), false, true) directly (hopefully I
> got the arguments right). We can avoid the big lock in most cases if
> kvm_arch_prepare_memory_region() sets a VM_MTE_RESET (tag clear etc.)
> and __alloc_zeroed_user_highpage() clears the tags on allocation (as we
> do for VM_MTE but the new flag would not affect the stage 1 VMM page
> attributes).

Another idea: if VM_SHARED is found for any vma within a region in
kvm_arch_prepare_memory_region(), we either prevent the enabling of MTE
for the guest or reject the memory slot if MTE was already enabled.

An alternative here would be to clear VM_MTE_ALLOWED so that any
subsequent mprotect(PROT_MTE) in the VMM would fail in
arch_validate_flags(). MTE would still be allowed in the guest but in
the VMM for the guest memory regions. We can probably do this
irrespective of VM_SHARED. Of course, the VMM can still mmap() the
memory initially with PROT_MTE but that's not an issue IIRC, only the
concurrent mprotect().
Steven Price June 4, 2021, 10:42 a.m. UTC | #3
On 03/06/2021 17:00, Catalin Marinas wrote:
> On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index c5d1f3c87dbd..226035cf7d6c 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -822,6 +822,42 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>  	return PAGE_SIZE;
>>  }
>>  
>> +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
>> +			     unsigned long size)
>> +{
>> +	if (kvm_has_mte(kvm)) {
> 
> Nitpick (less indentation):
> 
> 	if (!kvm_has_mte(kvm))
> 		return 0;

Thanks, will change.

>> +		/*
>> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
>> +		 * the VM will be able to see the page's tags and therefore
>> +		 * they must be initialised first. If PG_mte_tagged is set,
>> +		 * tags have already been initialised.
>> +		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
>> +		 * that may not support tags.
>> +		 */
>> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
>> +		struct page *page = pfn_to_online_page(pfn);
>> +
>> +		if (!page)
>> +			return -EFAULT;
>> +
>> +		for (i = 0; i < nr_pages; i++, page++) {
>> +			/*
>> +			 * There is a potential (but very unlikely) race
>> +			 * between two VMs which are sharing a physical page
>> +			 * entering this at the same time. However by splitting
>> +			 * the test/set the only risk is tags being overwritten
>> +			 * by the mte_clear_page_tags() call.
>> +			 */
> 
> And I think the real risk here is when the page is writable by at least
> one of the VMs sharing the page. This excludes KSM, so it only leaves
> the MAP_SHARED mappings.
> 
>> +			if (!test_bit(PG_mte_tagged, &page->flags)) {
>> +				mte_clear_page_tags(page_address(page));
>> +				set_bit(PG_mte_tagged, &page->flags);
>> +			}
>> +		}
> 
> If we want to cover this race (I'd say in a separate patch), we can call
> mte_sync_page_tags(page, __pte(0), false, true) directly (hopefully I
> got the arguments right). We can avoid the big lock in most cases if
> kvm_arch_prepare_memory_region() sets a VM_MTE_RESET (tag clear etc.)
> and __alloc_zeroed_user_highpage() clears the tags on allocation (as we
> do for VM_MTE but the new flag would not affect the stage 1 VMM page
> attributes).

To be honest I'm coming round to just exporting a
mte_prepare_page_tags() function which does the clear/set with the lock
held. I doubt it's such a performance critical path that it will cause
any noticeable issues. Then if we run into performance problems in the
future we can start experimenting with extra VM flags etc as necessary.

And from your later email:
> Another idea: if VM_SHARED is found for any vma within a region in
> kvm_arch_prepare_memory_region(), we either prevent the enabling of MTE
> for the guest or reject the memory slot if MTE was already enabled.
> 
> An alternative here would be to clear VM_MTE_ALLOWED so that any
> subsequent mprotect(PROT_MTE) in the VMM would fail in
> arch_validate_flags(). MTE would still be allowed in the guest but in
> the VMM for the guest memory regions. We can probably do this
> irrespective of VM_SHARED. Of course, the VMM can still mmap() the
> memory initially with PROT_MTE but that's not an issue IIRC, only the
> concurrent mprotect().

This could work, but I worry that it's potential fragile. Also the rules
for what user space can do are not obvious and may be surprising. I'd
also want to look into the likes of mremap() to see how easy it would be
to ensure that we couldn't end up with VM_SHARED (or VM_MTE_ALLOWED)
memory sneaking into a memslot.

Unless you think it's worth complicating the ABI in the hope of avoiding
the big lock overhead I think it's probably best to stick with the big
lock at least until we have more data on the overhead.

>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>>  			  unsigned long fault_status)
>> @@ -971,8 +1007,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	if (writable)
>>  		prot |= KVM_PGTABLE_PROT_W;
>>  
>> -	if (fault_status != FSC_PERM && !device)
>> +	if (fault_status != FSC_PERM && !device) {
>> +		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
>> +		if (ret)
>> +			goto out_unlock;
> 
> Maybe it was discussed in a previous version, why do we need this in
> addition to kvm_set_spte_gfn()?

kvm_set_spte_gfn() is only used for the MMU notifier path (e.g. if a
memslot is changed by the VMM). For the initial access we will normally
fault the page into stage 2 with user_mem_abort().

>> +
>>  		clean_dcache_guest_page(pfn, vma_pagesize);
>> +	}
>>  
>>  	if (exec_fault) {
>>  		prot |= KVM_PGTABLE_PROT_X;
>> @@ -1168,12 +1209,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>>  {
>>  	kvm_pfn_t pfn = pte_pfn(range->pte);
>> +	int ret;
>>  
>>  	if (!kvm->arch.mmu.pgt)
>>  		return 0;
>>  
>>  	WARN_ON(range->end - range->start != 1);
>>  
>> +	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
>> +	if (ret)
>> +		return false;
>> +
>>  	/*
>>  	 * We've moved a page around, probably through CoW, so let's treat it
>>  	 * just like a translation fault and clean the cache to the PoC.
> 
> Otherwise the patch looks fine.
> 

Thanks for the review.

Steve
Catalin Marinas June 4, 2021, 11:36 a.m. UTC | #4
On Fri, Jun 04, 2021 at 11:42:11AM +0100, Steven Price wrote:
> On 03/06/2021 17:00, Catalin Marinas wrote:
> > On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
> >> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> >> index c5d1f3c87dbd..226035cf7d6c 100644
> >> --- a/arch/arm64/kvm/mmu.c
> >> +++ b/arch/arm64/kvm/mmu.c
> >> @@ -822,6 +822,42 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >>  	return PAGE_SIZE;
> >>  }
> >>  
> >> +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> >> +			     unsigned long size)
> >> +{
> >> +	if (kvm_has_mte(kvm)) {
> >> +		/*
> >> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
> >> +		 * the VM will be able to see the page's tags and therefore
> >> +		 * they must be initialised first. If PG_mte_tagged is set,
> >> +		 * tags have already been initialised.
> >> +		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
> >> +		 * that may not support tags.
> >> +		 */
> >> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
> >> +		struct page *page = pfn_to_online_page(pfn);
> >> +
> >> +		if (!page)
> >> +			return -EFAULT;
> >> +
> >> +		for (i = 0; i < nr_pages; i++, page++) {
> >> +			/*
> >> +			 * There is a potential (but very unlikely) race
> >> +			 * between two VMs which are sharing a physical page
> >> +			 * entering this at the same time. However by splitting
> >> +			 * the test/set the only risk is tags being overwritten
> >> +			 * by the mte_clear_page_tags() call.
> >> +			 */
> > 
> > And I think the real risk here is when the page is writable by at least
> > one of the VMs sharing the page. This excludes KSM, so it only leaves
> > the MAP_SHARED mappings.
> > 
> >> +			if (!test_bit(PG_mte_tagged, &page->flags)) {
> >> +				mte_clear_page_tags(page_address(page));
> >> +				set_bit(PG_mte_tagged, &page->flags);
> >> +			}
> >> +		}
> > 
> > If we want to cover this race (I'd say in a separate patch), we can call
> > mte_sync_page_tags(page, __pte(0), false, true) directly (hopefully I
> > got the arguments right). We can avoid the big lock in most cases if
> > kvm_arch_prepare_memory_region() sets a VM_MTE_RESET (tag clear etc.)
> > and __alloc_zeroed_user_highpage() clears the tags on allocation (as we
> > do for VM_MTE but the new flag would not affect the stage 1 VMM page
> > attributes).
> 
> To be honest I'm coming round to just exporting a
> mte_prepare_page_tags() function which does the clear/set with the lock
> held. I doubt it's such a performance critical path that it will cause
> any noticeable issues. Then if we run into performance problems in the
> future we can start experimenting with extra VM flags etc as necessary.

It works for me.

> And from your later email:
> > Another idea: if VM_SHARED is found for any vma within a region in
> > kvm_arch_prepare_memory_region(), we either prevent the enabling of MTE
> > for the guest or reject the memory slot if MTE was already enabled.
> > 
> > An alternative here would be to clear VM_MTE_ALLOWED so that any
> > subsequent mprotect(PROT_MTE) in the VMM would fail in
> > arch_validate_flags(). MTE would still be allowed in the guest but in
> > the VMM for the guest memory regions. We can probably do this
> > irrespective of VM_SHARED. Of course, the VMM can still mmap() the
> > memory initially with PROT_MTE but that's not an issue IIRC, only the
> > concurrent mprotect().
> 
> This could work, but I worry that it's potential fragile. Also the rules
> for what user space can do are not obvious and may be surprising. I'd
> also want to look into the likes of mremap() to see how easy it would be
> to ensure that we couldn't end up with VM_SHARED (or VM_MTE_ALLOWED)
> memory sneaking into a memslot.
> 
> Unless you think it's worth complicating the ABI in the hope of avoiding
> the big lock overhead I think it's probably best to stick with the big
> lock at least until we have more data on the overhead.

It's up to Marc but I think for now just make it safe and once we get
our hands on hardware, we can assess the impact. For example, starting
multiple VMs simultaneously will contend on such big lock but we have an
option to optimise it by setting PG_mte_tagged on allocation via a new
VM_* flag.

For my last suggestion above, changing the VMM ABI afterwards is a bit
tricky, so we could state now that VM_SHARED and MTE are not allowed
(though it needs a patch to enforce it). That's assuming that mprotect()
in the VMM cannot race with the user_mem_abort() on another CPU which
makes the lock necessary anyway.

> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  			  struct kvm_memory_slot *memslot, unsigned long hva,
> >>  			  unsigned long fault_status)
> >> @@ -971,8 +1007,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	if (writable)
> >>  		prot |= KVM_PGTABLE_PROT_W;
> >>  
> >> -	if (fault_status != FSC_PERM && !device)
> >> +	if (fault_status != FSC_PERM && !device) {
> >> +		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
> >> +		if (ret)
> >> +			goto out_unlock;
> > 
> > Maybe it was discussed in a previous version, why do we need this in
> > addition to kvm_set_spte_gfn()?
> 
> kvm_set_spte_gfn() is only used for the MMU notifier path (e.g. if a
> memslot is changed by the VMM). For the initial access we will normally
> fault the page into stage 2 with user_mem_abort().

Right. Can we move the sanitise_mte_tags() call to
kvm_pgtable_stage2_map() instead or we don't have the all the
information needed?
Steven Price June 4, 2021, 12:51 p.m. UTC | #5
On 04/06/2021 12:36, Catalin Marinas wrote:
> On Fri, Jun 04, 2021 at 11:42:11AM +0100, Steven Price wrote:
>> On 03/06/2021 17:00, Catalin Marinas wrote:
>>> On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
>>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>>> index c5d1f3c87dbd..226035cf7d6c 100644
>>>> --- a/arch/arm64/kvm/mmu.c
>>>> +++ b/arch/arm64/kvm/mmu.c
>>>> @@ -822,6 +822,42 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>>>>  	return PAGE_SIZE;
>>>>  }
>>>>  
>>>> +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
>>>> +			     unsigned long size)
>>>> +{
>>>> +	if (kvm_has_mte(kvm)) {
>>>> +		/*
>>>> +		 * The page will be mapped in stage 2 as Normal Cacheable, so
>>>> +		 * the VM will be able to see the page's tags and therefore
>>>> +		 * they must be initialised first. If PG_mte_tagged is set,
>>>> +		 * tags have already been initialised.
>>>> +		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
>>>> +		 * that may not support tags.
>>>> +		 */
>>>> +		unsigned long i, nr_pages = size >> PAGE_SHIFT;
>>>> +		struct page *page = pfn_to_online_page(pfn);
>>>> +
>>>> +		if (!page)
>>>> +			return -EFAULT;
>>>> +
>>>> +		for (i = 0; i < nr_pages; i++, page++) {
>>>> +			/*
>>>> +			 * There is a potential (but very unlikely) race
>>>> +			 * between two VMs which are sharing a physical page
>>>> +			 * entering this at the same time. However by splitting
>>>> +			 * the test/set the only risk is tags being overwritten
>>>> +			 * by the mte_clear_page_tags() call.
>>>> +			 */
>>>
>>> And I think the real risk here is when the page is writable by at least
>>> one of the VMs sharing the page. This excludes KSM, so it only leaves
>>> the MAP_SHARED mappings.
>>>
>>>> +			if (!test_bit(PG_mte_tagged, &page->flags)) {
>>>> +				mte_clear_page_tags(page_address(page));
>>>> +				set_bit(PG_mte_tagged, &page->flags);
>>>> +			}
>>>> +		}
>>>
>>> If we want to cover this race (I'd say in a separate patch), we can call
>>> mte_sync_page_tags(page, __pte(0), false, true) directly (hopefully I
>>> got the arguments right). We can avoid the big lock in most cases if
>>> kvm_arch_prepare_memory_region() sets a VM_MTE_RESET (tag clear etc.)
>>> and __alloc_zeroed_user_highpage() clears the tags on allocation (as we
>>> do for VM_MTE but the new flag would not affect the stage 1 VMM page
>>> attributes).
>>
>> To be honest I'm coming round to just exporting a
>> mte_prepare_page_tags() function which does the clear/set with the lock
>> held. I doubt it's such a performance critical path that it will cause
>> any noticeable issues. Then if we run into performance problems in the
>> future we can start experimenting with extra VM flags etc as necessary.
> 
> It works for me.
> 
>> And from your later email:
>>> Another idea: if VM_SHARED is found for any vma within a region in
>>> kvm_arch_prepare_memory_region(), we either prevent the enabling of MTE
>>> for the guest or reject the memory slot if MTE was already enabled.
>>>
>>> An alternative here would be to clear VM_MTE_ALLOWED so that any
>>> subsequent mprotect(PROT_MTE) in the VMM would fail in
>>> arch_validate_flags(). MTE would still be allowed in the guest but in
>>> the VMM for the guest memory regions. We can probably do this
>>> irrespective of VM_SHARED. Of course, the VMM can still mmap() the
>>> memory initially with PROT_MTE but that's not an issue IIRC, only the
>>> concurrent mprotect().
>>
>> This could work, but I worry that it's potential fragile. Also the rules
>> for what user space can do are not obvious and may be surprising. I'd
>> also want to look into the likes of mremap() to see how easy it would be
>> to ensure that we couldn't end up with VM_SHARED (or VM_MTE_ALLOWED)
>> memory sneaking into a memslot.
>>
>> Unless you think it's worth complicating the ABI in the hope of avoiding
>> the big lock overhead I think it's probably best to stick with the big
>> lock at least until we have more data on the overhead.
> 
> It's up to Marc but I think for now just make it safe and once we get
> our hands on hardware, we can assess the impact. For example, starting
> multiple VMs simultaneously will contend on such big lock but we have an
> option to optimise it by setting PG_mte_tagged on allocation via a new
> VM_* flag.
> 
> For my last suggestion above, changing the VMM ABI afterwards is a bit
> tricky, so we could state now that VM_SHARED and MTE are not allowed
> (though it needs a patch to enforce it). That's assuming that mprotect()
> in the VMM cannot race with the user_mem_abort() on another CPU which
> makes the lock necessary anyway.
> 
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>>>>  			  unsigned long fault_status)
>>>> @@ -971,8 +1007,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (writable)
>>>>  		prot |= KVM_PGTABLE_PROT_W;
>>>>  
>>>> -	if (fault_status != FSC_PERM && !device)
>>>> +	if (fault_status != FSC_PERM && !device) {
>>>> +		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
>>>> +		if (ret)
>>>> +			goto out_unlock;
>>>
>>> Maybe it was discussed in a previous version, why do we need this in
>>> addition to kvm_set_spte_gfn()?
>>
>> kvm_set_spte_gfn() is only used for the MMU notifier path (e.g. if a
>> memslot is changed by the VMM). For the initial access we will normally
>> fault the page into stage 2 with user_mem_abort().
> 
> Right. Can we move the sanitise_mte_tags() call to
> kvm_pgtable_stage2_map() instead or we don't have the all the
> information needed?

I tried that before: kvm_pgtable_stage2_map() is shared with the
hypervisor so sadly we can't go poking around in the host as this breaks
on nVHE. I mentioned it in the v12 cover letter but it was in a wall of
text:

 * Move the code to sanitise tags out of user_mem_abort() into its own
   function. Also call this new function from kvm_set_spte_gfn() as that
   path was missing the sanitising.

   Originally I was going to move the code all the way down to
   kvm_pgtable_stage2_map(). Sadly as that also part of the EL2
   hypervisor this breaks nVHE as the code needs to perform actions in
   the host.

The only other option I could see would be to provide a wrapper for
kvm_pgtable_stage2_map() in mmu.c which could do the sanitising as
necessary. But considering we know the call site in
kvm_phys_addr_ioremap() doesn't need handling (PROT_DEVICE is always
specified) and there's only two more, it seemed easier just to add the
two calls necessary to the new sanitise_mte_tags().

We also have a direct pointer to 'kvm' this way which is much nicer than
pointer chasing it out of the kvm_pgtable structure.

Steve
Catalin Marinas June 4, 2021, 2:05 p.m. UTC | #6
On Fri, Jun 04, 2021 at 01:51:38PM +0100, Steven Price wrote:
> On 04/06/2021 12:36, Catalin Marinas wrote:
> > On Fri, Jun 04, 2021 at 11:42:11AM +0100, Steven Price wrote:
> >> On 03/06/2021 17:00, Catalin Marinas wrote:
> >>> On Mon, May 24, 2021 at 11:45:09AM +0100, Steven Price wrote:
> >>>> @@ -971,8 +1007,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>  	if (writable)
> >>>>  		prot |= KVM_PGTABLE_PROT_W;
> >>>>  
> >>>> -	if (fault_status != FSC_PERM && !device)
> >>>> +	if (fault_status != FSC_PERM && !device) {
> >>>> +		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
> >>>> +		if (ret)
> >>>> +			goto out_unlock;
> >>>
> >>> Maybe it was discussed in a previous version, why do we need this in
> >>> addition to kvm_set_spte_gfn()?
> >>
> >> kvm_set_spte_gfn() is only used for the MMU notifier path (e.g. if a
> >> memslot is changed by the VMM). For the initial access we will normally
> >> fault the page into stage 2 with user_mem_abort().
> > 
> > Right. Can we move the sanitise_mte_tags() call to
> > kvm_pgtable_stage2_map() instead or we don't have the all the
> > information needed?
> 
> I tried that before: kvm_pgtable_stage2_map() is shared with the
> hypervisor so sadly we can't go poking around in the host as this breaks
> on nVHE. I mentioned it in the v12 cover letter but it was in a wall of
> text:

Ah, I missed this in the cover letter (haven't read it).

So, apart from the nitpick with the early return for less indentation,
feel free to add:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index f612c090f2e4..6bf776c2399c 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
 	    vcpu_el1_is_32bit(vcpu))
 		vcpu->arch.hcr_el2 |= HCR_TID2;
+
+	if (kvm_has_mte(vcpu->kvm))
+		vcpu->arch.hcr_el2 |= HCR_ATA;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..afaa5333f0e4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -132,6 +132,8 @@  struct kvm_arch {
 
 	u8 pfr0_csv2;
 	u8 pfr0_csv3;
+	/* Memory Tagging Extension enabled for the guest */
+	bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -769,6 +771,7 @@  bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
 	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
 #define kvm_vcpu_has_pmu(vcpu)					\
 	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 73629094f903..56426565600c 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -112,7 +112,8 @@  static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);
 
-	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+	if (kvm_has_mte(vcpu->kvm))
+		new |= PSR_TCO_BIT;
 
 	new |= (old & PSR_DIT_BIT);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c5d1f3c87dbd..226035cf7d6c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -822,6 +822,42 @@  transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	return PAGE_SIZE;
 }
 
+static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+			     unsigned long size)
+{
+	if (kvm_has_mte(kvm)) {
+		/*
+		 * The page will be mapped in stage 2 as Normal Cacheable, so
+		 * the VM will be able to see the page's tags and therefore
+		 * they must be initialised first. If PG_mte_tagged is set,
+		 * tags have already been initialised.
+		 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
+		 * that may not support tags.
+		 */
+		unsigned long i, nr_pages = size >> PAGE_SHIFT;
+		struct page *page = pfn_to_online_page(pfn);
+
+		if (!page)
+			return -EFAULT;
+
+		for (i = 0; i < nr_pages; i++, page++) {
+			/*
+			 * There is a potential (but very unlikely) race
+			 * between two VMs which are sharing a physical page
+			 * entering this at the same time. However by splitting
+			 * the test/set the only risk is tags being overwritten
+			 * by the mte_clear_page_tags() call.
+			 */
+			if (!test_bit(PG_mte_tagged, &page->flags)) {
+				mte_clear_page_tags(page_address(page));
+				set_bit(PG_mte_tagged, &page->flags);
+			}
+		}
+	}
+
+	return 0;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -971,8 +1007,13 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
-	if (fault_status != FSC_PERM && !device)
+	if (fault_status != FSC_PERM && !device) {
+		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
+		if (ret)
+			goto out_unlock;
+
 		clean_dcache_guest_page(pfn, vma_pagesize);
+	}
 
 	if (exec_fault) {
 		prot |= KVM_PGTABLE_PROT_X;
@@ -1168,12 +1209,17 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_pfn_t pfn = pte_pfn(range->pte);
+	int ret;
 
 	if (!kvm->arch.mmu.pgt)
 		return 0;
 
 	WARN_ON(range->end - range->start != 1);
 
+	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
+	if (ret)
+		return false;
+
 	/*
 	 * We've moved a page around, probably through CoW, so let's treat it
 	 * just like a translation fault and clean the cache to the PoC.
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76ea2800c33e..4a98902eaf1a 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1047,6 +1047,13 @@  static u64 read_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64PFR1_EL1:
 		val &= ~FEATURE(ID_AA64PFR1_MTE);
+		if (kvm_has_mte(vcpu->kvm)) {
+			u64 pfr, mte;
+
+			pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+			mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
+			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+		}
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..8c95ba0fadda 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1082,6 +1082,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SGX_ATTRIBUTE 196
 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 #define KVM_CAP_PTP_KVM 198
+#define KVM_CAP_ARM_MTE 199
 
 #ifdef KVM_CAP_IRQ_ROUTING