diff mbox series

[v17,2/6] KVM: arm64: Introduce MTE VM feature

Message ID 20210621111716.37157-3-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series MTE support for KVM guest | expand

Commit Message

Steven Price June 21, 2021, 11:17 a.m. UTC
Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This will expose the feature to the guest and automatically
tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
storage) to ensure that the guest cannot see stale tags, and so that
the tags are correctly saved/restored across swap.

Actually exposing the new capability to user space happens in a later
patch.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  3 ++
 arch/arm64/include/asm/kvm_host.h    |  3 ++
 arch/arm64/kvm/hyp/exception.c       |  3 +-
 arch/arm64/kvm/mmu.c                 | 64 +++++++++++++++++++++++++++-
 arch/arm64/kvm/sys_regs.c            |  7 +++
 include/uapi/linux/kvm.h             |  1 +
 6 files changed, 79 insertions(+), 2 deletions(-)

Comments

Fuad Tabba June 21, 2021, 5 p.m. UTC | #1
Hi,

On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
>
> Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
> for a VM. This will expose the feature to the guest and automatically
> tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
> storage) to ensure that the guest cannot see stale tags, and so that
> the tags are correctly saved/restored across swap.
>
> Actually exposing the new capability to user space happens in a later
> patch.
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  3 ++
>  arch/arm64/include/asm/kvm_host.h    |  3 ++
>  arch/arm64/kvm/hyp/exception.c       |  3 +-
>  arch/arm64/kvm/mmu.c                 | 64 +++++++++++++++++++++++++++-
>  arch/arm64/kvm/sys_regs.c            |  7 +++
>  include/uapi/linux/kvm.h             |  1 +
>  6 files changed, 79 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 01b9857757f2..fd418955e31e 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>         if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
>             vcpu_el1_is_32bit(vcpu))
>                 vcpu->arch.hcr_el2 |= HCR_TID2;
> +
> +       if (kvm_has_mte(vcpu->kvm))
> +               vcpu->arch.hcr_el2 |= HCR_ATA;
>  }
>
>  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7cd7d5c8c4bc..afaa5333f0e4 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -132,6 +132,8 @@ struct kvm_arch {
>
>         u8 pfr0_csv2;
>         u8 pfr0_csv3;
> +       /* Memory Tagging Extension enabled for the guest */
> +       bool mte_enabled;
>  };

nit: newline before the comment/new member

>
>  struct kvm_vcpu_fault_info {
> @@ -769,6 +771,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
>  #define kvm_arm_vcpu_sve_finalized(vcpu) \
>         ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
>
> +#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
>  #define kvm_vcpu_has_pmu(vcpu)                                 \
>         (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
>
> diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
> index 11541b94b328..0418399e0a20 100644
> --- a/arch/arm64/kvm/hyp/exception.c
> +++ b/arch/arm64/kvm/hyp/exception.c
> @@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
>         new |= (old & PSR_C_BIT);
>         new |= (old & PSR_V_BIT);
>
> -       // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
> +       if (kvm_has_mte(vcpu->kvm))
> +               new |= PSR_TCO_BIT;
>
>         new |= (old & PSR_DIT_BIT);
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c10207fed2f3..52326b739357 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -822,6 +822,45 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
>         return PAGE_SIZE;
>  }
>
> +/*
> + * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
> + * able to see the page's tags and therefore they must be initialised first. If
> + * PG_mte_tagged is set, tags have already been initialised.
> + *
> + * The race in the test/set of the PG_mte_tagged flag is handled by:
> + * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
> + *   racing to santise the same page
> + * - mmap_lock protects between a VM faulting a page in and the VMM performing
> + *   an mprotect() to add VM_MTE
> + */
> +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> +                            unsigned long size)
> +{
> +       unsigned long i, nr_pages = size >> PAGE_SHIFT;
> +       struct page *page;
> +
> +       if (!kvm_has_mte(kvm))
> +               return 0;
> +
> +       /*
> +        * pfn_to_online_page() is used to reject ZONE_DEVICE pages
> +        * that may not support tags.
> +        */
> +       page = pfn_to_online_page(pfn);
> +
> +       if (!page)
> +               return -EFAULT;
> +
> +       for (i = 0; i < nr_pages; i++, page++) {
> +               if (!test_bit(PG_mte_tagged, &page->flags)) {
> +                       mte_clear_page_tags(page_address(page));
> +                       set_bit(PG_mte_tagged, &page->flags);
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                           struct kvm_memory_slot *memslot, unsigned long hva,
>                           unsigned long fault_status)
> @@ -971,8 +1010,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (writable)
>                 prot |= KVM_PGTABLE_PROT_W;
>
> -       if (fault_status != FSC_PERM && !device)
> +       if (fault_status != FSC_PERM && !device) {
> +               /* Check the VMM hasn't introduced a new VM_SHARED VMA */
> +               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
> +                       ret = -EFAULT;
> +                       goto out_unlock;
> +               }
> +               ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
> +               if (ret)
> +                       goto out_unlock;
> +

nit: Would it make sense to bring in sanitise_mte_tags under the
kvm_has_mte. I know that a check is done in kvm_has_mte as well, but
since you're already checking, it might make the code a bit clearer.

>                 clean_dcache_guest_page(pfn, vma_pagesize);
> +       }
>
>         if (exec_fault) {
>                 prot |= KVM_PGTABLE_PROT_X;
> @@ -1168,12 +1217,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>         kvm_pfn_t pfn = pte_pfn(range->pte);
> +       int ret;
>
>         if (!kvm->arch.mmu.pgt)
>                 return false;
>
>         WARN_ON(range->end - range->start != 1);
>
> +       ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
> +       if (ret)
> +               return false;
> +
>         /*
>          * We've moved a page around, probably through CoW, so let's treat it
>          * just like a translation fault and clean the cache to the PoC.
> @@ -1381,6 +1435,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>                 if (!vma)
>                         break;
>
> +               /*
> +                * VM_SHARED mappings are not allowed with MTE to avoid races
> +                * when updating the PG_mte_tagged page flag, see
> +                * sanitise_mte_tags for more details.
> +                */
> +               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
> +                       return -EINVAL;
> +
>                 /*
>                  * Take the intersection of this VMA with the memory region
>                  */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 1a7968ad078c..36f67f8deae1 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -1047,6 +1047,13 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
>                 break;
>         case SYS_ID_AA64PFR1_EL1:
>                 val &= ~FEATURE(ID_AA64PFR1_MTE);
> +               if (kvm_has_mte(vcpu->kvm)) {
> +                       u64 pfr, mte;
> +
> +                       pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);

nit: why reread the sanitized register? wouldn't it be clearer to
rework the masking of the val and the check for kvm_has_mte?

Cheers,
/fuad



> +                       mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
> +                       val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
> +               }
>                 break;
>         case SYS_ID_AA64ISAR1_EL1:
>                 if (!vcpu_has_ptrauth(vcpu))
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 79d9c44d1ad7..d4da58ddcad7 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1083,6 +1083,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_SGX_ATTRIBUTE 196
>  #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
>  #define KVM_CAP_PTP_KVM 198
> +#define KVM_CAP_ARM_MTE 199
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> --
> 2.20.1
>
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Marc Zyngier June 22, 2021, 11:29 a.m. UTC | #2
On Mon, 21 Jun 2021 18:00:20 +0100,
Fuad Tabba <tabba@google.com> wrote:
> 
> Hi,
> 
> On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
> >
> > Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
> > for a VM. This will expose the feature to the guest and automatically
> > tag memory pages touched by the VM as PG_mte_tagged (and clear the tag
> > storage) to ensure that the guest cannot see stale tags, and so that
> > the tags are correctly saved/restored across swap.
> >
> > Actually exposing the new capability to user space happens in a later
> > patch.
> >
> > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> > Signed-off-by: Steven Price <steven.price@arm.com>
> > ---
> >  arch/arm64/include/asm/kvm_emulate.h |  3 ++
> >  arch/arm64/include/asm/kvm_host.h    |  3 ++
> >  arch/arm64/kvm/hyp/exception.c       |  3 +-
> >  arch/arm64/kvm/mmu.c                 | 64 +++++++++++++++++++++++++++-
> >  arch/arm64/kvm/sys_regs.c            |  7 +++
> >  include/uapi/linux/kvm.h             |  1 +
> >  6 files changed, 79 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> > index 01b9857757f2..fd418955e31e 100644
> > --- a/arch/arm64/include/asm/kvm_emulate.h
> > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > @@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> >         if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
> >             vcpu_el1_is_32bit(vcpu))
> >                 vcpu->arch.hcr_el2 |= HCR_TID2;
> > +
> > +       if (kvm_has_mte(vcpu->kvm))
> > +               vcpu->arch.hcr_el2 |= HCR_ATA;
> >  }
> >
> >  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 7cd7d5c8c4bc..afaa5333f0e4 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -132,6 +132,8 @@ struct kvm_arch {
> >
> >         u8 pfr0_csv2;
> >         u8 pfr0_csv3;
> > +       /* Memory Tagging Extension enabled for the guest */
> > +       bool mte_enabled;
> >  };
> 
> nit: newline before the comment/new member
> 
> >
> >  struct kvm_vcpu_fault_info {
> > @@ -769,6 +771,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
> >  #define kvm_arm_vcpu_sve_finalized(vcpu) \
> >         ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
> >
> > +#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
> >  #define kvm_vcpu_has_pmu(vcpu)                                 \
> >         (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
> >
> > diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
> > index 11541b94b328..0418399e0a20 100644
> > --- a/arch/arm64/kvm/hyp/exception.c
> > +++ b/arch/arm64/kvm/hyp/exception.c
> > @@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
> >         new |= (old & PSR_C_BIT);
> >         new |= (old & PSR_V_BIT);
> >
> > -       // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
> > +       if (kvm_has_mte(vcpu->kvm))
> > +               new |= PSR_TCO_BIT;
> >
> >         new |= (old & PSR_DIT_BIT);
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index c10207fed2f3..52326b739357 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -822,6 +822,45 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
> >         return PAGE_SIZE;
> >  }
> >
> > +/*
> > + * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
> > + * able to see the page's tags and therefore they must be initialised first. If
> > + * PG_mte_tagged is set, tags have already been initialised.
> > + *
> > + * The race in the test/set of the PG_mte_tagged flag is handled by:
> > + * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
> > + *   racing to santise the same page
> > + * - mmap_lock protects between a VM faulting a page in and the VMM performing
> > + *   an mprotect() to add VM_MTE
> > + */
> > +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> > +                            unsigned long size)
> > +{
> > +       unsigned long i, nr_pages = size >> PAGE_SHIFT;
> > +       struct page *page;
> > +
> > +       if (!kvm_has_mte(kvm))
> > +               return 0;
> > +
> > +       /*
> > +        * pfn_to_online_page() is used to reject ZONE_DEVICE pages
> > +        * that may not support tags.
> > +        */
> > +       page = pfn_to_online_page(pfn);
> > +
> > +       if (!page)
> > +               return -EFAULT;
> > +
> > +       for (i = 0; i < nr_pages; i++, page++) {
> > +               if (!test_bit(PG_mte_tagged, &page->flags)) {
> > +                       mte_clear_page_tags(page_address(page));
> > +                       set_bit(PG_mte_tagged, &page->flags);
> > +               }
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                           struct kvm_memory_slot *memslot, unsigned long hva,
> >                           unsigned long fault_status)
> > @@ -971,8 +1010,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         if (writable)
> >                 prot |= KVM_PGTABLE_PROT_W;
> >
> > -       if (fault_status != FSC_PERM && !device)
> > +       if (fault_status != FSC_PERM && !device) {
> > +               /* Check the VMM hasn't introduced a new VM_SHARED VMA */
> > +               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
> > +                       ret = -EFAULT;
> > +                       goto out_unlock;
> > +               }
> > +               ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
> > +               if (ret)
> > +                       goto out_unlock;
> > +
> 
> nit: Would it make sense to bring in sanitise_mte_tags under the
> kvm_has_mte. I know that a check is done in kvm_has_mte as well, but
> since you're already checking, it might make the code a bit clearer.

I think it makes more sense once merged with -next, as the CMO has
been moved into the PT code. I came up with the following resolution:

	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
							   &pfn, &fault_ipa);

	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
		/* Check the VMM hasn't introduced a new VM_SHARED VMA */
		if (!(vma->vm_flags & VM_SHARED))
			ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
		else
			ret = -EFAULT;
		if (ret)
			goto out_unlock;
	}

	if (writable)
		prot |= KVM_PGTABLE_PROT_W;


However, there is a more annoying issue here, which is that the vma is
accessed outside of the mm lock. I *think* we're safe because if an
unmap happens in parallel, the MMU notifier will kick and we will be
in one of two cases:

- the unmap occurs before we take the kvm->mmu_lock, and the mmu
  notifier seq_lock is want saves us (we will drop everything and take
  the fault again),

- it occurs once we hold the lock, and this blocks the unmap.

Either way, I'd be more confident if the shared state was sampled
inside the locked section.

Thoughts?

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 01b9857757f2..fd418955e31e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
 	    vcpu_el1_is_32bit(vcpu))
 		vcpu->arch.hcr_el2 |= HCR_TID2;
+
+	if (kvm_has_mte(vcpu->kvm))
+		vcpu->arch.hcr_el2 |= HCR_ATA;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..afaa5333f0e4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -132,6 +132,8 @@  struct kvm_arch {
 
 	u8 pfr0_csv2;
 	u8 pfr0_csv3;
+	/* Memory Tagging Extension enabled for the guest */
+	bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -769,6 +771,7 @@  bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
 	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
 #define kvm_vcpu_has_pmu(vcpu)					\
 	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 11541b94b328..0418399e0a20 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -112,7 +112,8 @@  static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);
 
-	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+	if (kvm_has_mte(vcpu->kvm))
+		new |= PSR_TCO_BIT;
 
 	new |= (old & PSR_DIT_BIT);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c10207fed2f3..52326b739357 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -822,6 +822,45 @@  transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	return PAGE_SIZE;
 }
 
+/*
+ * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
+ * able to see the page's tags and therefore they must be initialised first. If
+ * PG_mte_tagged is set, tags have already been initialised.
+ *
+ * The race in the test/set of the PG_mte_tagged flag is handled by:
+ * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
+ *   racing to santise the same page
+ * - mmap_lock protects between a VM faulting a page in and the VMM performing
+ *   an mprotect() to add VM_MTE
+ */
+static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+			     unsigned long size)
+{
+	unsigned long i, nr_pages = size >> PAGE_SHIFT;
+	struct page *page;
+
+	if (!kvm_has_mte(kvm))
+		return 0;
+
+	/*
+	 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
+	 * that may not support tags.
+	 */
+	page = pfn_to_online_page(pfn);
+
+	if (!page)
+		return -EFAULT;
+
+	for (i = 0; i < nr_pages; i++, page++) {
+		if (!test_bit(PG_mte_tagged, &page->flags)) {
+			mte_clear_page_tags(page_address(page));
+			set_bit(PG_mte_tagged, &page->flags);
+		}
+	}
+
+	return 0;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -971,8 +1010,18 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
-	if (fault_status != FSC_PERM && !device)
+	if (fault_status != FSC_PERM && !device) {
+		/* Check the VMM hasn't introduced a new VM_SHARED VMA */
+		if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
+		if (ret)
+			goto out_unlock;
+
 		clean_dcache_guest_page(pfn, vma_pagesize);
+	}
 
 	if (exec_fault) {
 		prot |= KVM_PGTABLE_PROT_X;
@@ -1168,12 +1217,17 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_pfn_t pfn = pte_pfn(range->pte);
+	int ret;
 
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
 	WARN_ON(range->end - range->start != 1);
 
+	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
+	if (ret)
+		return false;
+
 	/*
 	 * We've moved a page around, probably through CoW, so let's treat it
 	 * just like a translation fault and clean the cache to the PoC.
@@ -1381,6 +1435,14 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		if (!vma)
 			break;
 
+		/*
+		 * VM_SHARED mappings are not allowed with MTE to avoid races
+		 * when updating the PG_mte_tagged page flag, see
+		 * sanitise_mte_tags for more details.
+		 */
+		if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
+			return -EINVAL;
+
 		/*
 		 * Take the intersection of this VMA with the memory region
 		 */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1a7968ad078c..36f67f8deae1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1047,6 +1047,13 @@  static u64 read_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64PFR1_EL1:
 		val &= ~FEATURE(ID_AA64PFR1_MTE);
+		if (kvm_has_mte(vcpu->kvm)) {
+			u64 pfr, mte;
+
+			pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+			mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
+			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+		}
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 79d9c44d1ad7..d4da58ddcad7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1083,6 +1083,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SGX_ATTRIBUTE 196
 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 #define KVM_CAP_PTP_KVM 198
+#define KVM_CAP_ARM_MTE 199
 
 #ifdef KVM_CAP_IRQ_ROUTING