diff mbox series

[v17,5/6] KVM: arm64: ioctl to fetch/store tags in a guest

Message ID 20210621111716.37157-6-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series MTE support for KVM guest | expand

Commit Message

Steven Price June 21, 2021, 11:17 a.m. UTC
The VMM may not wish to have it's own mapping of guest memory mapped
with PROT_MTE because this causes problems if the VMM has tag checking
enabled (the guest controls the tags in physical RAM and it's unlikely
the tags are correct for the VMM).

Instead add a new ioctl which allows the VMM to easily read/write the
tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
while the VMM can still read/write the tags for the purpose of
migration.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
 arch/arm64/include/asm/kvm_host.h |  3 ++
 arch/arm64/include/asm/mte-def.h  |  1 +
 arch/arm64/include/uapi/asm/kvm.h | 11 +++++
 arch/arm64/kvm/arm.c              |  7 +++
 arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 105 insertions(+)

Comments

Fuad Tabba June 22, 2021, 8:56 a.m. UTC | #1
Hi,


On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
>
> The VMM may not wish to have it's own mapping of guest memory mapped
> with PROT_MTE because this causes problems if the VMM has tag checking
> enabled (the guest controls the tags in physical RAM and it's unlikely
> the tags are correct for the VMM).
>
> Instead add a new ioctl which allows the VMM to easily read/write the
> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
> while the VMM can still read/write the tags for the purpose of
> migration.
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  3 ++
>  arch/arm64/include/asm/mte-def.h  |  1 +
>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>  arch/arm64/kvm/arm.c              |  7 +++
>  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  1 +
>  6 files changed, 105 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 309e36cc1b42..6a2ac4636d42 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
>  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>                                struct kvm_device_attr *attr);
>
> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +                               struct kvm_arm_copy_mte_tags *copy_tags);
> +
>  /* Guest/host FPSIMD coordination helpers */
>  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
>  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
> diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
> index cf241b0f0a42..626d359b396e 100644
> --- a/arch/arm64/include/asm/mte-def.h
> +++ b/arch/arm64/include/asm/mte-def.h
> @@ -7,6 +7,7 @@
>
>  #define MTE_GRANULE_SIZE       UL(16)
>  #define MTE_GRANULE_MASK       (~(MTE_GRANULE_SIZE - 1))
> +#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
>  #define MTE_TAG_SHIFT          56
>  #define MTE_TAG_SIZE           4
>  #define MTE_TAG_MASK           GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 24223adae150..b3edde68bc3e 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>         __u32 reserved[12];
>  };
>
> +struct kvm_arm_copy_mte_tags {
> +       __u64 guest_ipa;
> +       __u64 length;
> +       void __user *addr;
> +       __u64 flags;
> +       __u64 reserved[2];
> +};
> +
> +#define KVM_ARM_TAGS_TO_GUEST          0
> +#define KVM_ARM_TAGS_FROM_GUEST                1
> +
>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK                0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT       16
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 28ce26a68f09..511f3716fe33 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>
>                 return 0;
>         }
> +       case KVM_ARM_MTE_COPY_TAGS: {
> +               struct kvm_arm_copy_mte_tags copy_tags;
> +
> +               if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
> +                       return -EFAULT;
> +               return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
> +       }
>         default:
>                 return -EINVAL;
>         }
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 5cb4a1cd5603..4ddb20017b2f 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>
>         return ret;
>  }
> +
> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +                               struct kvm_arm_copy_mte_tags *copy_tags)
> +{
> +       gpa_t guest_ipa = copy_tags->guest_ipa;
> +       size_t length = copy_tags->length;
> +       void __user *tags = copy_tags->addr;
> +       gpa_t gfn;
> +       bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> +       int ret = 0;
> +
> +       if (!kvm_has_mte(kvm))
> +               return -EINVAL;
> +
> +       if (copy_tags->reserved[0] || copy_tags->reserved[1])
> +               return -EINVAL;
> +
> +       if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> +               return -EINVAL;
> +
> +       if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> +               return -EINVAL;
> +
> +       gfn = gpa_to_gfn(guest_ipa);
> +
> +       mutex_lock(&kvm->slots_lock);
> +
> +       while (length > 0) {
> +               kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> +               void *maddr;
> +               unsigned long num_tags;
> +               struct page *page;
> +
> +               if (is_error_noslot_pfn(pfn)) {
> +                       ret = -EFAULT;
> +                       goto out;
> +               }
> +
> +               page = pfn_to_online_page(pfn);
> +               if (!page) {
> +                       /* Reject ZONE_DEVICE memory */
> +                       ret = -EFAULT;
> +                       goto out;
> +               }
> +               maddr = page_address(page);
> +
> +               if (!write) {
> +                       if (test_bit(PG_mte_tagged, &page->flags))
> +                               num_tags = mte_copy_tags_to_user(tags, maddr,
> +                                                       MTE_GRANULES_PER_PAGE);
> +                       else
> +                               /* No tags in memory, so write zeros */
> +                               num_tags = MTE_GRANULES_PER_PAGE -
> +                                       clear_user(tags, MTE_GRANULES_PER_PAGE);
> +                       kvm_release_pfn_clean(pfn);
> +               } else {
> +                       num_tags = mte_copy_tags_from_user(maddr, tags,
> +                                                       MTE_GRANULES_PER_PAGE);
> +                       kvm_release_pfn_dirty(pfn);
> +               }
> +
> +               if (num_tags != MTE_GRANULES_PER_PAGE) {
> +                       ret = -EFAULT;
> +                       goto out;
> +               }
> +
> +               /* Set the flag after checking the write completed fully */
> +               if (write)
> +                       set_bit(PG_mte_tagged, &page->flags);
> +
> +               gfn++;
> +               tags += num_tags;
> +               length -= PAGE_SIZE;
> +       }
> +
> +out:
> +       mutex_unlock(&kvm->slots_lock);
> +       /* If some data has been copied report the number of bytes copied */
> +       if (length != copy_tags->length)
> +               return copy_tags->length - length;

I'm not sure if this is actually an issue, but a couple of comments on
the return value if there is an error after a partial copy has been
done. If mte_copy_tags_to_user or mte_copy_tags_from_user don't return
MTE_GRANULES_PER_PAGE, then the check for num_tags would fail, but
some of the tags would have been copied, which wouldn't be reflected
in length. That said, on a write the tagged bit wouldn't be set, and
on read then the return value would be conservative, but not
incorrect.

That said, even though it is described that way in the documentation
(rather deep in the description though), it might be confusing to
return a non-negative value on an error. The other kvm ioctl I could
find that does something similar, KVM_S390_GET_IRQ_STATE, seems to
always return a -ERROR on error, rather than the number of bytes
copied.

Cheers,
/fuad

> +       return ret;
> +}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index d4da58ddcad7..da1edd2b4046 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1429,6 +1429,7 @@ struct kvm_s390_ucas_mapping {
>  /* Available with KVM_CAP_PMU_EVENT_FILTER */
>  #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
>  #define KVM_PPC_SVM_OFF                  _IO(KVMIO,  0xb3)
> +#define KVM_ARM_MTE_COPY_TAGS    _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
>
>  /* ioctl for vm fd */
>  #define KVM_CREATE_DEVICE        _IOWR(KVMIO,  0xe0, struct kvm_create_device)
> --
> 2.20.1
>
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Marc Zyngier June 22, 2021, 10:25 a.m. UTC | #2
Hi Fuad,

On Tue, 22 Jun 2021 09:56:22 +0100,
Fuad Tabba <tabba@google.com> wrote:
> 
> Hi,
> 
> 
> On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
> >
> > The VMM may not wish to have it's own mapping of guest memory mapped
> > with PROT_MTE because this causes problems if the VMM has tag checking
> > enabled (the guest controls the tags in physical RAM and it's unlikely
> > the tags are correct for the VMM).
> >
> > Instead add a new ioctl which allows the VMM to easily read/write the
> > tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
> > while the VMM can still read/write the tags for the purpose of
> > migration.
> >
> > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> > Signed-off-by: Steven Price <steven.price@arm.com>
> > ---
> >  arch/arm64/include/asm/kvm_host.h |  3 ++
> >  arch/arm64/include/asm/mte-def.h  |  1 +
> >  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
> >  arch/arm64/kvm/arm.c              |  7 +++
> >  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h          |  1 +
> >  6 files changed, 105 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 309e36cc1b42..6a2ac4636d42 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
> >  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
> >                                struct kvm_device_attr *attr);
> >
> > +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> > +                               struct kvm_arm_copy_mte_tags *copy_tags);
> > +
> >  /* Guest/host FPSIMD coordination helpers */
> >  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
> >  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
> > diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
> > index cf241b0f0a42..626d359b396e 100644
> > --- a/arch/arm64/include/asm/mte-def.h
> > +++ b/arch/arm64/include/asm/mte-def.h
> > @@ -7,6 +7,7 @@
> >
> >  #define MTE_GRANULE_SIZE       UL(16)
> >  #define MTE_GRANULE_MASK       (~(MTE_GRANULE_SIZE - 1))
> > +#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
> >  #define MTE_TAG_SHIFT          56
> >  #define MTE_TAG_SIZE           4
> >  #define MTE_TAG_MASK           GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
> > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> > index 24223adae150..b3edde68bc3e 100644
> > --- a/arch/arm64/include/uapi/asm/kvm.h
> > +++ b/arch/arm64/include/uapi/asm/kvm.h
> > @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
> >         __u32 reserved[12];
> >  };
> >
> > +struct kvm_arm_copy_mte_tags {
> > +       __u64 guest_ipa;
> > +       __u64 length;
> > +       void __user *addr;
> > +       __u64 flags;
> > +       __u64 reserved[2];
> > +};
> > +
> > +#define KVM_ARM_TAGS_TO_GUEST          0
> > +#define KVM_ARM_TAGS_FROM_GUEST                1
> > +
> >  /* If you need to interpret the index values, here is the key: */
> >  #define KVM_REG_ARM_COPROC_MASK                0x000000000FFF0000
> >  #define KVM_REG_ARM_COPROC_SHIFT       16
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 28ce26a68f09..511f3716fe33 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >
> >                 return 0;
> >         }
> > +       case KVM_ARM_MTE_COPY_TAGS: {
> > +               struct kvm_arm_copy_mte_tags copy_tags;
> > +
> > +               if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
> > +                       return -EFAULT;
> > +               return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
> > +       }
> >         default:
> >                 return -EINVAL;
> >         }
> > diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> > index 5cb4a1cd5603..4ddb20017b2f 100644
> > --- a/arch/arm64/kvm/guest.c
> > +++ b/arch/arm64/kvm/guest.c
> > @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
> >
> >         return ret;
> >  }
> > +
> > +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> > +                               struct kvm_arm_copy_mte_tags *copy_tags)
> > +{
> > +       gpa_t guest_ipa = copy_tags->guest_ipa;
> > +       size_t length = copy_tags->length;
> > +       void __user *tags = copy_tags->addr;
> > +       gpa_t gfn;
> > +       bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> > +       int ret = 0;
> > +
> > +       if (!kvm_has_mte(kvm))
> > +               return -EINVAL;
> > +
> > +       if (copy_tags->reserved[0] || copy_tags->reserved[1])
> > +               return -EINVAL;
> > +
> > +       if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> > +               return -EINVAL;
> > +
> > +       if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> > +               return -EINVAL;
> > +
> > +       gfn = gpa_to_gfn(guest_ipa);
> > +
> > +       mutex_lock(&kvm->slots_lock);
> > +
> > +       while (length > 0) {
> > +               kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> > +               void *maddr;
> > +               unsigned long num_tags;
> > +               struct page *page;
> > +
> > +               if (is_error_noslot_pfn(pfn)) {
> > +                       ret = -EFAULT;
> > +                       goto out;
> > +               }
> > +
> > +               page = pfn_to_online_page(pfn);
> > +               if (!page) {
> > +                       /* Reject ZONE_DEVICE memory */
> > +                       ret = -EFAULT;
> > +                       goto out;
> > +               }
> > +               maddr = page_address(page);
> > +
> > +               if (!write) {
> > +                       if (test_bit(PG_mte_tagged, &page->flags))
> > +                               num_tags = mte_copy_tags_to_user(tags, maddr,
> > +                                                       MTE_GRANULES_PER_PAGE);
> > +                       else
> > +                               /* No tags in memory, so write zeros */
> > +                               num_tags = MTE_GRANULES_PER_PAGE -
> > +                                       clear_user(tags, MTE_GRANULES_PER_PAGE);
> > +                       kvm_release_pfn_clean(pfn);
> > +               } else {
> > +                       num_tags = mte_copy_tags_from_user(maddr, tags,
> > +                                                       MTE_GRANULES_PER_PAGE);
> > +                       kvm_release_pfn_dirty(pfn);
> > +               }
> > +
> > +               if (num_tags != MTE_GRANULES_PER_PAGE) {
> > +                       ret = -EFAULT;
> > +                       goto out;
> > +               }
> > +
> > +               /* Set the flag after checking the write completed fully */
> > +               if (write)
> > +                       set_bit(PG_mte_tagged, &page->flags);
> > +
> > +               gfn++;
> > +               tags += num_tags;
> > +               length -= PAGE_SIZE;
> > +       }
> > +
> > +out:
> > +       mutex_unlock(&kvm->slots_lock);
> > +       /* If some data has been copied report the number of bytes copied */
> > +       if (length != copy_tags->length)
> > +               return copy_tags->length - length;
> 
> I'm not sure if this is actually an issue, but a couple of comments on
> the return value if there is an error after a partial copy has been
> done. If mte_copy_tags_to_user or mte_copy_tags_from_user don't return
> MTE_GRANULES_PER_PAGE, then the check for num_tags would fail, but
> some of the tags would have been copied, which wouldn't be reflected
> in length. That said, on a write the tagged bit wouldn't be set, and
> on read then the return value would be conservative, but not
> incorrect.
>
> That said, even though it is described that way in the documentation
> (rather deep in the description though), it might be confusing to
> return a non-negative value on an error. The other kvm ioctl I could
> find that does something similar, KVM_S390_GET_IRQ_STATE, seems to
> always return a -ERROR on error, rather than the number of bytes
> copied.

My mental analogy for this ioctl is the read()/write() syscalls, which
return the number of bytes that have been transferred in either
direction.

I agree that there are some corner cases (a tag copy that fails
because of a faulty page adjacent to a valid page will still report
some degree of success), but it is also important to report what has
actually been done in either direction.

Thanks,

	M.
Fuad Tabba June 22, 2021, 10:56 a.m. UTC | #3
Hi Marc,

On Tue, Jun 22, 2021 at 11:25 AM Marc Zyngier <maz@kernel.org> wrote:
>
> Hi Fuad,
>
> On Tue, 22 Jun 2021 09:56:22 +0100,
> Fuad Tabba <tabba@google.com> wrote:
> >
> > Hi,
> >
> >
> > On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
> > >
> > > The VMM may not wish to have it's own mapping of guest memory mapped
> > > with PROT_MTE because this causes problems if the VMM has tag checking
> > > enabled (the guest controls the tags in physical RAM and it's unlikely
> > > the tags are correct for the VMM).
> > >
> > > Instead add a new ioctl which allows the VMM to easily read/write the
> > > tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
> > > while the VMM can still read/write the tags for the purpose of
> > > migration.
> > >
> > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> > > Signed-off-by: Steven Price <steven.price@arm.com>
> > > ---
> > >  arch/arm64/include/asm/kvm_host.h |  3 ++
> > >  arch/arm64/include/asm/mte-def.h  |  1 +
> > >  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
> > >  arch/arm64/kvm/arm.c              |  7 +++
> > >  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
> > >  include/uapi/linux/kvm.h          |  1 +
> > >  6 files changed, 105 insertions(+)
> > >
> > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > > index 309e36cc1b42..6a2ac4636d42 100644
> > > --- a/arch/arm64/include/asm/kvm_host.h
> > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
> > >  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
> > >                                struct kvm_device_attr *attr);
> > >
> > > +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> > > +                               struct kvm_arm_copy_mte_tags *copy_tags);
> > > +
> > >  /* Guest/host FPSIMD coordination helpers */
> > >  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
> > >  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
> > > diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
> > > index cf241b0f0a42..626d359b396e 100644
> > > --- a/arch/arm64/include/asm/mte-def.h
> > > +++ b/arch/arm64/include/asm/mte-def.h
> > > @@ -7,6 +7,7 @@
> > >
> > >  #define MTE_GRANULE_SIZE       UL(16)
> > >  #define MTE_GRANULE_MASK       (~(MTE_GRANULE_SIZE - 1))
> > > +#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
> > >  #define MTE_TAG_SHIFT          56
> > >  #define MTE_TAG_SIZE           4
> > >  #define MTE_TAG_MASK           GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
> > > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> > > index 24223adae150..b3edde68bc3e 100644
> > > --- a/arch/arm64/include/uapi/asm/kvm.h
> > > +++ b/arch/arm64/include/uapi/asm/kvm.h
> > > @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
> > >         __u32 reserved[12];
> > >  };
> > >
> > > +struct kvm_arm_copy_mte_tags {
> > > +       __u64 guest_ipa;
> > > +       __u64 length;
> > > +       void __user *addr;
> > > +       __u64 flags;
> > > +       __u64 reserved[2];
> > > +};
> > > +
> > > +#define KVM_ARM_TAGS_TO_GUEST          0
> > > +#define KVM_ARM_TAGS_FROM_GUEST                1
> > > +
> > >  /* If you need to interpret the index values, here is the key: */
> > >  #define KVM_REG_ARM_COPROC_MASK                0x000000000FFF0000
> > >  #define KVM_REG_ARM_COPROC_SHIFT       16
> > > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > > index 28ce26a68f09..511f3716fe33 100644
> > > --- a/arch/arm64/kvm/arm.c
> > > +++ b/arch/arm64/kvm/arm.c
> > > @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
> > >
> > >                 return 0;
> > >         }
> > > +       case KVM_ARM_MTE_COPY_TAGS: {
> > > +               struct kvm_arm_copy_mte_tags copy_tags;
> > > +
> > > +               if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
> > > +                       return -EFAULT;
> > > +               return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
> > > +       }
> > >         default:
> > >                 return -EINVAL;
> > >         }
> > > diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> > > index 5cb4a1cd5603..4ddb20017b2f 100644
> > > --- a/arch/arm64/kvm/guest.c
> > > +++ b/arch/arm64/kvm/guest.c
> > > @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
> > >
> > >         return ret;
> > >  }
> > > +
> > > +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> > > +                               struct kvm_arm_copy_mte_tags *copy_tags)
> > > +{
> > > +       gpa_t guest_ipa = copy_tags->guest_ipa;
> > > +       size_t length = copy_tags->length;
> > > +       void __user *tags = copy_tags->addr;
> > > +       gpa_t gfn;
> > > +       bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> > > +       int ret = 0;
> > > +
> > > +       if (!kvm_has_mte(kvm))
> > > +               return -EINVAL;
> > > +
> > > +       if (copy_tags->reserved[0] || copy_tags->reserved[1])
> > > +               return -EINVAL;
> > > +
> > > +       if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> > > +               return -EINVAL;
> > > +
> > > +       if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> > > +               return -EINVAL;
> > > +
> > > +       gfn = gpa_to_gfn(guest_ipa);
> > > +
> > > +       mutex_lock(&kvm->slots_lock);
> > > +
> > > +       while (length > 0) {
> > > +               kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> > > +               void *maddr;
> > > +               unsigned long num_tags;
> > > +               struct page *page;
> > > +
> > > +               if (is_error_noslot_pfn(pfn)) {
> > > +                       ret = -EFAULT;
> > > +                       goto out;
> > > +               }
> > > +
> > > +               page = pfn_to_online_page(pfn);
> > > +               if (!page) {
> > > +                       /* Reject ZONE_DEVICE memory */
> > > +                       ret = -EFAULT;
> > > +                       goto out;
> > > +               }
> > > +               maddr = page_address(page);
> > > +
> > > +               if (!write) {
> > > +                       if (test_bit(PG_mte_tagged, &page->flags))
> > > +                               num_tags = mte_copy_tags_to_user(tags, maddr,
> > > +                                                       MTE_GRANULES_PER_PAGE);
> > > +                       else
> > > +                               /* No tags in memory, so write zeros */
> > > +                               num_tags = MTE_GRANULES_PER_PAGE -
> > > +                                       clear_user(tags, MTE_GRANULES_PER_PAGE);
> > > +                       kvm_release_pfn_clean(pfn);
> > > +               } else {
> > > +                       num_tags = mte_copy_tags_from_user(maddr, tags,
> > > +                                                       MTE_GRANULES_PER_PAGE);
> > > +                       kvm_release_pfn_dirty(pfn);
> > > +               }
> > > +
> > > +               if (num_tags != MTE_GRANULES_PER_PAGE) {
> > > +                       ret = -EFAULT;
> > > +                       goto out;
> > > +               }
> > > +
> > > +               /* Set the flag after checking the write completed fully */
> > > +               if (write)
> > > +                       set_bit(PG_mte_tagged, &page->flags);
> > > +
> > > +               gfn++;
> > > +               tags += num_tags;
> > > +               length -= PAGE_SIZE;
> > > +       }
> > > +
> > > +out:
> > > +       mutex_unlock(&kvm->slots_lock);
> > > +       /* If some data has been copied report the number of bytes copied */
> > > +       if (length != copy_tags->length)
> > > +               return copy_tags->length - length;
> >
> > I'm not sure if this is actually an issue, but a couple of comments on
> > the return value if there is an error after a partial copy has been
> > done. If mte_copy_tags_to_user or mte_copy_tags_from_user don't return
> > MTE_GRANULES_PER_PAGE, then the check for num_tags would fail, but
> > some of the tags would have been copied, which wouldn't be reflected
> > in length. That said, on a write the tagged bit wouldn't be set, and
> > on read then the return value would be conservative, but not
> > incorrect.
> >
> > That said, even though it is described that way in the documentation
> > (rather deep in the description though), it might be confusing to
> > return a non-negative value on an error. The other kvm ioctl I could
> > find that does something similar, KVM_S390_GET_IRQ_STATE, seems to
> > always return a -ERROR on error, rather than the number of bytes
> > copied.
>
> My mental analogy for this ioctl is the read()/write() syscalls, which
> return the number of bytes that have been transferred in either
> direction.
>
> I agree that there are some corner cases (a tag copy that fails
> because of a faulty page adjacent to a valid page will still report
> some degree of success), but it is also important to report what has
> actually been done in either direction.

read()/write() return an error (-1) and not the amount copied if there
is an actual error I believe:

https://man7.org/linux/man-pages/man2/read.2.html

> It is not an error if this number is smaller than the number of
> bytes requested; this may happen for example because fewer bytes
> are actually available right now (maybe because we were close to
> end-of-file, or because we are reading from a pipe, or from a
> terminal), or because read() was interrupted by a signal.
>
> On error, -1 is returned, and errno is set to indicate the error.
> In this case, it is left unspecified whether the file position
> (if any) changes.

I think that for the current return value, then it would be good for
the documentation in patch 6/6 to be more explicit. There it says:

> :Returns: number of bytes copied, < 0 on error (-EINVAL for incorrect
>           arguments, -EFAULT if memory cannot be accessed).

Later on it does state that if an error happens after some copying has
been done, it returns the number copied. But that's at the end of the
section. I think it would be less confusing to have it in the summary
(with the "Returns").

Thanks,
/fuad






> Thanks,
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Steven Price June 23, 2021, 2:07 p.m. UTC | #4
On 22/06/2021 11:56, Fuad Tabba wrote:
> Hi Marc,
> 
> On Tue, Jun 22, 2021 at 11:25 AM Marc Zyngier <maz@kernel.org> wrote:
>>
>> Hi Fuad,
>>
>> On Tue, 22 Jun 2021 09:56:22 +0100,
>> Fuad Tabba <tabba@google.com> wrote:
>>>
>>> Hi,
>>>
>>>
>>> On Mon, Jun 21, 2021 at 12:18 PM Steven Price <steven.price@arm.com> wrote:
>>>>
>>>> The VMM may not wish to have it's own mapping of guest memory mapped
>>>> with PROT_MTE because this causes problems if the VMM has tag checking
>>>> enabled (the guest controls the tags in physical RAM and it's unlikely
>>>> the tags are correct for the VMM).
>>>>
>>>> Instead add a new ioctl which allows the VMM to easily read/write the
>>>> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
>>>> while the VMM can still read/write the tags for the purpose of
>>>> migration.
>>>>
>>>> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>> ---
>>>>  arch/arm64/include/asm/kvm_host.h |  3 ++
>>>>  arch/arm64/include/asm/mte-def.h  |  1 +
>>>>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>>>>  arch/arm64/kvm/arm.c              |  7 +++
>>>>  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
>>>>  include/uapi/linux/kvm.h          |  1 +
>>>>  6 files changed, 105 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>>> index 309e36cc1b42..6a2ac4636d42 100644
>>>> --- a/arch/arm64/include/asm/kvm_host.h
>>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>>> @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
>>>>  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>>>>                                struct kvm_device_attr *attr);
>>>>
>>>> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>>>> +                               struct kvm_arm_copy_mte_tags *copy_tags);
>>>> +
>>>>  /* Guest/host FPSIMD coordination helpers */
>>>>  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
>>>>  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
>>>> diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
>>>> index cf241b0f0a42..626d359b396e 100644
>>>> --- a/arch/arm64/include/asm/mte-def.h
>>>> +++ b/arch/arm64/include/asm/mte-def.h
>>>> @@ -7,6 +7,7 @@
>>>>
>>>>  #define MTE_GRANULE_SIZE       UL(16)
>>>>  #define MTE_GRANULE_MASK       (~(MTE_GRANULE_SIZE - 1))
>>>> +#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
>>>>  #define MTE_TAG_SHIFT          56
>>>>  #define MTE_TAG_SIZE           4
>>>>  #define MTE_TAG_MASK           GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
>>>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
>>>> index 24223adae150..b3edde68bc3e 100644
>>>> --- a/arch/arm64/include/uapi/asm/kvm.h
>>>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>>>> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>>>>         __u32 reserved[12];
>>>>  };
>>>>
>>>> +struct kvm_arm_copy_mte_tags {
>>>> +       __u64 guest_ipa;
>>>> +       __u64 length;
>>>> +       void __user *addr;
>>>> +       __u64 flags;
>>>> +       __u64 reserved[2];
>>>> +};
>>>> +
>>>> +#define KVM_ARM_TAGS_TO_GUEST          0
>>>> +#define KVM_ARM_TAGS_FROM_GUEST                1
>>>> +
>>>>  /* If you need to interpret the index values, here is the key: */
>>>>  #define KVM_REG_ARM_COPROC_MASK                0x000000000FFF0000
>>>>  #define KVM_REG_ARM_COPROC_SHIFT       16
>>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>>>> index 28ce26a68f09..511f3716fe33 100644
>>>> --- a/arch/arm64/kvm/arm.c
>>>> +++ b/arch/arm64/kvm/arm.c
>>>> @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>>>
>>>>                 return 0;
>>>>         }
>>>> +       case KVM_ARM_MTE_COPY_TAGS: {
>>>> +               struct kvm_arm_copy_mte_tags copy_tags;
>>>> +
>>>> +               if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
>>>> +                       return -EFAULT;
>>>> +               return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
>>>> +       }
>>>>         default:
>>>>                 return -EINVAL;
>>>>         }
>>>> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
>>>> index 5cb4a1cd5603..4ddb20017b2f 100644
>>>> --- a/arch/arm64/kvm/guest.c
>>>> +++ b/arch/arm64/kvm/guest.c
>>>> @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>>>>
>>>>         return ret;
>>>>  }
>>>> +
>>>> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>>>> +                               struct kvm_arm_copy_mte_tags *copy_tags)
>>>> +{
>>>> +       gpa_t guest_ipa = copy_tags->guest_ipa;
>>>> +       size_t length = copy_tags->length;
>>>> +       void __user *tags = copy_tags->addr;
>>>> +       gpa_t gfn;
>>>> +       bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>>>> +       int ret = 0;
>>>> +
>>>> +       if (!kvm_has_mte(kvm))
>>>> +               return -EINVAL;
>>>> +
>>>> +       if (copy_tags->reserved[0] || copy_tags->reserved[1])
>>>> +               return -EINVAL;
>>>> +
>>>> +       if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>>>> +               return -EINVAL;
>>>> +
>>>> +       if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>>>> +               return -EINVAL;
>>>> +
>>>> +       gfn = gpa_to_gfn(guest_ipa);
>>>> +
>>>> +       mutex_lock(&kvm->slots_lock);
>>>> +
>>>> +       while (length > 0) {
>>>> +               kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>>>> +               void *maddr;
>>>> +               unsigned long num_tags;
>>>> +               struct page *page;
>>>> +
>>>> +               if (is_error_noslot_pfn(pfn)) {
>>>> +                       ret = -EFAULT;
>>>> +                       goto out;
>>>> +               }
>>>> +
>>>> +               page = pfn_to_online_page(pfn);
>>>> +               if (!page) {
>>>> +                       /* Reject ZONE_DEVICE memory */
>>>> +                       ret = -EFAULT;
>>>> +                       goto out;
>>>> +               }
>>>> +               maddr = page_address(page);
>>>> +
>>>> +               if (!write) {
>>>> +                       if (test_bit(PG_mte_tagged, &page->flags))
>>>> +                               num_tags = mte_copy_tags_to_user(tags, maddr,
>>>> +                                                       MTE_GRANULES_PER_PAGE);
>>>> +                       else
>>>> +                               /* No tags in memory, so write zeros */
>>>> +                               num_tags = MTE_GRANULES_PER_PAGE -
>>>> +                                       clear_user(tags, MTE_GRANULES_PER_PAGE);
>>>> +                       kvm_release_pfn_clean(pfn);
>>>> +               } else {
>>>> +                       num_tags = mte_copy_tags_from_user(maddr, tags,
>>>> +                                                       MTE_GRANULES_PER_PAGE);
>>>> +                       kvm_release_pfn_dirty(pfn);
>>>> +               }
>>>> +
>>>> +               if (num_tags != MTE_GRANULES_PER_PAGE) {
>>>> +                       ret = -EFAULT;
>>>> +                       goto out;
>>>> +               }
>>>> +
>>>> +               /* Set the flag after checking the write completed fully */
>>>> +               if (write)
>>>> +                       set_bit(PG_mte_tagged, &page->flags);
>>>> +
>>>> +               gfn++;
>>>> +               tags += num_tags;
>>>> +               length -= PAGE_SIZE;
>>>> +       }
>>>> +
>>>> +out:
>>>> +       mutex_unlock(&kvm->slots_lock);
>>>> +       /* If some data has been copied report the number of bytes copied */
>>>> +       if (length != copy_tags->length)
>>>> +               return copy_tags->length - length;
>>>
>>> I'm not sure if this is actually an issue, but a couple of comments on
>>> the return value if there is an error after a partial copy has been
>>> done. If mte_copy_tags_to_user or mte_copy_tags_from_user don't return
>>> MTE_GRANULES_PER_PAGE, then the check for num_tags would fail, but
>>> some of the tags would have been copied, which wouldn't be reflected
>>> in length. That said, on a write the tagged bit wouldn't be set, and
>>> on read then the return value would be conservative, but not
>>> incorrect.
>>>
>>> That said, even though it is described that way in the documentation
>>> (rather deep in the description though), it might be confusing to
>>> return a non-negative value on an error. The other kvm ioctl I could
>>> find that does something similar, KVM_S390_GET_IRQ_STATE, seems to
>>> always return a -ERROR on error, rather than the number of bytes
>>> copied.
>>
>> My mental analogy for this ioctl is the read()/write() syscalls, which
>> return the number of bytes that have been transferred in either
>> direction.
>>
>> I agree that there are some corner cases (a tag copy that fails
>> because of a faulty page adjacent to a valid page will still report
>> some degree of success), but it is also important to report what has
>> actually been done in either direction.
> 
> read()/write() return an error (-1) and not the amount copied if there
> is an actual error I believe:
> 
> https://man7.org/linux/man-pages/man2/read.2.html
> 
>> It is not an error if this number is smaller than the number of
>> bytes requested; this may happen for example because fewer bytes
>> are actually available right now (maybe because we were close to
>> end-of-file, or because we are reading from a pipe, or from a
>> terminal), or because read() was interrupted by a signal.
>>
>> On error, -1 is returned, and errno is set to indicate the error.
>> In this case, it is left unspecified whether the file position
>> (if any) changes.
> 
> I think that for the current return value, then it would be good for
> the documentation in patch 6/6 to be more explicit. There it says:

read() isn't quite as interesting because (usually) there's no effect if
you abort a read part way through (you can easily roll back the file
position and return an error). Although I believe it does have the same
behaviour as write() in the cases of errors where rollback doesn't
happen. write()[1] has the following text:

> Note that a successful write() may transfer fewer than count
> bytes.  Such partial writes can occur for various reasons; for
> example, because there was insufficient space on the disk device
> to write all of the requested bytes, or because a blocked write()
> to a socket, pipe, or similar was interrupted by a signal handler
> after it had transferred some, but before it had transferred all
> of the requested bytes.  In the event of a partial write, the
> caller can make another write() call to transfer the remaining
> bytes.  The subsequent call will either transfer further bytes or
> may result in an error (e.g., if the disk is now full).

which matches the behaviour here - if there's an error (e.g. disk full)
then the partial success is reported (and the error effectively
ignored). The caller can then make another call to attempt the remaining
part at which point the error will actually be reported.

[1] https://man7.org/linux/man-pages/man2/write.2.html

>> :Returns: number of bytes copied, < 0 on error (-EINVAL for incorrect
>>           arguments, -EFAULT if memory cannot be accessed).
> 
> Later on it does state that if an error happens after some copying has
> been done, it returns the number copied. But that's at the end of the
> section. I think it would be less confusing to have it in the summary
> (with the "Returns").

I tried to keep the Returns section reasonably small. It does state
"number of bytes copied" which I think gives the reader a hint that the
return could be different from the number of bytes requested.

Thanks,

Steve
Marc Zyngier June 24, 2021, 1:35 p.m. UTC | #5
Hi Steven,

On Mon, 21 Jun 2021 12:17:15 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> The VMM may not wish to have it's own mapping of guest memory mapped
> with PROT_MTE because this causes problems if the VMM has tag checking
> enabled (the guest controls the tags in physical RAM and it's unlikely
> the tags are correct for the VMM).
> 
> Instead add a new ioctl which allows the VMM to easily read/write the
> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
> while the VMM can still read/write the tags for the purpose of
> migration.
> 
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  3 ++
>  arch/arm64/include/asm/mte-def.h  |  1 +
>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>  arch/arm64/kvm/arm.c              |  7 +++
>  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  1 +
>  6 files changed, 105 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 309e36cc1b42..6a2ac4636d42 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
>  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>  			       struct kvm_device_attr *attr);
>  
> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +				struct kvm_arm_copy_mte_tags *copy_tags);
> +
>  /* Guest/host FPSIMD coordination helpers */
>  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
>  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
> diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
> index cf241b0f0a42..626d359b396e 100644
> --- a/arch/arm64/include/asm/mte-def.h
> +++ b/arch/arm64/include/asm/mte-def.h
> @@ -7,6 +7,7 @@
>  
>  #define MTE_GRANULE_SIZE	UL(16)
>  #define MTE_GRANULE_MASK	(~(MTE_GRANULE_SIZE - 1))
> +#define MTE_GRANULES_PER_PAGE	(PAGE_SIZE / MTE_GRANULE_SIZE)
>  #define MTE_TAG_SHIFT		56
>  #define MTE_TAG_SIZE		4
>  #define MTE_TAG_MASK		GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 24223adae150..b3edde68bc3e 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>  	__u32 reserved[12];
>  };
>  
> +struct kvm_arm_copy_mte_tags {
> +	__u64 guest_ipa;
> +	__u64 length;
> +	void __user *addr;
> +	__u64 flags;
> +	__u64 reserved[2];
> +};
> +
> +#define KVM_ARM_TAGS_TO_GUEST		0
> +#define KVM_ARM_TAGS_FROM_GUEST		1
> +
>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT	16
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 28ce26a68f09..511f3716fe33 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  
>  		return 0;
>  	}
> +	case KVM_ARM_MTE_COPY_TAGS: {
> +		struct kvm_arm_copy_mte_tags copy_tags;
> +
> +		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
> +			return -EFAULT;
> +		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
> +	}
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 5cb4a1cd5603..4ddb20017b2f 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>  
>  	return ret;
>  }
> +
> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> +				struct kvm_arm_copy_mte_tags *copy_tags)
> +{
> +	gpa_t guest_ipa = copy_tags->guest_ipa;
> +	size_t length = copy_tags->length;
> +	void __user *tags = copy_tags->addr;
> +	gpa_t gfn;
> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
> +	int ret = 0;
> +
> +	if (!kvm_has_mte(kvm))
> +		return -EINVAL;
> +
> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
> +		return -EINVAL;
> +
> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
> +		return -EINVAL;
> +
> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	gfn = gpa_to_gfn(guest_ipa);
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	while (length > 0) {
> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
> +		void *maddr;
> +		unsigned long num_tags;
> +		struct page *page;
> +
> +		if (is_error_noslot_pfn(pfn)) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		page = pfn_to_online_page(pfn);
> +		if (!page) {
> +			/* Reject ZONE_DEVICE memory */
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		maddr = page_address(page);
> +
> +		if (!write) {
> +			if (test_bit(PG_mte_tagged, &page->flags))
> +				num_tags = mte_copy_tags_to_user(tags, maddr,
> +							MTE_GRANULES_PER_PAGE);
> +			else
> +				/* No tags in memory, so write zeros */
> +				num_tags = MTE_GRANULES_PER_PAGE -
> +					clear_user(tags, MTE_GRANULES_PER_PAGE);
> +			kvm_release_pfn_clean(pfn);
> +		} else {
> +			num_tags = mte_copy_tags_from_user(maddr, tags,
> +							MTE_GRANULES_PER_PAGE);
> +			kvm_release_pfn_dirty(pfn);
> +		}
> +
> +		if (num_tags != MTE_GRANULES_PER_PAGE) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		/* Set the flag after checking the write completed fully */
> +		if (write)
> +			set_bit(PG_mte_tagged, &page->flags);

This ended up catching my eye as I was merging some other patches.

This set_bit() occurs *after* the page has been released, meaning it
could have been evicted and reused in the interval. I plan to fix it
as below. Please let me know if that works for you.

Thanks,

	M.

From a78d3206378a7101659fbc2a4bf01cb9376c4793 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 24 Jun 2021 14:21:05 +0100
Subject: [PATCH] KVM: arm64: Set the MTE tag bit before releasing the page

Setting a page flag without holding a reference to the page
is living dangerously. In the tag-writing path, we drop the
reference to the page by calling kvm_release_pfn_dirty(),
and only then set the PG_mte_tagged bit.

It would be safer to do it the other way round.

Fixes: f0376edb1ddca ("KVM: arm64: Add ioctl to fetch/store tags in a guest")
Cc: Steven Price <steven.price@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/guest.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 4ddb20017b2f..60815ae477cf 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1053,6 +1053,14 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		} else {
 			num_tags = mte_copy_tags_from_user(maddr, tags,
 							MTE_GRANULES_PER_PAGE);
+
+			/*
+			 * Set the flag after checking the write
+			 * completed fully
+			 */
+			if (num_tags == MTE_GRANULES_PER_PAGE)
+				set_bit(PG_mte_tagged, &page->flags);
+
 			kvm_release_pfn_dirty(pfn);
 		}
 
@@ -1061,10 +1069,6 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 			goto out;
 		}
 
-		/* Set the flag after checking the write completed fully */
-		if (write)
-			set_bit(PG_mte_tagged, &page->flags);
-
 		gfn++;
 		tags += num_tags;
 		length -= PAGE_SIZE;
Steven Price June 24, 2021, 1:42 p.m. UTC | #6
On 24/06/2021 14:35, Marc Zyngier wrote:
> Hi Steven,
> 
> On Mon, 21 Jun 2021 12:17:15 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The VMM may not wish to have it's own mapping of guest memory mapped
>> with PROT_MTE because this causes problems if the VMM has tag checking
>> enabled (the guest controls the tags in physical RAM and it's unlikely
>> the tags are correct for the VMM).
>>
>> Instead add a new ioctl which allows the VMM to easily read/write the
>> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
>> while the VMM can still read/write the tags for the purpose of
>> migration.
>>
>> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_host.h |  3 ++
>>  arch/arm64/include/asm/mte-def.h  |  1 +
>>  arch/arm64/include/uapi/asm/kvm.h | 11 +++++
>>  arch/arm64/kvm/arm.c              |  7 +++
>>  arch/arm64/kvm/guest.c            | 82 +++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h          |  1 +
>>  6 files changed, 105 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 309e36cc1b42..6a2ac4636d42 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -729,6 +729,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
>>  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>>  			       struct kvm_device_attr *attr);
>>  
>> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>> +				struct kvm_arm_copy_mte_tags *copy_tags);
>> +
>>  /* Guest/host FPSIMD coordination helpers */
>>  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
>>  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
>> diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
>> index cf241b0f0a42..626d359b396e 100644
>> --- a/arch/arm64/include/asm/mte-def.h
>> +++ b/arch/arm64/include/asm/mte-def.h
>> @@ -7,6 +7,7 @@
>>  
>>  #define MTE_GRANULE_SIZE	UL(16)
>>  #define MTE_GRANULE_MASK	(~(MTE_GRANULE_SIZE - 1))
>> +#define MTE_GRANULES_PER_PAGE	(PAGE_SIZE / MTE_GRANULE_SIZE)
>>  #define MTE_TAG_SHIFT		56
>>  #define MTE_TAG_SIZE		4
>>  #define MTE_TAG_MASK		GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
>> index 24223adae150..b3edde68bc3e 100644
>> --- a/arch/arm64/include/uapi/asm/kvm.h
>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>> @@ -184,6 +184,17 @@ struct kvm_vcpu_events {
>>  	__u32 reserved[12];
>>  };
>>  
>> +struct kvm_arm_copy_mte_tags {
>> +	__u64 guest_ipa;
>> +	__u64 length;
>> +	void __user *addr;
>> +	__u64 flags;
>> +	__u64 reserved[2];
>> +};
>> +
>> +#define KVM_ARM_TAGS_TO_GUEST		0
>> +#define KVM_ARM_TAGS_FROM_GUEST		1
>> +
>>  /* If you need to interpret the index values, here is the key: */
>>  #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
>>  #define KVM_REG_ARM_COPROC_SHIFT	16
>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index 28ce26a68f09..511f3716fe33 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -1359,6 +1359,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>  
>>  		return 0;
>>  	}
>> +	case KVM_ARM_MTE_COPY_TAGS: {
>> +		struct kvm_arm_copy_mte_tags copy_tags;
>> +
>> +		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
>> +			return -EFAULT;
>> +		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
>> +	}
>>  	default:
>>  		return -EINVAL;
>>  	}
>> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
>> index 5cb4a1cd5603..4ddb20017b2f 100644
>> --- a/arch/arm64/kvm/guest.c
>> +++ b/arch/arm64/kvm/guest.c
>> @@ -995,3 +995,85 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>>  
>>  	return ret;
>>  }
>> +
>> +long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>> +				struct kvm_arm_copy_mte_tags *copy_tags)
>> +{
>> +	gpa_t guest_ipa = copy_tags->guest_ipa;
>> +	size_t length = copy_tags->length;
>> +	void __user *tags = copy_tags->addr;
>> +	gpa_t gfn;
>> +	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
>> +	int ret = 0;
>> +
>> +	if (!kvm_has_mte(kvm))
>> +		return -EINVAL;
>> +
>> +	if (copy_tags->reserved[0] || copy_tags->reserved[1])
>> +		return -EINVAL;
>> +
>> +	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
>> +		return -EINVAL;
>> +
>> +	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
>> +		return -EINVAL;
>> +
>> +	gfn = gpa_to_gfn(guest_ipa);
>> +
>> +	mutex_lock(&kvm->slots_lock);
>> +
>> +	while (length > 0) {
>> +		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
>> +		void *maddr;
>> +		unsigned long num_tags;
>> +		struct page *page;
>> +
>> +		if (is_error_noslot_pfn(pfn)) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		page = pfn_to_online_page(pfn);
>> +		if (!page) {
>> +			/* Reject ZONE_DEVICE memory */
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +		maddr = page_address(page);
>> +
>> +		if (!write) {
>> +			if (test_bit(PG_mte_tagged, &page->flags))
>> +				num_tags = mte_copy_tags_to_user(tags, maddr,
>> +							MTE_GRANULES_PER_PAGE);
>> +			else
>> +				/* No tags in memory, so write zeros */
>> +				num_tags = MTE_GRANULES_PER_PAGE -
>> +					clear_user(tags, MTE_GRANULES_PER_PAGE);
>> +			kvm_release_pfn_clean(pfn);
>> +		} else {
>> +			num_tags = mte_copy_tags_from_user(maddr, tags,
>> +							MTE_GRANULES_PER_PAGE);
>> +			kvm_release_pfn_dirty(pfn);
>> +		}
>> +
>> +		if (num_tags != MTE_GRANULES_PER_PAGE) {
>> +			ret = -EFAULT;
>> +			goto out;
>> +		}
>> +
>> +		/* Set the flag after checking the write completed fully */
>> +		if (write)
>> +			set_bit(PG_mte_tagged, &page->flags);
> 
> This ended up catching my eye as I was merging some other patches.
> 
> This set_bit() occurs *after* the page has been released, meaning it
> could have been evicted and reused in the interval. I plan to fix it
> as below. Please let me know if that works for you.
> 
> Thanks,
> 
> 	M.
> 
> From a78d3206378a7101659fbc2a4bf01cb9376c4793 Mon Sep 17 00:00:00 2001
> From: Marc Zyngier <maz@kernel.org>
> Date: Thu, 24 Jun 2021 14:21:05 +0100
> Subject: [PATCH] KVM: arm64: Set the MTE tag bit before releasing the page
> 
> Setting a page flag without holding a reference to the page
> is living dangerously. In the tag-writing path, we drop the
> reference to the page by calling kvm_release_pfn_dirty(),
> and only then set the PG_mte_tagged bit.
> 
> It would be safer to do it the other way round.
> 
> Fixes: f0376edb1ddca ("KVM: arm64: Add ioctl to fetch/store tags in a guest")
> Cc: Steven Price <steven.price@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Signed-off-by: Marc Zyngier <maz@kernel.org>

Well spotted - I'd originally had the set_bit there - but moved it down
when I realised it should only be set if the mte_copy_tags_from_user()
completed successfully. Obviously I hadn't noticed that the page
reference had gone by that point.

Thanks for fixing it. FWIW:

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  arch/arm64/kvm/guest.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 4ddb20017b2f..60815ae477cf 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -1053,6 +1053,14 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>  		} else {
>  			num_tags = mte_copy_tags_from_user(maddr, tags,
>  							MTE_GRANULES_PER_PAGE);
> +
> +			/*
> +			 * Set the flag after checking the write
> +			 * completed fully
> +			 */
> +			if (num_tags == MTE_GRANULES_PER_PAGE)
> +				set_bit(PG_mte_tagged, &page->flags);
> +
>  			kvm_release_pfn_dirty(pfn);
>  		}
>  
> @@ -1061,10 +1069,6 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>  			goto out;
>  		}
>  
> -		/* Set the flag after checking the write completed fully */
> -		if (write)
> -			set_bit(PG_mte_tagged, &page->flags);
> -
>  		gfn++;
>  		tags += num_tags;
>  		length -= PAGE_SIZE;
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 309e36cc1b42..6a2ac4636d42 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -729,6 +729,9 @@  int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+				struct kvm_arm_copy_mte_tags *copy_tags);
+
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
index cf241b0f0a42..626d359b396e 100644
--- a/arch/arm64/include/asm/mte-def.h
+++ b/arch/arm64/include/asm/mte-def.h
@@ -7,6 +7,7 @@ 
 
 #define MTE_GRANULE_SIZE	UL(16)
 #define MTE_GRANULE_MASK	(~(MTE_GRANULE_SIZE - 1))
+#define MTE_GRANULES_PER_PAGE	(PAGE_SIZE / MTE_GRANULE_SIZE)
 #define MTE_TAG_SHIFT		56
 #define MTE_TAG_SIZE		4
 #define MTE_TAG_MASK		GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..b3edde68bc3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,17 @@  struct kvm_vcpu_events {
 	__u32 reserved[12];
 };
 
+struct kvm_arm_copy_mte_tags {
+	__u64 guest_ipa;
+	__u64 length;
+	void __user *addr;
+	__u64 flags;
+	__u64 reserved[2];
+};
+
+#define KVM_ARM_TAGS_TO_GUEST		0
+#define KVM_ARM_TAGS_FROM_GUEST		1
+
 /* If you need to interpret the index values, here is the key: */
 #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
 #define KVM_REG_ARM_COPROC_SHIFT	16
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 28ce26a68f09..511f3716fe33 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1359,6 +1359,13 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 		return 0;
 	}
+	case KVM_ARM_MTE_COPY_TAGS: {
+		struct kvm_arm_copy_mte_tags copy_tags;
+
+		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
+			return -EFAULT;
+		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5cb4a1cd5603..4ddb20017b2f 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -995,3 +995,85 @@  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 
 	return ret;
 }
+
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+				struct kvm_arm_copy_mte_tags *copy_tags)
+{
+	gpa_t guest_ipa = copy_tags->guest_ipa;
+	size_t length = copy_tags->length;
+	void __user *tags = copy_tags->addr;
+	gpa_t gfn;
+	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+	int ret = 0;
+
+	if (!kvm_has_mte(kvm))
+		return -EINVAL;
+
+	if (copy_tags->reserved[0] || copy_tags->reserved[1])
+		return -EINVAL;
+
+	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+		return -EINVAL;
+
+	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+		return -EINVAL;
+
+	gfn = gpa_to_gfn(guest_ipa);
+
+	mutex_lock(&kvm->slots_lock);
+
+	while (length > 0) {
+		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+		void *maddr;
+		unsigned long num_tags;
+		struct page *page;
+
+		if (is_error_noslot_pfn(pfn)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		page = pfn_to_online_page(pfn);
+		if (!page) {
+			/* Reject ZONE_DEVICE memory */
+			ret = -EFAULT;
+			goto out;
+		}
+		maddr = page_address(page);
+
+		if (!write) {
+			if (test_bit(PG_mte_tagged, &page->flags))
+				num_tags = mte_copy_tags_to_user(tags, maddr,
+							MTE_GRANULES_PER_PAGE);
+			else
+				/* No tags in memory, so write zeros */
+				num_tags = MTE_GRANULES_PER_PAGE -
+					clear_user(tags, MTE_GRANULES_PER_PAGE);
+			kvm_release_pfn_clean(pfn);
+		} else {
+			num_tags = mte_copy_tags_from_user(maddr, tags,
+							MTE_GRANULES_PER_PAGE);
+			kvm_release_pfn_dirty(pfn);
+		}
+
+		if (num_tags != MTE_GRANULES_PER_PAGE) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		/* Set the flag after checking the write completed fully */
+		if (write)
+			set_bit(PG_mte_tagged, &page->flags);
+
+		gfn++;
+		tags += num_tags;
+		length -= PAGE_SIZE;
+	}
+
+out:
+	mutex_unlock(&kvm->slots_lock);
+	/* If some data has been copied report the number of bytes copied */
+	if (length != copy_tags->length)
+		return copy_tags->length - length;
+	return ret;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d4da58ddcad7..da1edd2b4046 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1429,6 +1429,7 @@  struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PMU_EVENT_FILTER */
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
+#define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)