Message ID | 20210517123239.8025-8-steven.price@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | MTE support for KVM guest | expand |
On Mon, 17 May 2021 13:32:38 +0100, Steven Price <steven.price@arm.com> wrote: > > The VMM may not wish to have it's own mapping of guest memory mapped > with PROT_MTE because this causes problems if the VMM has tag checking > enabled (the guest controls the tags in physical RAM and it's unlikely > the tags are correct for the VMM). > > Instead add a new ioctl which allows the VMM to easily read/write the > tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE > while the VMM can still read/write the tags for the purpose of > migration. > > Signed-off-by: Steven Price <steven.price@arm.com> > --- > arch/arm64/include/uapi/asm/kvm.h | 11 +++++ > arch/arm64/kvm/arm.c | 69 +++++++++++++++++++++++++++++++ > include/uapi/linux/kvm.h | 1 + > 3 files changed, 81 insertions(+) > > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h > index 24223adae150..b3edde68bc3e 100644 > --- a/arch/arm64/include/uapi/asm/kvm.h > +++ b/arch/arm64/include/uapi/asm/kvm.h > @@ -184,6 +184,17 @@ struct kvm_vcpu_events { > __u32 reserved[12]; > }; > > +struct kvm_arm_copy_mte_tags { > + __u64 guest_ipa; > + __u64 length; > + void __user *addr; > + __u64 flags; > + __u64 reserved[2]; > +}; > + > +#define KVM_ARM_TAGS_TO_GUEST 0 > +#define KVM_ARM_TAGS_FROM_GUEST 1 > + > /* If you need to interpret the index values, here is the key: */ > #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 > #define KVM_REG_ARM_COPROC_SHIFT 16 > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index e89a5e275e25..4b6c83beb75d 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, > } > } > > +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, > + struct kvm_arm_copy_mte_tags *copy_tags) > +{ > + gpa_t guest_ipa = copy_tags->guest_ipa; > + size_t length = copy_tags->length; > + void __user *tags = copy_tags->addr; > + gpa_t gfn; > + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); > + int ret = 0; > + > + if (copy_tags->reserved[0] || copy_tags->reserved[1]) > + return -EINVAL; > + > + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) > + return -EINVAL; > + > + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) > + return -EINVAL; > + > + gfn = gpa_to_gfn(guest_ipa); > + > + mutex_lock(&kvm->slots_lock); > + > + while (length > 0) { > + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); > + void *maddr; > + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; nit: this is a compile time constant, make it a #define. This will avoid the confusing overloading of "num_tags" as both an input and an output for the mte_copy_tags-* functions. > + > + if (is_error_noslot_pfn(pfn)) { > + ret = -EFAULT; > + goto out; > + } > + > + maddr = page_address(pfn_to_page(pfn)); > + > + if (!write) { > + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); > + kvm_release_pfn_clean(pfn); > + } else { > + num_tags = mte_copy_tags_from_user(maddr, tags, > + num_tags); > + kvm_release_pfn_dirty(pfn); > + } > + > + if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) { > + ret = -EFAULT; > + goto out; > + } > + > + gfn++; > + tags += num_tags; > + length -= PAGE_SIZE; > + } > + > +out: > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > + nit again: I'd really prefer it if you moved this to guest.c, where we already have a bunch of the save/restore stuff. > long kvm_arch_vm_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -1345,6 +1404,16 @@ long kvm_arch_vm_ioctl(struct file *filp, > > return 0; > } > + case KVM_ARM_MTE_COPY_TAGS: { > + struct kvm_arm_copy_mte_tags copy_tags; > + > + if (!kvm_has_mte(kvm)) > + return -EINVAL; > + > + if (copy_from_user(©_tags, argp, sizeof(copy_tags))) > + return -EFAULT; > + return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); > + } > default: > return -EINVAL; > } > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 8c95ba0fadda..4c011c60d468 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -1428,6 +1428,7 @@ struct kvm_s390_ucas_mapping { > /* Available with KVM_CAP_PMU_EVENT_FILTER */ > #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) > #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) > +#define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) > > /* ioctl for vm fd */ > #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) Thanks, M.
On 17/05/2021 19:04, Marc Zyngier wrote: > On Mon, 17 May 2021 13:32:38 +0100, > Steven Price <steven.price@arm.com> wrote: >> >> The VMM may not wish to have it's own mapping of guest memory mapped >> with PROT_MTE because this causes problems if the VMM has tag checking >> enabled (the guest controls the tags in physical RAM and it's unlikely >> the tags are correct for the VMM). >> >> Instead add a new ioctl which allows the VMM to easily read/write the >> tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE >> while the VMM can still read/write the tags for the purpose of >> migration. >> >> Signed-off-by: Steven Price <steven.price@arm.com> >> --- >> arch/arm64/include/uapi/asm/kvm.h | 11 +++++ >> arch/arm64/kvm/arm.c | 69 +++++++++++++++++++++++++++++++ >> include/uapi/linux/kvm.h | 1 + >> 3 files changed, 81 insertions(+) >> >> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h >> index 24223adae150..b3edde68bc3e 100644 >> --- a/arch/arm64/include/uapi/asm/kvm.h >> +++ b/arch/arm64/include/uapi/asm/kvm.h >> @@ -184,6 +184,17 @@ struct kvm_vcpu_events { >> __u32 reserved[12]; >> }; >> >> +struct kvm_arm_copy_mte_tags { >> + __u64 guest_ipa; >> + __u64 length; >> + void __user *addr; >> + __u64 flags; >> + __u64 reserved[2]; >> +}; >> + >> +#define KVM_ARM_TAGS_TO_GUEST 0 >> +#define KVM_ARM_TAGS_FROM_GUEST 1 >> + >> /* If you need to interpret the index values, here is the key: */ >> #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 >> #define KVM_REG_ARM_COPROC_SHIFT 16 >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c >> index e89a5e275e25..4b6c83beb75d 100644 >> --- a/arch/arm64/kvm/arm.c >> +++ b/arch/arm64/kvm/arm.c >> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, >> } >> } >> >> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, >> + struct kvm_arm_copy_mte_tags *copy_tags) >> +{ >> + gpa_t guest_ipa = copy_tags->guest_ipa; >> + size_t length = copy_tags->length; >> + void __user *tags = copy_tags->addr; >> + gpa_t gfn; >> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); >> + int ret = 0; >> + >> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) >> + return -EINVAL; >> + >> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) >> + return -EINVAL; >> + >> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) >> + return -EINVAL; >> + >> + gfn = gpa_to_gfn(guest_ipa); >> + >> + mutex_lock(&kvm->slots_lock); >> + >> + while (length > 0) { >> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); >> + void *maddr; >> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; > > nit: this is a compile time constant, make it a #define. This will > avoid the confusing overloading of "num_tags" as both an input and an > output for the mte_copy_tags-* functions. No problem, I agree my usage of num_tags wasn't very clear. >> + >> + if (is_error_noslot_pfn(pfn)) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + maddr = page_address(pfn_to_page(pfn)); >> + >> + if (!write) { >> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); >> + kvm_release_pfn_clean(pfn); >> + } else { >> + num_tags = mte_copy_tags_from_user(maddr, tags, >> + num_tags); >> + kvm_release_pfn_dirty(pfn); >> + } >> + >> + if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + gfn++; >> + tags += num_tags; >> + length -= PAGE_SIZE; >> + } >> + >> +out: >> + mutex_unlock(&kvm->slots_lock); >> + return ret; >> +} >> + > > nit again: I'd really prefer it if you moved this to guest.c, where we > already have a bunch of the save/restore stuff. Sure - I'll move it across. Thanks, Steve
On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h > index 24223adae150..b3edde68bc3e 100644 > --- a/arch/arm64/include/uapi/asm/kvm.h > +++ b/arch/arm64/include/uapi/asm/kvm.h > @@ -184,6 +184,17 @@ struct kvm_vcpu_events { > __u32 reserved[12]; > }; > > +struct kvm_arm_copy_mte_tags { > + __u64 guest_ipa; > + __u64 length; > + void __user *addr; > + __u64 flags; > + __u64 reserved[2]; I forgot the past discussions, what's the reserved for? Future expansion? > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index e89a5e275e25..4b6c83beb75d 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, > } > } > > +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, > + struct kvm_arm_copy_mte_tags *copy_tags) > +{ > + gpa_t guest_ipa = copy_tags->guest_ipa; > + size_t length = copy_tags->length; > + void __user *tags = copy_tags->addr; > + gpa_t gfn; > + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); > + int ret = 0; > + > + if (copy_tags->reserved[0] || copy_tags->reserved[1]) > + return -EINVAL; > + > + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) > + return -EINVAL; > + > + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) > + return -EINVAL; > + > + gfn = gpa_to_gfn(guest_ipa); > + > + mutex_lock(&kvm->slots_lock); > + > + while (length > 0) { > + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); > + void *maddr; > + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; > + > + if (is_error_noslot_pfn(pfn)) { > + ret = -EFAULT; > + goto out; > + } > + > + maddr = page_address(pfn_to_page(pfn)); > + > + if (!write) { > + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); > + kvm_release_pfn_clean(pfn); Do we need to check if PG_mte_tagged is set? If the page was not faulted into the guest address space but the VMM has the page, does the gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If not, this may read stale tags. > + } else { > + num_tags = mte_copy_tags_from_user(maddr, tags, > + num_tags); > + kvm_release_pfn_dirty(pfn); > + } Same question here, if the we can't guarantee the stage 2 pte being set, we'd need to set PG_mte_tagged. > + > + if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) { > + ret = -EFAULT; > + goto out; > + } > + > + gfn++; > + tags += num_tags; > + length -= PAGE_SIZE; > + } > + > +out: > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > +
On 20/05/2021 13:05, Catalin Marinas wrote: > On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: >> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h >> index 24223adae150..b3edde68bc3e 100644 >> --- a/arch/arm64/include/uapi/asm/kvm.h >> +++ b/arch/arm64/include/uapi/asm/kvm.h >> @@ -184,6 +184,17 @@ struct kvm_vcpu_events { >> __u32 reserved[12]; >> }; >> >> +struct kvm_arm_copy_mte_tags { >> + __u64 guest_ipa; >> + __u64 length; >> + void __user *addr; >> + __u64 flags; >> + __u64 reserved[2]; > > I forgot the past discussions, what's the reserved for? Future > expansion? Yes - for future expansion. Marc asked for them[1]: > I'd be keen on a couple of reserved __64s. Just in case... [1] https://lore.kernel.org/r/87ft14xl9e.wl-maz%40kernel.org >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c >> index e89a5e275e25..4b6c83beb75d 100644 >> --- a/arch/arm64/kvm/arm.c >> +++ b/arch/arm64/kvm/arm.c >> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, >> } >> } >> >> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, >> + struct kvm_arm_copy_mte_tags *copy_tags) >> +{ >> + gpa_t guest_ipa = copy_tags->guest_ipa; >> + size_t length = copy_tags->length; >> + void __user *tags = copy_tags->addr; >> + gpa_t gfn; >> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); >> + int ret = 0; >> + >> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) >> + return -EINVAL; >> + >> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) >> + return -EINVAL; >> + >> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) >> + return -EINVAL; >> + >> + gfn = gpa_to_gfn(guest_ipa); >> + >> + mutex_lock(&kvm->slots_lock); >> + >> + while (length > 0) { >> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); >> + void *maddr; >> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; >> + >> + if (is_error_noslot_pfn(pfn)) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + maddr = page_address(pfn_to_page(pfn)); >> + >> + if (!write) { >> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); >> + kvm_release_pfn_clean(pfn); > > Do we need to check if PG_mte_tagged is set? If the page was not faulted > into the guest address space but the VMM has the page, does the > gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If > not, this may read stale tags. Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot() will fault it into the guest. >> + } else { >> + num_tags = mte_copy_tags_from_user(maddr, tags, >> + num_tags); >> + kvm_release_pfn_dirty(pfn); >> + } > > Same question here, if the we can't guarantee the stage 2 pte being set, > we'd need to set PG_mte_tagged. This is arguably worse as we'll be writing tags into the guest but without setting PG_mte_tagged - so they'll be lost when the guest then faults the pages in. Which sounds like it should break migration. I think the below should be safe, and avoids the overhead of setting the flag just for reads. Thanks, Steve ----8<---- page = pfn_to_page(pfn); maddr = page_address(page); if (!write) { if (test_bit(PG_mte_tagged, &page->flags)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tag, MTE_GRANULES_PER_PAGE); kvm_release_pfn_clean(pfn); } else { num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); kvm_release_pfn_dirty(pfn); } if (num_tags != MTE_GRANULES_PER_PAGE) { ret = -EFAULT; goto out; } if (write) test_and_set_bit(PG_mte_tagged, &page->flags);
On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote: > On 20/05/2021 13:05, Catalin Marinas wrote: > > On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: > >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > >> index e89a5e275e25..4b6c83beb75d 100644 > >> --- a/arch/arm64/kvm/arm.c > >> +++ b/arch/arm64/kvm/arm.c > >> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, > >> } > >> } > >> > >> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, > >> + struct kvm_arm_copy_mte_tags *copy_tags) > >> +{ > >> + gpa_t guest_ipa = copy_tags->guest_ipa; > >> + size_t length = copy_tags->length; > >> + void __user *tags = copy_tags->addr; > >> + gpa_t gfn; > >> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); > >> + int ret = 0; > >> + > >> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) > >> + return -EINVAL; > >> + > >> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) > >> + return -EINVAL; > >> + > >> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) > >> + return -EINVAL; > >> + > >> + gfn = gpa_to_gfn(guest_ipa); > >> + > >> + mutex_lock(&kvm->slots_lock); > >> + > >> + while (length > 0) { > >> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); > >> + void *maddr; > >> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; > >> + > >> + if (is_error_noslot_pfn(pfn)) { > >> + ret = -EFAULT; > >> + goto out; > >> + } > >> + > >> + maddr = page_address(pfn_to_page(pfn)); > >> + > >> + if (!write) { > >> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); > >> + kvm_release_pfn_clean(pfn); > > > > Do we need to check if PG_mte_tagged is set? If the page was not faulted > > into the guest address space but the VMM has the page, does the > > gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If > > not, this may read stale tags. > > Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot() > will fault it into the guest. It doesn't indeed. What it does is a get_user_pages() but it's not of much help since the VMM pte wouldn't be tagged (we would have solved lots of problems if we required PROT_MTE in the VMM...) > >> + } else { > >> + num_tags = mte_copy_tags_from_user(maddr, tags, > >> + num_tags); > >> + kvm_release_pfn_dirty(pfn); > >> + } > > > > Same question here, if the we can't guarantee the stage 2 pte being set, > > we'd need to set PG_mte_tagged. > > This is arguably worse as we'll be writing tags into the guest but > without setting PG_mte_tagged - so they'll be lost when the guest then > faults the pages in. Which sounds like it should break migration. > > I think the below should be safe, and avoids the overhead of setting the > flag just for reads. > > Thanks, > > Steve > > ----8<---- > page = pfn_to_page(pfn); > maddr = page_address(page); > > if (!write) { > if (test_bit(PG_mte_tagged, &page->flags)) > num_tags = mte_copy_tags_to_user(tags, maddr, > MTE_GRANULES_PER_PAGE); > else > /* No tags in memory, so write zeros */ > num_tags = MTE_GRANULES_PER_PAGE - > clear_user(tag, MTE_GRANULES_PER_PAGE); > kvm_release_pfn_clean(pfn); For ptrace we return a -EOPNOTSUPP if the address doesn't have VM_MTE but I don't think it makes sense here, so I'm fine with clearing the destination and assuming that the tags are zero (as they'd be on faulting into the guest. Another thing I forgot to ask, what's guaranteeing that the page supports tags? Does this ioctl ensure that it would attempt the tag copying from some device mapping? Do we need some kvm_is_device_pfn() check? I guess ZONE_DEVICE memory we just refuse to map in an earlier patch. > } else { > num_tags = mte_copy_tags_from_user(maddr, tags, > MTE_GRANULES_PER_PAGE); > kvm_release_pfn_dirty(pfn); > } > > if (num_tags != MTE_GRANULES_PER_PAGE) { > ret = -EFAULT; > goto out; > } > > if (write) > test_and_set_bit(PG_mte_tagged, &page->flags); I think a set_bit() would do, I doubt it's any more efficient. But why not add it in the 'else' block above where we actually wrote the tags? The copy function may have failed part-way through. Maybe your logic is correct though, there are invalid tags in the page. Just add a comment.
On 20/05/2021 18:27, Catalin Marinas wrote: > On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote: >> On 20/05/2021 13:05, Catalin Marinas wrote: >>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: >>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c >>>> index e89a5e275e25..4b6c83beb75d 100644 >>>> --- a/arch/arm64/kvm/arm.c >>>> +++ b/arch/arm64/kvm/arm.c >>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, >>>> } >>>> } >>>> >>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, >>>> + struct kvm_arm_copy_mte_tags *copy_tags) >>>> +{ >>>> + gpa_t guest_ipa = copy_tags->guest_ipa; >>>> + size_t length = copy_tags->length; >>>> + void __user *tags = copy_tags->addr; >>>> + gpa_t gfn; >>>> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); >>>> + int ret = 0; >>>> + >>>> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) >>>> + return -EINVAL; >>>> + >>>> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) >>>> + return -EINVAL; >>>> + >>>> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) >>>> + return -EINVAL; >>>> + >>>> + gfn = gpa_to_gfn(guest_ipa); >>>> + >>>> + mutex_lock(&kvm->slots_lock); >>>> + >>>> + while (length > 0) { >>>> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); >>>> + void *maddr; >>>> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; >>>> + >>>> + if (is_error_noslot_pfn(pfn)) { >>>> + ret = -EFAULT; >>>> + goto out; >>>> + } >>>> + >>>> + maddr = page_address(pfn_to_page(pfn)); >>>> + >>>> + if (!write) { >>>> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); >>>> + kvm_release_pfn_clean(pfn); >>> >>> Do we need to check if PG_mte_tagged is set? If the page was not faulted >>> into the guest address space but the VMM has the page, does the >>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If >>> not, this may read stale tags. >> >> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot() >> will fault it into the guest. > > It doesn't indeed. What it does is a get_user_pages() but it's not of > much help since the VMM pte wouldn't be tagged (we would have solved > lots of problems if we required PROT_MTE in the VMM...) Sadly it solves some problems and creates others :( >>>> + } else { >>>> + num_tags = mte_copy_tags_from_user(maddr, tags, >>>> + num_tags); >>>> + kvm_release_pfn_dirty(pfn); >>>> + } >>> >>> Same question here, if the we can't guarantee the stage 2 pte being set, >>> we'd need to set PG_mte_tagged. >> >> This is arguably worse as we'll be writing tags into the guest but >> without setting PG_mte_tagged - so they'll be lost when the guest then >> faults the pages in. Which sounds like it should break migration. >> >> I think the below should be safe, and avoids the overhead of setting the >> flag just for reads. >> >> Thanks, >> >> Steve >> >> ----8<---- >> page = pfn_to_page(pfn); >> maddr = page_address(page); >> >> if (!write) { >> if (test_bit(PG_mte_tagged, &page->flags)) >> num_tags = mte_copy_tags_to_user(tags, maddr, >> MTE_GRANULES_PER_PAGE); >> else >> /* No tags in memory, so write zeros */ >> num_tags = MTE_GRANULES_PER_PAGE - >> clear_user(tag, MTE_GRANULES_PER_PAGE); >> kvm_release_pfn_clean(pfn); > > For ptrace we return a -EOPNOTSUPP if the address doesn't have VM_MTE > but I don't think it makes sense here, so I'm fine with clearing the > destination and assuming that the tags are zero (as they'd be on > faulting into the guest. Yeah - conceptually all pages in an MTE-enabled guest have tags. It's just we don't actually populate the physical memory until the guest tries to touch them. So it makes sense to just return zeros here. Alternatively we could populate the physical tags but that seems unnecessary. > Another thing I forgot to ask, what's guaranteeing that the page > supports tags? Does this ioctl ensure that it would attempt the tag > copying from some device mapping? Do we need some kvm_is_device_pfn() > check? I guess ZONE_DEVICE memory we just refuse to map in an earlier > patch. Hmm, nothing much. While reads are now fine (the memory won't have PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory. The fix is to just replace pfn_to_page() with pfn_to_online_page() and handle the error. >> } else { >> num_tags = mte_copy_tags_from_user(maddr, tags, >> MTE_GRANULES_PER_PAGE); >> kvm_release_pfn_dirty(pfn); >> } >> >> if (num_tags != MTE_GRANULES_PER_PAGE) { >> ret = -EFAULT; >> goto out; >> } >> >> if (write) >> test_and_set_bit(PG_mte_tagged, &page->flags); > > I think a set_bit() would do, I doubt it's any more efficient. But why I'd seen test_and_set_bit() used elsewhere (I forget where now) as a slightly more efficient approach. It complies down to a READ_ONCE and a conditional atomic, vs a single non-conditional atomic. But I don't have any actual data on the performance and this isn't a hot path, so I'll switch to the more obvious set_bit(). > not add it in the 'else' block above where we actually wrote the tags? > The copy function may have failed part-way through. Maybe your logic is > correct though, there are invalid tags in the page. Just add a comment. Yeah it's in case the write fails part way through - we don't want to expose the tags which were not written. I'll add a comment to explain that. Steve
On Fri, May 21, 2021 at 10:42:09AM +0100, Steven Price wrote: > On 20/05/2021 18:27, Catalin Marinas wrote: > > On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote: > >> On 20/05/2021 13:05, Catalin Marinas wrote: > >>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: > >>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > >>>> index e89a5e275e25..4b6c83beb75d 100644 > >>>> --- a/arch/arm64/kvm/arm.c > >>>> +++ b/arch/arm64/kvm/arm.c > >>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, > >>>> } > >>>> } > >>>> > >>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, > >>>> + struct kvm_arm_copy_mte_tags *copy_tags) > >>>> +{ > >>>> + gpa_t guest_ipa = copy_tags->guest_ipa; > >>>> + size_t length = copy_tags->length; > >>>> + void __user *tags = copy_tags->addr; > >>>> + gpa_t gfn; > >>>> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); > >>>> + int ret = 0; > >>>> + > >>>> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) > >>>> + return -EINVAL; > >>>> + > >>>> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) > >>>> + return -EINVAL; > >>>> + > >>>> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) > >>>> + return -EINVAL; > >>>> + > >>>> + gfn = gpa_to_gfn(guest_ipa); > >>>> + > >>>> + mutex_lock(&kvm->slots_lock); > >>>> + > >>>> + while (length > 0) { > >>>> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); > >>>> + void *maddr; > >>>> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; > >>>> + > >>>> + if (is_error_noslot_pfn(pfn)) { > >>>> + ret = -EFAULT; > >>>> + goto out; > >>>> + } > >>>> + > >>>> + maddr = page_address(pfn_to_page(pfn)); > >>>> + > >>>> + if (!write) { > >>>> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); > >>>> + kvm_release_pfn_clean(pfn); > >>> > >>> Do we need to check if PG_mte_tagged is set? If the page was not faulted > >>> into the guest address space but the VMM has the page, does the > >>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If > >>> not, this may read stale tags. > >> > >> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot() > >> will fault it into the guest. > > > > It doesn't indeed. What it does is a get_user_pages() but it's not of > > much help since the VMM pte wouldn't be tagged (we would have solved > > lots of problems if we required PROT_MTE in the VMM...) > > Sadly it solves some problems and creates others :( I had some (random) thoughts on how to make things simpler, maybe. I think most of these races would have been solved if we required PROT_MTE in the VMM but this has an impact on the VMM if it wants to use MTE itself. If such requirement was in place, all KVM needed to do is check PG_mte_tagged. So what we actually need is a set_pte_at() in the VMM to clear the tags and set PG_mte_tagged. Currently, we only do this if the memory type is tagged (PROT_MTE) but it's not strictly necessary. As an optimisation for normal programs, we don't want to do this all the time but the visible behaviour wouldn't change (well, maybe for ptrace slightly). However, it doesn't mean we couldn't for a VMM, with an opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't think of a better name) to mm_context_t.flags and set_pte_at() would behave as if the pte was tagged without actually mapping the memory in user space as tagged (protection flags not changed). Pages that don't support tagging are still safe, just some unnecessary ignored tag writes. This would need to be set before the mmap() for the guest memory. If we want finer-grained control we'd have to store this information in the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without affecting the actual memory type. The easiest would be another pte bit, though we are short on them. A more intrusive (not too bad) approach is to introduce a set_pte_at_vma() and read the flags directly in the arch code. In most places where set_pte_at() is called on a user mm, the vma is also available. Anyway, I'm not saying we go this route, just thinking out loud, get some opinions. > > Another thing I forgot to ask, what's guaranteeing that the page > > supports tags? Does this ioctl ensure that it would attempt the tag > > copying from some device mapping? Do we need some kvm_is_device_pfn() > > check? I guess ZONE_DEVICE memory we just refuse to map in an earlier > > patch. > > Hmm, nothing much. While reads are now fine (the memory won't have > PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory. I don't think it's a problem for writes either as the host wouldn't map such memory as tagged. It's just that it returns zeros and writes are ignored, so we could instead return an error (I haven't checked your latest series yet). > >> } else { > >> num_tags = mte_copy_tags_from_user(maddr, tags, > >> MTE_GRANULES_PER_PAGE); > >> kvm_release_pfn_dirty(pfn); > >> } > >> > >> if (num_tags != MTE_GRANULES_PER_PAGE) { > >> ret = -EFAULT; > >> goto out; > >> } > >> > >> if (write) > >> test_and_set_bit(PG_mte_tagged, &page->flags); > > > > I think a set_bit() would do, I doubt it's any more efficient. But why > > I'd seen test_and_set_bit() used elsewhere (I forget where now) as a > slightly more efficient approach. It complies down to a READ_ONCE and a > conditional atomic, vs a single non-conditional atomic. But I don't have > any actual data on the performance and this isn't a hot path, so I'll > switch to the more obvious set_bit(). Yeah, I think I've seen this as well. Anyway, it's probably lost in the noise of tag writing here.
On 24/05/2021 19:11, Catalin Marinas wrote: > On Fri, May 21, 2021 at 10:42:09AM +0100, Steven Price wrote: >> On 20/05/2021 18:27, Catalin Marinas wrote: >>> On Thu, May 20, 2021 at 04:58:01PM +0100, Steven Price wrote: >>>> On 20/05/2021 13:05, Catalin Marinas wrote: >>>>> On Mon, May 17, 2021 at 01:32:38PM +0100, Steven Price wrote: >>>>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c >>>>>> index e89a5e275e25..4b6c83beb75d 100644 >>>>>> --- a/arch/arm64/kvm/arm.c >>>>>> +++ b/arch/arm64/kvm/arm.c >>>>>> @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, >>>>>> } >>>>>> } >>>>>> >>>>>> +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, >>>>>> + struct kvm_arm_copy_mte_tags *copy_tags) >>>>>> +{ >>>>>> + gpa_t guest_ipa = copy_tags->guest_ipa; >>>>>> + size_t length = copy_tags->length; >>>>>> + void __user *tags = copy_tags->addr; >>>>>> + gpa_t gfn; >>>>>> + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); >>>>>> + int ret = 0; >>>>>> + >>>>>> + if (copy_tags->reserved[0] || copy_tags->reserved[1]) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + gfn = gpa_to_gfn(guest_ipa); >>>>>> + >>>>>> + mutex_lock(&kvm->slots_lock); >>>>>> + >>>>>> + while (length > 0) { >>>>>> + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); >>>>>> + void *maddr; >>>>>> + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; >>>>>> + >>>>>> + if (is_error_noslot_pfn(pfn)) { >>>>>> + ret = -EFAULT; >>>>>> + goto out; >>>>>> + } >>>>>> + >>>>>> + maddr = page_address(pfn_to_page(pfn)); >>>>>> + >>>>>> + if (!write) { >>>>>> + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); >>>>>> + kvm_release_pfn_clean(pfn); >>>>> >>>>> Do we need to check if PG_mte_tagged is set? If the page was not faulted >>>>> into the guest address space but the VMM has the page, does the >>>>> gfn_to_pfn_prot() guarantee that a kvm_set_spte_gfn() was called? If >>>>> not, this may read stale tags. >>>> >>>> Ah, I hadn't thought about that... No I don't believe gfn_to_pfn_prot() >>>> will fault it into the guest. >>> >>> It doesn't indeed. What it does is a get_user_pages() but it's not of >>> much help since the VMM pte wouldn't be tagged (we would have solved >>> lots of problems if we required PROT_MTE in the VMM...) >> >> Sadly it solves some problems and creates others :( > > I had some (random) thoughts on how to make things simpler, maybe. I > think most of these races would have been solved if we required PROT_MTE > in the VMM but this has an impact on the VMM if it wants to use MTE > itself. If such requirement was in place, all KVM needed to do is check > PG_mte_tagged. > > So what we actually need is a set_pte_at() in the VMM to clear the tags > and set PG_mte_tagged. Currently, we only do this if the memory type is > tagged (PROT_MTE) but it's not strictly necessary. > > As an optimisation for normal programs, we don't want to do this all the > time but the visible behaviour wouldn't change (well, maybe for ptrace > slightly). However, it doesn't mean we couldn't for a VMM, with an > opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't > think of a better name) to mm_context_t.flags and set_pte_at() would > behave as if the pte was tagged without actually mapping the memory in > user space as tagged (protection flags not changed). Pages that don't > support tagging are still safe, just some unnecessary ignored tag > writes. This would need to be set before the mmap() for the guest > memory. > > If we want finer-grained control we'd have to store this information in > the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without > affecting the actual memory type. The easiest would be another pte bit, > though we are short on them. A more intrusive (not too bad) approach is > to introduce a set_pte_at_vma() and read the flags directly in the arch > code. In most places where set_pte_at() is called on a user mm, the vma > is also available. > > Anyway, I'm not saying we go this route, just thinking out loud, get > some opinions. Does get_user_pages() actually end up calling set_pte_at() normally? If not then on the normal user_mem_abort() route although we can easily check VM_MTE_TAG_INIT there's no obvious place to hook in to ensure that the pages actually allocated have the PG_mte_tagged flag. I'm also not sure how well this would work with the MMU notifiers path in KVM. With MMU notifiers (i.e. the VMM replacing a page in the memslot) there's not even an obvious hook to enforce the VMA flag. So I think we'd end up with something like the sanitise_mte_tags() function to at least check that the PG_mte_tagged flag is set on the pages (assuming that the trigger for the MMU notifier has done the corresponding set_pte_at()). Admittedly this might close the current race documented there. It also feels wrong to me to tie this to a process with prctl(), it seems much more normal to implement this as a new mprotect() flag as this is really a memory property not a process property. And I think we'll find some scary corner cases if we try to associate everything back to a process - although I can't instantly think of anything that will actually break. >>> Another thing I forgot to ask, what's guaranteeing that the page >>> supports tags? Does this ioctl ensure that it would attempt the tag >>> copying from some device mapping? Do we need some kvm_is_device_pfn() >>> check? I guess ZONE_DEVICE memory we just refuse to map in an earlier >>> patch. >> >> Hmm, nothing much. While reads are now fine (the memory won't have >> PG_mte_tagged), writes could potentially happen on ZONE_DEVICE memory. > > I don't think it's a problem for writes either as the host wouldn't map > such memory as tagged. It's just that it returns zeros and writes are > ignored, so we could instead return an error (I haven't checked your > latest series yet). The latest series uses pfn_to_online_page() to reject ZONE_DEVICE early. >>>> } else { >>>> num_tags = mte_copy_tags_from_user(maddr, tags, >>>> MTE_GRANULES_PER_PAGE); >>>> kvm_release_pfn_dirty(pfn); >>>> } >>>> >>>> if (num_tags != MTE_GRANULES_PER_PAGE) { >>>> ret = -EFAULT; >>>> goto out; >>>> } >>>> >>>> if (write) >>>> test_and_set_bit(PG_mte_tagged, &page->flags); >>> >>> I think a set_bit() would do, I doubt it's any more efficient. But why >> >> I'd seen test_and_set_bit() used elsewhere (I forget where now) as a >> slightly more efficient approach. It complies down to a READ_ONCE and a >> conditional atomic, vs a single non-conditional atomic. But I don't have >> any actual data on the performance and this isn't a hot path, so I'll >> switch to the more obvious set_bit(). > > Yeah, I think I've seen this as well. Anyway, it's probably lost in the > noise of tag writing here. > Agreed. Thanks, Steve
On Thu, May 27, 2021 at 08:50:30AM +0100, Steven Price wrote: > On 24/05/2021 19:11, Catalin Marinas wrote: > > I had some (random) thoughts on how to make things simpler, maybe. I > > think most of these races would have been solved if we required PROT_MTE > > in the VMM but this has an impact on the VMM if it wants to use MTE > > itself. If such requirement was in place, all KVM needed to do is check > > PG_mte_tagged. > > > > So what we actually need is a set_pte_at() in the VMM to clear the tags > > and set PG_mte_tagged. Currently, we only do this if the memory type is > > tagged (PROT_MTE) but it's not strictly necessary. > > > > As an optimisation for normal programs, we don't want to do this all the > > time but the visible behaviour wouldn't change (well, maybe for ptrace > > slightly). However, it doesn't mean we couldn't for a VMM, with an > > opt-in via prctl(). This would add a MMCF_MTE_TAG_INIT bit (couldn't > > think of a better name) to mm_context_t.flags and set_pte_at() would > > behave as if the pte was tagged without actually mapping the memory in > > user space as tagged (protection flags not changed). Pages that don't > > support tagging are still safe, just some unnecessary ignored tag > > writes. This would need to be set before the mmap() for the guest > > memory. > > > > If we want finer-grained control we'd have to store this information in > > the vma flags, in addition to VM_MTE (e.g. VM_MTE_TAG_INIT) but without > > affecting the actual memory type. The easiest would be another pte bit, > > though we are short on them. A more intrusive (not too bad) approach is > > to introduce a set_pte_at_vma() and read the flags directly in the arch > > code. In most places where set_pte_at() is called on a user mm, the vma > > is also available. > > > > Anyway, I'm not saying we go this route, just thinking out loud, get > > some opinions. > > Does get_user_pages() actually end up calling set_pte_at() normally? Not always, at least as how it's called from hva_to_pfn(). My reading of the get_user_page_fast_only() is that it doesn't touch the pte, just walks the page tables and pins the page. Of course, it expects a valid pte to have been set in the VMM already, otherwise it doesn't pin any page and the caller falls back to the slow path. The slow path, get_user_pages_unlocked(), passes FOLL_TOUCH and set_pte_at() will be called either in follow_pfn_pte() if it was valid or via faultin_page() -> handle_mm_fault(). > If not then on the normal user_mem_abort() route although we can > easily check VM_MTE_TAG_INIT there's no obvious place to hook in to > ensure that the pages actually allocated have the PG_mte_tagged flag. I don't think it helps if we checked such vma flag in user_mem_abort(), we'd still have the race with set_pte_at() on the page flags. What I was trying to avoid is touching the page flags in too many places, so deferring this always to set_pte_at() in the VMM. > I'm also not sure how well this would work with the MMU notifiers path > in KVM. With MMU notifiers (i.e. the VMM replacing a page in the > memslot) there's not even an obvious hook to enforce the VMA flag. So I > think we'd end up with something like the sanitise_mte_tags() function > to at least check that the PG_mte_tagged flag is set on the pages > (assuming that the trigger for the MMU notifier has done the > corresponding set_pte_at()). Admittedly this might close the current > race documented there. If we kept this check to the VMM set_pte_at(), I think we can ignore the notifiers. > It also feels wrong to me to tie this to a process with prctl(), it > seems much more normal to implement this as a new mprotect() flag as > this is really a memory property not a process property. And I think > we'll find some scary corner cases if we try to associate everything > back to a process - although I can't instantly think of anything that > will actually break. I agree, tying it to the process looks wrong, only that it's less intrusive. I don't think it would break anything, only potential performance regression. A process would still need to pass PROT_MTE to be able to get tag checking. That's basically what I had in an early MTE implementation with clear_user_page() always zeroing the tags. I agree with you that a vma flag would be better but it's more complicated without an additional pte bit. We could also miss some updates as mprotect() for example checks for pte_same() before calling set_pte_at() (it would need to check the updated vma flags). I'll review the latest series but I'm tempted to move the logic in santise_mte_tags() to mte.c and take the big lock in there if PG_mte_tagged is not already set. If we hit performance issues, we can optimise this later to have the page flag set already on creation (new PROT flag, prctl etc.).
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 24223adae150..b3edde68bc3e 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -184,6 +184,17 @@ struct kvm_vcpu_events { __u32 reserved[12]; }; +struct kvm_arm_copy_mte_tags { + __u64 guest_ipa; + __u64 length; + void __user *addr; + __u64 flags; + __u64 reserved[2]; +}; + +#define KVM_ARM_TAGS_TO_GUEST 0 +#define KVM_ARM_TAGS_FROM_GUEST 1 + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e89a5e275e25..4b6c83beb75d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1309,6 +1309,65 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, } } +static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, + struct kvm_arm_copy_mte_tags *copy_tags) +{ + gpa_t guest_ipa = copy_tags->guest_ipa; + size_t length = copy_tags->length; + void __user *tags = copy_tags->addr; + gpa_t gfn; + bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); + int ret = 0; + + if (copy_tags->reserved[0] || copy_tags->reserved[1]) + return -EINVAL; + + if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) + return -EINVAL; + + if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) + return -EINVAL; + + gfn = gpa_to_gfn(guest_ipa); + + mutex_lock(&kvm->slots_lock); + + while (length > 0) { + kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); + void *maddr; + unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE; + + if (is_error_noslot_pfn(pfn)) { + ret = -EFAULT; + goto out; + } + + maddr = page_address(pfn_to_page(pfn)); + + if (!write) { + num_tags = mte_copy_tags_to_user(tags, maddr, num_tags); + kvm_release_pfn_clean(pfn); + } else { + num_tags = mte_copy_tags_from_user(maddr, tags, + num_tags); + kvm_release_pfn_dirty(pfn); + } + + if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE) { + ret = -EFAULT; + goto out; + } + + gfn++; + tags += num_tags; + length -= PAGE_SIZE; + } + +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1345,6 +1404,16 @@ long kvm_arch_vm_ioctl(struct file *filp, return 0; } + case KVM_ARM_MTE_COPY_TAGS: { + struct kvm_arm_copy_mte_tags copy_tags; + + if (!kvm_has_mte(kvm)) + return -EINVAL; + + if (copy_from_user(©_tags, argp, sizeof(copy_tags))) + return -EFAULT; + return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); + } default: return -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8c95ba0fadda..4c011c60d468 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1428,6 +1428,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PMU_EVENT_FILTER */ #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) +#define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)
The VMM may not wish to have it's own mapping of guest memory mapped with PROT_MTE because this causes problems if the VMM has tag checking enabled (the guest controls the tags in physical RAM and it's unlikely the tags are correct for the VMM). Instead add a new ioctl which allows the VMM to easily read/write the tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE while the VMM can still read/write the tags for the purpose of migration. Signed-off-by: Steven Price <steven.price@arm.com> --- arch/arm64/include/uapi/asm/kvm.h | 11 +++++ arch/arm64/kvm/arm.c | 69 +++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 81 insertions(+)