diff mbox series

[RFC,v2,01/13] mm/shmem: Introduce F_SEAL_GUEST

Message ID 20211119134739.20218-2-chao.p.peng@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng Nov. 19, 2021, 1:47 p.m. UTC
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

The new seal type provides semantics required for KVM guest private
memory support. A file descriptor with the seal set is going to be used
as source of guest memory in confidential computing environments such as
Intel TDX and AMD SEV.

F_SEAL_GUEST can only be set on empty memfd. After the seal is set
userspace cannot read, write or mmap the memfd.

Userspace is in charge of guest memory lifecycle: it can allocate the
memory with falloc or punch hole to free memory from the guest.

The file descriptor passed down to KVM as guest memory backend. KVM
register itself as the owner of the memfd via memfd_register_guest().

KVM provides callback that needed to be called on fallocate and punch
hole.

memfd_register_guest() returns callbacks that need be used for
requesting a new page from memfd.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/memfd.h      |  24 ++++++++
 include/linux/shmem_fs.h   |   9 +++
 include/uapi/linux/fcntl.h |   1 +
 mm/memfd.c                 |  33 +++++++++-
 mm/shmem.c                 | 123 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 186 insertions(+), 4 deletions(-)

Comments

David Hildenbrand Nov. 19, 2021, 1:51 p.m. UTC | #1
On 19.11.21 14:47, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> The new seal type provides semantics required for KVM guest private
> memory support. A file descriptor with the seal set is going to be used
> as source of guest memory in confidential computing environments such as
> Intel TDX and AMD SEV.
> 
> F_SEAL_GUEST can only be set on empty memfd. After the seal is set
> userspace cannot read, write or mmap the memfd.
> 
> Userspace is in charge of guest memory lifecycle: it can allocate the
> memory with falloc or punch hole to free memory from the guest.
> 
> The file descriptor passed down to KVM as guest memory backend. KVM
> register itself as the owner of the memfd via memfd_register_guest().
> 
> KVM provides callback that needed to be called on fallocate and punch
> hole.
> 
> memfd_register_guest() returns callbacks that need be used for
> requesting a new page from memfd.
> 

Repeating the feedback I already shared in a private mail thread:


As long as page migration / swapping is not supported, these pages
behave like any longterm pinned pages (e.g., VFIO) or secretmem pages.

1. These pages are not MOVABLE. They must not end up on ZONE_MOVABLE or
MIGRATE_CMA.

That should be easy to handle, you have to adjust the gfp_mask to
	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
just as mm/secretmem.c:secretmem_file_create() does.

2. These pages behave like mlocked pages and should be accounted as such.

This is probably where the accounting "fun" starts, but maybe it's
easier than I think to handle.

See mm/secretmem.c:secretmem_mmap(), where we account the pages as
VM_LOCKED and will consequently check per-process mlock limits. As we
don't mmap(), the same approach cannot be reused.

See drivers/vfio/vfio_iommu_type1.c:vfio_pin_map_dma() and
vfio_pin_pages_remote() on how to manually account via mm->locked_vm .

But it's a bit hairy because these pages are not actually mapped into
the page tables of the MM, so it might need some thought. Similarly,
these pages actually behave like "pinned" (as in mm->pinned_vm), but we
just don't increase the refcount AFAIR. Again, accounting really is a
bit hairy ...
Jason Gunthorpe Nov. 19, 2021, 3:19 p.m. UTC | #2
On Fri, Nov 19, 2021 at 09:47:27PM +0800, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> The new seal type provides semantics required for KVM guest private
> memory support. A file descriptor with the seal set is going to be used
> as source of guest memory in confidential computing environments such as
> Intel TDX and AMD SEV.
> 
> F_SEAL_GUEST can only be set on empty memfd. After the seal is set
> userspace cannot read, write or mmap the memfd.
> 
> Userspace is in charge of guest memory lifecycle: it can allocate the
> memory with falloc or punch hole to free memory from the guest.
> 
> The file descriptor passed down to KVM as guest memory backend. KVM
> register itself as the owner of the memfd via memfd_register_guest().
> 
> KVM provides callback that needed to be called on fallocate and punch
> hole.
> 
> memfd_register_guest() returns callbacks that need be used for
> requesting a new page from memfd.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
>  include/linux/memfd.h      |  24 ++++++++
>  include/linux/shmem_fs.h   |   9 +++
>  include/uapi/linux/fcntl.h |   1 +
>  mm/memfd.c                 |  33 +++++++++-
>  mm/shmem.c                 | 123 ++++++++++++++++++++++++++++++++++++-
>  5 files changed, 186 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/memfd.h b/include/linux/memfd.h
> index 4f1600413f91..ff920ef28688 100644
> +++ b/include/linux/memfd.h
> @@ -4,13 +4,37 @@
>  
>  #include <linux/file.h>
>  
> +struct guest_ops {
> +	void (*invalidate_page_range)(struct inode *inode, void *owner,
> +				      pgoff_t start, pgoff_t end);
> +	void (*fallocate)(struct inode *inode, void *owner,
> +			  pgoff_t start, pgoff_t end);
> +};
> +
> +struct guest_mem_ops {
> +	unsigned long (*get_lock_pfn)(struct inode *inode, pgoff_t offset,
> +				      bool alloc, int *order);
> +	void (*put_unlock_pfn)(unsigned long pfn);
> +
> +};

Ignoring confidential compute for a moment

If qmeu can put all the guest memory in a memfd and not map it, then
I'd also like to see that the IOMMU can use this interface too so we
can have VFIO working in this configuration.

As designed the above looks useful to import a memfd to a VFIO
container but could you consider some more generic naming than calling
this 'guest' ?

Along the same lines, to support fast migration, we'd want to be able
to send these things to the RDMA subsytem as well so we can do data
xfer. Very similar to VFIO.

Also, shouldn't this be two patches? F_SEAL is not really related to
these acessors, is it?

> +extern inline int memfd_register_guest(struct inode *inode, void *owner,
> +				       const struct guest_ops *guest_ops,
> +				       const struct guest_mem_ops **guest_mem_ops);

Why does this take an inode and not a file *?

> +int shmem_register_guest(struct inode *inode, void *owner,
> +			 const struct guest_ops *guest_ops,
> +			 const struct guest_mem_ops **guest_mem_ops)
> +{
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	if (!owner)
> +		return -EINVAL;
> +
> +	if (info->guest_owner && info->guest_owner != owner)
> +		return -EPERM;

And this looks like it means only a single subsytem can use this API
at once, not so nice..

Jason
David Hildenbrand Nov. 19, 2021, 3:39 p.m. UTC | #3
On 19.11.21 16:19, Jason Gunthorpe wrote:
> On Fri, Nov 19, 2021 at 09:47:27PM +0800, Chao Peng wrote:
>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>>
>> The new seal type provides semantics required for KVM guest private
>> memory support. A file descriptor with the seal set is going to be used
>> as source of guest memory in confidential computing environments such as
>> Intel TDX and AMD SEV.
>>
>> F_SEAL_GUEST can only be set on empty memfd. After the seal is set
>> userspace cannot read, write or mmap the memfd.
>>
>> Userspace is in charge of guest memory lifecycle: it can allocate the
>> memory with falloc or punch hole to free memory from the guest.
>>
>> The file descriptor passed down to KVM as guest memory backend. KVM
>> register itself as the owner of the memfd via memfd_register_guest().
>>
>> KVM provides callback that needed to be called on fallocate and punch
>> hole.
>>
>> memfd_register_guest() returns callbacks that need be used for
>> requesting a new page from memfd.
>>
>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
>>  include/linux/memfd.h      |  24 ++++++++
>>  include/linux/shmem_fs.h   |   9 +++
>>  include/uapi/linux/fcntl.h |   1 +
>>  mm/memfd.c                 |  33 +++++++++-
>>  mm/shmem.c                 | 123 ++++++++++++++++++++++++++++++++++++-
>>  5 files changed, 186 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/memfd.h b/include/linux/memfd.h
>> index 4f1600413f91..ff920ef28688 100644
>> +++ b/include/linux/memfd.h
>> @@ -4,13 +4,37 @@
>>  
>>  #include <linux/file.h>
>>  
>> +struct guest_ops {
>> +	void (*invalidate_page_range)(struct inode *inode, void *owner,
>> +				      pgoff_t start, pgoff_t end);
>> +	void (*fallocate)(struct inode *inode, void *owner,
>> +			  pgoff_t start, pgoff_t end);
>> +};
>> +
>> +struct guest_mem_ops {
>> +	unsigned long (*get_lock_pfn)(struct inode *inode, pgoff_t offset,
>> +				      bool alloc, int *order);
>> +	void (*put_unlock_pfn)(unsigned long pfn);
>> +
>> +};
> 
> Ignoring confidential compute for a moment
> 
> If qmeu can put all the guest memory in a memfd and not map it, then
> I'd also like to see that the IOMMU can use this interface too so we
> can have VFIO working in this configuration.

In QEMU we usually want to (and must) be able to access guest memory
from user space, with the current design we wouldn't even be able to
temporarily mmap it -- which makes sense for encrypted memory only. The
corner case really is encrypted memory. So I don't think we'll see a
broad use of this feature outside of encrypted VMs in QEMU. I might be
wrong, most probably I am :)

> 
> As designed the above looks useful to import a memfd to a VFIO
> container but could you consider some more generic naming than calling
> this 'guest' ?

+1 the guest terminology is somewhat sob-optimal.

> 
> Along the same lines, to support fast migration, we'd want to be able
> to send these things to the RDMA subsytem as well so we can do data
> xfer. Very similar to VFIO.
> 
> Also, shouldn't this be two patches? F_SEAL is not really related to
> these acessors, is it?


Apart from the special "encrypted memory" semantics, I assume nothing
speaks against allowing for mmaping these memfds, for example, for any
other VFIO use cases.
Jason Gunthorpe Nov. 19, 2021, 4 p.m. UTC | #4
On Fri, Nov 19, 2021 at 04:39:15PM +0100, David Hildenbrand wrote:

> > If qmeu can put all the guest memory in a memfd and not map it, then
> > I'd also like to see that the IOMMU can use this interface too so we
> > can have VFIO working in this configuration.
> 
> In QEMU we usually want to (and must) be able to access guest memory
> from user space, with the current design we wouldn't even be able to
> temporarily mmap it -- which makes sense for encrypted memory only. The
> corner case really is encrypted memory. So I don't think we'll see a
> broad use of this feature outside of encrypted VMs in QEMU. I might be
> wrong, most probably I am :)

Interesting..

The non-encrypted case I had in mind is the horrible flow in VFIO to
support qemu re-execing itself (VFIO_DMA_UNMAP_FLAG_VADDR).

Here VFIO is connected to a VA in a mm_struct that will become invalid
during the kexec period, but VFIO needs to continue to access it. For
IOMMU cases this is OK because the memory is already pinned, but for
the 'emulated iommu' used by mdevs pages are pinned dynamically. qemu
needs to ensure that VFIO can continue to access the pages across the
kexec, even though there is nothing to pin_user_pages() on.

This flow would work a lot better if VFIO was connected to the memfd
that is storing the guest memory. Then it naturally doesn't get
disrupted by exec() and we don't need the mess in the kernel..

I was wondering if we could get here using the direct_io APIs but this
would do the job too.

> Apart from the special "encrypted memory" semantics, I assume nothing
> speaks against allowing for mmaping these memfds, for example, for any
> other VFIO use cases.

We will eventually have VFIO with "encrypted memory". There was a talk
in LPC about the enabling work for this.

So, if the plan is to put fully encrpyted memory inside a memfd, then
we still will eventually need a way to pull the pfns it into the
IOMMU, presumably along with the access control parameters needed to
pass to the secure monitor to join a PCI device to the secure memory.

Jason
Sean Christopherson Nov. 19, 2021, 7:18 p.m. UTC | #5
On Fri, Nov 19, 2021, David Hildenbrand wrote:
> On 19.11.21 16:19, Jason Gunthorpe wrote:
> > As designed the above looks useful to import a memfd to a VFIO
> > container but could you consider some more generic naming than calling
> > this 'guest' ?
> 
> +1 the guest terminology is somewhat sob-optimal.

For the F_SEAL part, maybe F_SEAL_UNMAPPABLE?

No ideas for the kernel API, but that's also less concerning since it's not set
in stone.  I'm also not sure that dedicated APIs for each high-ish level use case
would be a bad thing, as the semantics are unlikely to be different to some extent.
E.g. for the KVM use case, there can be at most one guest associated with the fd,
but there can be any number of VFIO devices attached to the fd.
Jason Gunthorpe Nov. 19, 2021, 7:47 p.m. UTC | #6
On Fri, Nov 19, 2021 at 07:18:00PM +0000, Sean Christopherson wrote:
> On Fri, Nov 19, 2021, David Hildenbrand wrote:
> > On 19.11.21 16:19, Jason Gunthorpe wrote:
> > > As designed the above looks useful to import a memfd to a VFIO
> > > container but could you consider some more generic naming than calling
> > > this 'guest' ?
> > 
> > +1 the guest terminology is somewhat sob-optimal.
> 
> For the F_SEAL part, maybe F_SEAL_UNMAPPABLE?

Perhaps INACCESSIBLE?

> No ideas for the kernel API, but that's also less concerning since
> it's not set in stone.  I'm also not sure that dedicated APIs for
> each high-ish level use case would be a bad thing, as the semantics
> are unlikely to be different to some extent.  E.g. for the KVM use
> case, there can be at most one guest associated with the fd, but
> there can be any number of VFIO devices attached to the fd.

Even the kvm thing is not a hard restriction when you take away
confidential compute.

Why can't we have multiple KVMs linked to the same FD if the memory
isn't encrypted? Sure it isn't actually useful but it should work
fine.

Supporting only one thing is just a way to avoid having a linked list
of clients to broadcast invalidations too - for instance by using a
standard notifier block...

Also, how does dirty tracking work on this memory?

Jason
Sean Christopherson Nov. 19, 2021, 10:21 p.m. UTC | #7
On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> On Fri, Nov 19, 2021 at 07:18:00PM +0000, Sean Christopherson wrote:
> > No ideas for the kernel API, but that's also less concerning since
> > it's not set in stone.  I'm also not sure that dedicated APIs for
> > each high-ish level use case would be a bad thing, as the semantics
> > are unlikely to be different to some extent.  E.g. for the KVM use
> > case, there can be at most one guest associated with the fd, but
> > there can be any number of VFIO devices attached to the fd.
> 
> Even the kvm thing is not a hard restriction when you take away
> confidential compute.
> 
> Why can't we have multiple KVMs linked to the same FD if the memory
> isn't encrypted? Sure it isn't actually useful but it should work
> fine.

Hmm, true, but I want the KVM semantics to be 1:1 even if memory isn't encrypted.
Encrypting memory with a key that isn't available to the host is necessary to
(mostly) remove the host kernel from the guest's TCB, but it's not necessary to
remove host userspace from the TCB.  KVM absolutely can and should be able to do
that without relying on additional hardware/firmware.  Ignoring attestation and
whether or not the guest fully trusts the host kernel, there's value in preventing
a buggy or compromised userspace from attacking/corrupting the guest by remapping
guest memory or by mapping the same memory into multiple guests.

> Supporting only one thing is just a way to avoid having a linked list
> of clients to broadcast invalidations too - for instance by using a
> standard notifier block...

It's not just avoiding the linked list, there's a trust element as well.  E.g. in
the scenario where a device can access a confidential VM's encrypted private memory,
the guest is still the "owner" of the memory and needs to explicitly grant access to
a third party, e.g. the device or perhaps another VM.

That said, I'm certainly not dead set on having "guest" in the name, nor am I
opposed to implementing multi-consumer support from the get-go so we don't end
up with a mess later on.

> Also, how does dirty tracking work on this memory?

For KVM usage, KVM would provide the dirty bit info.  No idea how VFIO or other
use cases would work.
Jason Gunthorpe Nov. 19, 2021, 11:33 p.m. UTC | #8
On Fri, Nov 19, 2021 at 10:21:39PM +0000, Sean Christopherson wrote:
> On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> > On Fri, Nov 19, 2021 at 07:18:00PM +0000, Sean Christopherson wrote:
> > > No ideas for the kernel API, but that's also less concerning since
> > > it's not set in stone.  I'm also not sure that dedicated APIs for
> > > each high-ish level use case would be a bad thing, as the semantics
> > > are unlikely to be different to some extent.  E.g. for the KVM use
> > > case, there can be at most one guest associated with the fd, but
> > > there can be any number of VFIO devices attached to the fd.
> > 
> > Even the kvm thing is not a hard restriction when you take away
> > confidential compute.
> > 
> > Why can't we have multiple KVMs linked to the same FD if the memory
> > isn't encrypted? Sure it isn't actually useful but it should work
> > fine.
> 
> Hmm, true, but I want the KVM semantics to be 1:1 even if memory
> isn't encrypted.

That is policy and it doesn't belong hardwired into the kernel.

Your explanation makes me think that the F_SEAL_XX isn't defined
properly. It should be a userspace trap door to prevent any new
external accesses, including establishing new kvms, iommu's, rdmas,
mmaps, read/write, etc.

> It's not just avoiding the linked list, there's a trust element as
> well.  E.g. in the scenario where a device can access a confidential
> VM's encrypted private memory, the guest is still the "owner" of the
> memory and needs to explicitly grant access to a third party,
> e.g. the device or perhaps another VM.

Authorization is some other issue - the internal kAPI should be able
to indicate it is secured memory and the API user should do whatever
dance to gain access to it. Eg for VFIO ask the realm manager to
associate the pci_device with the owner realm.

Jason
Sean Christopherson Nov. 20, 2021, 1:23 a.m. UTC | #9
On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> On Fri, Nov 19, 2021 at 10:21:39PM +0000, Sean Christopherson wrote:
> > On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> > > On Fri, Nov 19, 2021 at 07:18:00PM +0000, Sean Christopherson wrote:
> > > > No ideas for the kernel API, but that's also less concerning since
> > > > it's not set in stone.  I'm also not sure that dedicated APIs for
> > > > each high-ish level use case would be a bad thing, as the semantics
> > > > are unlikely to be different to some extent.  E.g. for the KVM use
> > > > case, there can be at most one guest associated with the fd, but
> > > > there can be any number of VFIO devices attached to the fd.
> > > 
> > > Even the kvm thing is not a hard restriction when you take away
> > > confidential compute.
> > > 
> > > Why can't we have multiple KVMs linked to the same FD if the memory
> > > isn't encrypted? Sure it isn't actually useful but it should work
> > > fine.
> > 
> > Hmm, true, but I want the KVM semantics to be 1:1 even if memory
> > isn't encrypted.
> 
> That is policy and it doesn't belong hardwired into the kernel.

Agreed.  I had a blurb typed up about that policy just being an "exclusive" flag
in the kernel API that KVM would set when creating a confidential VM, but deleted
it and forgot to restore it when I went down the tangent of removing userspace
from the TCB without an assist from hardware/firmware.

> Your explanation makes me think that the F_SEAL_XX isn't defined
> properly. It should be a userspace trap door to prevent any new
> external accesses, including establishing new kvms, iommu's, rdmas,
> mmaps, read/write, etc.

Hmm, the way I was thinking of it is that it the F_SEAL_XX itself would prevent
mapping/accessing it from userspace, and that any policy beyond that would be
done via kernel APIs and thus handled by whatever in-kernel agent can access the
memory.  E.g. in the confidential VM case, without support for trusted devices,
KVM would require that it be the sole owner of the file.
Jason Gunthorpe Nov. 21, 2021, 12:05 a.m. UTC | #10
On Sat, Nov 20, 2021 at 01:23:16AM +0000, Sean Christopherson wrote:
> On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> > On Fri, Nov 19, 2021 at 10:21:39PM +0000, Sean Christopherson wrote:
> > > On Fri, Nov 19, 2021, Jason Gunthorpe wrote:
> > > > On Fri, Nov 19, 2021 at 07:18:00PM +0000, Sean Christopherson wrote:
> > > > > No ideas for the kernel API, but that's also less concerning since
> > > > > it's not set in stone.  I'm also not sure that dedicated APIs for
> > > > > each high-ish level use case would be a bad thing, as the semantics
> > > > > are unlikely to be different to some extent.  E.g. for the KVM use
> > > > > case, there can be at most one guest associated with the fd, but
> > > > > there can be any number of VFIO devices attached to the fd.
> > > > 
> > > > Even the kvm thing is not a hard restriction when you take away
> > > > confidential compute.
> > > > 
> > > > Why can't we have multiple KVMs linked to the same FD if the memory
> > > > isn't encrypted? Sure it isn't actually useful but it should work
> > > > fine.
> > > 
> > > Hmm, true, but I want the KVM semantics to be 1:1 even if memory
> > > isn't encrypted.
> > 
> > That is policy and it doesn't belong hardwired into the kernel.
> 
> Agreed.  I had a blurb typed up about that policy just being an "exclusive" flag
> in the kernel API that KVM would set when creating a confidential
> VM,

I still think that is policy in the kernel, what is wrong with
userspace doing it?

> > Your explanation makes me think that the F_SEAL_XX isn't defined
> > properly. It should be a userspace trap door to prevent any new
> > external accesses, including establishing new kvms, iommu's, rdmas,
> > mmaps, read/write, etc.
> 
> Hmm, the way I was thinking of it is that it the F_SEAL_XX itself would prevent
> mapping/accessing it from userspace, and that any policy beyond that would be
> done via kernel APIs and thus handled by whatever in-kernel agent can access the
> memory.  E.g. in the confidential VM case, without support for trusted devices,
> KVM would require that it be the sole owner of the file.

And how would kvm know if there is support for trusted devices?
Again seems like policy choices that should be left in userspace.

Especially for what could be a general in-kernel mechanism with many
users and not tightly linked to KVM as imagined here.

Jason
David Hildenbrand Nov. 22, 2021, 9:26 a.m. UTC | #11
On 19.11.21 17:00, Jason Gunthorpe wrote:
> On Fri, Nov 19, 2021 at 04:39:15PM +0100, David Hildenbrand wrote:
> 
>>> If qmeu can put all the guest memory in a memfd and not map it, then
>>> I'd also like to see that the IOMMU can use this interface too so we
>>> can have VFIO working in this configuration.
>>
>> In QEMU we usually want to (and must) be able to access guest memory
>> from user space, with the current design we wouldn't even be able to
>> temporarily mmap it -- which makes sense for encrypted memory only. The
>> corner case really is encrypted memory. So I don't think we'll see a
>> broad use of this feature outside of encrypted VMs in QEMU. I might be
>> wrong, most probably I am :)
> 
> Interesting..
> 
> The non-encrypted case I had in mind is the horrible flow in VFIO to
> support qemu re-execing itself (VFIO_DMA_UNMAP_FLAG_VADDR).

Thanks for sharing!

> 
> Here VFIO is connected to a VA in a mm_struct that will become invalid
> during the kexec period, but VFIO needs to continue to access it. For
> IOMMU cases this is OK because the memory is already pinned, but for
> the 'emulated iommu' used by mdevs pages are pinned dynamically. qemu
> needs to ensure that VFIO can continue to access the pages across the
> kexec, even though there is nothing to pin_user_pages() on.
> 
> This flow would work a lot better if VFIO was connected to the memfd
> that is storing the guest memory. Then it naturally doesn't get
> disrupted by exec() and we don't need the mess in the kernel..

I do wonder if we want to support sharing such memfds between processes
in all cases ... we most certainly don't want to be able to share
encrypted memory between VMs (I heard that the kernel has to forbid
that). It would make sense in the use case you describe, though.

> 
> I was wondering if we could get here using the direct_io APIs but this
> would do the job too.
> 
>> Apart from the special "encrypted memory" semantics, I assume nothing
>> speaks against allowing for mmaping these memfds, for example, for any
>> other VFIO use cases.
> 
> We will eventually have VFIO with "encrypted memory". There was a talk
> in LPC about the enabling work for this.

Yes, I heard about that as well. In the foreseeable future, we'll have
shared memory only visible for VFIO devices.

> 
> So, if the plan is to put fully encrpyted memory inside a memfd, then
> we still will eventually need a way to pull the pfns it into the
> IOMMU, presumably along with the access control parameters needed to
> pass to the secure monitor to join a PCI device to the secure memory.

Long-term, agreed.
Jason Gunthorpe Nov. 22, 2021, 1:31 p.m. UTC | #12
On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:

> I do wonder if we want to support sharing such memfds between processes
> in all cases ... we most certainly don't want to be able to share
> encrypted memory between VMs (I heard that the kernel has to forbid
> that). It would make sense in the use case you describe, though.

If there is a F_SEAL_XX that blocks every kind of new access, who
cares if userspace passes the FD around or not?

Jason
David Hildenbrand Nov. 22, 2021, 1:35 p.m. UTC | #13
On 22.11.21 14:31, Jason Gunthorpe wrote:
> On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:
> 
>> I do wonder if we want to support sharing such memfds between processes
>> in all cases ... we most certainly don't want to be able to share
>> encrypted memory between VMs (I heard that the kernel has to forbid
>> that). It would make sense in the use case you describe, though.
> 
> If there is a F_SEAL_XX that blocks every kind of new access, who
> cares if userspace passes the FD around or not?
I was imagining that you actually would want to do some kind of "change
ownership". But yeah, the intended semantics and all use cases we have
in mind are not fully clear to me yet. If it's really "no new access"
(side note: is "access" the right word?) then sure, we can pass the fd
around.
Kirill A. Shutemov Nov. 22, 2021, 1:59 p.m. UTC | #14
On Fri, Nov 19, 2021 at 02:51:11PM +0100, David Hildenbrand wrote:
> On 19.11.21 14:47, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > The new seal type provides semantics required for KVM guest private
> > memory support. A file descriptor with the seal set is going to be used
> > as source of guest memory in confidential computing environments such as
> > Intel TDX and AMD SEV.
> > 
> > F_SEAL_GUEST can only be set on empty memfd. After the seal is set
> > userspace cannot read, write or mmap the memfd.
> > 
> > Userspace is in charge of guest memory lifecycle: it can allocate the
> > memory with falloc or punch hole to free memory from the guest.
> > 
> > The file descriptor passed down to KVM as guest memory backend. KVM
> > register itself as the owner of the memfd via memfd_register_guest().
> > 
> > KVM provides callback that needed to be called on fallocate and punch
> > hole.
> > 
> > memfd_register_guest() returns callbacks that need be used for
> > requesting a new page from memfd.
> > 
> 
> Repeating the feedback I already shared in a private mail thread:
> 
> 
> As long as page migration / swapping is not supported, these pages
> behave like any longterm pinned pages (e.g., VFIO) or secretmem pages.
> 
> 1. These pages are not MOVABLE. They must not end up on ZONE_MOVABLE or
> MIGRATE_CMA.
> 
> That should be easy to handle, you have to adjust the gfp_mask to
> 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> just as mm/secretmem.c:secretmem_file_create() does.

Okay, fair enough. mapping_set_unevictable() also makes sesne.

> 2. These pages behave like mlocked pages and should be accounted as such.
> 
> This is probably where the accounting "fun" starts, but maybe it's
> easier than I think to handle.
> 
> See mm/secretmem.c:secretmem_mmap(), where we account the pages as
> VM_LOCKED and will consequently check per-process mlock limits. As we
> don't mmap(), the same approach cannot be reused.
> 
> See drivers/vfio/vfio_iommu_type1.c:vfio_pin_map_dma() and
> vfio_pin_pages_remote() on how to manually account via mm->locked_vm .
> 
> But it's a bit hairy because these pages are not actually mapped into
> the page tables of the MM, so it might need some thought. Similarly,
> these pages actually behave like "pinned" (as in mm->pinned_vm), but we
> just don't increase the refcount AFAIR. Again, accounting really is a
> bit hairy ...

Accounting is fun indeed. Non-mapped mlocked memory is going to be
confusing. Hm...

I will look closer.
Jason Gunthorpe Nov. 22, 2021, 2:01 p.m. UTC | #15
On Mon, Nov 22, 2021 at 02:35:49PM +0100, David Hildenbrand wrote:
> On 22.11.21 14:31, Jason Gunthorpe wrote:
> > On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:
> > 
> >> I do wonder if we want to support sharing such memfds between processes
> >> in all cases ... we most certainly don't want to be able to share
> >> encrypted memory between VMs (I heard that the kernel has to forbid
> >> that). It would make sense in the use case you describe, though.
> > 
> > If there is a F_SEAL_XX that blocks every kind of new access, who
> > cares if userspace passes the FD around or not?
> I was imagining that you actually would want to do some kind of "change
> ownership". But yeah, the intended semantics and all use cases we have
> in mind are not fully clear to me yet. If it's really "no new access"
> (side note: is "access" the right word?) then sure, we can pass the fd
> around.

What is "ownership" in a world with kvm and iommu are reading pages
out of the same fd?

"no new access" makes sense to me, we have access through
read/write/mmap/splice/etc and access to pages through the private in
kernel interface (kvm, iommu)

Jason
David Hildenbrand Nov. 22, 2021, 2:57 p.m. UTC | #16
On 22.11.21 15:01, Jason Gunthorpe wrote:
> On Mon, Nov 22, 2021 at 02:35:49PM +0100, David Hildenbrand wrote:
>> On 22.11.21 14:31, Jason Gunthorpe wrote:
>>> On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:
>>>
>>>> I do wonder if we want to support sharing such memfds between processes
>>>> in all cases ... we most certainly don't want to be able to share
>>>> encrypted memory between VMs (I heard that the kernel has to forbid
>>>> that). It would make sense in the use case you describe, though.
>>>
>>> If there is a F_SEAL_XX that blocks every kind of new access, who
>>> cares if userspace passes the FD around or not?
>> I was imagining that you actually would want to do some kind of "change
>> ownership". But yeah, the intended semantics and all use cases we have
>> in mind are not fully clear to me yet. If it's really "no new access"
>> (side note: is "access" the right word?) then sure, we can pass the fd
>> around.
> 
> What is "ownership" in a world with kvm and iommu are reading pages
> out of the same fd?

In the world of encrypted memory / TDX, KVM somewhat "owns" that memory
IMHO (for example, only it can migrate or swap out these pages; it's
might be debatable if the TDX module or KVM actually "own" these pages ).
Jason Gunthorpe Nov. 22, 2021, 3:09 p.m. UTC | #17
On Mon, Nov 22, 2021 at 03:57:17PM +0100, David Hildenbrand wrote:
> On 22.11.21 15:01, Jason Gunthorpe wrote:
> > On Mon, Nov 22, 2021 at 02:35:49PM +0100, David Hildenbrand wrote:
> >> On 22.11.21 14:31, Jason Gunthorpe wrote:
> >>> On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:
> >>>
> >>>> I do wonder if we want to support sharing such memfds between processes
> >>>> in all cases ... we most certainly don't want to be able to share
> >>>> encrypted memory between VMs (I heard that the kernel has to forbid
> >>>> that). It would make sense in the use case you describe, though.
> >>>
> >>> If there is a F_SEAL_XX that blocks every kind of new access, who
> >>> cares if userspace passes the FD around or not?
> >> I was imagining that you actually would want to do some kind of "change
> >> ownership". But yeah, the intended semantics and all use cases we have
> >> in mind are not fully clear to me yet. If it's really "no new access"
> >> (side note: is "access" the right word?) then sure, we can pass the fd
> >> around.
> > 
> > What is "ownership" in a world with kvm and iommu are reading pages
> > out of the same fd?
> 
> In the world of encrypted memory / TDX, KVM somewhat "owns" that memory
> IMHO (for example, only it can migrate or swap out these pages; it's
> might be debatable if the TDX module or KVM actually "own" these pages ).

Sounds like it is a swap provider more than an owner?

Jason
David Hildenbrand Nov. 22, 2021, 3:15 p.m. UTC | #18
On 22.11.21 16:09, Jason Gunthorpe wrote:
> On Mon, Nov 22, 2021 at 03:57:17PM +0100, David Hildenbrand wrote:
>> On 22.11.21 15:01, Jason Gunthorpe wrote:
>>> On Mon, Nov 22, 2021 at 02:35:49PM +0100, David Hildenbrand wrote:
>>>> On 22.11.21 14:31, Jason Gunthorpe wrote:
>>>>> On Mon, Nov 22, 2021 at 10:26:12AM +0100, David Hildenbrand wrote:
>>>>>
>>>>>> I do wonder if we want to support sharing such memfds between processes
>>>>>> in all cases ... we most certainly don't want to be able to share
>>>>>> encrypted memory between VMs (I heard that the kernel has to forbid
>>>>>> that). It would make sense in the use case you describe, though.
>>>>>
>>>>> If there is a F_SEAL_XX that blocks every kind of new access, who
>>>>> cares if userspace passes the FD around or not?
>>>> I was imagining that you actually would want to do some kind of "change
>>>> ownership". But yeah, the intended semantics and all use cases we have
>>>> in mind are not fully clear to me yet. If it's really "no new access"
>>>> (side note: is "access" the right word?) then sure, we can pass the fd
>>>> around.
>>>
>>> What is "ownership" in a world with kvm and iommu are reading pages
>>> out of the same fd?
>>
>> In the world of encrypted memory / TDX, KVM somewhat "owns" that memory
>> IMHO (for example, only it can migrate or swap out these pages; it's
>> might be debatable if the TDX module or KVM actually "own" these pages ).
> 
> Sounds like it is a swap provider more than an owner?

Yes, I think we can phrase it that way, + "migrate provider"
Paolo Bonzini Nov. 23, 2021, 8:54 a.m. UTC | #19
On 11/19/21 14:47, Chao Peng wrote:
> +static void guest_invalidate_page(struct inode *inode,
> +				  struct page *page, pgoff_t start, pgoff_t end)
> +{
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	if (!info->guest_ops || !info->guest_ops->invalidate_page_range)
> +		return;
> +
> +	start = max(start, page->index);
> +	end = min(end, page->index + thp_nr_pages(page)) - 1;
> +
> +	info->guest_ops->invalidate_page_range(inode, info->guest_owner,
> +					       start, end);
> +}

The lack of protection makes the API quite awkward to use;
the usual way to do this is with refcount_inc_not_zero (aka 
kvm_get_kvm_safe).

Can you use the shmem_inode_info spinlock to protect against this?  If 
register/unregister take the spinlock, the invalidate and fallocate can 
take a reference under the same spinlock, like this:

	if (!info->guest_ops)
		return;

	spin_lock(&info->lock);
	ops = info->guest_ops;
	if (!ops) {
		spin_unlock(&info->lock);
		return;
	}

	/* Calls kvm_get_kvm_safe.  */
	r = ops->get_guest_owner(info->guest_owner);
	spin_unlock(&info->lock);
	if (r < 0)
		return;

	start = max(start, page->index);
	end = min(end, page->index + thp_nr_pages(page)) - 1;

	ops->invalidate_page_range(inode, info->guest_owner,
					       start, end);
	ops->put_guest_owner(info->guest_owner);

Considering that you have to take a mutex anyway in patch 13, and that 
the critical section here is very small, the extra indirect calls are 
cheaper than walking the vm_list; and it makes the API clearer.

Paolo
Paolo Bonzini Nov. 23, 2021, 9:06 a.m. UTC | #20
On 11/19/21 16:39, David Hildenbrand wrote:
>> If qmeu can put all the guest memory in a memfd and not map it, then
>> I'd also like to see that the IOMMU can use this interface too so we
>> can have VFIO working in this configuration.
> 
> In QEMU we usually want to (and must) be able to access guest memory
> from user space, with the current design we wouldn't even be able to
> temporarily mmap it -- which makes sense for encrypted memory only. The
> corner case really is encrypted memory. So I don't think we'll see a
> broad use of this feature outside of encrypted VMs in QEMU. I might be
> wrong, most probably I am:)

It's not _that_ crazy an idea, but it's going to be some work to teach 
KVM that it has to kmap/kunmap around all memory accesses.

I think it's great that memfd hooks are usable by more than one 
subsystem, OTOH it's fair that whoever needs it does the work---and VFIO 
does not need it for confidential VMs, yet, so it should be fine for now 
to have a single user.

On the other hand, as I commented already, the lack of locking in the 
register/unregister functions has to be fixed even with a single user. 
Another thing we can do already is change the guest_ops/guest_mem_ops to 
something like memfd_falloc_notifier_ops/memfd_pfn_ops, and the 
register/unregister functions to memfd_register/unregister_falloc_notifier.

Chao, can you also put this under a new CONFIG such as "bool MEMFD_OPS", 
and select it from KVM?

Thanks,

Paolo
Chao Peng Nov. 23, 2021, 2:33 p.m. UTC | #21
On Tue, Nov 23, 2021 at 10:06:02AM +0100, Paolo Bonzini wrote:
> On 11/19/21 16:39, David Hildenbrand wrote:
> > > If qmeu can put all the guest memory in a memfd and not map it, then
> > > I'd also like to see that the IOMMU can use this interface too so we
> > > can have VFIO working in this configuration.
> > 
> > In QEMU we usually want to (and must) be able to access guest memory
> > from user space, with the current design we wouldn't even be able to
> > temporarily mmap it -- which makes sense for encrypted memory only. The
> > corner case really is encrypted memory. So I don't think we'll see a
> > broad use of this feature outside of encrypted VMs in QEMU. I might be
> > wrong, most probably I am:)
> 
> It's not _that_ crazy an idea, but it's going to be some work to teach KVM
> that it has to kmap/kunmap around all memory accesses.
> 
> I think it's great that memfd hooks are usable by more than one subsystem,
> OTOH it's fair that whoever needs it does the work---and VFIO does not need
> it for confidential VMs, yet, so it should be fine for now to have a single
> user.
> 
> On the other hand, as I commented already, the lack of locking in the
> register/unregister functions has to be fixed even with a single user.
> Another thing we can do already is change the guest_ops/guest_mem_ops to
> something like memfd_falloc_notifier_ops/memfd_pfn_ops, and the
> register/unregister functions to memfd_register/unregister_falloc_notifier.

I'm satisified with this naming ;)

> 
> Chao, can you also put this under a new CONFIG such as "bool MEMFD_OPS", and
> select it from KVM?

Yes, reasonable.

> 
> Thanks,
> 
> Paolo
David Hildenbrand Nov. 23, 2021, 3:20 p.m. UTC | #22
On 23.11.21 10:06, Paolo Bonzini wrote:
> On 11/19/21 16:39, David Hildenbrand wrote:
>>> If qmeu can put all the guest memory in a memfd and not map it, then
>>> I'd also like to see that the IOMMU can use this interface too so we
>>> can have VFIO working in this configuration.
>>
>> In QEMU we usually want to (and must) be able to access guest memory
>> from user space, with the current design we wouldn't even be able to
>> temporarily mmap it -- which makes sense for encrypted memory only. The
>> corner case really is encrypted memory. So I don't think we'll see a
>> broad use of this feature outside of encrypted VMs in QEMU. I might be
>> wrong, most probably I am:)
> 
> It's not _that_ crazy an idea, but it's going to be some work to teach 
> KVM that it has to kmap/kunmap around all memory accesses.

I'm also concerned about userspace access. But you sound like you have a
plan :)
Jason Gunthorpe Nov. 23, 2021, 5:17 p.m. UTC | #23
On Tue, Nov 23, 2021 at 10:06:02AM +0100, Paolo Bonzini wrote:

> I think it's great that memfd hooks are usable by more than one subsystem,
> OTOH it's fair that whoever needs it does the work---and VFIO does not need
> it for confidential VMs, yet, so it should be fine for now to have a single
> user.

I think adding a new interface to a core kernel subsystem should come
with a greater requirement to work out something generally useful and
not be overly wedded to a single use case (eg F_SEAL_GUEST)

Especially if something like 'single user' is not just a small
implementation artifact but a key design tennant of the whole eventual
solution.

Jason
Andy Lutomirski Dec. 3, 2021, 1:11 a.m. UTC | #24
On 11/19/21 05:47, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> The new seal type provides semantics required for KVM guest private
> memory support. A file descriptor with the seal set is going to be used
> as source of guest memory in confidential computing environments such as
> Intel TDX and AMD SEV.
> 
> F_SEAL_GUEST can only be set on empty memfd. After the seal is set
> userspace cannot read, write or mmap the memfd.

I don't have a strong objection here, but, given that you're only 
supporting it for memfd, would a memfd_create() flag be more 
straightforward?  If nothing else, it would avoid any possible locking 
issue.

I'm also very very slightly nervous about a situation in which one 
program sends a memfd to an untrusted other process and that process 
truncates the memfd and then F_SEAL_GUESTs it.  This could be mostly 
mitigated by also requiring that no other seals be set when F_SEAL_GUEST 
happens, but the alternative MFD_GUEST would eliminate this issue too.
diff mbox series

Patch

diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 4f1600413f91..ff920ef28688 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -4,13 +4,37 @@ 
 
 #include <linux/file.h>
 
+struct guest_ops {
+	void (*invalidate_page_range)(struct inode *inode, void *owner,
+				      pgoff_t start, pgoff_t end);
+	void (*fallocate)(struct inode *inode, void *owner,
+			  pgoff_t start, pgoff_t end);
+};
+
+struct guest_mem_ops {
+	unsigned long (*get_lock_pfn)(struct inode *inode, pgoff_t offset,
+				      bool alloc, int *order);
+	void (*put_unlock_pfn)(unsigned long pfn);
+
+};
+
 #ifdef CONFIG_MEMFD_CREATE
 extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+
+extern inline int memfd_register_guest(struct inode *inode, void *owner,
+				       const struct guest_ops *guest_ops,
+				       const struct guest_mem_ops **guest_mem_ops);
 #else
 static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
 {
 	return -EINVAL;
 }
+static inline int memfd_register_guest(struct inode *inode, void *owner,
+				       const struct guest_ops *guest_ops,
+				       const struct guest_mem_ops **guest_mem_ops)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 166158b6e917..8280c918775a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -12,6 +12,9 @@ 
 
 /* inode in-kernel data */
 
+struct guest_ops;
+struct guest_mem_ops;
+
 struct shmem_inode_info {
 	spinlock_t		lock;
 	unsigned int		seals;		/* shmem seals */
@@ -25,6 +28,8 @@  struct shmem_inode_info {
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct inode		vfs_inode;
+	void			*guest_owner;
+	const struct guest_ops	*guest_ops;
 };
 
 struct shmem_sb_info {
@@ -96,6 +101,10 @@  extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
 
+extern int shmem_register_guest(struct inode *inode, void *owner,
+				const struct guest_ops *guest_ops,
+				const struct guest_mem_ops **guest_mem_ops);
+
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..c79bc8572721 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@ 
 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
+#define F_SEAL_GUEST		0x0020
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/memfd.c b/mm/memfd.c
index 081dd33e6a61..a98b30bcf982 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -130,11 +130,25 @@  static unsigned int *memfd_file_seals_ptr(struct file *file)
 	return NULL;
 }
 
+int memfd_register_guest(struct inode *inode, void *owner,
+			 const struct guest_ops *guest_ops,
+			 const struct guest_mem_ops **guest_mem_ops)
+{
+	if (shmem_mapping(inode->i_mapping)) {
+		return shmem_register_guest(inode, owner,
+					    guest_ops, guest_mem_ops);
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(memfd_register_guest);
+
 #define F_ALL_SEALS (F_SEAL_SEAL | \
 		     F_SEAL_SHRINK | \
 		     F_SEAL_GROW | \
 		     F_SEAL_WRITE | \
-		     F_SEAL_FUTURE_WRITE)
+		     F_SEAL_FUTURE_WRITE | \
+		     F_SEAL_GUEST)
 
 static int memfd_add_seals(struct file *file, unsigned int seals)
 {
@@ -203,10 +217,27 @@  static int memfd_add_seals(struct file *file, unsigned int seals)
 		}
 	}
 
+	if (seals & F_SEAL_GUEST) {
+		i_mmap_lock_read(inode->i_mapping);
+
+		if (!RB_EMPTY_ROOT(&inode->i_mapping->i_mmap.rb_root)) {
+			error = -EBUSY;
+			goto unlock;
+		}
+
+		if (i_size_read(inode)) {
+			error = -EBUSY;
+			goto unlock;
+		}
+	}
+
 	*file_seals |= seals;
 	error = 0;
 
 unlock:
+	if (seals & F_SEAL_GUEST)
+		i_mmap_unlock_read(inode->i_mapping);
+
 	inode_unlock(inode);
 	return error;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 23c91a8beb78..38b3b6b9a3a5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,6 +78,7 @@  static struct vfsmount *shm_mnt;
 #include <linux/userfaultfd_k.h>
 #include <linux/rmap.h>
 #include <linux/uuid.h>
+#include <linux/memfd.h>
 
 #include <linux/uaccess.h>
 
@@ -906,6 +907,21 @@  static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
 	return split_huge_page(page) >= 0;
 }
 
+static void guest_invalidate_page(struct inode *inode,
+				  struct page *page, pgoff_t start, pgoff_t end)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (!info->guest_ops || !info->guest_ops->invalidate_page_range)
+		return;
+
+	start = max(start, page->index);
+	end = min(end, page->index + thp_nr_pages(page)) - 1;
+
+	info->guest_ops->invalidate_page_range(inode, info->guest_owner,
+					       start, end);
+}
+
 /*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -949,6 +965,8 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			}
 			index += thp_nr_pages(page) - 1;
 
+			guest_invalidate_page(inode, page, start, end);
+
 			if (!unfalloc || !PageUptodate(page))
 				truncate_inode_page(mapping, page);
 			unlock_page(page);
@@ -1025,6 +1043,9 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 					index--;
 					break;
 				}
+
+				guest_invalidate_page(inode, page, start, end);
+
 				VM_BUG_ON_PAGE(PageWriteback(page), page);
 				if (shmem_punch_compound(page, start, end))
 					truncate_inode_page(mapping, page);
@@ -1098,6 +1119,9 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 			return -EPERM;
 
+		if ((info->seals & F_SEAL_GUEST) && (newsize & ~PAGE_MASK))
+			return -EINVAL;
+
 		if (newsize != oldsize) {
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
@@ -1364,6 +1388,8 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 	if (!total_swap_pages)
 		goto redirty;
+	if (info->seals & F_SEAL_GUEST)
+		goto redirty;
 
 	/*
 	 * Our capabilities prevent regular writeback or sync from ever calling
@@ -2262,6 +2288,9 @@  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
+	if (info->seals & F_SEAL_GUEST)
+		return -EPERM;
+
 	/* arm64 - allow memory tagging on RAM-based files */
 	vma->vm_flags |= VM_MTE_ALLOWED;
 
@@ -2459,12 +2488,14 @@  shmem_write_begin(struct file *file, struct address_space *mapping,
 	int ret = 0;
 
 	/* i_rwsem is held by caller */
-	if (unlikely(info->seals & (F_SEAL_GROW |
-				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+	if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
+				    F_SEAL_FUTURE_WRITE | F_SEAL_GUEST))) {
 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
 			return -EPERM;
 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
 			return -EPERM;
+		if (info->seals & F_SEAL_GUEST)
+			return -EPERM;
 	}
 
 	ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
@@ -2546,6 +2577,20 @@  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		end_index = i_size >> PAGE_SHIFT;
 		if (index > end_index)
 			break;
+
+		/*
+		 * inode_lock protects setting up seals as well as write to
+		 * i_size. Setting F_SEAL_GUEST only allowed with i_size == 0.
+		 *
+		 * Check F_SEAL_GUEST after i_size. It effectively serialize
+		 * read vs. setting F_SEAL_GUEST without taking inode_lock in
+		 * read path.
+		 */
+		if (SHMEM_I(inode)->seals & F_SEAL_GUEST) {
+			error = -EPERM;
+			break;
+		}
+
 		if (index == end_index) {
 			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset)
@@ -2677,6 +2722,12 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			goto out;
 		}
 
+		if ((info->seals & F_SEAL_GUEST) &&
+		    (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
+			error = -EINVAL;
+			goto out;
+		}
+
 		shmem_falloc.waitq = &shmem_falloc_waitq;
 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
@@ -2796,6 +2847,8 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
 		i_size_write(inode, offset + len);
 	inode->i_ctime = current_time(inode);
+	if (info->guest_ops && info->guest_ops->fallocate)
+		info->guest_ops->fallocate(inode, info->guest_owner, start, end);
 undone:
 	spin_lock(&inode->i_lock);
 	inode->i_private = NULL;
@@ -3800,6 +3853,20 @@  static int shmem_error_remove_page(struct address_space *mapping,
 	return 0;
 }
 
+#ifdef CONFIG_MIGRATION
+int shmem_migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page,
+		enum migrate_mode mode)
+{
+	struct inode *inode = mapping->host;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (info->seals & F_SEAL_GUEST)
+		return -ENOTSUPP;
+	return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
 const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_no_writeback,
@@ -3808,12 +3875,62 @@  const struct address_space_operations shmem_aops = {
 	.write_end	= shmem_write_end,
 #endif
 #ifdef CONFIG_MIGRATION
-	.migratepage	= migrate_page,
+	.migratepage	= shmem_migrate_page,
 #endif
 	.error_remove_page = shmem_error_remove_page,
 };
 EXPORT_SYMBOL(shmem_aops);
 
+static unsigned long shmem_get_lock_pfn(struct inode *inode, pgoff_t offset,
+					bool alloc, int *order)
+{
+	struct page *page;
+	int ret;
+	enum sgp_type sgp = alloc ? SGP_WRITE : SGP_READ;
+
+	ret = shmem_getpage(inode, offset, &page, sgp);
+	if (ret)
+		return ret;
+
+	*order = thp_order(compound_head(page));
+
+	return page_to_pfn(page);
+}
+
+static void shmem_put_unlock_pfn(unsigned long pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	put_page(page);
+}
+
+static const struct guest_mem_ops shmem_guest_ops = {
+	.get_lock_pfn = shmem_get_lock_pfn,
+	.put_unlock_pfn = shmem_put_unlock_pfn,
+};
+
+int shmem_register_guest(struct inode *inode, void *owner,
+			 const struct guest_ops *guest_ops,
+			 const struct guest_mem_ops **guest_mem_ops)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (!owner)
+		return -EINVAL;
+
+	if (info->guest_owner && info->guest_owner != owner)
+		return -EPERM;
+
+	info->guest_owner = owner;
+	info->guest_ops = guest_ops;
+	*guest_mem_ops = &shmem_guest_ops;
+	return 0;
+}
+
 static const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
 	.get_unmapped_area = shmem_get_unmapped_area,