diff mbox series

[RFC,4/4] mm: guest_memfd: Add ability for mmap'ing pages

Message ID 20240805-guest-memfd-lib-v1-4-e5a29a4ff5d7@quicinc.com (mailing list archive)
State New
Headers show
Series mm: Introduce guest_memfd library | expand

Commit Message

Elliot Berman Aug. 5, 2024, 6:34 p.m. UTC
Confidential/protected guest virtual machines want to share some memory
back with the host Linux. For example, virtqueues allow host and
protected guest to exchange data. In MMU-only isolation of protected
guest virtual machines, the transition between "shared" and "private"
can be done in-place without a trusted hypervisor copying pages.

Add support for this feature and allow Linux to mmap host-accessible
pages. When the owner provides an ->accessible() callback in the
struct guest_memfd_operations, guest_memfd allows folios to be mapped
when the ->accessible() callback returns 0.

To safely make inaccessible:

```
folio = guest_memfd_grab_folio(inode, index, flags);
r = guest_memfd_make_inaccessible(inode, folio);
if (r)
        goto err;

hypervisor_does_guest_mapping(folio);

folio_unlock(folio);
```

hypervisor_does_s2_mapping(folio) should make it so
ops->accessible(...) on those folios fails.

The folio lock ensures atomicity.

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 include/linux/guest_memfd.h |  7 ++++
 mm/guest_memfd.c            | 81 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

Comments

David Hildenbrand Aug. 6, 2024, 1:51 p.m. UTC | #1
>   
> -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>   		r = guest_memfd_folio_private(folio);
>   		if (r)
>   			goto out_err;
> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>   }
>   EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>   
> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> +{
> +	unsigned long gmem_flags = (unsigned long)file->private_data;
> +	unsigned long i;
> +	int r;
> +
> +	unmap_mapping_folio(folio);
> +
> +	/**
> +	 * We can't use the refcount. It might be elevated due to
> +	 * guest/vcpu trying to access same folio as another vcpu
> +	 * or because userspace is trying to access folio for same reason

As discussed, that's insufficient. We really have to drive the refcount 
to 1 -- the single reference we expect.

What is the exact problem you are running into here? Who can just grab a 
reference and maybe do nasty things with it?
Patrick Roy Aug. 6, 2024, 3:48 p.m. UTC | #2
On Mon, 2024-08-05 at 19:34 +0100, Elliot Berman wrote:
> Confidential/protected guest virtual machines want to share some memory
> back with the host Linux. For example, virtqueues allow host and
> protected guest to exchange data. In MMU-only isolation of protected
> guest virtual machines, the transition between "shared" and "private"
> can be done in-place without a trusted hypervisor copying pages.
> 
> Add support for this feature and allow Linux to mmap host-accessible
> pages. When the owner provides an ->accessible() callback in the
> struct guest_memfd_operations, guest_memfd allows folios to be mapped
> when the ->accessible() callback returns 0.

Wouldn't the set of inaccessible folios always match exactly the set of
folios that have PG_private=1 set? At least guest_memfd instances that
have GUEST_MEMFD_FLAG_NO_DIRECT_MAP set, having folios without direct
map entries marked "accessible" sound like it may cause a lot of mayhem
(as those folios would essentially be secretmem folios, but this time
without the GUP checks). But even more generally, wouldn't tracking
accessibility via PG_private be enough?

> To safely make inaccessible:
> 
> ```
> folio = guest_memfd_grab_folio(inode, index, flags);
> r = guest_memfd_make_inaccessible(inode, folio);
> if (r)
>         goto err;
> 
> hypervisor_does_guest_mapping(folio);
> 
> folio_unlock(folio);
> ```
> 
> hypervisor_does_s2_mapping(folio) should make it so
> ops->accessible(...) on those folios fails.
> 
> The folio lock ensures atomicity.
> 
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>  include/linux/guest_memfd.h |  7 ++++
>  mm/guest_memfd.c            | 81 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> index f9e4a27aed67..edcb4ba60cb0 100644
> --- a/include/linux/guest_memfd.h
> +++ b/include/linux/guest_memfd.h
> @@ -16,12 +16,18 @@
>   * @invalidate_end: called after invalidate_begin returns success. Optional.
>   * @prepare: called before a folio is mapped into the guest address space.
>   *           Optional.
> + * @accessible: called after prepare returns success and before it's mapped
> + *              into the guest address space. Returns 0 if the folio can be
> + *              accessed.
> + *              Optional. If not present, assumes folios are never accessible.
>   * @release: Called when releasing the guest_memfd file. Required.
>   */
>  struct guest_memfd_operations {
>         int (*invalidate_begin)(struct inode *inode, pgoff_t offset, unsigned long nr);
>         void (*invalidate_end)(struct inode *inode, pgoff_t offset, unsigned long nr);
>         int (*prepare)(struct inode *inode, pgoff_t offset, struct folio *folio);
> +       int (*accessible)(struct inode *inode, struct folio *folio,
> +                         pgoff_t offset, unsigned long nr);
>         int (*release)(struct inode *inode);
>  };
> 
> @@ -48,5 +54,6 @@ struct file *guest_memfd_alloc(const char *name,
>                                const struct guest_memfd_operations *ops,
>                                loff_t size, unsigned long flags);
>  bool is_guest_memfd(struct file *file, const struct guest_memfd_operations *ops);
> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio);
> 
>  #endif
> diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> index e9d8cab72b28..6b5609932ca5 100644
> --- a/mm/guest_memfd.c
> +++ b/mm/guest_memfd.c
> @@ -9,6 +9,8 @@
>  #include <linux/pagemap.h>
>  #include <linux/set_memory.h>
> 
> +#include "internal.h"
> +
>  static inline int guest_memfd_folio_private(struct folio *folio)
>  {
>         unsigned long nr_pages = folio_nr_pages(folio);
> @@ -89,7 +91,7 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>                         goto out_err;
>         }
> 
> -       if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> +       if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>                 r = guest_memfd_folio_private(folio);
>                 if (r)
>                         goto out_err;
> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>  }
>  EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> 
> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> +{
> +       unsigned long gmem_flags = (unsigned long)file->private_data;
> +       unsigned long i;
> +       int r;
> +
> +       unmap_mapping_folio(folio);
> +
> +       /**
> +        * We can't use the refcount. It might be elevated due to
> +        * guest/vcpu trying to access same folio as another vcpu
> +        * or because userspace is trying to access folio for same reason
> +        *
> +        * folio_lock serializes the transitions between (in)accessible
> +        */
> +       if (folio_maybe_dma_pinned(folio))
> +               return -EBUSY;
> +
> +       if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> +               r = guest_memfd_folio_private(folio);
> +               if (r)
> +                       return r;
> +       }
> +
> +       return 0;
> +}
> +
> +static vm_fault_t gmem_fault(struct vm_fault *vmf)
> +{
> +       struct file *file = vmf->vma->vm_file;
> +       struct inode *inode = file_inode(file);
> +       const struct guest_memfd_operations *ops = inode->i_private;
> +       struct folio *folio;
> +       pgoff_t off;
> +       int r;
> +
> +       folio = guest_memfd_grab_folio(file, vmf->pgoff, GUEST_MEMFD_GRAB_UPTODATE);
> +       if (!folio)
> +               return VM_FAULT_SIGBUS;
> +
> +       off = vmf->pgoff & (folio_nr_pages(folio) - 1);
> +       r = ops->accessible(inode, folio, off, 1);
> +       if (r) {

This made be stumble at first. I know you say ops->accessible returning
0 means "this is accessible", but if I only look at this if-statement it
reads as "if the folio is accessible, send a SIGBUS", which is not
what's actually happening.

> +               folio_unlock(folio);
> +               folio_put(folio);
> +               return VM_FAULT_SIGBUS;
> +       }
> +
> +       guest_memfd_folio_clear_private(folio);
> +
> +       vmf->page = folio_page(folio, off);
> +
> +       return VM_FAULT_LOCKED;
> +}
> +
> +static const struct vm_operations_struct gmem_vm_ops = {
> +       .fault = gmem_fault,
> +};
> +
> +static int gmem_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       const struct guest_memfd_operations *ops = file_inode(file)->i_private;
> +
> +       if (!ops->accessible)
> +               return -EPERM;
> +
> +       /* No support for private mappings to avoid COW.  */
> +       if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> +           (VM_SHARED | VM_MAYSHARE))
> +               return -EINVAL;
> +
> +       file_accessed(file);
> +       vma->vm_ops = &gmem_vm_ops;
> +       return 0;
> +}
> +
>  static long gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
>  {
>         struct inode *inode = file_inode(file);
> @@ -220,6 +298,7 @@ static int gmem_release(struct inode *inode, struct file *file)
>  static struct file_operations gmem_fops = {
>         .open = generic_file_open,
>         .llseek = generic_file_llseek,
> +       .mmap = gmem_mmap,
>         .release = gmem_release,
>         .fallocate = gmem_fallocate,
>         .owner = THIS_MODULE,
> 
> --
> 2.34.1
>
Elliot Berman Aug. 8, 2024, 9:41 p.m. UTC | #3
On Wed, Aug 07, 2024 at 06:12:00PM +0200, David Hildenbrand wrote:
> On 06.08.24 19:14, Elliot Berman wrote:
> > On Tue, Aug 06, 2024 at 03:51:22PM +0200, David Hildenbrand wrote:
> > > > -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> > > > +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
> > > >    		r = guest_memfd_folio_private(folio);
> > > >    		if (r)
> > > >    			goto out_err;
> > > > @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> > > >    }
> > > >    EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> > > > +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> > > > +{
> > > > +	unsigned long gmem_flags = (unsigned long)file->private_data;
> > > > +	unsigned long i;
> > > > +	int r;
> > > > +
> > > > +	unmap_mapping_folio(folio);
> > > > +
> > > > +	/**
> > > > +	 * We can't use the refcount. It might be elevated due to
> > > > +	 * guest/vcpu trying to access same folio as another vcpu
> > > > +	 * or because userspace is trying to access folio for same reason
> > > 
> > > As discussed, that's insufficient. We really have to drive the refcount to 1
> > > -- the single reference we expect.
> > > 
> > > What is the exact problem you are running into here? Who can just grab a
> > > reference and maybe do nasty things with it?
> > > 
> > 
> > Right, I remember we had discussed it. The problem I faced was if 2
> > vcpus fault on same page, they would race to look up the folio in
> > filemap, increment refcount, then try to lock the folio. One of the
> > vcpus wins the lock, while the other waits. The vcpu that gets the
> > lock vcpu will see the elevated refcount.
> > 
> > I was in middle of writing an explanation why I think this is best
> > approach and realized I think it should be possible to do
> > shared->private conversion and actually have single reference. There
> > would be some cost to walk through the allocated folios and convert them
> > to private before any vcpu runs. The approach I had gone with was to
> > do conversions as late as possible.
> 
> We certainly have to support conversion while the VCPUs are running.
> 
> The VCPUs might be able to avoid grabbing a folio reference for the
> conversion and only do the folio_lock(): as long as we have a guarantee that
> we will disallow freeing the folio in gmem, for example, by syncing against
> FALLOC_FL_PUNCH_HOLE.
> 
> So if we can rely on the "gmem" reference to the folio that cannot go away
> while we do what we do, we should be fine.
> 
> <random though>
> 
> Meanwhile, I was thinking if we would want to track the references we
> hand out to "safe" users differently.
> 
> Safe references would only be references that would survive a
> private<->shared conversion, like KVM MMU mappings maybe?
> 
> KVM would then have to be thought to return these gmem references
> differently.
> 
> The idea would be to track these "safe" references differently
> (page->private?) and only allow dropping *our* guest_memfd reference if all
> these "safe" references are gone. That is, FALLOC_FL_PUNCH_HOLE would also
> fail if there are any "safe" reference remaining.
> 
> <\random though>
> 

I didn't find a path in filemap where we can grab folio without
increasing its refcount. I liked the idea of keeping track of a "safe"
refcount, but I believe there is a small window to race comparing the
main folio refcount and the "safe" refcount. A vcpu could have
incremented the main folio refcount and on the way to increment the safe
refcount. Before that happens, another thread does the comparison and
sees a mismatch.

Thanks,
Elliot
Elliot Berman Aug. 8, 2024, 9:42 p.m. UTC | #4
On Thu, Aug 08, 2024 at 06:51:14PM +0000, Ackerley Tng wrote:
> Elliot Berman <quic_eberman@quicinc.com> writes:
> 
> > Confidential/protected guest virtual machines want to share some memory
> > back with the host Linux. For example, virtqueues allow host and
> > protected guest to exchange data. In MMU-only isolation of protected
> > guest virtual machines, the transition between "shared" and "private"
> > can be done in-place without a trusted hypervisor copying pages.
> >
> > Add support for this feature and allow Linux to mmap host-accessible
> > pages. When the owner provides an ->accessible() callback in the
> > struct guest_memfd_operations, guest_memfd allows folios to be mapped
> > when the ->accessible() callback returns 0.
> >
> > To safely make inaccessible:
> >
> > ```
> > folio = guest_memfd_grab_folio(inode, index, flags);
> > r = guest_memfd_make_inaccessible(inode, folio);
> > if (r)
> >         goto err;
> >
> > hypervisor_does_guest_mapping(folio);
> >
> > folio_unlock(folio);
> > ```
> >
> > hypervisor_does_s2_mapping(folio) should make it so
> > ops->accessible(...) on those folios fails.
> >
> > The folio lock ensures atomicity.
> 
> I am also working on determining faultability not based on the
> private-ness of the page but based on permission given by the
> guest. I'd like to learn from what you've discovered here.
> 
> Could you please elaborate on this? What races is the folio_lock
> intended to prevent, what operations are we ensuring atomicity of?

The contention I've been paying most attention to are racing userspace
and vcpu faults where guest needs the page to be private. There could
also be multiple vcpus demanding same page.

We had some chatter about doing the private->shared conversion via
separate ioctl (mem attributes). I think the same race can happen with
userspace whether it's vcpu fault or ioctl making the folio "finally
guest-private".

Also, in non-CoCo KVM private guest_memfd, KVM or userspace could also
convert private->shared and need to make sure that all the tracking for
the current state is consistent.

> Is this why you did a guest_memfd_grab_folio() before checking
> ->accessible(), and then doing folio_unlock() if the page is
> inaccessible?
> 

Right, I want to guard against userspace being able to fault in a page
concurrently with that same page doing a shared->private conversion. The
folio_lock seems like the best fine-grained lock to grab.

If the shared->private converter wins the folio_lock first, then the
userspace fault waits and will see ->accessible() == false as desired. 

If userspace fault wins the folio_lock first, it relinquishes the lock
only after installing the folio in page tables[*]. When the
shared->private converter finally gets the lock,
guest_memfd_make_inaccessible() will be able to unmap the folio from any
userspace page tables (and direct map, if applicable).

[*]: I'm not mm expert, but that was what I could find when I went
digging.

Thanks,
Elliot

> >
> > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> > ---
> >  include/linux/guest_memfd.h |  7 ++++
> >  mm/guest_memfd.c            | 81 ++++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 87 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> > index f9e4a27aed67..edcb4ba60cb0 100644
> > --- a/include/linux/guest_memfd.h
> > +++ b/include/linux/guest_memfd.h
> > @@ -16,12 +16,18 @@
> >   * @invalidate_end: called after invalidate_begin returns success. Optional.
> >   * @prepare: called before a folio is mapped into the guest address space.
> >   *           Optional.
> > + * @accessible: called after prepare returns success and before it's mapped
> > + *              into the guest address space. Returns 0 if the folio can be
> > + *              accessed.
> > + *              Optional. If not present, assumes folios are never accessible.
> >   * @release: Called when releasing the guest_memfd file. Required.
> >   */
> >  struct guest_memfd_operations {
> >  	int (*invalidate_begin)(struct inode *inode, pgoff_t offset, unsigned long nr);
> >  	void (*invalidate_end)(struct inode *inode, pgoff_t offset, unsigned long nr);
> >  	int (*prepare)(struct inode *inode, pgoff_t offset, struct folio *folio);
> > +	int (*accessible)(struct inode *inode, struct folio *folio,
> > +			  pgoff_t offset, unsigned long nr);
> >  	int (*release)(struct inode *inode);
> >  };
> >  
> > @@ -48,5 +54,6 @@ struct file *guest_memfd_alloc(const char *name,
> >  			       const struct guest_memfd_operations *ops,
> >  			       loff_t size, unsigned long flags);
> >  bool is_guest_memfd(struct file *file, const struct guest_memfd_operations *ops);
> > +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio);
> >  
> >  #endif
> > diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> > index e9d8cab72b28..6b5609932ca5 100644
> > --- a/mm/guest_memfd.c
> > +++ b/mm/guest_memfd.c
> > @@ -9,6 +9,8 @@
> >  #include <linux/pagemap.h>
> >  #include <linux/set_memory.h>
> >  
> > +#include "internal.h"
> > +
> >  static inline int guest_memfd_folio_private(struct folio *folio)
> >  {
> >  	unsigned long nr_pages = folio_nr_pages(folio);
> > @@ -89,7 +91,7 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> >  			goto out_err;
> >  	}
> >  
> > -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> > +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
> >  		r = guest_memfd_folio_private(folio);
> >  		if (r)
> >  			goto out_err;
> > @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> >  }
> >  EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> >  
> > +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> > +{
> > +	unsigned long gmem_flags = (unsigned long)file->private_data;
> > +	unsigned long i;
> > +	int r;
> > +
> > +	unmap_mapping_folio(folio);
> > +
> > +	/**
> > +	 * We can't use the refcount. It might be elevated due to
> > +	 * guest/vcpu trying to access same folio as another vcpu
> > +	 * or because userspace is trying to access folio for same reason
> > +	 *
> > +	 * folio_lock serializes the transitions between (in)accessible
> > +	 */
> > +	if (folio_maybe_dma_pinned(folio))
> > +		return -EBUSY;
> > +
> > +	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> > +		r = guest_memfd_folio_private(folio);
> > +		if (r)
> > +			return r;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static vm_fault_t gmem_fault(struct vm_fault *vmf)
> > +{
> > +	struct file *file = vmf->vma->vm_file;
> > +	struct inode *inode = file_inode(file);
> > +	const struct guest_memfd_operations *ops = inode->i_private;
> > +	struct folio *folio;
> > +	pgoff_t off;
> > +	int r;
> > +
> > +	folio = guest_memfd_grab_folio(file, vmf->pgoff, GUEST_MEMFD_GRAB_UPTODATE);
> 
> Could grabbing the folio with GUEST_MEMFD_GRAB_UPTODATE cause unintended
> zeroing of the page if the page turns out to be inaccessible?
> 

I assume that if page is inaccessible, it would already have been marked
up to date and we wouldn't try to zero the page.

I'm thinking that if hypervisor zeroes the page when making the page
private, it would not give the GUEST_MEMFD_GRAB_UPTODATE flag when
grabbing the folio. I believe the hypervisor should know when grabbing
the folio if it's about to donate to the guest.

Thanks,
Elliot
David Hildenbrand Aug. 8, 2024, 9:55 p.m. UTC | #5
On 08.08.24 23:41, Elliot Berman wrote:
> On Wed, Aug 07, 2024 at 06:12:00PM +0200, David Hildenbrand wrote:
>> On 06.08.24 19:14, Elliot Berman wrote:
>>> On Tue, Aug 06, 2024 at 03:51:22PM +0200, David Hildenbrand wrote:
>>>>> -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
>>>>> +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>>>>>     		r = guest_memfd_folio_private(folio);
>>>>>     		if (r)
>>>>>     			goto out_err;
>>>>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>>>>>     }
>>>>>     EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>>>>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
>>>>> +{
>>>>> +	unsigned long gmem_flags = (unsigned long)file->private_data;
>>>>> +	unsigned long i;
>>>>> +	int r;
>>>>> +
>>>>> +	unmap_mapping_folio(folio);
>>>>> +
>>>>> +	/**
>>>>> +	 * We can't use the refcount. It might be elevated due to
>>>>> +	 * guest/vcpu trying to access same folio as another vcpu
>>>>> +	 * or because userspace is trying to access folio for same reason
>>>>
>>>> As discussed, that's insufficient. We really have to drive the refcount to 1
>>>> -- the single reference we expect.
>>>>
>>>> What is the exact problem you are running into here? Who can just grab a
>>>> reference and maybe do nasty things with it?
>>>>
>>>
>>> Right, I remember we had discussed it. The problem I faced was if 2
>>> vcpus fault on same page, they would race to look up the folio in
>>> filemap, increment refcount, then try to lock the folio. One of the
>>> vcpus wins the lock, while the other waits. The vcpu that gets the
>>> lock vcpu will see the elevated refcount.
>>>
>>> I was in middle of writing an explanation why I think this is best
>>> approach and realized I think it should be possible to do
>>> shared->private conversion and actually have single reference. There
>>> would be some cost to walk through the allocated folios and convert them
>>> to private before any vcpu runs. The approach I had gone with was to
>>> do conversions as late as possible.
>>
>> We certainly have to support conversion while the VCPUs are running.
>>
>> The VCPUs might be able to avoid grabbing a folio reference for the
>> conversion and only do the folio_lock(): as long as we have a guarantee that
>> we will disallow freeing the folio in gmem, for example, by syncing against
>> FALLOC_FL_PUNCH_HOLE.
>>
>> So if we can rely on the "gmem" reference to the folio that cannot go away
>> while we do what we do, we should be fine.
>>
>> <random though>
>>
>> Meanwhile, I was thinking if we would want to track the references we
>> hand out to "safe" users differently.
>>
>> Safe references would only be references that would survive a
>> private<->shared conversion, like KVM MMU mappings maybe?
>>
>> KVM would then have to be thought to return these gmem references
>> differently.
>>
>> The idea would be to track these "safe" references differently
>> (page->private?) and only allow dropping *our* guest_memfd reference if all
>> these "safe" references are gone. That is, FALLOC_FL_PUNCH_HOLE would also
>> fail if there are any "safe" reference remaining.
>>
>> <\random though>
>>
> 
> I didn't find a path in filemap where we can grab folio without
> increasing its refcount. I liked the idea of keeping track of a "safe"
> refcount, but I believe there is a small window to race comparing the
> main folio refcount and the "safe" refcount.

There are various possible models. To detect unexpected references, we 
could either use

folio_ref_count(folio) == gmem_folio_safe_ref_count(folio) + 1

[we increment both ref counter]

or

folio_ref_count(folio) == 1

[we only increment the safe refcount and let other magic handle it as 
described]

A vcpu could have
> incremented the main folio refcount and on the way to increment the safe
> refcount. Before that happens, another thread does the comparison and
> sees a mismatch.

Likely there won't be a way around coming up with code that is able to 
deal with such temporary, "speculative" folio references.

In the simplest case, these references will be obtained from our gmem 
code only, and we'll have to detect that it happened and retry (a 
seqcount would be a naive solution).

In the complex case, these references are temporarily obtained from 
other core-mm code -- using folio_try_get(). We can minimize some of 
them (speculative references from GUP or the pagecache), and try 
optimizing others (PFN walkers like page migration).

But likely we'll need some retry magic, at least initially.
Elliot Berman Aug. 8, 2024, 10:26 p.m. UTC | #6
On Thu, Aug 08, 2024 at 11:55:15PM +0200, David Hildenbrand wrote:
> On 08.08.24 23:41, Elliot Berman wrote:
> > On Wed, Aug 07, 2024 at 06:12:00PM +0200, David Hildenbrand wrote:
> > > On 06.08.24 19:14, Elliot Berman wrote:
> > > > On Tue, Aug 06, 2024 at 03:51:22PM +0200, David Hildenbrand wrote:
> > > > > > -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> > > > > > +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
> > > > > >     		r = guest_memfd_folio_private(folio);
> > > > > >     		if (r)
> > > > > >     			goto out_err;
> > > > > > @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> > > > > >     }
> > > > > >     EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> > > > > > +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> > > > > > +{
> > > > > > +	unsigned long gmem_flags = (unsigned long)file->private_data;
> > > > > > +	unsigned long i;
> > > > > > +	int r;
> > > > > > +
> > > > > > +	unmap_mapping_folio(folio);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * We can't use the refcount. It might be elevated due to
> > > > > > +	 * guest/vcpu trying to access same folio as another vcpu
> > > > > > +	 * or because userspace is trying to access folio for same reason
> > > > > 
> > > > > As discussed, that's insufficient. We really have to drive the refcount to 1
> > > > > -- the single reference we expect.
> > > > > 
> > > > > What is the exact problem you are running into here? Who can just grab a
> > > > > reference and maybe do nasty things with it?
> > > > > 
> > > > 
> > > > Right, I remember we had discussed it. The problem I faced was if 2
> > > > vcpus fault on same page, they would race to look up the folio in
> > > > filemap, increment refcount, then try to lock the folio. One of the
> > > > vcpus wins the lock, while the other waits. The vcpu that gets the
> > > > lock vcpu will see the elevated refcount.
> > > > 
> > > > I was in middle of writing an explanation why I think this is best
> > > > approach and realized I think it should be possible to do
> > > > shared->private conversion and actually have single reference. There
> > > > would be some cost to walk through the allocated folios and convert them
> > > > to private before any vcpu runs. The approach I had gone with was to
> > > > do conversions as late as possible.
> > > 
> > > We certainly have to support conversion while the VCPUs are running.
> > > 
> > > The VCPUs might be able to avoid grabbing a folio reference for the
> > > conversion and only do the folio_lock(): as long as we have a guarantee that
> > > we will disallow freeing the folio in gmem, for example, by syncing against
> > > FALLOC_FL_PUNCH_HOLE.
> > > 
> > > So if we can rely on the "gmem" reference to the folio that cannot go away
> > > while we do what we do, we should be fine.
> > > 
> > > <random though>
> > > 
> > > Meanwhile, I was thinking if we would want to track the references we
> > > hand out to "safe" users differently.
> > > 
> > > Safe references would only be references that would survive a
> > > private<->shared conversion, like KVM MMU mappings maybe?
> > > 
> > > KVM would then have to be thought to return these gmem references
> > > differently.
> > > 
> > > The idea would be to track these "safe" references differently
> > > (page->private?) and only allow dropping *our* guest_memfd reference if all
> > > these "safe" references are gone. That is, FALLOC_FL_PUNCH_HOLE would also
> > > fail if there are any "safe" reference remaining.
> > > 
> > > <\random though>
> > > 
> > 
> > I didn't find a path in filemap where we can grab folio without
> > increasing its refcount. I liked the idea of keeping track of a "safe"
> > refcount, but I believe there is a small window to race comparing the
> > main folio refcount and the "safe" refcount.
> 
> There are various possible models. To detect unexpected references, we could
> either use
> 
> folio_ref_count(folio) == gmem_folio_safe_ref_count(folio) + 1
> 
> [we increment both ref counter]
> 
> or
> 
> folio_ref_count(folio) == 1
> 
> [we only increment the safe refcount and let other magic handle it as
> described]
> 
> A vcpu could have
> > incremented the main folio refcount and on the way to increment the safe
> > refcount. Before that happens, another thread does the comparison and
> > sees a mismatch.
> 
> Likely there won't be a way around coming up with code that is able to deal
> with such temporary, "speculative" folio references.
> 
> In the simplest case, these references will be obtained from our gmem code
> only, and we'll have to detect that it happened and retry (a seqcount would
> be a naive solution).
> 
> In the complex case, these references are temporarily obtained from other
> core-mm code -- using folio_try_get(). We can minimize some of them
> (speculative references from GUP or the pagecache), and try optimizing
> others (PFN walkers like page migration).
> 
> But likely we'll need some retry magic, at least initially.
> 

I thought retry magic would not fly. I'll try this out.

Thanks,
Elliot
David Hildenbrand Aug. 9, 2024, 7:16 a.m. UTC | #7
On 09.08.24 00:26, Elliot Berman wrote:
> On Thu, Aug 08, 2024 at 11:55:15PM +0200, David Hildenbrand wrote:
>> On 08.08.24 23:41, Elliot Berman wrote:
>>> On Wed, Aug 07, 2024 at 06:12:00PM +0200, David Hildenbrand wrote:
>>>> On 06.08.24 19:14, Elliot Berman wrote:
>>>>> On Tue, Aug 06, 2024 at 03:51:22PM +0200, David Hildenbrand wrote:
>>>>>>> -	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
>>>>>>> +	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>>>>>>>      		r = guest_memfd_folio_private(folio);
>>>>>>>      		if (r)
>>>>>>>      			goto out_err;
>>>>>>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>>>>>>>      }
>>>>>>>      EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>>>>>>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
>>>>>>> +{
>>>>>>> +	unsigned long gmem_flags = (unsigned long)file->private_data;
>>>>>>> +	unsigned long i;
>>>>>>> +	int r;
>>>>>>> +
>>>>>>> +	unmap_mapping_folio(folio);
>>>>>>> +
>>>>>>> +	/**
>>>>>>> +	 * We can't use the refcount. It might be elevated due to
>>>>>>> +	 * guest/vcpu trying to access same folio as another vcpu
>>>>>>> +	 * or because userspace is trying to access folio for same reason
>>>>>>
>>>>>> As discussed, that's insufficient. We really have to drive the refcount to 1
>>>>>> -- the single reference we expect.
>>>>>>
>>>>>> What is the exact problem you are running into here? Who can just grab a
>>>>>> reference and maybe do nasty things with it?
>>>>>>
>>>>>
>>>>> Right, I remember we had discussed it. The problem I faced was if 2
>>>>> vcpus fault on same page, they would race to look up the folio in
>>>>> filemap, increment refcount, then try to lock the folio. One of the
>>>>> vcpus wins the lock, while the other waits. The vcpu that gets the
>>>>> lock vcpu will see the elevated refcount.
>>>>>
>>>>> I was in middle of writing an explanation why I think this is best
>>>>> approach and realized I think it should be possible to do
>>>>> shared->private conversion and actually have single reference. There
>>>>> would be some cost to walk through the allocated folios and convert them
>>>>> to private before any vcpu runs. The approach I had gone with was to
>>>>> do conversions as late as possible.
>>>>
>>>> We certainly have to support conversion while the VCPUs are running.
>>>>
>>>> The VCPUs might be able to avoid grabbing a folio reference for the
>>>> conversion and only do the folio_lock(): as long as we have a guarantee that
>>>> we will disallow freeing the folio in gmem, for example, by syncing against
>>>> FALLOC_FL_PUNCH_HOLE.
>>>>
>>>> So if we can rely on the "gmem" reference to the folio that cannot go away
>>>> while we do what we do, we should be fine.
>>>>
>>>> <random though>
>>>>
>>>> Meanwhile, I was thinking if we would want to track the references we
>>>> hand out to "safe" users differently.
>>>>
>>>> Safe references would only be references that would survive a
>>>> private<->shared conversion, like KVM MMU mappings maybe?
>>>>
>>>> KVM would then have to be thought to return these gmem references
>>>> differently.
>>>>
>>>> The idea would be to track these "safe" references differently
>>>> (page->private?) and only allow dropping *our* guest_memfd reference if all
>>>> these "safe" references are gone. That is, FALLOC_FL_PUNCH_HOLE would also
>>>> fail if there are any "safe" reference remaining.
>>>>
>>>> <\random though>
>>>>
>>>
>>> I didn't find a path in filemap where we can grab folio without
>>> increasing its refcount. I liked the idea of keeping track of a "safe"
>>> refcount, but I believe there is a small window to race comparing the
>>> main folio refcount and the "safe" refcount.
>>
>> There are various possible models. To detect unexpected references, we could
>> either use
>>
>> folio_ref_count(folio) == gmem_folio_safe_ref_count(folio) + 1
>>
>> [we increment both ref counter]
>>
>> or
>>
>> folio_ref_count(folio) == 1
>>
>> [we only increment the safe refcount and let other magic handle it as
>> described]
>>
>> A vcpu could have
>>> incremented the main folio refcount and on the way to increment the safe
>>> refcount. Before that happens, another thread does the comparison and
>>> sees a mismatch.
>>
>> Likely there won't be a way around coming up with code that is able to deal
>> with such temporary, "speculative" folio references.
>>
>> In the simplest case, these references will be obtained from our gmem code
>> only, and we'll have to detect that it happened and retry (a seqcount would
>> be a naive solution).
>>
>> In the complex case, these references are temporarily obtained from other
>> core-mm code -- using folio_try_get(). We can minimize some of them
>> (speculative references from GUP or the pagecache), and try optimizing
>> others (PFN walkers like page migration).
>>
>> But likely we'll need some retry magic, at least initially.
>>
> 
> I thought retry magic would not fly. I'll try this out.

Any details why? At least the "other gmem code is currently taking a 
speculative reference" should be handable, these speculative references 
all happen from gmem code and it should be under our control.

We can protect against some core-mm speculative references (GUP, 
page-cache): after we allocated pages for gmem, and a RCU grace period 
passed, these can no longer happen from old context that previously had 
these pages allocated before gmem allocated them.

Other folio_try_get() users like memory offlining or page migration are 
more problematic. In general, the assumption is that they will give up 
quickly, for example when realizing that a folio is not LRU or non 
"non-lru-movable" -- which is the case for gmem-allocated pages.

Yes, no retry magic would be very much preferred, but as soon as we want 
to map these folios to user space and have GUP work on them (IOW, we 
have to make the folio refcount usable), we cannot easily block all 
speculative references from core-mm, for example, by freezing the 
refcount at 0. Long term, we might find ways to just block these 
speculative references more efficiently / differently.
Fuad Tabba Aug. 15, 2024, 7:24 a.m. UTC | #8
Hi David,

On Tue, 6 Aug 2024 at 14:51, David Hildenbrand <david@redhat.com> wrote:
>
> >
> > -     if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> > +     if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
> >               r = guest_memfd_folio_private(folio);
> >               if (r)
> >                       goto out_err;
> > @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> >   }
> >   EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> >
> > +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> > +{
> > +     unsigned long gmem_flags = (unsigned long)file->private_data;
> > +     unsigned long i;
> > +     int r;
> > +
> > +     unmap_mapping_folio(folio);
> > +
> > +     /**
> > +      * We can't use the refcount. It might be elevated due to
> > +      * guest/vcpu trying to access same folio as another vcpu
> > +      * or because userspace is trying to access folio for same reason
>
> As discussed, that's insufficient. We really have to drive the refcount
> to 1 -- the single reference we expect.
>
> What is the exact problem you are running into here? Who can just grab a
> reference and maybe do nasty things with it?

I was wondering, why do we need to check the refcount? Isn't it enough
to check for page_mapped() || page_maybe_dma_pinned(), while holding
the folio lock?

Thanks!
/fuad

> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Aug. 16, 2024, 9:48 a.m. UTC | #9
On 15.08.24 09:24, Fuad Tabba wrote:
> Hi David,

Hi!

> 
> On Tue, 6 Aug 2024 at 14:51, David Hildenbrand <david@redhat.com> wrote:
>>
>>>
>>> -     if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
>>> +     if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>>>                r = guest_memfd_folio_private(folio);
>>>                if (r)
>>>                        goto out_err;
>>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>>>    }
>>>    EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>>>
>>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
>>> +{
>>> +     unsigned long gmem_flags = (unsigned long)file->private_data;
>>> +     unsigned long i;
>>> +     int r;
>>> +
>>> +     unmap_mapping_folio(folio);
>>> +
>>> +     /**
>>> +      * We can't use the refcount. It might be elevated due to
>>> +      * guest/vcpu trying to access same folio as another vcpu
>>> +      * or because userspace is trying to access folio for same reason
>>
>> As discussed, that's insufficient. We really have to drive the refcount
>> to 1 -- the single reference we expect.
>>
>> What is the exact problem you are running into here? Who can just grab a
>> reference and maybe do nasty things with it?
> 
> I was wondering, why do we need to check the refcount? Isn't it enough
> to check for page_mapped() || page_maybe_dma_pinned(), while holding
> the folio lock?

(folio_mapped() + folio_maybe_dma_pinned())

Not everything goes trough FOLL_PIN. vmsplice() is an example, or just 
some very simple read/write through /proc/pid/mem. Further, some 
O_DIRECT implementations still don't use FOLL_PIN.

So if you see an additional folio reference, as soon as you mapped that 
thing to user space, you have to assume that it could be someone 
reading/writing that memory in possibly sane context. (vmsplice() should 
be using FOLL_PIN|FOLL_LONGTERM, but that's a longer discussion)

(noting that also folio_maybe_dma_pinned() can have false positives in 
some cases due to speculative references or *many* references).
Fuad Tabba Aug. 16, 2024, 11:19 a.m. UTC | #10
On Fri, 16 Aug 2024 at 10:48, David Hildenbrand <david@redhat.com> wrote:
>
> On 15.08.24 09:24, Fuad Tabba wrote:
> > Hi David,
>
> Hi!
>
> >
> > On Tue, 6 Aug 2024 at 14:51, David Hildenbrand <david@redhat.com> wrote:
> >>
> >>>
> >>> -     if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> >>> +     if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
> >>>                r = guest_memfd_folio_private(folio);
> >>>                if (r)
> >>>                        goto out_err;
> >>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
> >>>    }
> >>>    EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
> >>>
> >>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
> >>> +{
> >>> +     unsigned long gmem_flags = (unsigned long)file->private_data;
> >>> +     unsigned long i;
> >>> +     int r;
> >>> +
> >>> +     unmap_mapping_folio(folio);
> >>> +
> >>> +     /**
> >>> +      * We can't use the refcount. It might be elevated due to
> >>> +      * guest/vcpu trying to access same folio as another vcpu
> >>> +      * or because userspace is trying to access folio for same reason
> >>
> >> As discussed, that's insufficient. We really have to drive the refcount
> >> to 1 -- the single reference we expect.
> >>
> >> What is the exact problem you are running into here? Who can just grab a
> >> reference and maybe do nasty things with it?
> >
> > I was wondering, why do we need to check the refcount? Isn't it enough
> > to check for page_mapped() || page_maybe_dma_pinned(), while holding
> > the folio lock?
>
> (folio_mapped() + folio_maybe_dma_pinned())
>
> Not everything goes trough FOLL_PIN. vmsplice() is an example, or just
> some very simple read/write through /proc/pid/mem. Further, some
> O_DIRECT implementations still don't use FOLL_PIN.
>
> So if you see an additional folio reference, as soon as you mapped that
> thing to user space, you have to assume that it could be someone
> reading/writing that memory in possibly sane context. (vmsplice() should
> be using FOLL_PIN|FOLL_LONGTERM, but that's a longer discussion)
>
> (noting that also folio_maybe_dma_pinned() can have false positives in
> some cases due to speculative references or *many* references).

Thanks for the clarification!
/fuad

> --
> Cheers,
>
> David / dhildenb
>
Ackerley Tng Aug. 16, 2024, 5:45 p.m. UTC | #11
David Hildenbrand <david@redhat.com> writes:

> On 15.08.24 09:24, Fuad Tabba wrote:
>> Hi David,
>
> Hi!
>
>> 
>> On Tue, 6 Aug 2024 at 14:51, David Hildenbrand <david@redhat.com> wrote:
>>>
>>>>
>>>> -     if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
>>>> +     if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>>>>                r = guest_memfd_folio_private(folio);
>>>>                if (r)
>>>>                        goto out_err;
>>>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>>>>
>>>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
>>>> +{
>>>> +     unsigned long gmem_flags = (unsigned long)file->private_data;
>>>> +     unsigned long i;
>>>> +     int r;
>>>> +
>>>> +     unmap_mapping_folio(folio);
>>>> +
>>>> +     /**
>>>> +      * We can't use the refcount. It might be elevated due to
>>>> +      * guest/vcpu trying to access same folio as another vcpu
>>>> +      * or because userspace is trying to access folio for same reason
>>>
>>> As discussed, that's insufficient. We really have to drive the refcount
>>> to 1 -- the single reference we expect.
>>>
>>> What is the exact problem you are running into here? Who can just grab a
>>> reference and maybe do nasty things with it?
>> 
>> I was wondering, why do we need to check the refcount? Isn't it enough
>> to check for page_mapped() || page_maybe_dma_pinned(), while holding
>> the folio lock?

Thank you Fuad for asking!

>
> (folio_mapped() + folio_maybe_dma_pinned())
>
> Not everything goes trough FOLL_PIN. vmsplice() is an example, or just 
> some very simple read/write through /proc/pid/mem. Further, some 
> O_DIRECT implementations still don't use FOLL_PIN.
>
> So if you see an additional folio reference, as soon as you mapped that 
> thing to user space, you have to assume that it could be someone 
> reading/writing that memory in possibly sane context. (vmsplice() should 
> be using FOLL_PIN|FOLL_LONGTERM, but that's a longer discussion)
>

Thanks David for the clarification, this example is very helpful!

IIUC folio_lock() isn't a prerequisite for taking a refcount on the
folio.

Even if we are able to figure out a "safe" refcount, and check that the
current refcount == "safe" refcount before removing from direct map,
what's stopping some other part of the kernel from taking a refcount
just after the check happens and causing trouble with the folio's
removal from direct map?

> (noting that also folio_maybe_dma_pinned() can have false positives in 
> some cases due to speculative references or *many* references).

Are false positives (speculative references) okay since it's better to
be safe than remove from direct map prematurely?

>
> -- 
> Cheers,
>
> David / dhildenb
David Hildenbrand Aug. 16, 2024, 6:08 p.m. UTC | #12
On 16.08.24 19:45, Ackerley Tng wrote:
> 
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 15.08.24 09:24, Fuad Tabba wrote:
>>> Hi David,
>>
>> Hi!
>>
>>>
>>> On Tue, 6 Aug 2024 at 14:51, David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>>>
>>>>> -     if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
>>>>> +     if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
>>>>>                 r = guest_memfd_folio_private(folio);
>>>>>                 if (r)
>>>>>                         goto out_err;
>>>>> @@ -107,6 +109,82 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>>>>>     }
>>>>>     EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
>>>>>
>>>>> +int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
>>>>> +{
>>>>> +     unsigned long gmem_flags = (unsigned long)file->private_data;
>>>>> +     unsigned long i;
>>>>> +     int r;
>>>>> +
>>>>> +     unmap_mapping_folio(folio);
>>>>> +
>>>>> +     /**
>>>>> +      * We can't use the refcount. It might be elevated due to
>>>>> +      * guest/vcpu trying to access same folio as another vcpu
>>>>> +      * or because userspace is trying to access folio for same reason
>>>>
>>>> As discussed, that's insufficient. We really have to drive the refcount
>>>> to 1 -- the single reference we expect.
>>>>
>>>> What is the exact problem you are running into here? Who can just grab a
>>>> reference and maybe do nasty things with it?
>>>
>>> I was wondering, why do we need to check the refcount? Isn't it enough
>>> to check for page_mapped() || page_maybe_dma_pinned(), while holding
>>> the folio lock?
> 
> Thank you Fuad for asking!
> 
>>
>> (folio_mapped() + folio_maybe_dma_pinned())
>>
>> Not everything goes trough FOLL_PIN. vmsplice() is an example, or just
>> some very simple read/write through /proc/pid/mem. Further, some
>> O_DIRECT implementations still don't use FOLL_PIN.
>>
>> So if you see an additional folio reference, as soon as you mapped that
>> thing to user space, you have to assume that it could be someone
>> reading/writing that memory in possibly sane context. (vmsplice() should
>> be using FOLL_PIN|FOLL_LONGTERM, but that's a longer discussion)
>>
> 
> Thanks David for the clarification, this example is very helpful!
> 
> IIUC folio_lock() isn't a prerequisite for taking a refcount on the
> folio.

Right, to do folio_lock() you only have to guarantee that the folio 
cannot get freed concurrently. So you piggyback on another reference 
(you hold indirectly).

> 
> Even if we are able to figure out a "safe" refcount, and check that the
> current refcount == "safe" refcount before removing from direct map,
> what's stopping some other part of the kernel from taking a refcount
> just after the check happens and causing trouble with the folio's
> removal from direct map?

Once the page was unmapped from user space, and there were no additional 
references (e.g., GUP, whatever), any new references can only be 
(should, unless BUG :) ) temporary speculative references that should 
not try accessing page content, and that should back off if the folio is 
not deemed interesting or cannot be locked. (e.g., page 
migration/compaction/offlining).

Of course, there are some corner cases (kgdb, hibernation, /proc/kcore), 
but most of these can be dealt with in one way or the other (make these 
back off and not read/write page content, similar to how we handled it 
for secretmem).

These (kgdb, /proc/kcore) might not even take a folio reference, they 
just "access stuff" and we only have to teach them to "not access that".

> 
>> (noting that also folio_maybe_dma_pinned() can have false positives in
>> some cases due to speculative references or *many* references).
> 
> Are false positives (speculative references) okay since it's better to
> be safe than remove from direct map prematurely?

folio_maybe_dma_pinned() is primarily used in fork context. Copying more 
(if the folio maybe pinned and, therefore, must not get COW-shared with 
other processes and must instead create a private page copy) is the 
"better safe than sorry". So false positives (that happen rarely) are 
tolerable.

Regading the directmap, it would -- just like with additional references 
-- detect that the page cannot currently be removed from the direct map. 
It's similarly "better safe than sorry", but here means that we likely 
must retry if we cannot easily fallback to something else like for the 
fork+COW case.
Ackerley Tng Aug. 16, 2024, 9:52 p.m. UTC | #13
David Hildenbrand <david@redhat.com> writes:

> On 16.08.24 19:45, Ackerley Tng wrote:
>> 
>> <snip>
>> 
>> IIUC folio_lock() isn't a prerequisite for taking a refcount on the
>> folio.
>
> Right, to do folio_lock() you only have to guarantee that the folio 
> cannot get freed concurrently. So you piggyback on another reference 
> (you hold indirectly).
>
>> 
>> Even if we are able to figure out a "safe" refcount, and check that the
>> current refcount == "safe" refcount before removing from direct map,
>> what's stopping some other part of the kernel from taking a refcount
>> just after the check happens and causing trouble with the folio's
>> removal from direct map?
>
> Once the page was unmapped from user space, and there were no additional 
> references (e.g., GUP, whatever), any new references can only be 
> (should, unless BUG :) ) temporary speculative references that should 
> not try accessing page content, and that should back off if the folio is 
> not deemed interesting or cannot be locked. (e.g., page 
> migration/compaction/offlining).

I thought about it again - I think the vmsplice() cases are taken care
of once we check that the folios are not mapped into userspace, since
vmsplice() reads from a mapping.

splice() reads from the fd directly, but that's taken care since
guest_memfd doesn't have a .splice_read() handler.

Reading /proc/pid/mem also requires the pages to first be mapped, IIUC,
otherwise the pages won't show up, so checking that there are no more
mappings to userspace takes care of this.

>
> Of course, there are some corner cases (kgdb, hibernation, /proc/kcore), 
> but most of these can be dealt with in one way or the other (make these 
> back off and not read/write page content, similar to how we handled it 
> for secretmem).

Does that really leave us with these corner cases? And so perhaps we
could get away with just taking the folio_lock() to keep away the
speculative references? So something like

  1. Check that the folio is not mapped and not pinned.
  2. folio_lock() all the folios about to be removed from direct map
  -- With the lock, all other accesses should be speculative --
  3. Check that the refcount == "safe" refcount
      3a. Unlock and return to userspace with -EAGAIN
  4. Remove from direct map
  5. folio_unlock() all those folios

Perhaps a very naive question: can the "safe" refcount be statically
determined by walking through the code and counting where refcount is
expected to be incremented?

Or perhaps the "safe" refcount may differ based on kernel config. Could
we perhaps have a single static variable safe_refcount, and whenever a
new guest_memfd folio is allocated, do

  safe_refcount = min(new_folio_refcount, safe_refcount)

>
> These (kgdb, /proc/kcore) might not even take a folio reference, they 
> just "access stuff" and we only have to teach them to "not access that".
>
>> 
>>> (noting that also folio_maybe_dma_pinned() can have false positives in
>>> some cases due to speculative references or *many* references).
>> 
>> Are false positives (speculative references) okay since it's better to
>> be safe than remove from direct map prematurely?
>
> folio_maybe_dma_pinned() is primarily used in fork context. Copying more 
> (if the folio maybe pinned and, therefore, must not get COW-shared with 
> other processes and must instead create a private page copy) is the 
> "better safe than sorry". So false positives (that happen rarely) are 
> tolerable.
>
> Regading the directmap, it would -- just like with additional references 
> -- detect that the page cannot currently be removed from the direct map. 
> It's similarly "better safe than sorry", but here means that we likely 
> must retry if we cannot easily fallback to something else like for the 
> fork+COW case.
>
> -- 
> Cheers,
>
> David / dhildenb
David Hildenbrand Aug. 16, 2024, 10:03 p.m. UTC | #14
On 16.08.24 23:52, Ackerley Tng wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 16.08.24 19:45, Ackerley Tng wrote:
>>>
>>> <snip>
>>>
>>> IIUC folio_lock() isn't a prerequisite for taking a refcount on the
>>> folio.
>>
>> Right, to do folio_lock() you only have to guarantee that the folio
>> cannot get freed concurrently. So you piggyback on another reference
>> (you hold indirectly).
>>
>>>
>>> Even if we are able to figure out a "safe" refcount, and check that the
>>> current refcount == "safe" refcount before removing from direct map,
>>> what's stopping some other part of the kernel from taking a refcount
>>> just after the check happens and causing trouble with the folio's
>>> removal from direct map?
>>
>> Once the page was unmapped from user space, and there were no additional
>> references (e.g., GUP, whatever), any new references can only be
>> (should, unless BUG :) ) temporary speculative references that should
>> not try accessing page content, and that should back off if the folio is
>> not deemed interesting or cannot be locked. (e.g., page
>> migration/compaction/offlining).
> 
> I thought about it again - I think the vmsplice() cases are taken care
> of once we check that the folios are not mapped into userspace, since
> vmsplice() reads from a mapping.
> 
> splice() reads from the fd directly, but that's taken care since
> guest_memfd doesn't have a .splice_read() handler.
> 
> Reading /proc/pid/mem also requires the pages to first be mapped, IIUC,
> otherwise the pages won't show up, so checking that there are no more
> mappings to userspace takes care of this.

You have a misconception.

You can map pages to user space, GUP them, and then unmap them from user 
space. A GUP reference can outlive your user space mappings, easily.

So once there is a raised refcount, it could as well just be from 
vmsplice, or a pending reference from /proc/pid/mem, O_DIRECT, ...

> 
>>
>> Of course, there are some corner cases (kgdb, hibernation, /proc/kcore),
>> but most of these can be dealt with in one way or the other (make these
>> back off and not read/write page content, similar to how we handled it
>> for secretmem).
> 
> Does that really leave us with these corner cases? And so perhaps we
> could get away with just taking the folio_lock() to keep away the
> speculative references? So something like
> 
>    1. Check that the folio is not mapped and not pinned.

To do that, you have to lookup the folio first. That currently requires 
a refcount increment, even if only temporarily. Maybe we could avoid 
that, if we can guarantee that we are the only one modifying the 
pageache here, and we sync against that ourselves.

>    2. folio_lock() all the folios about to be removed from direct map
>    -- With the lock, all other accesses should be speculative --
>    3. Check that the refcount == "safe" refcount
>        3a. Unlock and return to userspace with -EAGAIN
>    4. Remove from direct map
>    5. folio_unlock() all those folios
> 
> Perhaps a very naive question: can the "safe" refcount be statically
> determined by walking through the code and counting where refcount is
> expected to be incremented?


Depends on how we design it. But if you hand out "safe" references to 
KVM etc, you'd have to track that -- and how often -- somehow. At which 
point we are at "increment/decrement" safe reference to track that for you.
Elliot Berman Aug. 16, 2024, 11:52 p.m. UTC | #15
On Sat, Aug 17, 2024 at 12:03:50AM +0200, David Hildenbrand wrote:
> On 16.08.24 23:52, Ackerley Tng wrote:
> > David Hildenbrand <david@redhat.com> writes:
> > 
> > > On 16.08.24 19:45, Ackerley Tng wrote:
> > > > 
> > > > <snip>
> > > > 
> > > > IIUC folio_lock() isn't a prerequisite for taking a refcount on the
> > > > folio.
> > > 
> > > Right, to do folio_lock() you only have to guarantee that the folio
> > > cannot get freed concurrently. So you piggyback on another reference
> > > (you hold indirectly).
> > > 
> > > > 
> > > > Even if we are able to figure out a "safe" refcount, and check that the
> > > > current refcount == "safe" refcount before removing from direct map,
> > > > what's stopping some other part of the kernel from taking a refcount
> > > > just after the check happens and causing trouble with the folio's
> > > > removal from direct map?
> > > 
> > > Once the page was unmapped from user space, and there were no additional
> > > references (e.g., GUP, whatever), any new references can only be
> > > (should, unless BUG :) ) temporary speculative references that should
> > > not try accessing page content, and that should back off if the folio is
> > > not deemed interesting or cannot be locked. (e.g., page
> > > migration/compaction/offlining).
> > 
> > I thought about it again - I think the vmsplice() cases are taken care
> > of once we check that the folios are not mapped into userspace, since
> > vmsplice() reads from a mapping.
> > 
> > splice() reads from the fd directly, but that's taken care since
> > guest_memfd doesn't have a .splice_read() handler.
> > 
> > Reading /proc/pid/mem also requires the pages to first be mapped, IIUC,
> > otherwise the pages won't show up, so checking that there are no more
> > mappings to userspace takes care of this.
> 
> You have a misconception.
> 
> You can map pages to user space, GUP them, and then unmap them from user
> space. A GUP reference can outlive your user space mappings, easily.
> 
> So once there is a raised refcount, it could as well just be from vmsplice,
> or a pending reference from /proc/pid/mem, O_DIRECT, ...
> 
> > 
> > > 
> > > Of course, there are some corner cases (kgdb, hibernation, /proc/kcore),
> > > but most of these can be dealt with in one way or the other (make these
> > > back off and not read/write page content, similar to how we handled it
> > > for secretmem).
> > 
> > Does that really leave us with these corner cases? And so perhaps we
> > could get away with just taking the folio_lock() to keep away the
> > speculative references? So something like
> > 
> >    1. Check that the folio is not mapped and not pinned.
> 
> To do that, you have to lookup the folio first. That currently requires a
> refcount increment, even if only temporarily. Maybe we could avoid that, if
> we can guarantee that we are the only one modifying the pageache here, and
> we sync against that ourselves.
> 
> >    2. folio_lock() all the folios about to be removed from direct map
> >    -- With the lock, all other accesses should be speculative --
> >    3. Check that the refcount == "safe" refcount
> >        3a. Unlock and return to userspace with -EAGAIN
> >    4. Remove from direct map
> >    5. folio_unlock() all those folios
> > 
> > Perhaps a very naive question: can the "safe" refcount be statically
> > determined by walking through the code and counting where refcount is
> > expected to be incremented?
> 
> 
> Depends on how we design it. But if you hand out "safe" references to KVM
> etc, you'd have to track that -- and how often -- somehow. At which point we
> are at "increment/decrement" safe reference to track that for you.
>

Just a status update: I've gotten the "safe" reference counter
implementation working for Gunyah now. It feels a bit flimsy because
we're juggling 3 reference counters*, but it seems like the right thing
to do after all the discussions here. It's passing all the Gunyah unit
tests I have which have so far been pretty good at finding issues.

I need to clean up the patches now and I'm aiming to have it out for RFC
next week.

* folio refcount, "accessible" refcount, and "safe" refcount

Thanks,
Elliot
diff mbox series

Patch

diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
index f9e4a27aed67..edcb4ba60cb0 100644
--- a/include/linux/guest_memfd.h
+++ b/include/linux/guest_memfd.h
@@ -16,12 +16,18 @@ 
  * @invalidate_end: called after invalidate_begin returns success. Optional.
  * @prepare: called before a folio is mapped into the guest address space.
  *           Optional.
+ * @accessible: called after prepare returns success and before it's mapped
+ *              into the guest address space. Returns 0 if the folio can be
+ *              accessed.
+ *              Optional. If not present, assumes folios are never accessible.
  * @release: Called when releasing the guest_memfd file. Required.
  */
 struct guest_memfd_operations {
 	int (*invalidate_begin)(struct inode *inode, pgoff_t offset, unsigned long nr);
 	void (*invalidate_end)(struct inode *inode, pgoff_t offset, unsigned long nr);
 	int (*prepare)(struct inode *inode, pgoff_t offset, struct folio *folio);
+	int (*accessible)(struct inode *inode, struct folio *folio,
+			  pgoff_t offset, unsigned long nr);
 	int (*release)(struct inode *inode);
 };
 
@@ -48,5 +54,6 @@  struct file *guest_memfd_alloc(const char *name,
 			       const struct guest_memfd_operations *ops,
 			       loff_t size, unsigned long flags);
 bool is_guest_memfd(struct file *file, const struct guest_memfd_operations *ops);
+int guest_memfd_make_inaccessible(struct file *file, struct folio *folio);
 
 #endif
diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
index e9d8cab72b28..6b5609932ca5 100644
--- a/mm/guest_memfd.c
+++ b/mm/guest_memfd.c
@@ -9,6 +9,8 @@ 
 #include <linux/pagemap.h>
 #include <linux/set_memory.h>
 
+#include "internal.h"
+
 static inline int guest_memfd_folio_private(struct folio *folio)
 {
 	unsigned long nr_pages = folio_nr_pages(folio);
@@ -89,7 +91,7 @@  struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
 			goto out_err;
 	}
 
-	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
+	if (!ops->accessible && (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)) {
 		r = guest_memfd_folio_private(folio);
 		if (r)
 			goto out_err;
@@ -107,6 +109,82 @@  struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
 }
 EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
 
+int guest_memfd_make_inaccessible(struct file *file, struct folio *folio)
+{
+	unsigned long gmem_flags = (unsigned long)file->private_data;
+	unsigned long i;
+	int r;
+
+	unmap_mapping_folio(folio);
+
+	/**
+	 * We can't use the refcount. It might be elevated due to
+	 * guest/vcpu trying to access same folio as another vcpu
+	 * or because userspace is trying to access folio for same reason
+	 *
+	 * folio_lock serializes the transitions between (in)accessible
+	 */
+	if (folio_maybe_dma_pinned(folio))
+		return -EBUSY;
+
+	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
+		r = guest_memfd_folio_private(folio);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static vm_fault_t gmem_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct inode *inode = file_inode(file);
+	const struct guest_memfd_operations *ops = inode->i_private;
+	struct folio *folio;
+	pgoff_t off;
+	int r;
+
+	folio = guest_memfd_grab_folio(file, vmf->pgoff, GUEST_MEMFD_GRAB_UPTODATE);
+	if (!folio)
+		return VM_FAULT_SIGBUS;
+
+	off = vmf->pgoff & (folio_nr_pages(folio) - 1);
+	r = ops->accessible(inode, folio, off, 1);
+	if (r) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return VM_FAULT_SIGBUS;
+	}
+
+	guest_memfd_folio_clear_private(folio);
+
+	vmf->page = folio_page(folio, off);
+
+	return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct gmem_vm_ops = {
+	.fault = gmem_fault,
+};
+
+static int gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	const struct guest_memfd_operations *ops = file_inode(file)->i_private;
+
+	if (!ops->accessible)
+		return -EPERM;
+
+	/* No support for private mappings to avoid COW.  */
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+	    (VM_SHARED | VM_MAYSHARE))
+		return -EINVAL;
+
+	file_accessed(file);
+	vma->vm_ops = &gmem_vm_ops;
+	return 0;
+}
+
 static long gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
@@ -220,6 +298,7 @@  static int gmem_release(struct inode *inode, struct file *file)
 static struct file_operations gmem_fops = {
 	.open = generic_file_open,
 	.llseek = generic_file_llseek,
+	.mmap = gmem_mmap,
 	.release = gmem_release,
 	.fallocate = gmem_fallocate,
 	.owner = THIS_MODULE,