diff mbox series

[10/10] fsnotify: generate pre-content permission event on page fault

Message ID 1bc2855779e7ba1d80592be7d6257b43f1a91886.1721931241.git.josef@toxicpanda.com (mailing list archive)
State New
Headers show
Series fanotify: add pre-content hooks | expand

Commit Message

Josef Bacik July 25, 2024, 6:19 p.m. UTC
FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
on the faulting method.

This pre-content event is meant to be used by hierarchical storage
managers that want to fill in the file content on first read access.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/notify/fsnotify.c             | 13 +++++++++
 include/linux/fsnotify_backend.h | 14 +++++++++
 mm/filemap.c                     | 50 ++++++++++++++++++++++++++++----
 3 files changed, 71 insertions(+), 6 deletions(-)

Comments

Amir Goldstein July 25, 2024, 8:19 p.m. UTC | #1
On Thu, Jul 25, 2024 at 9:20 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> on the faulting method.
>
> This pre-content event is meant to be used by hierarchical storage
> managers that want to fill in the file content on first read access.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/notify/fsnotify.c             | 13 +++++++++
>  include/linux/fsnotify_backend.h | 14 +++++++++
>  mm/filemap.c                     | 50 ++++++++++++++++++++++++++++----
>  3 files changed, 71 insertions(+), 6 deletions(-)
>
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 1ca4a8da7f29..435232d46b4f 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -28,6 +28,19 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
>         fsnotify_clear_marks_by_mount(mnt);
>  }
>
> +bool fsnotify_file_has_content_watches(struct file *file)

nit: has_pre_content_watches...

> +{
> +       struct inode *inode = file_inode(file);
> +       struct super_block *sb = inode->i_sb;
> +       struct mount *mnt = real_mount(file->f_path.mnt);
> +       u32 mask = inode->i_fsnotify_mask;
> +
> +       mask |= mnt->mnt_fsnotify_mask;
> +       mask |= sb->s_fsnotify_mask;
> +
> +       return !!(mask & FSNOTIFY_PRE_CONTENT_EVENTS);

This can use the fsnotify_object_watched() helper, and it will need
the READ_ONCE() that are just being added to avoid data races.

> +}
> +
>  /**
>   * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
>   * @sb: superblock being unmounted.
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 36c3d18cc40a..6983fbf096b8 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -900,6 +900,15 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
>         INIT_LIST_HEAD(&event->list);
>  }
>
> +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
> +bool fsnotify_file_has_content_watches(struct file *file);
> +#else
> +static inline bool fsnotify_file_has_content_watches(struct file *file)
> +{
> +       return false;
> +}
> +#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
> +
>  #else
>
>  static inline int fsnotify(__u32 mask, const void *data, int data_type,
> @@ -938,6 +947,11 @@ static inline u32 fsnotify_get_cookie(void)
>  static inline void fsnotify_unmount_inodes(struct super_block *sb)
>  {}
>
> +static inline bool fsnotify_file_has_content_watches(struct file *file)
> +{
> +       return false;
> +}
> +
>  #endif /* CONFIG_FSNOTIFY */
>
>  #endif /* __KERNEL __ */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index ca8c8d889eef..cc9d7885bbe3 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -46,6 +46,7 @@
>  #include <linux/pipe_fs_i.h>
>  #include <linux/splice.h>
>  #include <linux/rcupdate_wait.h>
> +#include <linux/fsnotify.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlbflush.h>
>  #include "internal.h"
> @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
>   * that.  If we didn't pin a file then we return NULL.  The file that is
>   * returned needs to be fput()'ed when we're done with it.
>   */
> -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> +                                          struct file *fpin)
>  {
>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         struct address_space *mapping = file->f_mapping;
>         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned long vm_flags = vmf->vma->vm_flags;
>         unsigned int mmap_miss;
>
> @@ -3182,12 +3183,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>   * was pinned if we have to drop the mmap_lock in order to do IO.
>   */
>  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> -                                           struct folio *folio)
> +                                           struct folio *folio,
> +                                           struct file *fpin)
>  {

If I am reading correctly, iomap (i.e. xfs) write shared memory fault
does not reach this code?

Do we care about writable shared memory faults use case for HSM?
It does not sound very relevant to HSM, but we cannot just ignore it..

>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned int mmap_miss;
>
>         /* If we don't want any read-ahead, don't bother */
> @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>         if (unlikely(index >= max_idx))
>                 return VM_FAULT_SIGBUS;
>
> +       /*
> +        * If we have pre-content watchers then we need to generate events on
> +        * page fault so that we can populate any data before the fault.
> +        *
> +        * We only do this on the first pass through, otherwise the populating
> +        * application could potentially deadlock on the mmap lock if it tries
> +        * to populate it with mmap.
> +        */
> +       if (fault_flag_allow_retry_first(vmf->flags) &&
> +           fsnotify_file_has_content_watches(file)) {
> +               int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> +               loff_t pos = vmf->pgoff << PAGE_SHIFT;
> +
> +               fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> +
> +               /*
> +                * We can only emit the event if we did actually release the
> +                * mmap lock.
> +                */
> +               if (fpin) {
> +                       error = fsnotify_file_area_perm(fpin, mask, &pos,
> +                                                       PAGE_SIZE);

This is going to also emit a FAN_ACCESS_PERM event.
Heritage of the fact that read() has to emit FAN_ACCESS_PERM
for backward compat and a design decision to emit both
FAN_ACCESS_PERM and FAN_PRE_ACCESS to avoid carrying the
API baggage of the legacy FAN_ACCESS_PERM to pre-content events.

Suggestion as workaround - use MAY_ACCESS instead of MAY_READ
here and emit FAN_PRE_ACCESS with MAY_READ | MAY_ACCESS.

Thank you for pushing my patches through!
Amir.
Josef Bacik July 29, 2024, 5:11 p.m. UTC | #2
On Thu, Jul 25, 2024 at 11:19:33PM +0300, Amir Goldstein wrote:
> On Thu, Jul 25, 2024 at 9:20 PM Josef Bacik <josef@toxicpanda.com> wrote:
> >
> > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > on the faulting method.
> >
> > This pre-content event is meant to be used by hierarchical storage
> > managers that want to fill in the file content on first read access.
> >
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/notify/fsnotify.c             | 13 +++++++++
> >  include/linux/fsnotify_backend.h | 14 +++++++++
> >  mm/filemap.c                     | 50 ++++++++++++++++++++++++++++----
> >  3 files changed, 71 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> > index 1ca4a8da7f29..435232d46b4f 100644
> > --- a/fs/notify/fsnotify.c
> > +++ b/fs/notify/fsnotify.c
> > @@ -28,6 +28,19 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
> >         fsnotify_clear_marks_by_mount(mnt);
> >  }
> >
> > +bool fsnotify_file_has_content_watches(struct file *file)
> 
> nit: has_pre_content_watches...
> 
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       struct super_block *sb = inode->i_sb;
> > +       struct mount *mnt = real_mount(file->f_path.mnt);
> > +       u32 mask = inode->i_fsnotify_mask;
> > +
> > +       mask |= mnt->mnt_fsnotify_mask;
> > +       mask |= sb->s_fsnotify_mask;
> > +
> > +       return !!(mask & FSNOTIFY_PRE_CONTENT_EVENTS);
> 
> This can use the fsnotify_object_watched() helper, and it will need
> the READ_ONCE() that are just being added to avoid data races.
> 
> > +}
> > +
> >  /**
> >   * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
> >   * @sb: superblock being unmounted.
> > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> > index 36c3d18cc40a..6983fbf096b8 100644
> > --- a/include/linux/fsnotify_backend.h
> > +++ b/include/linux/fsnotify_backend.h
> > @@ -900,6 +900,15 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
> >         INIT_LIST_HEAD(&event->list);
> >  }
> >
> > +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
> > +bool fsnotify_file_has_content_watches(struct file *file);
> > +#else
> > +static inline bool fsnotify_file_has_content_watches(struct file *file)
> > +{
> > +       return false;
> > +}
> > +#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
> > +
> >  #else
> >
> >  static inline int fsnotify(__u32 mask, const void *data, int data_type,
> > @@ -938,6 +947,11 @@ static inline u32 fsnotify_get_cookie(void)
> >  static inline void fsnotify_unmount_inodes(struct super_block *sb)
> >  {}
> >
> > +static inline bool fsnotify_file_has_content_watches(struct file *file)
> > +{
> > +       return false;
> > +}
> > +
> >  #endif /* CONFIG_FSNOTIFY */
> >
> >  #endif /* __KERNEL __ */
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index ca8c8d889eef..cc9d7885bbe3 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -46,6 +46,7 @@
> >  #include <linux/pipe_fs_i.h>
> >  #include <linux/splice.h>
> >  #include <linux/rcupdate_wait.h>
> > +#include <linux/fsnotify.h>
> >  #include <asm/pgalloc.h>
> >  #include <asm/tlbflush.h>
> >  #include "internal.h"
> > @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
> >   * that.  If we didn't pin a file then we return NULL.  The file that is
> >   * returned needs to be fput()'ed when we're done with it.
> >   */
> > -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> > +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> > +                                          struct file *fpin)
> >  {
> >         struct file *file = vmf->vma->vm_file;
> >         struct file_ra_state *ra = &file->f_ra;
> >         struct address_space *mapping = file->f_mapping;
> >         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> > -       struct file *fpin = NULL;
> >         unsigned long vm_flags = vmf->vma->vm_flags;
> >         unsigned int mmap_miss;
> >
> > @@ -3182,12 +3183,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> >   * was pinned if we have to drop the mmap_lock in order to do IO.
> >   */
> >  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> > -                                           struct folio *folio)
> > +                                           struct folio *folio,
> > +                                           struct file *fpin)
> >  {
> 
> If I am reading correctly, iomap (i.e. xfs) write shared memory fault
> does not reach this code?
> 
> Do we care about writable shared memory faults use case for HSM?
> It does not sound very relevant to HSM, but we cannot just ignore it..
> 

Sorry I realized I went off to try and solve this problem and never responded to
you.  I'm addressing the other comments, but this one is a little tricky.

We're kind of stuck between a rock and a hard place with this.  I had originally
put this before the ->fault() callback, but purposefully moved it into
filemap_fault() because I want to be able to drop the mmap lock while we're
waiting for a response from the HSM.

The reason to do this is because there are things that take the mmap lock for
simple things outside of the process, like /proc/$PID/smaps and other related
things, and this can cause high priority tasks to block behind possibly low
priority IO, creating a priority inversion.

Now, I'm not sure how widespread of a problem this is anymore, I know there's
been work done to the kernel and tools to avoid this style of problem.  I'm ok
with a "try it and see" approach, but I don't love that.

However I think putting fsnotify hooks into XFS itself for this particular path
is a good choice either.  What do you think?  Just move it to before ->fault(),
leave the mmap lock in place, and be done with it?  Thanks,

Josef
Amir Goldstein July 29, 2024, 6:57 p.m. UTC | #3
On Mon, Jul 29, 2024 at 8:11 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> On Thu, Jul 25, 2024 at 11:19:33PM +0300, Amir Goldstein wrote:
> > On Thu, Jul 25, 2024 at 9:20 PM Josef Bacik <josef@toxicpanda.com> wrote:
> > >
> > > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > > on the faulting method.
> > >
> > > This pre-content event is meant to be used by hierarchical storage
> > > managers that want to fill in the file content on first read access.
> > >
> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > > ---
> > >  fs/notify/fsnotify.c             | 13 +++++++++
> > >  include/linux/fsnotify_backend.h | 14 +++++++++
> > >  mm/filemap.c                     | 50 ++++++++++++++++++++++++++++----
> > >  3 files changed, 71 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> > > index 1ca4a8da7f29..435232d46b4f 100644
> > > --- a/fs/notify/fsnotify.c
> > > +++ b/fs/notify/fsnotify.c
> > > @@ -28,6 +28,19 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
> > >         fsnotify_clear_marks_by_mount(mnt);
> > >  }
> > >
> > > +bool fsnotify_file_has_content_watches(struct file *file)
> >
> > nit: has_pre_content_watches...
> >
> > > +{
> > > +       struct inode *inode = file_inode(file);
> > > +       struct super_block *sb = inode->i_sb;
> > > +       struct mount *mnt = real_mount(file->f_path.mnt);
> > > +       u32 mask = inode->i_fsnotify_mask;
> > > +
> > > +       mask |= mnt->mnt_fsnotify_mask;
> > > +       mask |= sb->s_fsnotify_mask;
> > > +
> > > +       return !!(mask & FSNOTIFY_PRE_CONTENT_EVENTS);
> >
> > This can use the fsnotify_object_watched() helper, and it will need
> > the READ_ONCE() that are just being added to avoid data races.
> >
> > > +}
> > > +
> > >  /**
> > >   * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
> > >   * @sb: superblock being unmounted.
> > > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> > > index 36c3d18cc40a..6983fbf096b8 100644
> > > --- a/include/linux/fsnotify_backend.h
> > > +++ b/include/linux/fsnotify_backend.h
> > > @@ -900,6 +900,15 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
> > >         INIT_LIST_HEAD(&event->list);
> > >  }
> > >
> > > +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
> > > +bool fsnotify_file_has_content_watches(struct file *file);
> > > +#else
> > > +static inline bool fsnotify_file_has_content_watches(struct file *file)
> > > +{
> > > +       return false;
> > > +}
> > > +#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
> > > +
> > >  #else
> > >
> > >  static inline int fsnotify(__u32 mask, const void *data, int data_type,
> > > @@ -938,6 +947,11 @@ static inline u32 fsnotify_get_cookie(void)
> > >  static inline void fsnotify_unmount_inodes(struct super_block *sb)
> > >  {}
> > >
> > > +static inline bool fsnotify_file_has_content_watches(struct file *file)
> > > +{
> > > +       return false;
> > > +}
> > > +
> > >  #endif /* CONFIG_FSNOTIFY */
> > >
> > >  #endif /* __KERNEL __ */
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index ca8c8d889eef..cc9d7885bbe3 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -46,6 +46,7 @@
> > >  #include <linux/pipe_fs_i.h>
> > >  #include <linux/splice.h>
> > >  #include <linux/rcupdate_wait.h>
> > > +#include <linux/fsnotify.h>
> > >  #include <asm/pgalloc.h>
> > >  #include <asm/tlbflush.h>
> > >  #include "internal.h"
> > > @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
> > >   * that.  If we didn't pin a file then we return NULL.  The file that is
> > >   * returned needs to be fput()'ed when we're done with it.
> > >   */
> > > -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> > > +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> > > +                                          struct file *fpin)
> > >  {
> > >         struct file *file = vmf->vma->vm_file;
> > >         struct file_ra_state *ra = &file->f_ra;
> > >         struct address_space *mapping = file->f_mapping;
> > >         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> > > -       struct file *fpin = NULL;
> > >         unsigned long vm_flags = vmf->vma->vm_flags;
> > >         unsigned int mmap_miss;
> > >
> > > @@ -3182,12 +3183,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> > >   * was pinned if we have to drop the mmap_lock in order to do IO.
> > >   */
> > >  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> > > -                                           struct folio *folio)
> > > +                                           struct folio *folio,
> > > +                                           struct file *fpin)
> > >  {
> >
> > If I am reading correctly, iomap (i.e. xfs) write shared memory fault
> > does not reach this code?
> >
> > Do we care about writable shared memory faults use case for HSM?
> > It does not sound very relevant to HSM, but we cannot just ignore it..
> >
>
> Sorry I realized I went off to try and solve this problem and never responded to
> you.  I'm addressing the other comments, but this one is a little tricky.
>
> We're kind of stuck between a rock and a hard place with this.  I had originally
> put this before the ->fault() callback, but purposefully moved it into
> filemap_fault() because I want to be able to drop the mmap lock while we're
> waiting for a response from the HSM.
>
> The reason to do this is because there are things that take the mmap lock for
> simple things outside of the process, like /proc/$PID/smaps and other related
> things, and this can cause high priority tasks to block behind possibly low
> priority IO, creating a priority inversion.
>
> Now, I'm not sure how widespread of a problem this is anymore, I know there's
> been work done to the kernel and tools to avoid this style of problem.  I'm ok
> with a "try it and see" approach, but I don't love that.
>

I defer this question to Jan.

> However I think putting fsnotify hooks into XFS itself for this particular path
> is a good choice either.

I think you meant "not a good choice" and I agree -
it is not only xfs, but could be any fs that will be converted to iomap
Other fs have ->fault != filemap_fault, even if they do end up calling
filemap_fault, IOW, there is no API guarantee that they will.

> What do you think?  Just move it to before ->fault(),
> leave the mmap lock in place, and be done with it?

If Jan blesses the hook called with mmap lock, then yeh,
putting the hook in the most generic "vfs" code would be
the best choice for maintenance.

Thanks,
Amir.
Jan Kara July 30, 2024, 12:18 p.m. UTC | #4
On Mon 29-07-24 21:57:34, Amir Goldstein wrote:
> On Mon, Jul 29, 2024 at 8:11 PM Josef Bacik <josef@toxicpanda.com> wrote:
> > > If I am reading correctly, iomap (i.e. xfs) write shared memory fault
> > > does not reach this code?
> > >
> > > Do we care about writable shared memory faults use case for HSM?
> > > It does not sound very relevant to HSM, but we cannot just ignore it..
> > >
> >
> > Sorry I realized I went off to try and solve this problem and never responded to
> > you.  I'm addressing the other comments, but this one is a little tricky.
> >
> > We're kind of stuck between a rock and a hard place with this.  I had originally
> > put this before the ->fault() callback, but purposefully moved it into
> > filemap_fault() because I want to be able to drop the mmap lock while we're
> > waiting for a response from the HSM.
> >
> > The reason to do this is because there are things that take the mmap lock for
> > simple things outside of the process, like /proc/$PID/smaps and other related
> > things, and this can cause high priority tasks to block behind possibly low
> > priority IO, creating a priority inversion.
> >
> > Now, I'm not sure how widespread of a problem this is anymore, I know there's
> > been work done to the kernel and tools to avoid this style of problem.  I'm ok
> > with a "try it and see" approach, but I don't love that.
> >
> 
> I defer this question to Jan.
> 
> > However I think putting fsnotify hooks into XFS itself for this particular path
> > is a good choice either.
> 
> I think you meant "not a good choice" and I agree -
> it is not only xfs, but could be any fs that will be converted to iomap
> Other fs have ->fault != filemap_fault, even if they do end up calling
> filemap_fault, IOW, there is no API guarantee that they will.
> 
> > What do you think?  Just move it to before ->fault(),
> > leave the mmap lock in place, and be done with it?
> 
> If Jan blesses the hook called with mmap lock, then yeh,
> putting the hook in the most generic "vfs" code would be
> the best choice for maintenance.

Well, I agree with Josef's comment about a rock and a hard place. For once,
regardless whether the hook will happen from before ->fault or from inside
the ->fault handler there will be fault callers where we cannot drop
mmap_lock (not all archs support dropping mmap_lock inside a fault AFAIR -
but a quick grep seems to show that these days maybe they do, also some
callers - most notably through GUP - don't allow dropping of mmap_lock
inside fault). So we have to have a way to handle a fault without
FAULT_FLAG_ALLOW_RETRY flag.

Now of course waiting for userspace reply to fanotify event with mmap_lock
held is ... dangerous. For example consider application with two threads:

T1					T2
page fault on inode I			write to inode I
  lock mm->mmap_lock			  inode_lock(I)
    send fanotify event			  ...
					  fault_in_iov_iter_readable()
					    lock mm->mmap_lock -> blocks
					      behind T1

now the HSM handler needs to fill in contents of inode I requested by the
page fault:

  inode_lock(I) -> deadlock

So conceptually I think the flow could look like (in __do_fault):

	if (!(vmf->flags & FAULT_FLAG_TRIED) &&
	    fsnotify_may_send_pre_content_event()) {
		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
			return VM_FAULT_RETRY;
		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
		if (!fpin)
			return ???VM_FAULT_SIGSEGV???;
		err = fsnotify_fault(...);
		if (err)
			return VM_FAULT_SIGBUS | VM_FAULT_RETRY;
		/*
		 * We are fine with proceeding with the fault. Retry the fault
		 * to let the filesystem handle it.
		 */
		return VM_FAULT_RETRY;
	}

The downside is that if we enter the page fault without ability to drop
mmap_lock on a file needing HSM handling, we get SIGSEGV. I'm not sure it
matters in practice because these are not that common paths e.g. stuff like
putting a breakpoint / uprobe on a file but maybe there are some surprises.

								Honza
Josef Bacik July 30, 2024, 4:51 p.m. UTC | #5
On Tue, Jul 30, 2024 at 02:18:37PM +0200, Jan Kara wrote:
> On Mon 29-07-24 21:57:34, Amir Goldstein wrote:
> > On Mon, Jul 29, 2024 at 8:11 PM Josef Bacik <josef@toxicpanda.com> wrote:
> > > > If I am reading correctly, iomap (i.e. xfs) write shared memory fault
> > > > does not reach this code?
> > > >
> > > > Do we care about writable shared memory faults use case for HSM?
> > > > It does not sound very relevant to HSM, but we cannot just ignore it..
> > > >
> > >
> > > Sorry I realized I went off to try and solve this problem and never responded to
> > > you.  I'm addressing the other comments, but this one is a little tricky.
> > >
> > > We're kind of stuck between a rock and a hard place with this.  I had originally
> > > put this before the ->fault() callback, but purposefully moved it into
> > > filemap_fault() because I want to be able to drop the mmap lock while we're
> > > waiting for a response from the HSM.
> > >
> > > The reason to do this is because there are things that take the mmap lock for
> > > simple things outside of the process, like /proc/$PID/smaps and other related
> > > things, and this can cause high priority tasks to block behind possibly low
> > > priority IO, creating a priority inversion.
> > >
> > > Now, I'm not sure how widespread of a problem this is anymore, I know there's
> > > been work done to the kernel and tools to avoid this style of problem.  I'm ok
> > > with a "try it and see" approach, but I don't love that.
> > >
> > 
> > I defer this question to Jan.
> > 
> > > However I think putting fsnotify hooks into XFS itself for this particular path
> > > is a good choice either.
> > 
> > I think you meant "not a good choice" and I agree -
> > it is not only xfs, but could be any fs that will be converted to iomap
> > Other fs have ->fault != filemap_fault, even if they do end up calling
> > filemap_fault, IOW, there is no API guarantee that they will.
> > 
> > > What do you think?  Just move it to before ->fault(),
> > > leave the mmap lock in place, and be done with it?
> > 
> > If Jan blesses the hook called with mmap lock, then yeh,
> > putting the hook in the most generic "vfs" code would be
> > the best choice for maintenance.
> 
> Well, I agree with Josef's comment about a rock and a hard place. For once,
> regardless whether the hook will happen from before ->fault or from inside
> the ->fault handler there will be fault callers where we cannot drop
> mmap_lock (not all archs support dropping mmap_lock inside a fault AFAIR -
> but a quick grep seems to show that these days maybe they do, also some
> callers - most notably through GUP - don't allow dropping of mmap_lock
> inside fault). So we have to have a way to handle a fault without
> FAULT_FLAG_ALLOW_RETRY flag.
> 
> Now of course waiting for userspace reply to fanotify event with mmap_lock
> held is ... dangerous. For example consider application with two threads:
> 
> T1					T2
> page fault on inode I			write to inode I
>   lock mm->mmap_lock			  inode_lock(I)
>     send fanotify event			  ...
> 					  fault_in_iov_iter_readable()
> 					    lock mm->mmap_lock -> blocks
> 					      behind T1
> 
> now the HSM handler needs to fill in contents of inode I requested by the
> page fault:
> 
>   inode_lock(I) -> deadlock
> 
> So conceptually I think the flow could look like (in __do_fault):
> 
> 	if (!(vmf->flags & FAULT_FLAG_TRIED) &&
> 	    fsnotify_may_send_pre_content_event()) {
> 		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> 			return VM_FAULT_RETRY;
> 		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
> 		if (!fpin)
> 			return ???VM_FAULT_SIGSEGV???;
> 		err = fsnotify_fault(...);
> 		if (err)
> 			return VM_FAULT_SIGBUS | VM_FAULT_RETRY;
> 		/*
> 		 * We are fine with proceeding with the fault. Retry the fault
> 		 * to let the filesystem handle it.
> 		 */
> 		return VM_FAULT_RETRY;
> 	}
> 
> The downside is that if we enter the page fault without ability to drop
> mmap_lock on a file needing HSM handling, we get SIGSEGV. I'm not sure it
> matters in practice because these are not that common paths e.g. stuff like
> putting a breakpoint / uprobe on a file but maybe there are some surprises.
> 

The only thing I don't like about this is that now the fault handler loses the
ability to drop the mmap sem.  I think in practice this doesn't really matter,
because we're trying to avoid doing IO while under the mmap sem, and presumably
this will have instantiated the pages to avoid the IO.

However if you use reflink this wouldn't be the case, and now we've
re-introduced the priority inversion possiblity.

I'm leaning more towards just putting the fsnotify hook in the xfs code for the
write case, and anybody else who implements their own ->fault without calling
the generic one will just have to do the same.

That being said I'm not going to die on any particular hill here, if you still
want to do the above before the ->fault() handler then I'll update my code to do
that and we can move on.  Thanks,

Josef
Jan Kara Aug. 1, 2024, 9:34 p.m. UTC | #6
On Tue 30-07-24 12:51:49, Josef Bacik wrote:
> On Tue, Jul 30, 2024 at 02:18:37PM +0200, Jan Kara wrote:
> > On Mon 29-07-24 21:57:34, Amir Goldstein wrote:
> > > On Mon, Jul 29, 2024 at 8:11 PM Josef Bacik <josef@toxicpanda.com> wrote:
> > > > > If I am reading correctly, iomap (i.e. xfs) write shared memory fault
> > > > > does not reach this code?
> > > > >
> > > > > Do we care about writable shared memory faults use case for HSM?
> > > > > It does not sound very relevant to HSM, but we cannot just ignore it..
> > > > >
> > > >
> > > > Sorry I realized I went off to try and solve this problem and never responded to
> > > > you.  I'm addressing the other comments, but this one is a little tricky.
> > > >
> > > > We're kind of stuck between a rock and a hard place with this.  I had originally
> > > > put this before the ->fault() callback, but purposefully moved it into
> > > > filemap_fault() because I want to be able to drop the mmap lock while we're
> > > > waiting for a response from the HSM.
> > > >
> > > > The reason to do this is because there are things that take the mmap lock for
> > > > simple things outside of the process, like /proc/$PID/smaps and other related
> > > > things, and this can cause high priority tasks to block behind possibly low
> > > > priority IO, creating a priority inversion.
> > > >
> > > > Now, I'm not sure how widespread of a problem this is anymore, I know there's
> > > > been work done to the kernel and tools to avoid this style of problem.  I'm ok
> > > > with a "try it and see" approach, but I don't love that.
> > > >
> > > 
> > > I defer this question to Jan.
> > > 
> > > > However I think putting fsnotify hooks into XFS itself for this particular path
> > > > is a good choice either.
> > > 
> > > I think you meant "not a good choice" and I agree -
> > > it is not only xfs, but could be any fs that will be converted to iomap
> > > Other fs have ->fault != filemap_fault, even if they do end up calling
> > > filemap_fault, IOW, there is no API guarantee that they will.
> > > 
> > > > What do you think?  Just move it to before ->fault(),
> > > > leave the mmap lock in place, and be done with it?
> > > 
> > > If Jan blesses the hook called with mmap lock, then yeh,
> > > putting the hook in the most generic "vfs" code would be
> > > the best choice for maintenance.
> > 
> > Well, I agree with Josef's comment about a rock and a hard place. For once,
> > regardless whether the hook will happen from before ->fault or from inside
> > the ->fault handler there will be fault callers where we cannot drop
> > mmap_lock (not all archs support dropping mmap_lock inside a fault AFAIR -
> > but a quick grep seems to show that these days maybe they do, also some
> > callers - most notably through GUP - don't allow dropping of mmap_lock
> > inside fault). So we have to have a way to handle a fault without
> > FAULT_FLAG_ALLOW_RETRY flag.
> > 
> > Now of course waiting for userspace reply to fanotify event with mmap_lock
> > held is ... dangerous. For example consider application with two threads:
> > 
> > T1					T2
> > page fault on inode I			write to inode I
> >   lock mm->mmap_lock			  inode_lock(I)
> >     send fanotify event			  ...
> > 					  fault_in_iov_iter_readable()
> > 					    lock mm->mmap_lock -> blocks
> > 					      behind T1
> > 
> > now the HSM handler needs to fill in contents of inode I requested by the
> > page fault:
> > 
> >   inode_lock(I) -> deadlock
> > 
> > So conceptually I think the flow could look like (in __do_fault):
> > 
> > 	if (!(vmf->flags & FAULT_FLAG_TRIED) &&
> > 	    fsnotify_may_send_pre_content_event()) {
> > 		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> > 			return VM_FAULT_RETRY;
> > 		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
> > 		if (!fpin)
> > 			return ???VM_FAULT_SIGSEGV???;
> > 		err = fsnotify_fault(...);
> > 		if (err)
> > 			return VM_FAULT_SIGBUS | VM_FAULT_RETRY;
> > 		/*
> > 		 * We are fine with proceeding with the fault. Retry the fault
> > 		 * to let the filesystem handle it.
> > 		 */
> > 		return VM_FAULT_RETRY;
> > 	}
> > 
> > The downside is that if we enter the page fault without ability to drop
> > mmap_lock on a file needing HSM handling, we get SIGSEGV. I'm not sure it
> > matters in practice because these are not that common paths e.g. stuff like
> > putting a breakpoint / uprobe on a file but maybe there are some surprises.
> > 
> 
> The only thing I don't like about this is that now the fault handler
> loses the ability to drop the mmap sem.  I think in practice this doesn't
> really matter, because we're trying to avoid doing IO while under the
> mmap sem, and presumably this will have instantiated the pages to avoid
> the IO.
> 
> However if you use reflink this wouldn't be the case, and now we've
> re-introduced the priority inversion possiblity.

Hum, you're right. I was focused on handling the case when HSM needs to
pull in the page but the common case when the page is local, just not in
cache, is also very important.

> I'm leaning more towards just putting the fsnotify hook in the xfs code
> for the write case, and anybody else who implements their own ->fault
> without calling the generic one will just have to do the same.
> 
> That being said I'm not going to die on any particular hill here, if you
> still want to do the above before the ->fault() handler then I'll update
> my code to do that and we can move on.  Thanks,

I think for now issuing the pre content event from ->fault is OK. I'm not
thrilled about it (since it ultimately means duplication among page fault
handlers) but I don't see a cleaner solution that would perform reasonably
well.

								Honza
Jan Kara Aug. 1, 2024, 9:40 p.m. UTC | #7
On Thu 25-07-24 14:19:47, Josef Bacik wrote:
> FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> on the faulting method.
> 
> This pre-content event is meant to be used by hierarchical storage
> managers that want to fill in the file content on first read access.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
...
> @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>  	if (unlikely(index >= max_idx))
>  		return VM_FAULT_SIGBUS;
>  
> +	/*
> +	 * If we have pre-content watchers then we need to generate events on
> +	 * page fault so that we can populate any data before the fault.
> +	 *
> +	 * We only do this on the first pass through, otherwise the populating
> +	 * application could potentially deadlock on the mmap lock if it tries
> +	 * to populate it with mmap.
> +	 */
> +	if (fault_flag_allow_retry_first(vmf->flags) &&
> +	    fsnotify_file_has_content_watches(file)) {

I'm somewhat nervous that if ALLOW_RETRY isn't set, we'd silently jump into
readpage code without ever sending pre-content event and thus we'd possibly
expose invalid content to userspace? I think we should fail the fault if
fsnotify_file_has_content_watches(file) && !(vmf->flags &
FAULT_FLAG_ALLOW_RETRY).

> +		int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> +		loff_t pos = vmf->pgoff << PAGE_SHIFT;
> +
> +		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> +
> +		/*
> +		 * We can only emit the event if we did actually release the
> +		 * mmap lock.
> +		 */
> +		if (fpin) {
> +			error = fsnotify_file_area_perm(fpin, mask, &pos,
> +							PAGE_SIZE);
> +			if (error) {
> +				fput(fpin);
> +				return VM_FAULT_ERROR;
> +			}
> +		}
> +	}
> +
>  	/*
>  	 * Do we have something in the page cache already?
>  	 */
...
> @@ -3612,6 +3643,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>  	unsigned long rss = 0;
>  	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
>  
> +	/*
> +	 * We are under RCU, we can't emit events here, we need to force a
> +	 * normal fault to make sure the events get sent.
> +	 */
> +	if (fsnotify_file_has_content_watches(file))
> +		return ret;
> +

I don't think we need to do anything for filemap_map_pages(). The call just
inserts page cache content into page tables and whatever is in the page
cache and has folio_uptodate() set should be already valid file content,
shouldn't it?

								Honza
Josef Bacik Aug. 2, 2024, 4:03 p.m. UTC | #8
On Thu, Aug 01, 2024 at 11:40:25PM +0200, Jan Kara wrote:
> On Thu 25-07-24 14:19:47, Josef Bacik wrote:
> > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > on the faulting method.
> > 
> > This pre-content event is meant to be used by hierarchical storage
> > managers that want to fill in the file content on first read access.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ...
> > @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
> >  	if (unlikely(index >= max_idx))
> >  		return VM_FAULT_SIGBUS;
> >  
> > +	/*
> > +	 * If we have pre-content watchers then we need to generate events on
> > +	 * page fault so that we can populate any data before the fault.
> > +	 *
> > +	 * We only do this on the first pass through, otherwise the populating
> > +	 * application could potentially deadlock on the mmap lock if it tries
> > +	 * to populate it with mmap.
> > +	 */
> > +	if (fault_flag_allow_retry_first(vmf->flags) &&
> > +	    fsnotify_file_has_content_watches(file)) {
> 
> I'm somewhat nervous that if ALLOW_RETRY isn't set, we'd silently jump into
> readpage code without ever sending pre-content event and thus we'd possibly
> expose invalid content to userspace? I think we should fail the fault if
> fsnotify_file_has_content_watches(file) && !(vmf->flags &
> FAULT_FLAG_ALLOW_RETRY).

I was worried about this too but it seems to not be the case that we'll not ever
have ALLOW_RETRY.  That being said I'm fine turning this into a sigbus.

> 
> > +		int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> > +		loff_t pos = vmf->pgoff << PAGE_SHIFT;
> > +
> > +		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> > +
> > +		/*
> > +		 * We can only emit the event if we did actually release the
> > +		 * mmap lock.
> > +		 */
> > +		if (fpin) {
> > +			error = fsnotify_file_area_perm(fpin, mask, &pos,
> > +							PAGE_SIZE);
> > +			if (error) {
> > +				fput(fpin);
> > +				return VM_FAULT_ERROR;
> > +			}
> > +		}
> > +	}
> > +
> >  	/*
> >  	 * Do we have something in the page cache already?
> >  	 */
> ...
> > @@ -3612,6 +3643,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> >  	unsigned long rss = 0;
> >  	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
> >  
> > +	/*
> > +	 * We are under RCU, we can't emit events here, we need to force a
> > +	 * normal fault to make sure the events get sent.
> > +	 */
> > +	if (fsnotify_file_has_content_watches(file))
> > +		return ret;
> > +
> 
> I don't think we need to do anything for filemap_map_pages(). The call just
> inserts page cache content into page tables and whatever is in the page
> cache and has folio_uptodate() set should be already valid file content,
> shouldn't it?

I'll make this comment more clear.  filemap_fault() will start readahead, but
we'll only emit the event for the page size that we're faulting.  I had looked
at putting this at the readahead place and figuring out the readahead size, but
literally anything could trigger readahead so it's better to just not allow
filemap_map_pages() to happen, otherwise we'll end up with empty pages (if the
content hasn't been populated yet) and never emit an event for those ranges.
Thanks,

Josef
Jan Kara Aug. 5, 2024, 12:13 p.m. UTC | #9
On Fri 02-08-24 12:03:57, Josef Bacik wrote:
> On Thu, Aug 01, 2024 at 11:40:25PM +0200, Jan Kara wrote:
> > On Thu 25-07-24 14:19:47, Josef Bacik wrote:
> > > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > > on the faulting method.
> > > 
> > > This pre-content event is meant to be used by hierarchical storage
> > > managers that want to fill in the file content on first read access.
> > > 
> > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ...
> > > @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
> > >  	if (unlikely(index >= max_idx))
> > >  		return VM_FAULT_SIGBUS;
> > >  
> > > +	/*
> > > +	 * If we have pre-content watchers then we need to generate events on
> > > +	 * page fault so that we can populate any data before the fault.
> > > +	 *
> > > +	 * We only do this on the first pass through, otherwise the populating
> > > +	 * application could potentially deadlock on the mmap lock if it tries
> > > +	 * to populate it with mmap.
> > > +	 */
> > > +	if (fault_flag_allow_retry_first(vmf->flags) &&
> > > +	    fsnotify_file_has_content_watches(file)) {
> > 
> > I'm somewhat nervous that if ALLOW_RETRY isn't set, we'd silently jump into
> > readpage code without ever sending pre-content event and thus we'd possibly
> > expose invalid content to userspace? I think we should fail the fault if
> > fsnotify_file_has_content_watches(file) && !(vmf->flags &
> > FAULT_FLAG_ALLOW_RETRY).
> 
> I was worried about this too but it seems to not be the case that we'll not ever
> have ALLOW_RETRY.  That being said I'm fine turning this into a sigbus.

Do you mean that with your workloads we always have ALLOW_RETRY set? As I
wrote, currently you'd have to try really hard to hit such paths but they
are there - for example if you place uprobe on an address in a VMA that is
not present, the page fault is going to happen without ALLOW_RETRY set.

> > > +		int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> > > +		loff_t pos = vmf->pgoff << PAGE_SHIFT;
> > > +
> > > +		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> > > +
> > > +		/*
> > > +		 * We can only emit the event if we did actually release the
> > > +		 * mmap lock.
> > > +		 */
> > > +		if (fpin) {
> > > +			error = fsnotify_file_area_perm(fpin, mask, &pos,
> > > +							PAGE_SIZE);
> > > +			if (error) {
> > > +				fput(fpin);
> > > +				return VM_FAULT_ERROR;
> > > +			}
> > > +		}
> > > +	}
> > > +
> > >  	/*
> > >  	 * Do we have something in the page cache already?
> > >  	 */
> > ...
> > > @@ -3612,6 +3643,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> > >  	unsigned long rss = 0;
> > >  	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
> > >  
> > > +	/*
> > > +	 * We are under RCU, we can't emit events here, we need to force a
> > > +	 * normal fault to make sure the events get sent.
> > > +	 */
> > > +	if (fsnotify_file_has_content_watches(file))
> > > +		return ret;
> > > +
> > 
> > I don't think we need to do anything for filemap_map_pages(). The call just
> > inserts page cache content into page tables and whatever is in the page
> > cache and has folio_uptodate() set should be already valid file content,
> > shouldn't it?
> 
> I'll make this comment more clear.  filemap_fault() will start readahead,
> but we'll only emit the event for the page size that we're faulting.  I
> had looked at putting this at the readahead place and figuring out the
> readahead size, but literally anything could trigger readahead so it's
> better to just not allow filemap_map_pages() to happen, otherwise we'll
> end up with empty pages (if the content hasn't been populated yet) and
> never emit an event for those ranges.

This seems like an interesting problem. Even ordinary read(2) will trigger
readahead and as you say, we would be instantiating folios with wrong
content (zeros) due to that. It seems as a fragile design to keep such
folios in the page cache and place checks in all the places that could
possibly make their content visible to the user. I'd rather make sure that
if we pull folios into page cache (and set folio_uptodate() bit), their
content is indeed valid.

What we could do is to turn off readahead on the inode if
fsnotify_file_has_content_watches() is true. Essentially the handler of the
precontent event can do a much better job of prefilling the page cache with
whatever content is needed in a range that makes sense. And then we could
leave filemap_map_pages() intact. What do you think guys?

								Honza


> Thanks,
> 
> Josef
>
Josef Bacik Aug. 7, 2024, 7:04 p.m. UTC | #10
On Mon, Aug 05, 2024 at 02:13:49PM +0200, Jan Kara wrote:
> On Fri 02-08-24 12:03:57, Josef Bacik wrote:
> > On Thu, Aug 01, 2024 at 11:40:25PM +0200, Jan Kara wrote:
> > > On Thu 25-07-24 14:19:47, Josef Bacik wrote:
> > > > FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> > > > on the faulting method.
> > > > 
> > > > This pre-content event is meant to be used by hierarchical storage
> > > > managers that want to fill in the file content on first read access.
> > > > 
> > > > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > > ...
> > > > @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
> > > >  	if (unlikely(index >= max_idx))
> > > >  		return VM_FAULT_SIGBUS;
> > > >  
> > > > +	/*
> > > > +	 * If we have pre-content watchers then we need to generate events on
> > > > +	 * page fault so that we can populate any data before the fault.
> > > > +	 *
> > > > +	 * We only do this on the first pass through, otherwise the populating
> > > > +	 * application could potentially deadlock on the mmap lock if it tries
> > > > +	 * to populate it with mmap.
> > > > +	 */
> > > > +	if (fault_flag_allow_retry_first(vmf->flags) &&
> > > > +	    fsnotify_file_has_content_watches(file)) {
> > > 
> > > I'm somewhat nervous that if ALLOW_RETRY isn't set, we'd silently jump into
> > > readpage code without ever sending pre-content event and thus we'd possibly
> > > expose invalid content to userspace? I think we should fail the fault if
> > > fsnotify_file_has_content_watches(file) && !(vmf->flags &
> > > FAULT_FLAG_ALLOW_RETRY).
> > 
> > I was worried about this too but it seems to not be the case that we'll not ever
> > have ALLOW_RETRY.  That being said I'm fine turning this into a sigbus.
> 
> Do you mean that with your workloads we always have ALLOW_RETRY set? As I
> wrote, currently you'd have to try really hard to hit such paths but they
> are there - for example if you place uprobe on an address in a VMA that is
> not present, the page fault is going to happen without ALLOW_RETRY set.

From what I can tell we almost always have FOLL_UNLOCKABLE set, which is what
translates into ALLOW_RETRY.  There's definitely some paths that can get there,
but as far as what happens in a normal environment we're going to almost always
have ALLOW_RETRY set.

This does leave a hole in some corner cases.  I'm content to say "don't do that"
if you want to use these hooks.

Optionally we could add a FAN_PRE_MMAP hook in vm_mmap() for the range that is
being mmap'ed to make sure we never miss any events, and then applications can
decide if they want to risk it with the pagefault hooks or enable the mmap hooks
for absolute certainty.

> 
> > > > +		int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> > > > +		loff_t pos = vmf->pgoff << PAGE_SHIFT;
> > > > +
> > > > +		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> > > > +
> > > > +		/*
> > > > +		 * We can only emit the event if we did actually release the
> > > > +		 * mmap lock.
> > > > +		 */
> > > > +		if (fpin) {
> > > > +			error = fsnotify_file_area_perm(fpin, mask, &pos,
> > > > +							PAGE_SIZE);
> > > > +			if (error) {
> > > > +				fput(fpin);
> > > > +				return VM_FAULT_ERROR;
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +
> > > >  	/*
> > > >  	 * Do we have something in the page cache already?
> > > >  	 */
> > > ...
> > > > @@ -3612,6 +3643,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> > > >  	unsigned long rss = 0;
> > > >  	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
> > > >  
> > > > +	/*
> > > > +	 * We are under RCU, we can't emit events here, we need to force a
> > > > +	 * normal fault to make sure the events get sent.
> > > > +	 */
> > > > +	if (fsnotify_file_has_content_watches(file))
> > > > +		return ret;
> > > > +
> > > 
> > > I don't think we need to do anything for filemap_map_pages(). The call just
> > > inserts page cache content into page tables and whatever is in the page
> > > cache and has folio_uptodate() set should be already valid file content,
> > > shouldn't it?
> > 
> > I'll make this comment more clear.  filemap_fault() will start readahead,
> > but we'll only emit the event for the page size that we're faulting.  I
> > had looked at putting this at the readahead place and figuring out the
> > readahead size, but literally anything could trigger readahead so it's
> > better to just not allow filemap_map_pages() to happen, otherwise we'll
> > end up with empty pages (if the content hasn't been populated yet) and
> > never emit an event for those ranges.
> 
> This seems like an interesting problem. Even ordinary read(2) will trigger
> readahead and as you say, we would be instantiating folios with wrong
> content (zeros) due to that. It seems as a fragile design to keep such
> folios in the page cache and place checks in all the places that could
> possibly make their content visible to the user. I'd rather make sure that
> if we pull folios into page cache (and set folio_uptodate() bit), their
> content is indeed valid.

The hook exists before we go looking in pagecache, so we're fine with read(2),
the only problem is mmap (AFAICT, I am not very smart after all).

> 
> What we could do is to turn off readahead on the inode if
> fsnotify_file_has_content_watches() is true. Essentially the handler of the
> precontent event can do a much better job of prefilling the page cache with
> whatever content is needed in a range that makes sense. And then we could
> leave filemap_map_pages() intact. What do you think guys?

I had considered this but decided against it because it seemed like a big
hammer, but if you're cool with it then so am I.  Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 1ca4a8da7f29..435232d46b4f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -28,6 +28,19 @@  void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 	fsnotify_clear_marks_by_mount(mnt);
 }
 
+bool fsnotify_file_has_content_watches(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct mount *mnt = real_mount(file->f_path.mnt);
+	u32 mask = inode->i_fsnotify_mask;
+
+	mask |= mnt->mnt_fsnotify_mask;
+	mask |= sb->s_fsnotify_mask;
+
+	return !!(mask & FSNOTIFY_PRE_CONTENT_EVENTS);
+}
+
 /**
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
  * @sb: superblock being unmounted.
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 36c3d18cc40a..6983fbf096b8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -900,6 +900,15 @@  static inline void fsnotify_init_event(struct fsnotify_event *event)
 	INIT_LIST_HEAD(&event->list);
 }
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+bool fsnotify_file_has_content_watches(struct file *file);
+#else
+static inline bool fsnotify_file_has_content_watches(struct file *file)
+{
+	return false;
+}
+#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
+
 #else
 
 static inline int fsnotify(__u32 mask, const void *data, int data_type,
@@ -938,6 +947,11 @@  static inline u32 fsnotify_get_cookie(void)
 static inline void fsnotify_unmount_inodes(struct super_block *sb)
 {}
 
+static inline bool fsnotify_file_has_content_watches(struct file *file)
+{
+	return false;
+}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
diff --git a/mm/filemap.c b/mm/filemap.c
index ca8c8d889eef..cc9d7885bbe3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,6 +46,7 @@ 
 #include <linux/pipe_fs_i.h>
 #include <linux/splice.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/fsnotify.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -3112,13 +3113,13 @@  static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
  * that.  If we didn't pin a file then we return NULL.  The file that is
  * returned needs to be fput()'ed when we're done with it.
  */
-static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
+					   struct file *fpin)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	struct address_space *mapping = file->f_mapping;
 	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
-	struct file *fpin = NULL;
 	unsigned long vm_flags = vmf->vma->vm_flags;
 	unsigned int mmap_miss;
 
@@ -3182,12 +3183,12 @@  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  * was pinned if we have to drop the mmap_lock in order to do IO.
  */
 static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
-					    struct folio *folio)
+					    struct folio *folio,
+					    struct file *fpin)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
-	struct file *fpin = NULL;
 	unsigned int mmap_miss;
 
 	/* If we don't want any read-ahead, don't bother */
@@ -3287,6 +3288,35 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	if (unlikely(index >= max_idx))
 		return VM_FAULT_SIGBUS;
 
+	/*
+	 * If we have pre-content watchers then we need to generate events on
+	 * page fault so that we can populate any data before the fault.
+	 *
+	 * We only do this on the first pass through, otherwise the populating
+	 * application could potentially deadlock on the mmap lock if it tries
+	 * to populate it with mmap.
+	 */
+	if (fault_flag_allow_retry_first(vmf->flags) &&
+	    fsnotify_file_has_content_watches(file)) {
+		int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
+		loff_t pos = vmf->pgoff << PAGE_SHIFT;
+
+		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+
+		/*
+		 * We can only emit the event if we did actually release the
+		 * mmap lock.
+		 */
+		if (fpin) {
+			error = fsnotify_file_area_perm(fpin, mask, &pos,
+							PAGE_SIZE);
+			if (error) {
+				fput(fpin);
+				return VM_FAULT_ERROR;
+			}
+		}
+	}
+
 	/*
 	 * Do we have something in the page cache already?
 	 */
@@ -3297,7 +3327,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 		 * the lock.
 		 */
 		if (!(vmf->flags & FAULT_FLAG_TRIED))
-			fpin = do_async_mmap_readahead(vmf, folio);
+			fpin = do_async_mmap_readahead(vmf, folio, fpin);
 		if (unlikely(!folio_test_uptodate(folio))) {
 			filemap_invalidate_lock_shared(mapping);
 			mapping_locked = true;
@@ -3311,7 +3341,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 		count_vm_event(PGMAJFAULT);
 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
-		fpin = do_sync_mmap_readahead(vmf);
+		fpin = do_sync_mmap_readahead(vmf, fpin);
 retry_find:
 		/*
 		 * See comment in filemap_create_folio() why we need
@@ -3604,6 +3634,7 @@  vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	struct vm_area_struct *vma = vmf->vma;
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	pgoff_t last_pgoff = start_pgoff;
 	unsigned long addr;
 	XA_STATE(xas, &mapping->i_pages, start_pgoff);
@@ -3612,6 +3643,13 @@  vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	unsigned long rss = 0;
 	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
 
+	/*
+	 * We are under RCU, we can't emit events here, we need to force a
+	 * normal fault to make sure the events get sent.
+	 */
+	if (fsnotify_file_has_content_watches(file))
+		return ret;
+
 	rcu_read_lock();
 	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
 	if (!folio)