diff mbox series

[v5,03/13] mm/shmem: Support memfile_notifier

Message ID 20220310140911.50924-4-chao.p.peng@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng March 10, 2022, 2:09 p.m. UTC
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

It maintains a memfile_notifier list in shmem_inode_info structure and
implements memfile_pfn_ops callbacks defined by memfile_notifier. It
then exposes them to memfile_notifier via
shmem_get_memfile_notifier_info.

We use SGP_NOALLOC in shmem_get_lock_pfn since the pages should be
allocated by userspace for private memory. If there is no pages
allocated at the offset then error should be returned so KVM knows that
the memory is not private memory.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/shmem_fs.h |  4 +++
 mm/shmem.c               | 76 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

Comments

Dave Chinner March 10, 2022, 11:08 p.m. UTC | #1
On Thu, Mar 10, 2022 at 10:09:01PM +0800, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> It maintains a memfile_notifier list in shmem_inode_info structure and
> implements memfile_pfn_ops callbacks defined by memfile_notifier. It
> then exposes them to memfile_notifier via
> shmem_get_memfile_notifier_info.
> 
> We use SGP_NOALLOC in shmem_get_lock_pfn since the pages should be
> allocated by userspace for private memory. If there is no pages
> allocated at the offset then error should be returned so KVM knows that
> the memory is not private memory.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/linux/shmem_fs.h |  4 +++
>  mm/shmem.c               | 76 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+)
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 2dde843f28ef..7bb16f2d2825 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
>  #include <linux/percpu_counter.h>
>  #include <linux/xattr.h>
>  #include <linux/fs_parser.h>
> +#include <linux/memfile_notifier.h>
>  
>  /* inode in-kernel data */
>  
> @@ -28,6 +29,9 @@ struct shmem_inode_info {
>  	struct simple_xattrs	xattrs;		/* list of xattrs */
>  	atomic_t		stop_eviction;	/* hold when working on inode */
>  	unsigned int		xflags;		/* shmem extended flags */
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	struct memfile_notifier_list memfile_notifiers;
> +#endif
>  	struct inode		vfs_inode;
>  };
>  
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9b31a7056009..7b43e274c9a2 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
>  	return page ? page_folio(page) : NULL;
>  }
>  
> +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> +#endif
> +}

*notify_populate(), not fallocate.  This is a notification that a
range has been populated, not that the fallocate() syscall was run
to populate the backing store of a file.

i.e.  fallocate is the name of a userspace filesystem API that can
be used to manipulate the backing store of a file in various ways.
It can both populate and punch away the backing store of a file, and
some operations that fallocate() can run will do both (e.g.
FALLOC_FL_ZERO_RANGE) and so could generate both
notify_invalidate() and a notify_populate() events.

Hence "fallocate" as an internal mm namespace or operation does not
belong anywhere in core MM infrastructure - it should never get used
anywhere other than the VFS/filesystem layers that implement the
fallocate() syscall or use it directly.

Cheers,

Dave.
Chao Peng March 11, 2022, 8:42 a.m. UTC | #2
On Fri, Mar 11, 2022 at 10:08:22AM +1100, Dave Chinner wrote:
> On Thu, Mar 10, 2022 at 10:09:01PM +0800, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > It maintains a memfile_notifier list in shmem_inode_info structure and
> > implements memfile_pfn_ops callbacks defined by memfile_notifier. It
> > then exposes them to memfile_notifier via
> > shmem_get_memfile_notifier_info.
> > 
> > We use SGP_NOALLOC in shmem_get_lock_pfn since the pages should be
> > allocated by userspace for private memory. If there is no pages
> > allocated at the offset then error should be returned so KVM knows that
> > the memory is not private memory.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  include/linux/shmem_fs.h |  4 +++
> >  mm/shmem.c               | 76 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 80 insertions(+)
> > 
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index 2dde843f28ef..7bb16f2d2825 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -9,6 +9,7 @@
> >  #include <linux/percpu_counter.h>
> >  #include <linux/xattr.h>
> >  #include <linux/fs_parser.h>
> > +#include <linux/memfile_notifier.h>
> >  
> >  /* inode in-kernel data */
> >  
> > @@ -28,6 +29,9 @@ struct shmem_inode_info {
> >  	struct simple_xattrs	xattrs;		/* list of xattrs */
> >  	atomic_t		stop_eviction;	/* hold when working on inode */
> >  	unsigned int		xflags;		/* shmem extended flags */
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +	struct memfile_notifier_list memfile_notifiers;
> > +#endif
> >  	struct inode		vfs_inode;
> >  };
> >  
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 9b31a7056009..7b43e274c9a2 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
> >  	return page ? page_folio(page) : NULL;
> >  }
> >  
> > +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +	struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > +	memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> > +#endif
> > +}
> 
> *notify_populate(), not fallocate.  This is a notification that a
> range has been populated, not that the fallocate() syscall was run
> to populate the backing store of a file.
> 
> i.e.  fallocate is the name of a userspace filesystem API that can
> be used to manipulate the backing store of a file in various ways.
> It can both populate and punch away the backing store of a file, and
> some operations that fallocate() can run will do both (e.g.
> FALLOC_FL_ZERO_RANGE) and so could generate both
> notify_invalidate() and a notify_populate() events.

Yes, I fully agreed fallocate syscall has both populating and hole
punching semantics so notify_fallocate can be misleading since we
actually mean populate here.

> 
> Hence "fallocate" as an internal mm namespace or operation does not
> belong anywhere in core MM infrastructure - it should never get used
> anywhere other than the VFS/filesystem layers that implement the
> fallocate() syscall or use it directly.

Will use your suggestion through the series where applied. Thanks for
your suggestion.

Chao
> 
> Cheers,
> 
> Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com
Kirill A. Shutemov April 11, 2022, 3:26 p.m. UTC | #3
On Thu, Mar 10, 2022 at 10:09:01PM +0800, Chao Peng wrote:
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9b31a7056009..7b43e274c9a2 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
>  	return page ? page_folio(page) : NULL;
>  }
>  
> +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> +#endif

All these #ifdefs look ugly. Could you provide dummy memfile_* for
!MEMFILE_NOTIFIER case?
Chao Peng April 12, 2022, 1:12 p.m. UTC | #4
On Mon, Apr 11, 2022 at 06:26:47PM +0300, Kirill A. Shutemov wrote:
> On Thu, Mar 10, 2022 at 10:09:01PM +0800, Chao Peng wrote:
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 9b31a7056009..7b43e274c9a2 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
> >  	return page ? page_folio(page) : NULL;
> >  }
> >  
> > +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +	struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > +	memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> > +#endif
> 
> All these #ifdefs look ugly. Could you provide dummy memfile_* for
> !MEMFILE_NOTIFIER case?
Sure.

Chao
> 
> -- 
>  Kirill A. Shutemov
Vishal Annapurve April 19, 2022, 10:40 p.m. UTC | #5
On Thu, Mar 10, 2022 at 6:10 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>
> It maintains a memfile_notifier list in shmem_inode_info structure and
> implements memfile_pfn_ops callbacks defined by memfile_notifier. It
> then exposes them to memfile_notifier via
> shmem_get_memfile_notifier_info.
>
> We use SGP_NOALLOC in shmem_get_lock_pfn since the pages should be
> allocated by userspace for private memory. If there is no pages
> allocated at the offset then error should be returned so KVM knows that
> the memory is not private memory.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/linux/shmem_fs.h |  4 +++
>  mm/shmem.c               | 76 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+)
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 2dde843f28ef..7bb16f2d2825 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
>  #include <linux/percpu_counter.h>
>  #include <linux/xattr.h>
>  #include <linux/fs_parser.h>
> +#include <linux/memfile_notifier.h>
>
>  /* inode in-kernel data */
>
> @@ -28,6 +29,9 @@ struct shmem_inode_info {
>         struct simple_xattrs    xattrs;         /* list of xattrs */
>         atomic_t                stop_eviction;  /* hold when working on inode */
>         unsigned int            xflags;         /* shmem extended flags */
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +       struct memfile_notifier_list memfile_notifiers;
> +#endif
>         struct inode            vfs_inode;
>  };
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9b31a7056009..7b43e274c9a2 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
>         return page ? page_folio(page) : NULL;
>  }
>
> +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +       struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +       memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> +#endif
> +}
> +
> +static void notify_invalidate_page(struct inode *inode, struct folio *folio,
> +                                  pgoff_t start, pgoff_t end)
> +{
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +       struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +       start = max(start, folio->index);
> +       end = min(end, folio->index + folio_nr_pages(folio));
> +
> +       memfile_notifier_invalidate(&info->memfile_notifiers, start, end);
> +#endif
> +}
> +
>  /*
>   * Remove range of pages and swap entries from page cache, and free them.
>   * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
> @@ -946,6 +968,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>                         }
>                         index += folio_nr_pages(folio) - 1;
>
> +                       notify_invalidate_page(inode, folio, start, end);
> +
>                         if (!unfalloc || !folio_test_uptodate(folio))
>                                 truncate_inode_folio(mapping, folio);
>                         folio_unlock(folio);
> @@ -1019,6 +1043,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>                                         index--;
>                                         break;
>                                 }
> +
> +                               notify_invalidate_page(inode, folio, start, end);
> +

Should this be done in batches or done once for all of range [start, end)?

>                                 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
>                                                 folio);
>                                 truncate_inode_folio(mapping, folio);
> @@ -2279,6 +2306,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>                 info->flags = flags & VM_NORESERVE;
>                 INIT_LIST_HEAD(&info->shrinklist);
>                 INIT_LIST_HEAD(&info->swaplist);
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +               memfile_notifier_list_init(&info->memfile_notifiers);
> +#endif
>                 simple_xattrs_init(&info->xattrs);
>                 cache_no_acl(inode);
>                 mapping_set_large_folios(inode->i_mapping);
> @@ -2802,6 +2832,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
>         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
>                 i_size_write(inode, offset + len);
>         inode->i_ctime = current_time(inode);
> +       notify_fallocate(inode, start, end);
>  undone:
>         spin_lock(&inode->i_lock);
>         inode->i_private = NULL;
> @@ -3909,6 +3940,47 @@ static struct file_system_type shmem_fs_type = {
>         .fs_flags       = FS_USERNS_MOUNT,
>  };
>
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static long shmem_get_lock_pfn(struct inode *inode, pgoff_t offset, int *order)
> +{
> +       struct page *page;
> +       int ret;
> +
> +       ret = shmem_getpage(inode, offset, &page, SGP_NOALLOC);
> +       if (ret)
> +               return ret;
> +
> +       *order = thp_order(compound_head(page));
> +
> +       return page_to_pfn(page);
> +}
> +
> +static void shmem_put_unlock_pfn(unsigned long pfn)
> +{
> +       struct page *page = pfn_to_page(pfn);
> +
> +       VM_BUG_ON_PAGE(!PageLocked(page), page);
> +
> +       set_page_dirty(page);
> +       unlock_page(page);
> +       put_page(page);
> +}
> +
> +static struct memfile_notifier_list* shmem_get_notifier_list(struct inode *inode)
> +{
> +       if (!shmem_mapping(inode->i_mapping))
> +               return NULL;
> +
> +       return  &SHMEM_I(inode)->memfile_notifiers;
> +}
> +
> +static struct memfile_backing_store shmem_backing_store = {
> +       .pfn_ops.get_lock_pfn = shmem_get_lock_pfn,
> +       .pfn_ops.put_unlock_pfn = shmem_put_unlock_pfn,
> +       .get_notifier_list = shmem_get_notifier_list,
> +};
> +#endif /* CONFIG_MEMFILE_NOTIFIER */
> +
>  int __init shmem_init(void)
>  {
>         int error;
> @@ -3934,6 +4006,10 @@ int __init shmem_init(void)
>         else
>                 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
>  #endif
> +
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +       memfile_register_backing_store(&shmem_backing_store);
> +#endif
>         return 0;
>
>  out1:
> --
> 2.17.1
>
Chao Peng April 20, 2022, 3:24 a.m. UTC | #6
On Tue, Apr 19, 2022 at 03:40:09PM -0700, Vishal Annapurve wrote:
> On Thu, Mar 10, 2022 at 6:10 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> >
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> >
> > It maintains a memfile_notifier list in shmem_inode_info structure and
> > implements memfile_pfn_ops callbacks defined by memfile_notifier. It
> > then exposes them to memfile_notifier via
> > shmem_get_memfile_notifier_info.
> >
> > We use SGP_NOALLOC in shmem_get_lock_pfn since the pages should be
> > allocated by userspace for private memory. If there is no pages
> > allocated at the offset then error should be returned so KVM knows that
> > the memory is not private memory.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  include/linux/shmem_fs.h |  4 +++
> >  mm/shmem.c               | 76 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 80 insertions(+)
> >
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index 2dde843f28ef..7bb16f2d2825 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -9,6 +9,7 @@
> >  #include <linux/percpu_counter.h>
> >  #include <linux/xattr.h>
> >  #include <linux/fs_parser.h>
> > +#include <linux/memfile_notifier.h>
> >
> >  /* inode in-kernel data */
> >
> > @@ -28,6 +29,9 @@ struct shmem_inode_info {
> >         struct simple_xattrs    xattrs;         /* list of xattrs */
> >         atomic_t                stop_eviction;  /* hold when working on inode */
> >         unsigned int            xflags;         /* shmem extended flags */
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +       struct memfile_notifier_list memfile_notifiers;
> > +#endif
> >         struct inode            vfs_inode;
> >  };
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 9b31a7056009..7b43e274c9a2 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -903,6 +903,28 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
> >         return page ? page_folio(page) : NULL;
> >  }
> >
> > +static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +       struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > +       memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
> > +#endif
> > +}
> > +
> > +static void notify_invalidate_page(struct inode *inode, struct folio *folio,
> > +                                  pgoff_t start, pgoff_t end)
> > +{
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +       struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > +       start = max(start, folio->index);
> > +       end = min(end, folio->index + folio_nr_pages(folio));
> > +
> > +       memfile_notifier_invalidate(&info->memfile_notifiers, start, end);
> > +#endif
> > +}
> > +
> >  /*
> >   * Remove range of pages and swap entries from page cache, and free them.
> >   * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
> > @@ -946,6 +968,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
> >                         }
> >                         index += folio_nr_pages(folio) - 1;
> >
> > +                       notify_invalidate_page(inode, folio, start, end);
> > +
> >                         if (!unfalloc || !folio_test_uptodate(folio))
> >                                 truncate_inode_folio(mapping, folio);
> >                         folio_unlock(folio);
> > @@ -1019,6 +1043,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
> >                                         index--;
> >                                         break;
> >                                 }
> > +
> > +                               notify_invalidate_page(inode, folio, start, end);
> > +
> 
> Should this be done in batches or done once for all of range [start, end)?

Batching is definitely prefered. Will look at that.

Thanks,
Chao
> 
> >                                 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
> >                                                 folio);
> >                                 truncate_inode_folio(mapping, folio);
> > @@ -2279,6 +2306,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> >                 info->flags = flags & VM_NORESERVE;
> >                 INIT_LIST_HEAD(&info->shrinklist);
> >                 INIT_LIST_HEAD(&info->swaplist);
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +               memfile_notifier_list_init(&info->memfile_notifiers);
> > +#endif
> >                 simple_xattrs_init(&info->xattrs);
> >                 cache_no_acl(inode);
> >                 mapping_set_large_folios(inode->i_mapping);
> > @@ -2802,6 +2832,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
> >         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
> >                 i_size_write(inode, offset + len);
> >         inode->i_ctime = current_time(inode);
> > +       notify_fallocate(inode, start, end);
> >  undone:
> >         spin_lock(&inode->i_lock);
> >         inode->i_private = NULL;
> > @@ -3909,6 +3940,47 @@ static struct file_system_type shmem_fs_type = {
> >         .fs_flags       = FS_USERNS_MOUNT,
> >  };
> >
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +static long shmem_get_lock_pfn(struct inode *inode, pgoff_t offset, int *order)
> > +{
> > +       struct page *page;
> > +       int ret;
> > +
> > +       ret = shmem_getpage(inode, offset, &page, SGP_NOALLOC);
> > +       if (ret)
> > +               return ret;
> > +
> > +       *order = thp_order(compound_head(page));
> > +
> > +       return page_to_pfn(page);
> > +}
> > +
> > +static void shmem_put_unlock_pfn(unsigned long pfn)
> > +{
> > +       struct page *page = pfn_to_page(pfn);
> > +
> > +       VM_BUG_ON_PAGE(!PageLocked(page), page);
> > +
> > +       set_page_dirty(page);
> > +       unlock_page(page);
> > +       put_page(page);
> > +}
> > +
> > +static struct memfile_notifier_list* shmem_get_notifier_list(struct inode *inode)
> > +{
> > +       if (!shmem_mapping(inode->i_mapping))
> > +               return NULL;
> > +
> > +       return  &SHMEM_I(inode)->memfile_notifiers;
> > +}
> > +
> > +static struct memfile_backing_store shmem_backing_store = {
> > +       .pfn_ops.get_lock_pfn = shmem_get_lock_pfn,
> > +       .pfn_ops.put_unlock_pfn = shmem_put_unlock_pfn,
> > +       .get_notifier_list = shmem_get_notifier_list,
> > +};
> > +#endif /* CONFIG_MEMFILE_NOTIFIER */
> > +
> >  int __init shmem_init(void)
> >  {
> >         int error;
> > @@ -3934,6 +4006,10 @@ int __init shmem_init(void)
> >         else
> >                 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
> >  #endif
> > +
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +       memfile_register_backing_store(&shmem_backing_store);
> > +#endif
> >         return 0;
> >
> >  out1:
> > --
> > 2.17.1
> >
diff mbox series

Patch

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 2dde843f28ef..7bb16f2d2825 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@ 
 #include <linux/percpu_counter.h>
 #include <linux/xattr.h>
 #include <linux/fs_parser.h>
+#include <linux/memfile_notifier.h>
 
 /* inode in-kernel data */
 
@@ -28,6 +29,9 @@  struct shmem_inode_info {
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	atomic_t		stop_eviction;	/* hold when working on inode */
 	unsigned int		xflags;		/* shmem extended flags */
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	struct memfile_notifier_list memfile_notifiers;
+#endif
 	struct inode		vfs_inode;
 };
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 9b31a7056009..7b43e274c9a2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -903,6 +903,28 @@  static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 	return page ? page_folio(page) : NULL;
 }
 
+static void notify_fallocate(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	memfile_notifier_fallocate(&info->memfile_notifiers, start, end);
+#endif
+}
+
+static void notify_invalidate_page(struct inode *inode, struct folio *folio,
+				   pgoff_t start, pgoff_t end)
+{
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	start = max(start, folio->index);
+	end = min(end, folio->index + folio_nr_pages(folio));
+
+	memfile_notifier_invalidate(&info->memfile_notifiers, start, end);
+#endif
+}
+
 /*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -946,6 +968,8 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			}
 			index += folio_nr_pages(folio) - 1;
 
+			notify_invalidate_page(inode, folio, start, end);
+
 			if (!unfalloc || !folio_test_uptodate(folio))
 				truncate_inode_folio(mapping, folio);
 			folio_unlock(folio);
@@ -1019,6 +1043,9 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 					index--;
 					break;
 				}
+
+				notify_invalidate_page(inode, folio, start, end);
+
 				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
 						folio);
 				truncate_inode_folio(mapping, folio);
@@ -2279,6 +2306,9 @@  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
+#ifdef CONFIG_MEMFILE_NOTIFIER
+		memfile_notifier_list_init(&info->memfile_notifiers);
+#endif
 		simple_xattrs_init(&info->xattrs);
 		cache_no_acl(inode);
 		mapping_set_large_folios(inode->i_mapping);
@@ -2802,6 +2832,7 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
 		i_size_write(inode, offset + len);
 	inode->i_ctime = current_time(inode);
+	notify_fallocate(inode, start, end);
 undone:
 	spin_lock(&inode->i_lock);
 	inode->i_private = NULL;
@@ -3909,6 +3940,47 @@  static struct file_system_type shmem_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
+#ifdef CONFIG_MEMFILE_NOTIFIER
+static long shmem_get_lock_pfn(struct inode *inode, pgoff_t offset, int *order)
+{
+	struct page *page;
+	int ret;
+
+	ret = shmem_getpage(inode, offset, &page, SGP_NOALLOC);
+	if (ret)
+		return ret;
+
+	*order = thp_order(compound_head(page));
+
+	return page_to_pfn(page);
+}
+
+static void shmem_put_unlock_pfn(unsigned long pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	put_page(page);
+}
+
+static struct memfile_notifier_list* shmem_get_notifier_list(struct inode *inode)
+{
+	if (!shmem_mapping(inode->i_mapping))
+		return NULL;
+
+	return  &SHMEM_I(inode)->memfile_notifiers;
+}
+
+static struct memfile_backing_store shmem_backing_store = {
+	.pfn_ops.get_lock_pfn = shmem_get_lock_pfn,
+	.pfn_ops.put_unlock_pfn = shmem_put_unlock_pfn,
+	.get_notifier_list = shmem_get_notifier_list,
+};
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
 int __init shmem_init(void)
 {
 	int error;
@@ -3934,6 +4006,10 @@  int __init shmem_init(void)
 	else
 		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
 #endif
+
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	memfile_register_backing_store(&shmem_backing_store);
+#endif
 	return 0;
 
 out1: