diff mbox series

[v6,01/27] mm: Introduce struct folio

Message ID 20210331184728.1188084-2-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series Memory Folios | expand

Commit Message

Matthew Wilcox March 31, 2021, 6:47 p.m. UTC
A struct folio is a new abstraction to replace the venerable struct page.
A function which takes a struct folio argument declares that it will
operate on the entire (possibly compound) page, not just PAGE_SIZE bytes.
In return, the caller guarantees that the pointer it is passing does
not point to a tail page.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h       | 78 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h | 65 +++++++++++++++++++++++++++++++++
 mm/util.c                | 19 ++++++++++
 3 files changed, 162 insertions(+)

Comments

Kirill A . Shutemov April 6, 2021, 12:29 p.m. UTC | #1
On Wed, Mar 31, 2021 at 07:47:02PM +0100, Matthew Wilcox (Oracle) wrote:
> +/**
> + * folio_next - Move to the next physical folio.
> + * @folio: The folio we're currently operating on.
> + *
> + * If you have physically contiguous memory which may span more than
> + * one folio (eg a &struct bio_vec), use this function to move from one
> + * folio to the next.  Do not use it if the memory is only virtually
> + * contiguous as the folios are almost certainly not adjacent to each
> + * other.  This is the folio equivalent to writing ``page++``.
> + *
> + * Context: We assume that the folios are refcounted and/or locked at a
> + * higher level and do not adjust the reference counts.
> + * Return: The next struct folio.
> + */
> +static inline struct folio *folio_next(struct folio *folio)
> +{
> +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> +	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
> +#else
> +	return folio + folio_nr_pages(folio);
> +#endif

Do we really need the #if here?

From quick look at nth_page() and memory_model.h, compiler should be able
to simplify calculation for FLATMEM or SPARSEMEM_VMEMMAP to what you do in
the #else. No?

> @@ -224,6 +224,71 @@ struct page {
>  #endif
>  } _struct_page_alignment;
>  
> +/**
> + * struct folio - Represents a contiguous set of bytes.
> + * @flags: Identical to the page flags.
> + * @lru: Least Recently Used list; tracks how recently this folio was used.
> + * @mapping: The file this page belongs to, or refers to the anon_vma for
> + *    anonymous pages.
> + * @index: Offset within the file, in units of pages.  For anonymous pages,
> + *    this is the index from the beginning of the mmap.
> + * @private: Filesystem per-folio data (see attach_folio_private()).
> + *    Used for swp_entry_t if FolioSwapCache().
> + * @_mapcount: How many times this folio is mapped to userspace.  Use
> + *    folio_mapcount() to access it.
> + * @_refcount: Number of references to this folio.  Use folio_ref_count()
> + *    to read it.
> + * @memcg_data: Memory Control Group data.
> + *
> + * A folio is a physically, virtually and logically contiguous set
> + * of bytes.  It is a power-of-two in size, and it is aligned to that
> + * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
> + * in the page cache, it is at a file offset which is a multiple of that
> + * power-of-two.
> + */
> +struct folio {
> +	/* private: don't document the anon union */
> +	union {
> +		struct {
> +	/* public: */
> +			unsigned long flags;
> +			struct list_head lru;
> +			struct address_space *mapping;
> +			pgoff_t index;
> +			unsigned long private;
> +			atomic_t _mapcount;
> +			atomic_t _refcount;
> +#ifdef CONFIG_MEMCG
> +			unsigned long memcg_data;
> +#endif

As Christoph, I'm not a fan of this :/

> +	/* private: the union with struct page is transitional */
> +		};
> +		struct page page;
> +	};
> +};
Matthew Wilcox April 6, 2021, 12:48 p.m. UTC | #2
On Tue, Apr 06, 2021 at 03:29:18PM +0300, Kirill A. Shutemov wrote:
> On Wed, Mar 31, 2021 at 07:47:02PM +0100, Matthew Wilcox (Oracle) wrote:
> > +/**
> > + * folio_next - Move to the next physical folio.
> > + * @folio: The folio we're currently operating on.
> > + *
> > + * If you have physically contiguous memory which may span more than
> > + * one folio (eg a &struct bio_vec), use this function to move from one
> > + * folio to the next.  Do not use it if the memory is only virtually
> > + * contiguous as the folios are almost certainly not adjacent to each
> > + * other.  This is the folio equivalent to writing ``page++``.
> > + *
> > + * Context: We assume that the folios are refcounted and/or locked at a
> > + * higher level and do not adjust the reference counts.
> > + * Return: The next struct folio.
> > + */
> > +static inline struct folio *folio_next(struct folio *folio)
> > +{
> > +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> > +	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
> > +#else
> > +	return folio + folio_nr_pages(folio);
> > +#endif
> 
> Do we really need the #if here?
> 
> >From quick look at nth_page() and memory_model.h, compiler should be able
> to simplify calculation for FLATMEM or SPARSEMEM_VMEMMAP to what you do in
> the #else. No?

No.

0000000000001180 <a>:
struct page *a(struct page *p, unsigned long n)
{
    1180:       e8 00 00 00 00          callq  1185 <a+0x5>
                        1181: R_X86_64_PLT32    __fentry__-0x4
    1185:       55                      push   %rbp
        return nth_page(p, n);
    1186:       48 2b 3d 00 00 00 00    sub    0x0(%rip),%rdi
                        1189: R_X86_64_PC32     vmemmap_base-0x4
    118d:       48 c1 ff 06             sar    $0x6,%rdi
    1191:       48 8d 04 37             lea    (%rdi,%rsi,1),%rax
    1195:       48 89 e5                mov    %rsp,%rbp
        return nth_page(p, n);
    1198:       48 c1 e0 06             shl    $0x6,%rax
    119c:       48 03 05 00 00 00 00    add    0x0(%rip),%rax
                        119f: R_X86_64_PC32     vmemmap_base-0x4
    11a3:       5d                      pop    %rbp
    11a4:       c3                      retq   

vs

00000000000011b0 <b>:

struct page *b(struct page *p, unsigned long n)
{
    11b0:       e8 00 00 00 00          callq  11b5 <b+0x5>
                        11b1: R_X86_64_PLT32    __fentry__-0x4
    11b5:       55                      push   %rbp
        return p + n;
    11b6:       48 c1 e6 06             shl    $0x6,%rsi
    11ba:       48 8d 04 37             lea    (%rdi,%rsi,1),%rax
    11be:       48 89 e5                mov    %rsp,%rbp
    11c1:       5d                      pop    %rbp
    11c2:       c3                      retq   

Now, maybe we should put this optimisation into the definition of nth_page?

> > +struct folio {
> > +	/* private: don't document the anon union */
> > +	union {
> > +		struct {
> > +	/* public: */
> > +			unsigned long flags;
> > +			struct list_head lru;
> > +			struct address_space *mapping;
> > +			pgoff_t index;
> > +			unsigned long private;
> > +			atomic_t _mapcount;
> > +			atomic_t _refcount;
> > +#ifdef CONFIG_MEMCG
> > +			unsigned long memcg_data;
> > +#endif
> 
> As Christoph, I'm not a fan of this :/

What would you prefer?
Kirill A . Shutemov April 6, 2021, 2:21 p.m. UTC | #3
On Tue, Apr 06, 2021 at 01:48:07PM +0100, Matthew Wilcox wrote:
> Now, maybe we should put this optimisation into the definition of nth_page?

Sounds like a good idea to me.

> > > +struct folio {
> > > +	/* private: don't document the anon union */
> > > +	union {
> > > +		struct {
> > > +	/* public: */
> > > +			unsigned long flags;
> > > +			struct list_head lru;
> > > +			struct address_space *mapping;
> > > +			pgoff_t index;
> > > +			unsigned long private;
> > > +			atomic_t _mapcount;
> > > +			atomic_t _refcount;
> > > +#ifdef CONFIG_MEMCG
> > > +			unsigned long memcg_data;
> > > +#endif
> > 
> > As Christoph, I'm not a fan of this :/
> 
> What would you prefer?

I liked earlier approach with only struct page here. Once we know a field
should never be referenced from raw struct page, we can move it here.

But feel free to ignore my suggestion. It's not show-stopper for me and
reverting is back doesn't worth it.

I went through the patchset and it looks good. You can use my

  Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

on all of them.

Thanks a lot for doing this.
Christoph Hellwig April 6, 2021, 2:31 p.m. UTC | #4
On Tue, Apr 06, 2021 at 01:48:07PM +0100, Matthew Wilcox wrote:
> Now, maybe we should put this optimisation into the definition of nth_page?

That would be nice.

> > As Christoph, I'm not a fan of this :/
> 
> What would you prefer?

Looking at your full folio series on git.infradead.org, there are a
total of 12 references to non-page members of struct folio, assuming
my crude grep that expects a folio to be named folio did not miss any.

Except for one that prints folio->flags in cachefiles code, and which
should go away they are all in core MM code in mm/ or include/.  With
enough file system conversions I do see potential uses for ->mapping
and ->index outside of core code, but IMHO we can ignore those for now
and just switch them over if/when we actually change the struct folio
internals to split them from tail pages.

So my opinion is:  leave these fields out for now, and when the problem
that we'd have a lot of reference out of core code arises deal with it
once we know about the scope.  Maybe we add wrappers for the few
members that are reasonable "public", maybe we then do the union
trick you have here because it is the least evil, or maybe we just do
not do anything at all until these fields move over to the folio
entirely.
Matthew Wilcox April 6, 2021, 2:40 p.m. UTC | #5
On Tue, Apr 06, 2021 at 03:31:50PM +0100, Christoph Hellwig wrote:
> > > As Christoph, I'm not a fan of this :/
> > 
> > What would you prefer?
> 
> Looking at your full folio series on git.infradead.org, there are a
> total of 12 references to non-page members of struct folio, assuming
> my crude grep that expects a folio to be named folio did not miss any.

Hmm ... I count more in the filesystems:

fs/afs/dir.c:   struct afs_vnode *dvnode = AFS_FS_I(folio->page.mapping->host);
fs/afs/dir.c:   _enter("{%lu},%zu,%zu", folio->page.index, offset, length);
fs/afs/file.c:  _enter("{%lu},%zu,%zu", folio->page.index, offset, length);
fs/afs/write.c:         folio->page.index);
fs/befs/linuxvfs.c:     struct inode *inode = folio->page.mapping->host;
fs/btrfs/disk-io.c:     tree = &BTRFS_I(folio->page.mapping->host)->io_tree;
fs/btrfs/disk-io.c:             btrfs_warn(BTRFS_I(folio->page.mapping->host)->root->fs_info,
fs/btrfs/extent_io.c:   struct btrfs_inode *inode = BTRFS_I(folios[0]->page.mapping->host);
fs/btrfs/file.c:                if (folio->page.mapping != inode->i_mapping) {
fs/btrfs/free-space-cache.c:                    if (folio->page.mapping != inode->i_mapping) {
fs/btrfs/inode.c:               if (folio->page.mapping != mapping) {
fs/btrfs/inode.c:       struct btrfs_inode *inode = BTRFS_I(folio->page.mapping->host);
fs/buffer.c:    spin_lock(&folio->page.mapping->private_lock);
fs/buffer.c:    spin_unlock(&folio->page.mapping->private_lock);
fs/buffer.c:    block_in_file = (sector_t)folio->page.index <<
fs/ceph/addr.c:              mapping->host, folio, folio->page.index);
fs/ceph/addr.c:      mapping->host, folio, folio->page.index,
fs/ceph/addr.c: folio->page.private = (unsigned long)snapc;
fs/ceph/addr.c: inode = folio->page.mapping->host;
fs/ceph/addr.c:              inode, folio, folio->page.index, offset, length);
fs/ceph/addr.c:      inode, folio, folio->page.index);
fs/cifs/file.c: struct cifsInodeInfo *cifsi = CIFS_I(folio->page.mapping->host);
fs/ext4/inode.c:        struct inode *inode = folio->page.mapping->host;
fs/f2fs/data.c: struct inode *inode = folio->page.mapping->host;
fs/fuse/dir.c:  int err = fuse_readlink_page(folio->page.mapping->host, &folio->page);
fs/gfs2/aops.c: struct gfs2_sbd *sdp = GFS2_SB(folio->page.mapping->host);
fs/iomap/buffered-io.c: unsigned int nr_blocks = i_blocks_per_folio(folio->page.mapping->host,
fs/iomap/buffered-io.c: struct inode *inode = folio->page.mapping->host;
fs/iomap/buffered-io.c: BUG_ON(folio->page.index);
fs/iomap/buffered-io.c:         gfp_t gfp = mapping_gfp_constraint(folio->page.mapping,
fs/iomap/buffered-io.c: struct inode *inode = folio->page.mapping->host;
fs/iomap/buffered-io.c: struct inode *inode = folio->page.mapping->host;
fs/iomap/buffered-io.c: trace_iomap_releasepage(folio->page.mapping->host, folio_offset(folio),
fs/iomap/buffered-io.c: trace_iomap_invalidatepage(folio->page.mapping->host, offset, len);
fs/jffs2/file.c:        struct inode *inode = folio->page.mapping->host;
fs/mpage.c:     struct inode *inode = folio->page.mapping->host;
fs/mpage.c:             gfp = readahead_gfp_mask(folio->page.mapping);
fs/mpage.c:             gfp = mapping_gfp_constraint(folio->page.mapping, GFP_KERNEL);
fs/mpage.c:     block_in_file = (sector_t)folio->page.index << (PAGE_SHIFT - blkbits);
fs/mpage.c:             prefetchw(&folio->page.flags);
fs/nfs/file.c:  nfs_fscache_invalidate_page(&folio->page, folio->page.mapping->host);
fs/nfs/fscache.c:                nfs_i_fscache(inode), folio, folio->page.index,
fs/nfs/fscache.c:                folio->page.flags, inode);
fs/reiserfs/inode.c:    struct inode *inode = folio->page.mapping->host;
fs/remap_range.c:       if (folio1->page.index > folio2->page.index)
fs/ubifs/file.c:        struct inode *inode = folio->page.mapping->host;
fs/xfs/xfs_aops.c:      struct inode            *inode = folio->page.mapping->host;

(I didn't go through my whole series and do the conversion from
folio->page.x to folio->x yet)
Christoph Hellwig April 6, 2021, 2:47 p.m. UTC | #6
On Tue, Apr 06, 2021 at 03:40:22PM +0100, Matthew Wilcox wrote:
> On Tue, Apr 06, 2021 at 03:31:50PM +0100, Christoph Hellwig wrote:
> > > > As Christoph, I'm not a fan of this :/
> > > 
> > > What would you prefer?
> > 
> > Looking at your full folio series on git.infradead.org, there are a
> > total of 12 references to non-page members of struct folio, assuming
> > my crude grep that expects a folio to be named folio did not miss any.
> 
> Hmm ... I count more in the filesystems:

I only counted the ones that you actually did convert.

This add about 80 more.  IMHO still not worth doing the union.  I'd
rather sort this out properl if/when the structures get properly split.
Matthew Wilcox April 6, 2021, 2:55 p.m. UTC | #7
On Tue, Apr 06, 2021 at 03:47:12PM +0100, Christoph Hellwig wrote:
> On Tue, Apr 06, 2021 at 03:40:22PM +0100, Matthew Wilcox wrote:
> > On Tue, Apr 06, 2021 at 03:31:50PM +0100, Christoph Hellwig wrote:
> > > > > As Christoph, I'm not a fan of this :/
> > > > 
> > > > What would you prefer?
> > > 
> > > Looking at your full folio series on git.infradead.org, there are a
> > > total of 12 references to non-page members of struct folio, assuming
> > > my crude grep that expects a folio to be named folio did not miss any.
> > 
> > Hmm ... I count more in the filesystems:
> 
> I only counted the ones that you actually did convert.
> 
> This add about 80 more.  IMHO still not worth doing the union.  I'd
> rather sort this out properl if/when the structures get properly split.

Assuming we're getting rid of them all though, we have to include:

$ git grep 'page->mapping' fs |wc -l
358
$ git grep 'page->index' fs |wc -l
355
Christoph Hellwig April 6, 2021, 3:05 p.m. UTC | #8
On Tue, Apr 06, 2021 at 03:55:11PM +0100, Matthew Wilcox wrote:
> Assuming we're getting rid of them all though, we have to include:
> 
> $ git grep 'page->mapping' fs |wc -l
> 358
> $ git grep 'page->index' fs |wc -l
> 355

Are they all going to stay?  Or are we going to clean up some of that
mess.  A lot of ->index should be page_offet, and on the mapping side
the page_mapping and page_file_mapping mess is also waiting to be
sorted..
Matthew Wilcox April 6, 2021, 4:25 p.m. UTC | #9
On Tue, Apr 06, 2021 at 04:05:50PM +0100, Christoph Hellwig wrote:
> On Tue, Apr 06, 2021 at 03:55:11PM +0100, Matthew Wilcox wrote:
> > Assuming we're getting rid of them all though, we have to include:
> > 
> > $ git grep 'page->mapping' fs |wc -l
> > 358
> > $ git grep 'page->index' fs |wc -l
> > 355
> 
> Are they all going to stay?  Or are we going to clean up some of that
> mess.  A lot of ->index should be page_offet, and on the mapping side
> the page_mapping and page_file_mapping mess is also waiting to be
> sorted..

About a third of ->index can be folio_offset(), based on a crude:

$ git grep 'page->index.*PAGE_' |wc -l
101

and I absolutely don't mind cleaning that up as part of the folio work,
but that still leaves 200-250 instances that would need to be changed
later.

I don't want to change the page->mapping to calls to folio_mapping().
That's a lot of extra work for a page which the filesystem knows belongs
to it.  folio_mapping() only needs to be used for pages which might not
belong to a filesystem.

page_file_mapping() absolutely needs to go away.  The way to do that
is to change swap-over-nfs to use direct IO, and then NFS can use
folio->mapping like all other filesystems.  f2fs is just terminally
confused and shouldn't be using page_file_mapping at all.  I'll fix
that as part of the folio work.
Christoph Hellwig April 7, 2021, 6:09 a.m. UTC | #10
On Tue, Apr 06, 2021 at 05:25:30PM +0100, Matthew Wilcox wrote:
> About a third of ->index can be folio_offset(), based on a crude:
> 
> $ git grep 'page->index.*PAGE_' |wc -l
> 101
> 
> and I absolutely don't mind cleaning that up as part of the folio work,
> but that still leaves 200-250 instances that would need to be changed
> later.
> 
> I don't want to change the page->mapping to calls to folio_mapping().
> That's a lot of extra work for a page which the filesystem knows belongs
> to it.  folio_mapping() only needs to be used for pages which might not
> belong to a filesystem.
> 
> page_file_mapping() absolutely needs to go away.  The way to do that
> is to change swap-over-nfs to use direct IO, and then NFS can use
> folio->mapping like all other filesystems.  f2fs is just terminally
> confused and shouldn't be using page_file_mapping at all.  I'll fix
> that as part of the folio work.

Thanks.  So my opinion for now remains preferably just don't add
the union and derefence through the page.  But I'm not going to block
the series for it, as I think it is a huge and badly needed cleanup
required to make further use of larger pages / large chunks of memory
in the pagecache and the file systems.
Rasmus Villemoes April 8, 2021, 9:01 a.m. UTC | #11
On 31/03/2021 20.47, Matthew Wilcox (Oracle) wrote:

> +static inline void folio_build_bug(void)
> +{
> +#define FOLIO_MATCH(pg, fl)						\
> +BUILD_BUG_ON(offsetof(struct page, pg) != offsetof(struct folio, fl));
> +
> +	FOLIO_MATCH(flags, flags);
> +	FOLIO_MATCH(lru, lru);
> +	FOLIO_MATCH(mapping, mapping);
> +	FOLIO_MATCH(index, index);
> +	FOLIO_MATCH(private, private);
> +	FOLIO_MATCH(_mapcount, _mapcount);
> +	FOLIO_MATCH(_refcount, _refcount);
> +#ifdef CONFIG_MEMCG
> +	FOLIO_MATCH(memcg_data, memcg_data);
> +#endif
> +#undef FOLIO_MATCH
> +	BUILD_BUG_ON(sizeof(struct page) != sizeof(struct folio));
> +}
> +

Perhaps do this next to the definition of struct folio instead of hiding
it in some arbitrary TU - hint, we have static_assert() that doesn't
need to be in function context. And consider amending FOLIO_MATCH by a
static_assert(__same_type(typeof_member(...), typeof_member(...))).

Rasmus
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3e4dc6678eb2..761063e733bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -936,6 +936,20 @@  static inline unsigned int compound_order(struct page *page)
 	return page[1].compound_order;
 }
 
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+	return compound_order(&folio->page);
+}
+
 static inline bool hpage_pincount_available(struct page *page)
 {
 	/*
@@ -1581,6 +1595,69 @@  static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
+/**
+ * folio_nr_pages - The number of pages in the folio.
+ * @folio: The folio.
+ *
+ * Return: A number which is a power of two.
+ */
+static inline unsigned long folio_nr_pages(struct folio *folio)
+{
+	return compound_nr(&folio->page);
+}
+
+/**
+ * folio_next - Move to the next physical folio.
+ * @folio: The folio we're currently operating on.
+ *
+ * If you have physically contiguous memory which may span more than
+ * one folio (eg a &struct bio_vec), use this function to move from one
+ * folio to the next.  Do not use it if the memory is only virtually
+ * contiguous as the folios are almost certainly not adjacent to each
+ * other.  This is the folio equivalent to writing ``page++``.
+ *
+ * Context: We assume that the folios are refcounted and/or locked at a
+ * higher level and do not adjust the reference counts.
+ * Return: The next struct folio.
+ */
+static inline struct folio *folio_next(struct folio *folio)
+{
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
+#else
+	return folio + folio_nr_pages(folio);
+#endif
+}
+
+/**
+ * folio_shift - The number of bits covered by this folio.
+ * @folio: The folio.
+ *
+ * A folio contains a number of bytes which is a power-of-two in size.
+ * This function tells you which power-of-two the folio is.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The base-2 logarithm of the size of this folio.
+ */
+static inline unsigned int folio_shift(struct folio *folio)
+{
+	return PAGE_SHIFT + folio_order(folio);
+}
+
+/**
+ * folio_size - The number of bytes in a folio.
+ * @folio: The folio.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The number of bytes in this folio.
+ */
+static inline size_t folio_size(struct folio *folio)
+{
+	return PAGE_SIZE << folio_order(folio);
+}
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
@@ -1685,6 +1762,7 @@  extern void pagefault_out_of_memory(void);
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
 
 /*
  * Flags passed to show_mem() and show_free_areas() to suppress output in
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..a0c7894fad1d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,6 +224,71 @@  struct page {
 #endif
 } _struct_page_alignment;
 
+/**
+ * struct folio - Represents a contiguous set of bytes.
+ * @flags: Identical to the page flags.
+ * @lru: Least Recently Used list; tracks how recently this folio was used.
+ * @mapping: The file this page belongs to, or refers to the anon_vma for
+ *    anonymous pages.
+ * @index: Offset within the file, in units of pages.  For anonymous pages,
+ *    this is the index from the beginning of the mmap.
+ * @private: Filesystem per-folio data (see attach_folio_private()).
+ *    Used for swp_entry_t if FolioSwapCache().
+ * @_mapcount: How many times this folio is mapped to userspace.  Use
+ *    folio_mapcount() to access it.
+ * @_refcount: Number of references to this folio.  Use folio_ref_count()
+ *    to read it.
+ * @memcg_data: Memory Control Group data.
+ *
+ * A folio is a physically, virtually and logically contiguous set
+ * of bytes.  It is a power-of-two in size, and it is aligned to that
+ * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
+ * in the page cache, it is at a file offset which is a multiple of that
+ * power-of-two.
+ */
+struct folio {
+	/* private: don't document the anon union */
+	union {
+		struct {
+	/* public: */
+			unsigned long flags;
+			struct list_head lru;
+			struct address_space *mapping;
+			pgoff_t index;
+			unsigned long private;
+			atomic_t _mapcount;
+			atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+			unsigned long memcg_data;
+#endif
+	/* private: the union with struct page is transitional */
+		};
+		struct page page;
+	};
+};
+
+/**
+ * page_folio - Converts from page to folio.
+ * @page: The page.
+ *
+ * Every page is part of a folio.  This function cannot be called on a
+ * NULL pointer.
+ *
+ * Context: No reference, nor lock is required on @page.  If the caller
+ * does not hold a reference, this call may race with a folio split, so
+ * it should re-check the folio still contains this page after gaining
+ * a reference on the folio.
+ * Return: The folio which contains this page.
+ */
+static inline struct folio *page_folio(struct page *page)
+{
+	unsigned long head = READ_ONCE(page->compound_head);
+
+	if (unlikely(head & 1))
+		return (struct folio *)(head - 1);
+	return (struct folio *)page;
+}
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
 	return &page[1].compound_mapcount;
diff --git a/mm/util.c b/mm/util.c
index 0b6dd9d81da7..521a772f06eb 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -686,6 +686,25 @@  struct anon_vma *page_anon_vma(struct page *page)
 	return __page_rmapping(page);
 }
 
+static inline void folio_build_bug(void)
+{
+#define FOLIO_MATCH(pg, fl)						\
+BUILD_BUG_ON(offsetof(struct page, pg) != offsetof(struct folio, fl));
+
+	FOLIO_MATCH(flags, flags);
+	FOLIO_MATCH(lru, lru);
+	FOLIO_MATCH(mapping, mapping);
+	FOLIO_MATCH(index, index);
+	FOLIO_MATCH(private, private);
+	FOLIO_MATCH(_mapcount, _mapcount);
+	FOLIO_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+	FOLIO_MATCH(memcg_data, memcg_data);
+#endif
+#undef FOLIO_MATCH
+	BUILD_BUG_ON(sizeof(struct page) != sizeof(struct folio));
+}
+
 struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping;