diff mbox series

[v4,01/25] mm: Introduce struct folio

Message ID 20210305041901.2396498-2-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series Page folios | expand

Commit Message

Matthew Wilcox March 5, 2021, 4:18 a.m. UTC
A struct folio refers to an entire (possibly compound) page.  A function
which takes a struct folio argument declares that it will operate on the
entire compound page, not just PAGE_SIZE bytes.  In return, the caller
guarantees that the pointer it is passing does not point to a tail page.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h       | 30 ++++++++++++++++++++++++++++++
 include/linux/mm_types.h | 17 +++++++++++++++++
 2 files changed, 47 insertions(+)

Comments

Andrew Morton March 13, 2021, 8:37 p.m. UTC | #1
On Fri,  5 Mar 2021 04:18:37 +0000 "Matthew Wilcox (Oracle)" <willy@infradead.org> wrote:

> A struct folio refers to an entire (possibly compound) page.  A function
> which takes a struct folio argument declares that it will operate on the
> entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> guarantees that the pointer it is passing does not point to a tail page.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/mm.h       | 30 ++++++++++++++++++++++++++++++
>  include/linux/mm_types.h | 17 +++++++++++++++++

Perhaps a new folio.h would be neater.

> @@ -1518,6 +1523,30 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
>  #endif
>  }
>  
> +static inline unsigned long folio_nr_pages(struct folio *folio)
> +{
> +	return compound_nr(&folio->page);
> +}
> +
> +static inline struct folio *next_folio(struct folio *folio)
> +{
> +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> +	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
> +#else
> +	return folio + folio_nr_pages(folio);
> +#endif
> +}

It's a shame this isn't called folio_something(), like the rest of the API.

Unclear what this does.  Some comments would help.

> +static inline unsigned int folio_shift(struct folio *folio)
> +{
> +	return PAGE_SHIFT + folio_order(folio);
> +}
> +
> +static inline size_t folio_size(struct folio *folio)
> +{
> +	return PAGE_SIZE << folio_order(folio);
> +}

Why size_t?  That's pretty rare in this space and I'd have expected
unsigned long.


> @@ -1623,6 +1652,7 @@ extern void pagefault_out_of_memory(void);
>  
>  #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
>  #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
> +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
>  
>  /*
>   * Flags passed to show_mem() and show_free_areas() to suppress output in
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0974ad501a47..a311cb48526f 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -223,6 +223,23 @@ struct page {
>  #endif
>  } _struct_page_alignment;
>  
> +/*
> + * A struct folio is either a base (order-0) page or the head page of
> + * a compound page.
> + */
> +struct folio {
> +	struct page page;
> +};
> +
> +static inline struct folio *page_folio(struct page *page)
> +{
> +	unsigned long head = READ_ONCE(page->compound_head);
> +
> +	if (unlikely(head & 1))
> +		return (struct folio *)(head - 1);
> +	return (struct folio *)page;
> +}

What purpose does the READ_ONCE() serve?
Matthew Wilcox March 14, 2021, 4:07 a.m. UTC | #2
On Sat, Mar 13, 2021 at 12:37:02PM -0800, Andrew Morton wrote:
> On Fri,  5 Mar 2021 04:18:37 +0000 "Matthew Wilcox (Oracle)" <willy@infradead.org> wrote:
> 
> > A struct folio refers to an entire (possibly compound) page.  A function
> > which takes a struct folio argument declares that it will operate on the
> > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> > guarantees that the pointer it is passing does not point to a tail page.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > ---
> >  include/linux/mm.h       | 30 ++++++++++++++++++++++++++++++
> >  include/linux/mm_types.h | 17 +++++++++++++++++
> 
> Perhaps a new folio.h would be neater.

I understand that urge (I'm no fan of the size of mm.h ...), but it ends
up pretty interwoven with mm.h.  For example:

static inline struct zone *folio_zone(const struct folio *folio)
{
        return page_zone(&folio->page);
}

so we need both struct folio defined here and we need page_zone().
page_zone() is defined in mm.h, so we'd have folio.h including mm.h.
But then put_page() calls put_folio(), so we need folio.h included
in mm.h.

The patch series I have does a lot of movement of page cache functionality
from mm.h to filemap.h, so you may end up with a smaller mm.h at the
end of it.  Right now, it's 10 lines longer than it was.

> > +static inline struct folio *next_folio(struct folio *folio)
> > +{
> > +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> > +	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
> > +#else
> > +	return folio + folio_nr_pages(folio);
> > +#endif
> > +}
> 
> It's a shame this isn't called folio_something(), like the rest of the API.
> 
> Unclear what this does.  Some comments would help.

folio_next() it can be.  I'll add some commentary.

> > +static inline unsigned int folio_shift(struct folio *folio)
> > +{
> > +	return PAGE_SHIFT + folio_order(folio);
> > +}
> > +
> > +static inline size_t folio_size(struct folio *folio)
> > +{
> > +	return PAGE_SIZE << folio_order(folio);
> > +}
> 
> Why size_t?  That's pretty rare in this space and I'd have expected
> unsigned long.

I like to use size_t for things which are the number of bytes represented
by an in-memory object.  As opposed to all the other things which we
use unsigned long for.  Maybe that's more common on the filesystem side
of the house.

> > +static inline struct folio *page_folio(struct page *page)
> > +{
> > +	unsigned long head = READ_ONCE(page->compound_head);
> > +
> > +	if (unlikely(head & 1))
> > +		return (struct folio *)(head - 1);
> > +	return (struct folio *)page;
> > +}
> 
> What purpose does the READ_ONCE() serve?

Same purpose as it does in compound_head():

static inline struct page *compound_head(struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return (struct page *) (head - 1);
        return page;
}

I think Kirill would say that it's to defend against splitting if we
don't have a refcount on the page yet.  So if we do something like walk
the page tables, find a PTE, translate it to a struct page, then try to
get a reference on the head page, we don't end up with an incoherent
answer from 'compound_head()' if it's split in the middle of the call
and the page->compound_head field gets assigned some other value.

It might return the wrong page, so get_user_pages() still has to check
the page is right after it's got the reference, but at least this way
it's guaranteed to return something that was right at one time.

There might be more to it, but that's my understanding of why the code
is currently written this way.
Christoph Hellwig March 17, 2021, 5:14 p.m. UTC | #3
On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
> +/*
> + * A struct folio is either a base (order-0) page or the head page of
> + * a compound page.
> + */

Hmm.  While that comment seems to be true I'm not sure it is the
essence.  Maybe it should be more framed in terms of

"A folio represents a contigously allocated chunk of memory.."

and then extend it with the categories of state and operations performed
on the folio while those get added.  The above statement can still
remain as a low-level explanation, maybe moved to the page member
instead of the type itself.
Education Directorate March 18, 2021, 11:56 p.m. UTC | #4
On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
> A struct folio refers to an entire (possibly compound) page.  A function
> which takes a struct folio argument declares that it will operate on the
> entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> guarantees that the pointer it is passing does not point to a tail page.
>

Is this a part of a larger use case or general cleanup/refactor where
the split between page and folio simplify programming?

Balbir Singh.
Matthew Wilcox March 19, 2021, 1:25 a.m. UTC | #5
On Fri, Mar 19, 2021 at 10:56:45AM +1100, Balbir Singh wrote:
> On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
> > A struct folio refers to an entire (possibly compound) page.  A function
> > which takes a struct folio argument declares that it will operate on the
> > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> > guarantees that the pointer it is passing does not point to a tail page.
> >
> 
> Is this a part of a larger use case or general cleanup/refactor where
> the split between page and folio simplify programming?

The goal here is to manage memory in larger chunks.  Pages are now too
small for just about every workload.  Even compiling the kernel sees a 7%
performance improvement just by doing readahead using relatively small
THPs (16k-256k).  You can see that work here:
https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/master

I think Kirill, Hugh and others have done a fantastic job stretching
the page struct to work in shmem, but we really need a different type
to avoid people writing code that _looks_ right but is actually buggy.
So I'm starting again, this time with the folio metaphor.
Education Directorate March 20, 2021, 2:09 a.m. UTC | #6
On Fri, Mar 19, 2021 at 01:25:27AM +0000, Matthew Wilcox wrote:
> On Fri, Mar 19, 2021 at 10:56:45AM +1100, Balbir Singh wrote:
> > On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
> > > A struct folio refers to an entire (possibly compound) page.  A function
> > > which takes a struct folio argument declares that it will operate on the
> > > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> > > guarantees that the pointer it is passing does not point to a tail page.
> > >
> > 
> > Is this a part of a larger use case or general cleanup/refactor where
> > the split between page and folio simplify programming?
> 
> The goal here is to manage memory in larger chunks.  Pages are now too
> small for just about every workload.  Even compiling the kernel sees a 7%
> performance improvement just by doing readahead using relatively small
> THPs (16k-256k).  You can see that work here:
> https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/master
> 
> I think Kirill, Hugh and others have done a fantastic job stretching
> the page struct to work in shmem, but we really need a different type
> to avoid people writing code that _looks_ right but is actually buggy.
> So I'm starting again, this time with the folio metaphor.

Thanks, makes sense, I'll take a look.

Balbir Singh.
Nicholas Piggin March 22, 2021, 2:52 a.m. UTC | #7
Excerpts from Matthew Wilcox's message of March 19, 2021 11:25 am:
> On Fri, Mar 19, 2021 at 10:56:45AM +1100, Balbir Singh wrote:
>> On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
>> > A struct folio refers to an entire (possibly compound) page.  A function
>> > which takes a struct folio argument declares that it will operate on the
>> > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
>> > guarantees that the pointer it is passing does not point to a tail page.
>> >
>> 
>> Is this a part of a larger use case or general cleanup/refactor where
>> the split between page and folio simplify programming?
> 
> The goal here is to manage memory in larger chunks.  Pages are now too
> small for just about every workload.  Even compiling the kernel sees a 7%
> performance improvement just by doing readahead using relatively small
> THPs (16k-256k).  You can see that work here:
> https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/master

The 7% improvement comes from cache cold kbuild by improving IO
patterns?

Just wondering what kind of readahead is enabled by this that can't
be done with base page size.

Thanks,
Nick
Matthew Wilcox March 22, 2021, 3:54 a.m. UTC | #8
On Mon, Mar 22, 2021 at 12:52:40PM +1000, Nicholas Piggin wrote:
> Excerpts from Matthew Wilcox's message of March 19, 2021 11:25 am:
> > On Fri, Mar 19, 2021 at 10:56:45AM +1100, Balbir Singh wrote:
> >> On Fri, Mar 05, 2021 at 04:18:37AM +0000, Matthew Wilcox (Oracle) wrote:
> >> > A struct folio refers to an entire (possibly compound) page.  A function
> >> > which takes a struct folio argument declares that it will operate on the
> >> > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> >> > guarantees that the pointer it is passing does not point to a tail page.
> >> >
> >> 
> >> Is this a part of a larger use case or general cleanup/refactor where
> >> the split between page and folio simplify programming?
> > 
> > The goal here is to manage memory in larger chunks.  Pages are now too
> > small for just about every workload.  Even compiling the kernel sees a 7%
> > performance improvement just by doing readahead using relatively small
> > THPs (16k-256k).  You can see that work here:
> > https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/master
> 
> The 7% improvement comes from cache cold kbuild by improving IO
> patterns?
> 
> Just wondering what kind of readahead is enabled by this that can't
> be done with base page size.

I see my explanation earlier was confusing.  What I meant to say
was that the only way in that patch set to create larger pages was
at readahead time.  Writes were incapable of creating larger pages.
Once pages were in the page cache, they got managed at that granularity
unless they got split by a truncate/holepunch/io-error/...

I don't have good perf runs of kernbench to say exactly where we got the
benefit.  My assumption is that because we're managing an entire, say,
256kB page as a single unit on the LRU list, we benefit from lower LRU
lock contention.  There's also the benefit of batching, eg, allocating
a single 256kB page from the page allocator may well be more effective
than allocating 64 4kB pages.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 77e64e3eac80..a46e5a4385b0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -927,6 +927,11 @@  static inline unsigned int compound_order(struct page *page)
 	return page[1].compound_order;
 }
 
+static inline unsigned int folio_order(struct folio *folio)
+{
+	return compound_order(&folio->page);
+}
+
 static inline bool hpage_pincount_available(struct page *page)
 {
 	/*
@@ -1518,6 +1523,30 @@  static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
+static inline unsigned long folio_nr_pages(struct folio *folio)
+{
+	return compound_nr(&folio->page);
+}
+
+static inline struct folio *next_folio(struct folio *folio)
+{
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+	return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
+#else
+	return folio + folio_nr_pages(folio);
+#endif
+}
+
+static inline unsigned int folio_shift(struct folio *folio)
+{
+	return PAGE_SHIFT + folio_order(folio);
+}
+
+static inline size_t folio_size(struct folio *folio)
+{
+	return PAGE_SIZE << folio_order(folio);
+}
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
@@ -1623,6 +1652,7 @@  extern void pagefault_out_of_memory(void);
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
 
 /*
  * Flags passed to show_mem() and show_free_areas() to suppress output in
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0974ad501a47..a311cb48526f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -223,6 +223,23 @@  struct page {
 #endif
 } _struct_page_alignment;
 
+/*
+ * A struct folio is either a base (order-0) page or the head page of
+ * a compound page.
+ */
+struct folio {
+	struct page page;
+};
+
+static inline struct folio *page_folio(struct page *page)
+{
+	unsigned long head = READ_ONCE(page->compound_head);
+
+	if (unlikely(head & 1))
+		return (struct folio *)(head - 1);
+	return (struct folio *)page;
+}
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
 	return &page[1].compound_mapcount;