diff mbox series

[RFC,01/14] mm: Make folios_put() the basis of release_pages()

Message ID 20230825135918.4164671-2-willy@infradead.org (mailing list archive)
State New
Headers show
Series Rearrange batched folio freeing | expand

Commit Message

Matthew Wilcox Aug. 25, 2023, 1:59 p.m. UTC
By making release_pages() call folios_put(), we can get rid of the calls
to compound_head() for the callers that already know they have folios.
We can also get rid of the lock_batch tracking as we know the size of
the batch is limited by folio_batch.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h | 19 ++---------
 mm/mlock.c         |  3 +-
 mm/swap.c          | 84 +++++++++++++++++++++++++++-------------------
 3 files changed, 52 insertions(+), 54 deletions(-)

Comments

Ryan Roberts Aug. 31, 2023, 2:21 p.m. UTC | #1
On 25/08/2023 14:59, Matthew Wilcox (Oracle) wrote:
> By making release_pages() call folios_put(), we can get rid of the calls
> to compound_head() for the callers that already know they have folios.
> We can also get rid of the lock_batch tracking as we know the size of
> the batch is limited by folio_batch.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/mm.h | 19 ++---------
>  mm/mlock.c         |  3 +-
>  mm/swap.c          | 84 +++++++++++++++++++++++++++-------------------
>  3 files changed, 52 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2c6b54b5506a..7d1d96b75d11 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -36,6 +36,7 @@ struct anon_vma;
>  struct anon_vma_chain;
>  struct user_struct;
>  struct pt_regs;
> +struct folio_batch;
>  
>  extern int sysctl_page_lock_unfairness;
>  
> @@ -1521,23 +1522,7 @@ typedef union {
>  } release_pages_arg __attribute__ ((__transparent_union__));
>  
>  void release_pages(release_pages_arg, int nr);
> -
> -/**
> - * folios_put - Decrement the reference count on an array of folios.
> - * @folios: The folios.
> - * @nr: How many folios there are.
> - *
> - * Like folio_put(), but for an array of folios.  This is more efficient
> - * than writing the loop yourself as it will optimise the locks which
> - * need to be taken if the folios are freed.
> - *
> - * Context: May be called in process or interrupt context, but not in NMI
> - * context.  May be called while holding a spinlock.
> - */
> -static inline void folios_put(struct folio **folios, unsigned int nr)
> -{
> -	release_pages(folios, nr);
> -}
> +void folios_put(struct folio_batch *folios);
>  
>  static inline void put_page(struct page *page)
>  {
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 06bdfab83b58..67bd74a6268a 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
>  
>  	if (lruvec)
>  		unlock_page_lruvec_irq(lruvec);
> -	folios_put(fbatch->folios, folio_batch_count(fbatch));
> -	folio_batch_reinit(fbatch);
> +	folios_put(fbatch);
>  }
>  
>  void mlock_drain_local(void)
> diff --git a/mm/swap.c b/mm/swap.c
> index cd8f0150ba3a..7bdc63b56859 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -89,7 +89,7 @@ static void __page_cache_release(struct folio *folio)
>  		__folio_clear_lru_flags(folio);
>  		unlock_page_lruvec_irqrestore(lruvec, flags);
>  	}
> -	/* See comment on folio_test_mlocked in release_pages() */
> +	/* See comment on folio_test_mlocked in folios_put() */
>  	if (unlikely(folio_test_mlocked(folio))) {
>  		long nr_pages = folio_nr_pages(folio);
>  
> @@ -175,7 +175,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
>  	 * while the LRU lock is held.
>  	 *
>  	 * (That is not true of __page_cache_release(), and not necessarily
> -	 * true of release_pages(): but those only clear the mlocked flag after
> +	 * true of folios_put(): but those only clear the mlocked flag after
>  	 * folio_put_testzero() has excluded any other users of the folio.)
>  	 */
>  	if (folio_evictable(folio)) {
> @@ -221,8 +221,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
>  
>  	if (lruvec)
>  		unlock_page_lruvec_irqrestore(lruvec, flags);
> -	folios_put(fbatch->folios, folio_batch_count(fbatch));
> -	folio_batch_reinit(fbatch);
> +	folios_put(fbatch);
>  }
>  
>  static void folio_batch_add_and_move(struct folio_batch *fbatch,
> @@ -946,41 +945,27 @@ void lru_cache_disable(void)
>  }
>  
>  /**
> - * release_pages - batched put_page()
> - * @arg: array of pages to release
> - * @nr: number of pages
> + * folios_put - Decrement the reference count on a batch of folios.
> + * @folios: The folios.
>   *
> - * Decrement the reference count on all the pages in @arg.  If it
> - * fell to zero, remove the page from the LRU and free it.
> + * Like folio_put(), but for a batch of folios.  This is more efficient
> + * than writing the loop yourself as it will optimise the locks which need
> + * to be taken if the folios are freed.  The folios batch is returned
> + * empty and ready to be reused for another batch; there is no need to
> + * reinitialise it.
>   *
> - * Note that the argument can be an array of pages, encoded pages,
> - * or folio pointers. We ignore any encoded bits, and turn any of
> - * them into just a folio that gets free'd.
> + * Context: May be called in process or interrupt context, but not in NMI
> + * context.  May be called while holding a spinlock.
>   */
> -void release_pages(release_pages_arg arg, int nr)
> +void folios_put(struct folio_batch *folios)
>  {
>  	int i;
> -	struct encoded_page **encoded = arg.encoded_pages;
>  	LIST_HEAD(pages_to_free);
>  	struct lruvec *lruvec = NULL;
>  	unsigned long flags = 0;
> -	unsigned int lock_batch;
>  
> -	for (i = 0; i < nr; i++) {
> -		struct folio *folio;
> -
> -		/* Turn any of the argument types into a folio */
> -		folio = page_folio(encoded_page_ptr(encoded[i]));
> -
> -		/*
> -		 * Make sure the IRQ-safe lock-holding time does not get
> -		 * excessive with a continuous string of pages from the
> -		 * same lruvec. The lock is held only if lruvec != NULL.
> -		 */
> -		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {

SWAP_CLUSTER_MAX is 32. By using the folio_batch, I think you are limitted to 15
in your batch, so I guess you could be taking/releasing the lock x2 as often? Is
there any perf implication?

> -			unlock_page_lruvec_irqrestore(lruvec, flags);
> -			lruvec = NULL;
> -		}
> +	for (i = 0; i < folios->nr; i++) {
> +		struct folio *folio = folios->folios[i];
>  
>  		if (is_huge_zero_page(&folio->page))
>  			continue;
> @@ -1010,13 +995,8 @@ void release_pages(release_pages_arg arg, int nr)
>  		}
>  
>  		if (folio_test_lru(folio)) {
> -			struct lruvec *prev_lruvec = lruvec;
> -
>  			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
>  									&flags);
> -			if (prev_lruvec != lruvec)
> -				lock_batch = 0;
> -
>  			lruvec_del_folio(lruvec, folio);
>  			__folio_clear_lru_flags(folio);
>  		}
> @@ -1040,6 +1020,40 @@ void release_pages(release_pages_arg arg, int nr)
>  
>  	mem_cgroup_uncharge_list(&pages_to_free);
>  	free_unref_page_list(&pages_to_free);
> +	folios->nr = 0;

folio_batch_reinit(folios) ?

> +}
> +EXPORT_SYMBOL(folios_put);
> +
> +/**
> + * release_pages - batched put_page()
> + * @arg: array of pages to release
> + * @nr: number of pages
> + *
> + * Decrement the reference count on all the pages in @arg.  If it
> + * fell to zero, remove the page from the LRU and free it.
> + *
> + * Note that the argument can be an array of pages, encoded pages,
> + * or folio pointers. We ignore any encoded bits, and turn any of
> + * them into just a folio that gets free'd.
> + */
> +void release_pages(release_pages_arg arg, int nr)
> +{
> +	struct folio_batch fbatch;
> +	struct encoded_page **encoded = arg.encoded_pages;
> +	int i;
> +
> +	folio_batch_init(&fbatch);
> +	for (i = 0; i < nr; i++) {
> +		/* Turn any of the argument types into a folio */
> +		struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
> +
> +		if (folio_batch_add(&fbatch, folio) > 0)
> +			continue;
> +		folios_put(&fbatch);
> +	}
> +
> +	if (fbatch.nr)

if (folio_batch_count(&fbatch)) ?

> +		folios_put(&fbatch);
>  }
>  EXPORT_SYMBOL(release_pages);
>
Matthew Wilcox Sept. 1, 2023, 3:58 a.m. UTC | #2
On Thu, Aug 31, 2023 at 03:21:53PM +0100, Ryan Roberts wrote:
> On 25/08/2023 14:59, Matthew Wilcox (Oracle) wrote:
> > By making release_pages() call folios_put(), we can get rid of the calls
> > to compound_head() for the callers that already know they have folios.
> > We can also get rid of the lock_batch tracking as we know the size of
> > the batch is limited by folio_batch.
> > -		/*
> > -		 * Make sure the IRQ-safe lock-holding time does not get
> > -		 * excessive with a continuous string of pages from the
> > -		 * same lruvec. The lock is held only if lruvec != NULL.
> > -		 */
> > -		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
> 
> SWAP_CLUSTER_MAX is 32. By using the folio_batch, I think you are limitted to 15
> in your batch, so I guess you could be taking/releasing the lock x2 as often? Is
> there any perf implication?

Yes, if the batch size is larger than 15, we'll take/release the lru lock
more often.  We could increase the size of the folio_batch if that becomes
a problem.  I'm not sure how often it's a problem; we already limit the
number of folios to process to 15 in, eg, folio_batch_add_and_move().
I'm not really sure why this code gets to be special and hold the lock
for twice as long as the callers of folio_batch_add_and_move.

> > @@ -1040,6 +1020,40 @@ void release_pages(release_pages_arg arg, int nr)
> >  
> >  	mem_cgroup_uncharge_list(&pages_to_free);
> >  	free_unref_page_list(&pages_to_free);
> > +	folios->nr = 0;
> 
> folio_batch_reinit(folios) ?

I don't really like the abstraction here.  Back to folio_batch_move_lru()
as an example:

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
...
	}
...
	folio_batch_reinit(fbatch);

vs what I'd rather write:

	for (i = 0; i < fbatch->nr; i++) {
		struct folio *folio = fbatch->folios[i];
...
	}
...
	fbatch->nr = 0;

OK, we've successfully abstracted away that there is a member of
folio_batch called 'nr', but we still have to go poking around inside
folio_batch to extract the folio itself.  So it's not like we've
managed to make folio_batch a completely opaque type.  And I don't
think that folio_batch_count() is really all that much more descriptive
than fbatch->nr.  Indeed, I think the second one is easier to read;
it's obviously a plain loop.

I suppose that folio_batch_count() / _reinit() are easier to grep for
than '>nr\>' but I don't think that's a particularly useful thing to do.
We could add abstractions to get the folio_batch_folio(fbatch, i), but
when we start to get into something like folio_batch_remove_exceptionals()
(and there's something similar happening in this patchset where we strip
out the hugetlb folios), you're messing with the internal structure of
the folio_batch so much that you may as well not bother with any kind
of abstraction.

I'm really temped to rip out folio_batch_count() and folio_batch_reinit().
They don't seem useful enough.
Ryan Roberts Sept. 1, 2023, 8:14 a.m. UTC | #3
On 01/09/2023 04:58, Matthew Wilcox wrote:
> On Thu, Aug 31, 2023 at 03:21:53PM +0100, Ryan Roberts wrote:
>> On 25/08/2023 14:59, Matthew Wilcox (Oracle) wrote:
>>> By making release_pages() call folios_put(), we can get rid of the calls
>>> to compound_head() for the callers that already know they have folios.
>>> We can also get rid of the lock_batch tracking as we know the size of
>>> the batch is limited by folio_batch.
>>> -		/*
>>> -		 * Make sure the IRQ-safe lock-holding time does not get
>>> -		 * excessive with a continuous string of pages from the
>>> -		 * same lruvec. The lock is held only if lruvec != NULL.
>>> -		 */
>>> -		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
>>
>> SWAP_CLUSTER_MAX is 32. By using the folio_batch, I think you are limitted to 15
>> in your batch, so I guess you could be taking/releasing the lock x2 as often? Is
>> there any perf implication?
> 
> Yes, if the batch size is larger than 15, we'll take/release the lru lock
> more often.  We could increase the size of the folio_batch if that becomes
> a problem.  I'm not sure how often it's a problem; we already limit the
> number of folios to process to 15 in, eg, folio_batch_add_and_move().
> I'm not really sure why this code gets to be special and hold the lock
> for twice as long as the callers of folio_batch_add_and_move.

mmugather stores page pointers in batches of PAGE_SIZE (minus a small header).
So for 4K pages on 64 bit system, that's ~512 page pointers per batch. So I
would imagine you will notice 15 vs 32 for the munmap and mmap_exit cases.

As promised, I'm planning to do some benchmarking today, so I could play with
the size of the folio_batch.

I'm not sure if there is any special reason PAGEVEC_SIZE is 15? I can see you
changed it from 14 a while back, by better packing the struct pagevec header so
that the structure is still the same size it used to be. But 14 was set in
pre-history. Are things likely to break if we expand it to 31 (doubling its size)?

> 
>>> @@ -1040,6 +1020,40 @@ void release_pages(release_pages_arg arg, int nr)
>>>  
>>>  	mem_cgroup_uncharge_list(&pages_to_free);
>>>  	free_unref_page_list(&pages_to_free);
>>> +	folios->nr = 0;
>>
>> folio_batch_reinit(folios) ?
> 
> I don't really like the abstraction here.  Back to folio_batch_move_lru()
> as an example:
> 
>         for (i = 0; i < folio_batch_count(fbatch); i++) {
>                 struct folio *folio = fbatch->folios[i];
> ...
> 	}
> ...
> 	folio_batch_reinit(fbatch);
> 
> vs what I'd rather write:
> 
> 	for (i = 0; i < fbatch->nr; i++) {
> 		struct folio *folio = fbatch->folios[i];
> ...
> 	}
> ...
> 	fbatch->nr = 0;
> 
> OK, we've successfully abstracted away that there is a member of
> folio_batch called 'nr', but we still have to go poking around inside
> folio_batch to extract the folio itself.  So it's not like we've
> managed to make folio_batch a completely opaque type.  And I don't
> think that folio_batch_count() is really all that much more descriptive
> than fbatch->nr.  Indeed, I think the second one is easier to read;
> it's obviously a plain loop.
> 
> I suppose that folio_batch_count() / _reinit() are easier to grep for
> than '>nr\>' but I don't think that's a particularly useful thing to do.
> We could add abstractions to get the folio_batch_folio(fbatch, i), but
> when we start to get into something like folio_batch_remove_exceptionals()
> (and there's something similar happening in this patchset where we strip
> out the hugetlb folios), you're messing with the internal structure of
> the folio_batch so much that you may as well not bother with any kind
> of abstraction.
> 
> I'm really temped to rip out folio_batch_count() and folio_batch_reinit().
> They don't seem useful enough.

Looks like folio_batch_count() is used 71 times, and folio_batch_reinit() 0
times (after your changes). So the former will create churn to rip out. My
opinion is simply that if there is an API or something, you should use it rather
than touch the internals directly. It's just a nit from my perspective, so
interpret as you like ;-)

>
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c6b54b5506a..7d1d96b75d11 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,6 +36,7 @@  struct anon_vma;
 struct anon_vma_chain;
 struct user_struct;
 struct pt_regs;
+struct folio_batch;
 
 extern int sysctl_page_lock_unfairness;
 
@@ -1521,23 +1522,7 @@  typedef union {
 } release_pages_arg __attribute__ ((__transparent_union__));
 
 void release_pages(release_pages_arg, int nr);
-
-/**
- * folios_put - Decrement the reference count on an array of folios.
- * @folios: The folios.
- * @nr: How many folios there are.
- *
- * Like folio_put(), but for an array of folios.  This is more efficient
- * than writing the loop yourself as it will optimise the locks which
- * need to be taken if the folios are freed.
- *
- * Context: May be called in process or interrupt context, but not in NMI
- * context.  May be called while holding a spinlock.
- */
-static inline void folios_put(struct folio **folios, unsigned int nr)
-{
-	release_pages(folios, nr);
-}
+void folios_put(struct folio_batch *folios);
 
 static inline void put_page(struct page *page)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index 06bdfab83b58..67bd74a6268a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -206,8 +206,7 @@  static void mlock_folio_batch(struct folio_batch *fbatch)
 
 	if (lruvec)
 		unlock_page_lruvec_irq(lruvec);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 
 void mlock_drain_local(void)
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3a..7bdc63b56859 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -89,7 +89,7 @@  static void __page_cache_release(struct folio *folio)
 		__folio_clear_lru_flags(folio);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
-	/* See comment on folio_test_mlocked in release_pages() */
+	/* See comment on folio_test_mlocked in folios_put() */
 	if (unlikely(folio_test_mlocked(folio))) {
 		long nr_pages = folio_nr_pages(folio);
 
@@ -175,7 +175,7 @@  static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
 	 * while the LRU lock is held.
 	 *
 	 * (That is not true of __page_cache_release(), and not necessarily
-	 * true of release_pages(): but those only clear the mlocked flag after
+	 * true of folios_put(): but those only clear the mlocked flag after
 	 * folio_put_testzero() has excluded any other users of the folio.)
 	 */
 	if (folio_evictable(folio)) {
@@ -221,8 +221,7 @@  static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 
 	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 
 static void folio_batch_add_and_move(struct folio_batch *fbatch,
@@ -946,41 +945,27 @@  void lru_cache_disable(void)
 }
 
 /**
- * release_pages - batched put_page()
- * @arg: array of pages to release
- * @nr: number of pages
+ * folios_put - Decrement the reference count on a batch of folios.
+ * @folios: The folios.
  *
- * Decrement the reference count on all the pages in @arg.  If it
- * fell to zero, remove the page from the LRU and free it.
+ * Like folio_put(), but for a batch of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed.  The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need to
+ * reinitialise it.
  *
- * Note that the argument can be an array of pages, encoded pages,
- * or folio pointers. We ignore any encoded bits, and turn any of
- * them into just a folio that gets free'd.
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
  */
-void release_pages(release_pages_arg arg, int nr)
+void folios_put(struct folio_batch *folios)
 {
 	int i;
-	struct encoded_page **encoded = arg.encoded_pages;
 	LIST_HEAD(pages_to_free);
 	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
-	unsigned int lock_batch;
 
-	for (i = 0; i < nr; i++) {
-		struct folio *folio;
-
-		/* Turn any of the argument types into a folio */
-		folio = page_folio(encoded_page_ptr(encoded[i]));
-
-		/*
-		 * Make sure the IRQ-safe lock-holding time does not get
-		 * excessive with a continuous string of pages from the
-		 * same lruvec. The lock is held only if lruvec != NULL.
-		 */
-		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
-			unlock_page_lruvec_irqrestore(lruvec, flags);
-			lruvec = NULL;
-		}
+	for (i = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
 
 		if (is_huge_zero_page(&folio->page))
 			continue;
@@ -1010,13 +995,8 @@  void release_pages(release_pages_arg arg, int nr)
 		}
 
 		if (folio_test_lru(folio)) {
-			struct lruvec *prev_lruvec = lruvec;
-
 			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
 									&flags);
-			if (prev_lruvec != lruvec)
-				lock_batch = 0;
-
 			lruvec_del_folio(lruvec, folio);
 			__folio_clear_lru_flags(folio);
 		}
@@ -1040,6 +1020,40 @@  void release_pages(release_pages_arg arg, int nr)
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_unref_page_list(&pages_to_free);
+	folios->nr = 0;
+}
+EXPORT_SYMBOL(folios_put);
+
+/**
+ * release_pages - batched put_page()
+ * @arg: array of pages to release
+ * @nr: number of pages
+ *
+ * Decrement the reference count on all the pages in @arg.  If it
+ * fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
+ */
+void release_pages(release_pages_arg arg, int nr)
+{
+	struct folio_batch fbatch;
+	struct encoded_page **encoded = arg.encoded_pages;
+	int i;
+
+	folio_batch_init(&fbatch);
+	for (i = 0; i < nr; i++) {
+		/* Turn any of the argument types into a folio */
+		struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
+
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		folios_put(&fbatch);
+	}
+
+	if (fbatch.nr)
+		folios_put(&fbatch);
 }
 EXPORT_SYMBOL(release_pages);