diff mbox series

[v3,2/2] lru: allow large batched add large folio to lru list

Message ID 20230429082759.1600796-3-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series Reduce lock contention related with large folio | expand

Commit Message

Yin, Fengwei April 29, 2023, 8:27 a.m. UTC
Currently, large folio is not batched added to lru list. Which
cause high lru lock contention after enable large folio for
anonymous mapping.

Running page_fault1 of will-it-scale + order 2 folio with 96
processes on Ice Lake 48C/96T, the lru lock contention could
be around 64%:
-   64.31%     0.23%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
   - 64.07% folio_lruvec_lock_irqsave
      + 64.01% _raw_spin_lock_irqsave

With this patch, the lru lock contention dropped to 43% with same
testing:
-   42.67%     0.19%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
   - 42.48% folio_lruvec_lock_irqsave
      + 42.42% _raw_spin_lock_irqsave

Reported-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 include/linux/pagevec.h | 46 ++++++++++++++++++++++++++++++++++++++---
 mm/mlock.c              |  7 +++----
 mm/swap.c               |  3 +--
 3 files changed, 47 insertions(+), 9 deletions(-)

Comments

Matthew Wilcox April 29, 2023, 10:35 p.m. UTC | #1
On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
> @@ -22,6 +23,7 @@ struct address_space;
>  struct pagevec {
>  	unsigned char nr;
>  	bool percpu_pvec_drained;
> +	unsigned short nr_pages;

I still don't like storing nr_pages in the pagevec/folio_batch.
Yin, Fengwei May 1, 2023, 5:52 a.m. UTC | #2
Hi Matthew,

On 4/30/2023 6:35 AM, Matthew Wilcox wrote:
> On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
>> @@ -22,6 +23,7 @@ struct address_space;
>>  struct pagevec {
>>  	unsigned char nr;
>>  	bool percpu_pvec_drained;
>> +	unsigned short nr_pages;
> 
> I still don't like storing nr_pages in the pagevec/folio_batch.
OK. Let me see whether I could find out other way without storing
the nr_pages. Thanks.


Regards
Yin, Fengwei

>
Yin, Fengwei May 5, 2023, 5:51 a.m. UTC | #3
Hi Matthew,

On 4/30/2023 6:35 AM, Matthew Wilcox wrote:
> On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
>> @@ -22,6 +23,7 @@ struct address_space;
>>  struct pagevec {
>>  	unsigned char nr;
>>  	bool percpu_pvec_drained;
>> +	unsigned short nr_pages;
> 
> I still don't like storing nr_pages in the pagevec/folio_batch.
> 

What about the change like following:

diff --git a/mm/swap.c b/mm/swap.c
index 57cb01b042f6..5e7e9c0734ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -228,8 +228,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 static void folio_batch_add_and_move(struct folio_batch *fbatch,
                struct folio *folio, move_fn_t move_fn)
 {
-       if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
-           !lru_cache_disabled())
+       int nr_pages = folio_nr_pages(folio);
+
+       if (folio_batch_add(fbatch, folio) && !lru_cache_disabled() &&
+           (!folio_test_large(folio) || (nr_pages <= (PAGEVEC_SIZE + 1))))
                return;
        folio_batch_move_lru(fbatch, move_fn);
 }


I did testing about the lru lock contention with different folio size
with will-it-scale + deferred queue lock contention mitigated:
  - If large folio size is 16K (order 2), the lru lock takes 64.31% cpu runtime
  - If large folio size is 64K (order 4), the lru lock takes 24.24% cpu runtime
This is as our expectation: The larger size of folio, the less lru lock
contention.

It's acceptable to not batched operate on large folio which is large
enough. PAGEVEC_SIZE + 1 is chosen here based on following reasons:
  - acceptable max memory size per batch: 15 x 16 x 4096 = 983040 bytes
  - the folios with size larger than it will not apply batched operation.
    But the lru lock contention is not high already.


I collected data with lru contention when run will-it-scale.page_fault1:

folio with order 2:
  Without the change:
  -   64.31%     0.23%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
     + 64.07% folio_lruvec_lock_irqsave

  With the change:
  -   21.55%     0.21%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
     + 21.34% folio_lruvec_lock_irqsave

folio with order 4:
  Without the change:
  -   24.24%     0.15%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
     + 24.09% folio_lruvec_lock_irqsave

  With the change:
  -   2.20%     0.09%  page_fault1_pro  [kernel.kallsyms]            [k] folio_lruvec_lock_irqsave
     + 2.11% folio_lruvec_lock_irqsave

folio with order 5:
  -   8.21%     0.16%  page_fault1_pro  [kernel.kallsyms]  [k] folio_lruvec_lock_irqsave
     + 8.05% folio_lruvec_lock_irqsave


Regards
Yin, Fengwei
Yin, Fengwei May 15, 2023, 2:14 a.m. UTC | #4
Hi Matthew,

On 5/5/2023 1:51 PM, Yin, Fengwei wrote:
> Hi Matthew,
> 
> On 4/30/2023 6:35 AM, Matthew Wilcox wrote:
>> On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
>>> @@ -22,6 +23,7 @@ struct address_space;
>>>  struct pagevec {
>>>  	unsigned char nr;
>>>  	bool percpu_pvec_drained;
>>> +	unsigned short nr_pages;
>>
>> I still don't like storing nr_pages in the pagevec/folio_batch.
>>
> 
> What about the change like following:
Soft ping.


Regards
Yin, Fengwei

> 
> diff --git a/mm/swap.c b/mm/swap.c
> index 57cb01b042f6..5e7e9c0734ab 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -228,8 +228,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
>  static void folio_batch_add_and_move(struct folio_batch *fbatch,
>                 struct folio *folio, move_fn_t move_fn)
>  {
> -       if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
> -           !lru_cache_disabled())
> +       int nr_pages = folio_nr_pages(folio);
> +
> +       if (folio_batch_add(fbatch, folio) && !lru_cache_disabled() &&
> +           (!folio_test_large(folio) || (nr_pages <= (PAGEVEC_SIZE + 1))))
>                 return;
>         folio_batch_move_lru(fbatch, move_fn);
>  }
> 
> 
> I did testing about the lru lock contention with different folio size
> with will-it-scale + deferred queue lock contention mitigated:
>   - If large folio size is 16K (order 2), the lru lock takes 64.31% cpu runtime
>   - If large folio size is 64K (order 4), the lru lock takes 24.24% cpu runtime
> This is as our expectation: The larger size of folio, the less lru lock
> contention.
> 
> It's acceptable to not batched operate on large folio which is large
> enough. PAGEVEC_SIZE + 1 is chosen here based on following reasons:
>   - acceptable max memory size per batch: 15 x 16 x 4096 = 983040 bytes
>   - the folios with size larger than it will not apply batched operation.
>     But the lru lock contention is not high already.
> 
> 
> I collected data with lru contention when run will-it-scale.page_fault1:
> 
> folio with order 2:
>   Without the change:
>   -   64.31%     0.23%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
>      + 64.07% folio_lruvec_lock_irqsave
> 
>   With the change:
>   -   21.55%     0.21%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
>      + 21.34% folio_lruvec_lock_irqsave
> 
> folio with order 4:
>   Without the change:
>   -   24.24%     0.15%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
>      + 24.09% folio_lruvec_lock_irqsave
> 
>   With the change:
>   -   2.20%     0.09%  page_fault1_pro  [kernel.kallsyms]            [k] folio_lruvec_lock_irqsave
>      + 2.11% folio_lruvec_lock_irqsave
> 
> folio with order 5:
>   -   8.21%     0.16%  page_fault1_pro  [kernel.kallsyms]  [k] folio_lruvec_lock_irqsave
>      + 8.05% folio_lruvec_lock_irqsave
> 
> 
> Regards
> Yin, Fengwei
>
Matthew Wilcox June 20, 2023, 3:22 a.m. UTC | #5
On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
> diff --git a/mm/swap.c b/mm/swap.c
> index 57cb01b042f6..0f8554aeb338 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -228,8 +228,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
>  static void folio_batch_add_and_move(struct folio_batch *fbatch,
>  		struct folio *folio, move_fn_t move_fn)
>  {
> -	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
> -	    !lru_cache_disabled())
> +	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
>  		return;
>  	folio_batch_move_lru(fbatch, move_fn);
>  }

What if all you do is:

-	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
-	    !lru_cache_disabled())
+	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())


How does that perform?
Yin, Fengwei June 20, 2023, 4:39 a.m. UTC | #6
On 6/20/23 11:22, Matthew Wilcox wrote:
> On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
>> diff --git a/mm/swap.c b/mm/swap.c
>> index 57cb01b042f6..0f8554aeb338 100644
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -228,8 +228,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
>>  static void folio_batch_add_and_move(struct folio_batch *fbatch,
>>  		struct folio *folio, move_fn_t move_fn)
>>  {
>> -	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
>> -	    !lru_cache_disabled())
>> +	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
>>  		return;
>>  	folio_batch_move_lru(fbatch, move_fn);
>>  }
> 
> What if all you do is:
> 
> -	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
> -	    !lru_cache_disabled())
> +	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
> 
> 
> How does that perform?
I will give it a try. Thanks.


Regards
Yin, Fengwei
Yin, Fengwei June 20, 2023, 8:01 a.m. UTC | #7
On 6/20/23 11:22, Matthew Wilcox wrote:
> On Sat, Apr 29, 2023 at 04:27:59PM +0800, Yin Fengwei wrote:
>> diff --git a/mm/swap.c b/mm/swap.c
>> index 57cb01b042f6..0f8554aeb338 100644
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -228,8 +228,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
>>  static void folio_batch_add_and_move(struct folio_batch *fbatch,
>>  		struct folio *folio, move_fn_t move_fn)
>>  {
>> -	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
>> -	    !lru_cache_disabled())
>> +	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
>>  		return;
>>  	folio_batch_move_lru(fbatch, move_fn);
>>  }
> 
> What if all you do is:
> 
> -	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
> -	    !lru_cache_disabled())
> +	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
> 
> 
> How does that perform?
With same hardware: IceLake 48C/96T and using order 2, the test result is as following:

order2_without_the_patch:
  -   65.53%     0.22%  page_fault1_pro  [kernel.kallsyms]           [k] folio_lruvec_lock_irqsave
     - 65.30% folio_lruvec_lock_irqsave
        + 65.30% _raw_spin_lock_irqsave

order2_with_the_patch:
  -   19.94%     0.26%  page_fault1_pro  [kernel.vmlinux]            [k] folio_lruvec_lock_irqsave
     - 19.67% folio_lruvec_lock_irqsave
        + 19.67% _raw_spin_lock_irqsave


Regards
Yin, Fengwei
diff mbox series

Patch

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index f582f7213ea5..9479b7b50bc6 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -10,6 +10,7 @@ 
 #define _LINUX_PAGEVEC_H
 
 #include <linux/xarray.h>
+#include <linux/mm.h>
 
 /* 15 pointers + header align the pagevec structure to a power of two */
 #define PAGEVEC_SIZE	15
@@ -22,6 +23,7 @@  struct address_space;
 struct pagevec {
 	unsigned char nr;
 	bool percpu_pvec_drained;
+	unsigned short nr_pages;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -30,12 +32,14 @@  void __pagevec_release(struct pagevec *pvec);
 static inline void pagevec_init(struct pagevec *pvec)
 {
 	pvec->nr = 0;
+	pvec->nr_pages = 0;
 	pvec->percpu_pvec_drained = false;
 }
 
 static inline void pagevec_reinit(struct pagevec *pvec)
 {
 	pvec->nr = 0;
+	pvec->nr_pages = 0;
 }
 
 static inline unsigned pagevec_count(struct pagevec *pvec)
@@ -54,7 +58,12 @@  static inline unsigned pagevec_space(struct pagevec *pvec)
 static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
 {
 	pvec->pages[pvec->nr++] = page;
-	return pagevec_space(pvec);
+	pvec->nr_pages += compound_nr(page);
+
+	if (pvec->nr_pages > PAGEVEC_SIZE)
+		return 0;
+	else
+		return pagevec_space(pvec);
 }
 
 static inline void pagevec_release(struct pagevec *pvec)
@@ -75,6 +84,7 @@  static inline void pagevec_release(struct pagevec *pvec)
 struct folio_batch {
 	unsigned char nr;
 	bool percpu_pvec_drained;
+	unsigned short nr_pages;
 	struct folio *folios[PAGEVEC_SIZE];
 };
 
@@ -92,12 +102,14 @@  static_assert(offsetof(struct pagevec, pages) ==
 static inline void folio_batch_init(struct folio_batch *fbatch)
 {
 	fbatch->nr = 0;
+	fbatch->nr_pages = 0;
 	fbatch->percpu_pvec_drained = false;
 }
 
 static inline void folio_batch_reinit(struct folio_batch *fbatch)
 {
 	fbatch->nr = 0;
+	fbatch->nr_pages = 0;
 }
 
 static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
@@ -110,6 +122,32 @@  static inline unsigned int fbatch_space(struct folio_batch *fbatch)
 	return PAGEVEC_SIZE - fbatch->nr;
 }
 
+/**
+ * folio_batch_add_nr_pages() - Add a folio to a batch.
+ * @fbatch: The folio batch.
+ * @folio: The folio to add.
+ * @nr_pages: The number of pages added to batch.
+ *
+ * The folio is added to the end of the batch.
+ * The batch must have previously been initialised using folio_batch_init().
+ *
+ * Return: The number of slots still available.
+ * Note: parameter folio may not be direct reference to folio and can't
+ *       use folio_nr_pages(folio).
+ *       Currently, this function is only called in mlock.c.
+ */
+static inline unsigned folio_batch_add_nr_pages(struct folio_batch *fbatch,
+		struct folio *folio, unsigned int nr_pages)
+{
+	fbatch->folios[fbatch->nr++] = folio;
+	fbatch->nr_pages += nr_pages;
+
+	if (fbatch->nr_pages > PAGEVEC_SIZE)
+		return 0;
+	else
+		return fbatch_space(fbatch);
+}
+
 /**
  * folio_batch_add() - Add a folio to a batch.
  * @fbatch: The folio batch.
@@ -123,8 +161,10 @@  static inline unsigned int fbatch_space(struct folio_batch *fbatch)
 static inline unsigned folio_batch_add(struct folio_batch *fbatch,
 		struct folio *folio)
 {
-	fbatch->folios[fbatch->nr++] = folio;
-	return fbatch_space(fbatch);
+	unsigned int nr_pages;
+
+	nr_pages = xa_is_value(folio) ? 1 : folio_nr_pages(folio);
+	return folio_batch_add_nr_pages(fbatch, folio, nr_pages);
 }
 
 static inline void folio_batch_release(struct folio_batch *fbatch)
diff --git a/mm/mlock.c b/mm/mlock.c
index 617469fce96d..6de3e6d4639f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -243,19 +243,18 @@  bool need_mlock_drain(int cpu)
 void mlock_folio(struct folio *folio)
 {
 	struct folio_batch *fbatch;
+	unsigned int nr_pages = folio_nr_pages(folio);
 
 	local_lock(&mlock_fbatch.lock);
 	fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
 
 	if (!folio_test_set_mlocked(folio)) {
-		int nr_pages = folio_nr_pages(folio);
-
 		zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
 		__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
 	}
 
 	folio_get(folio);
-	if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
+	if (!folio_batch_add_nr_pages(fbatch, mlock_lru(folio), nr_pages) ||
 	    folio_test_large(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
@@ -278,7 +277,7 @@  void mlock_new_folio(struct folio *folio)
 	__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
 
 	folio_get(folio);
-	if (!folio_batch_add(fbatch, mlock_new(folio)) ||
+	if (!folio_batch_add_nr_pages(fbatch, mlock_new(folio), nr_pages) ||
 	    folio_test_large(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
diff --git a/mm/swap.c b/mm/swap.c
index 57cb01b042f6..0f8554aeb338 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -228,8 +228,7 @@  static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 static void folio_batch_add_and_move(struct folio_batch *fbatch,
 		struct folio *folio, move_fn_t move_fn)
 {
-	if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
-	    !lru_cache_disabled())
+	if (folio_batch_add(fbatch, folio) && !lru_cache_disabled())
 		return;
 	folio_batch_move_lru(fbatch, move_fn);
 }