diff mbox series

[RFC] mm/vmscan.c: avoid possible long latency caused by too_many_isolated()

Message ID 20210416023536.168632-1-zhengjun.xing@linux.intel.com (mailing list archive)
State New
Headers show
Series [RFC] mm/vmscan.c: avoid possible long latency caused by too_many_isolated() | expand

Commit Message

Xing Zhengjun April 16, 2021, 2:35 a.m. UTC
From: Zhengjun Xing <zhengjun.xing@linux.intel.com>

In the system with very few file pages, it is easy to reproduce
"nr_isolated_file > nr_inactive_file",  then too_many_isolated return true,
shrink_inactive_list enter "msleep(100)", the long latency will happen.
The test case to reproduce it is very simple, allocate a lot of huge pages
(near the DRAM size), then do free, repeat the same operation many times.
There is a 3/10 rate to reproduce the issue. In the test, sc-> gfp_mask
is 0x342cca ("_GFP_IO" and "__GFP_FS" is masked),it is more easy to enter
“inactive >>=3”, then “isolated > inactive” will easy to be true.

So I  have a proposal to set a threshold number for the total file pages
to ignore the system with very few file pages, and then bypass the 100ms
sleep. It is hard to set a perfect number for the threshold, so I
just give an example of "256" for it, need more inputs for it.

Signed-off-by: Zhengjun Xing <zhengjun.xing@linux.intel.com>
---
 mm/vmscan.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

Comments

Xing Zhengjun April 22, 2021, 8:36 a.m. UTC | #1
Hi,

    In the system with very few file pages (nr_active_file + 
nr_inactive_file < 100), it is easy to reproduce "nr_isolated_file > 
nr_inactive_file",  then too_many_isolated return true, 
shrink_inactive_list enter "msleep(100)", the long latency will happen.

The test case to reproduce it is very simple: allocate many huge 
pages(near the DRAM size), then do free, repeat the same operation many 
times.
In the test case, the system with very few file pages (nr_active_file + 
nr_inactive_file < 100), I have dumpped the numbers of 
active/inactive/isolated file pages during the whole test(see in the 
attachments) , in shrink_inactive_list "too_many_isolated" is very easy 
to return true, then enter "msleep(100)",in "too_many_isolated" 
sc->gfp_mask is 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is 
also very easy to enter “inactive >>=3”, then “isolated > inactive” will 
be true.

So I  have a proposal to set a threshold number for the total file pages 
to ignore the system with very few file pages, and then bypass the 100ms 
sleep.
It is hard to set a perfect number for the threshold, so I just give an 
example of "256" for it.

I appreciate it if you can give me your suggestion/comments. Thanks.


On 4/16/2021 10:35 AM, zhengjun.xing@linux.intel.com wrote:
> From: Zhengjun Xing <zhengjun.xing@linux.intel.com>
> 
> In the system with very few file pages, it is easy to reproduce
> "nr_isolated_file > nr_inactive_file",  then too_many_isolated return true,
> shrink_inactive_list enter "msleep(100)", the long latency will happen.
> The test case to reproduce it is very simple, allocate a lot of huge pages
> (near the DRAM size), then do free, repeat the same operation many times.
> There is a 3/10 rate to reproduce the issue. In the test, sc-> gfp_mask
> is 0x342cca ("_GFP_IO" and "__GFP_FS" is masked),it is more easy to enter
> “inactive >>=3”, then “isolated > inactive” will easy to be true.
> 
> So I  have a proposal to set a threshold number for the total file pages
> to ignore the system with very few file pages, and then bypass the 100ms
> sleep. It is hard to set a perfect number for the threshold, so I
> just give an example of "256" for it, need more inputs for it.
> 
> Signed-off-by: Zhengjun Xing <zhengjun.xing@linux.intel.com>
> ---
>   mm/vmscan.c | 11 +++++++++--
>   1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 562e87cbd7a1..a1926463455c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -168,6 +168,7 @@ struct scan_control {
>    * From 0 .. 200.  Higher means more swappy.
>    */
>   int vm_swappiness = 60;
> +int lru_list_threshold = SWAP_CLUSTER_MAX << 3;
>   
>   static void set_task_reclaim_state(struct task_struct *task,
>   				   struct reclaim_state *rs)
> @@ -1785,7 +1786,7 @@ int isolate_lru_page(struct page *page)
>   static int too_many_isolated(struct pglist_data *pgdat, int file,
>   		struct scan_control *sc)
>   {
> -	unsigned long inactive, isolated;
> +	unsigned long inactive, isolated, active, nr_lru_pages;
>   
>   	if (current_is_kswapd())
>   		return 0;
> @@ -1796,11 +1797,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
>   	if (file) {
>   		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
>   		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> +		active = node_page_state(pgdat, NR_ACTIVE_FILE);
>   	} else {
>   		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
>   		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> +		active = node_page_state(pgdat, NR_ACTIVE_ANON);
>   	}
> -
> +	nr_lru_pages = inactive + active;
>   	/*
>   	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
>   	 * won't get blocked by normal direct-reclaimers, forming a circular
> @@ -1809,6 +1812,10 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
>   	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
>   		inactive >>= 3;
>   
> +	if (isolated > inactive)
> +		if (nr_lru_pages < lru_list_threshold)
> +			return 0;
> +
>   	return isolated > inactive;
>   }
>   
>
Hillf Danton April 22, 2021, 10:23 a.m. UTC | #2
Hi Zhengjun

On Thu, 22 Apr 2021 16:36:19 +0800 Zhengjun Xing wrote:
>     In the system with very few file pages (nr_active_file + 
> nr_inactive_file < 100), it is easy to reproduce "nr_isolated_file > 
> nr_inactive_file",  then too_many_isolated return true, 
> shrink_inactive_list enter "msleep(100)", the long latency will happen.

We should skip reclaiming page cache in this case.
> 
> The test case to reproduce it is very simple: allocate many huge 
> pages(near the DRAM size), then do free, repeat the same operation many 
> times.
> In the test case, the system with very few file pages (nr_active_file + 
> nr_inactive_file < 100), I have dumpped the numbers of 
> active/inactive/isolated file pages during the whole test(see in the 
> attachments) , in shrink_inactive_list "too_many_isolated" is very easy 
> to return true, then enter "msleep(100)",in "too_many_isolated" 
> sc->gfp_mask is 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is 
> also very easy to enter “inactive >>=3”, then “isolated > inactive” will 
> be true.
> 
> So I  have a proposal to set a threshold number for the total file pages 
> to ignore the system with very few file pages, and then bypass the 100ms 
> sleep.
> It is hard to set a perfect number for the threshold, so I just give an 
> example of "256" for it.

Another option seems like we take a nap at the second time of lru tmi
with some allocators in your case served without the 100ms delay.

+++ x/mm/vmscan.c
@@ -118,6 +118,9 @@ struct scan_control {
 	/* The file pages on the current node are dangerously low */
 	unsigned int file_is_tiny:1;
 
+	unsigned int file_tmi:1; /* too many isolated */
+	unsigned int anon_tmi:1;
+
 	/* Allocation order */
 	s8 order;
 
@@ -1905,6 +1908,21 @@ static int current_may_throttle(void)
 		bdi_write_congested(current->backing_dev_info);
 }
 
+static void update_sc_tmi(struct scan_control *sc, bool file, int set)
+{
+	if (file)
+		sc->file_tmi = set;
+	else
+		sc->anon_tmi = set;
+}
+static bool is_sc_tmi(struct scan_control *sc, bool file)
+{
+	if (file)
+		return sc->file_tmi != 0;
+	else
+		return sc->anon_tmi != 0;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1927,6 +1945,11 @@ shrink_inactive_list(unsigned long nr_to
 		if (stalled)
 			return 0;
 
+		if (!is_sc_tmi(sc, file)) {
+			update_sc_tmi(sc, file, 1);
+			return 0;
+		}
+
 		/* wait a bit for the reclaimer. */
 		msleep(100);
 		stalled = true;
@@ -1936,6 +1959,9 @@ shrink_inactive_list(unsigned long nr_to
 			return SWAP_CLUSTER_MAX;
 	}
 
+	if (is_sc_tmi(sc, file))
+		update_sc_tmi(sc, file, 0);
+
 	lru_add_drain();
 
 	spin_lock_irq(&lruvec->lru_lock);
Yu Zhao April 22, 2021, 5:13 p.m. UTC | #3
On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> Hi,
> 
>    In the system with very few file pages (nr_active_file + nr_inactive_file
> < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
> too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> long latency will happen.
> 
> The test case to reproduce it is very simple: allocate many huge pages(near
> the DRAM size), then do free, repeat the same operation many times.
> In the test case, the system with very few file pages (nr_active_file +
> nr_inactive_file < 100), I have dumpped the numbers of
> active/inactive/isolated file pages during the whole test(see in the
> attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> enter “inactive >>=3”, then “isolated > inactive” will be true.
> 
> So I  have a proposal to set a threshold number for the total file pages to
> ignore the system with very few file pages, and then bypass the 100ms sleep.
> It is hard to set a perfect number for the threshold, so I just give an
> example of "256" for it.
> 
> I appreciate it if you can give me your suggestion/comments. Thanks.

Hi Zhengjun,

It seems to me using the number of isolated pages to keep a lid on
direct reclaimers is not a good solution. We shouldn't keep going
that direction if we really want to fix the problem because migration
can isolate many pages too, which in turn blocks page reclaim.

Here is something works a lot better. Please give it a try. Thanks.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 507d216610bf2..9a09f7e76f6b8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -951,6 +951,8 @@ typedef struct pglist_data {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 
+	atomic_t nr_reclaimers;
+
 	/*
 	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
 	 *
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c080fafec396..f7278642290a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1786,43 +1786,6 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
-/*
- * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
- * then get rescheduled. When there are massive number of tasks doing page
- * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
- * the LRU list will go small and be scanned faster than necessary, leading to
- * unnecessary swapping, thrashing and OOM.
- */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
-		struct scan_control *sc)
-{
-	unsigned long inactive, isolated;
-
-	if (current_is_kswapd())
-		return 0;
-
-	if (!writeback_throttling_sane(sc))
-		return 0;
-
-	if (file) {
-		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
-		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
-	} else {
-		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
-		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
-	}
-
-	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
-	 */
-	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
-		inactive >>= 3;
-
-	return isolated > inactive;
-}
-
 /*
  * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
  * On return, @list is reused as a list of pages to be freed by the caller.
@@ -1924,19 +1887,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	bool stalled = false;
 
-	while (unlikely(too_many_isolated(pgdat, file, sc))) {
-		if (stalled)
-			return 0;
-
-		/* wait a bit for the reclaimer. */
-		msleep(100);
-		stalled = true;
-
-		/* We are about to die and free our memory. Return now. */
-		if (fatal_signal_pending(current))
-			return SWAP_CLUSTER_MAX;
-	}
-
 	lru_add_drain();
 
 	spin_lock_irq(&lruvec->lru_lock);
@@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int nr_cpus;
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
@@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	set_task_reclaim_state(current, &sc.reclaim_state);
 	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
 
+	nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
+	while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
+		if (schedule_timeout_killable(HZ / 10))
+			return SWAP_CLUSTER_MAX;
+	}
+
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
+	if (nr_cpus)
+		atomic_dec(&pgdat->nr_reclaimers);
+
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 	set_task_reclaim_state(current, NULL);
Shakeel Butt April 22, 2021, 6:51 p.m. UTC | #4
On Thu, Apr 22, 2021 at 10:13 AM Yu Zhao <yuzhao@google.com> wrote:
>
[...]
>         spin_lock_irq(&lruvec->lru_lock);
> @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
>  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>                                 gfp_t gfp_mask, nodemask_t *nodemask)
>  {
> +       int nr_cpus;
>         unsigned long nr_reclaimed;
>         struct scan_control sc = {
>                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
> @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>         set_task_reclaim_state(current, &sc.reclaim_state);
>         trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
>
> +       nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();

kswapd does not call this function (directly or indirectly).

> +       while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {

At most nr_nodes * nr_cpus direct reclaimers are allowed?

> +               if (schedule_timeout_killable(HZ / 10))

trace_mm_vmscan_direct_reclaim_end() and set_task_reclaim_state(NULL)?

> +                       return SWAP_CLUSTER_MAX;
> +       }
> +
>         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>
> +       if (nr_cpus)
> +               atomic_dec(&pgdat->nr_reclaimers);
> +
>         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
>         set_task_reclaim_state(current, NULL);

BTW I think this approach needs to be more sophisticated. What if a
direct reclaimer within the reclaim is scheduled away and is out of
CPU quota?
Yu Zhao April 22, 2021, 8:15 p.m. UTC | #5
On Thu, Apr 22, 2021 at 12:52 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Thu, Apr 22, 2021 at 10:13 AM Yu Zhao <yuzhao@google.com> wrote:
> >
> [...]
> >         spin_lock_irq(&lruvec->lru_lock);
> > @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
> >  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >                                 gfp_t gfp_mask, nodemask_t *nodemask)
> >  {
> > +       int nr_cpus;
> >         unsigned long nr_reclaimed;
> >         struct scan_control sc = {
> >                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
> > @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >         set_task_reclaim_state(current, &sc.reclaim_state);
> >         trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
> >
> > +       nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
>
> kswapd does not call this function (directly or indirectly).
>
> > +       while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
>
> At most nr_nodes * nr_cpus direct reclaimers are allowed?
>
> > +               if (schedule_timeout_killable(HZ / 10))
>
> trace_mm_vmscan_direct_reclaim_end() and set_task_reclaim_state(NULL)?
>
> > +                       return SWAP_CLUSTER_MAX;
> > +       }
> > +
> >         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> >
> > +       if (nr_cpus)
> > +               atomic_dec(&pgdat->nr_reclaimers);
> > +
> >         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
> >         set_task_reclaim_state(current, NULL);
>
> BTW I think this approach needs to be more sophisticated. What if a
> direct reclaimer within the reclaim is scheduled away and is out of
> CPU quota?

More sophisticated to what end?

We wouldn't worry about similar scenarios that we ran out of cpu quota
while holding resources like a mutex, Si why this one is different,
especially given that we already allow many reclaimers to run
concurrently?
Tim Chen April 22, 2021, 8:17 p.m. UTC | #6
On 4/22/21 10:13 AM, Yu Zhao wrote:

> @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
>  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  				gfp_t gfp_mask, nodemask_t *nodemask)
>  {
> +	int nr_cpus;
>  	unsigned long nr_reclaimed;
>  	struct scan_control sc = {
>  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
> @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  	set_task_reclaim_state(current, &sc.reclaim_state);
>  	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
>  
> +	nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> +	while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> +		if (schedule_timeout_killable(HZ / 10))

100 msec seems like a long time to wait.  The original code in shrink_inactive_list
choose 100 msec sleep because the sleep happens only once in the while loop and 100 msec was
used to check for stalling.  In this case the loop can go on for a while and the 
#reclaimers can go down below the sooner than 100 msec. Seems like it should be checked
more often.

Tim

> +			return SWAP_CLUSTER_MAX;
> +	}
> +
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  
> +	if (nr_cpus)
> +		atomic_dec(&pgdat->nr_reclaimers);
> +
>  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
>  	set_task_reclaim_state(current, NULL);
>
Yu Zhao April 22, 2021, 8:30 p.m. UTC | #7
On Thu, Apr 22, 2021 at 2:17 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
>
>
> On 4/22/21 10:13 AM, Yu Zhao wrote:
>
> > @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
> >  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >                               gfp_t gfp_mask, nodemask_t *nodemask)
> >  {
> > +     int nr_cpus;
> >       unsigned long nr_reclaimed;
> >       struct scan_control sc = {
> >               .nr_to_reclaim = SWAP_CLUSTER_MAX,
> > @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >       set_task_reclaim_state(current, &sc.reclaim_state);
> >       trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
> >
> > +     nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> > +     while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> > +             if (schedule_timeout_killable(HZ / 10))
>
> 100 msec seems like a long time to wait.  The original code in shrink_inactive_list
> choose 100 msec sleep because the sleep happens only once in the while loop and 100 msec was
> used to check for stalling.  In this case the loop can go on for a while and the
> #reclaimers can go down below the sooner than 100 msec. Seems like it should be checked
> more often.

You are not looking at the original code -- the original code sleeps
indefinitely. It was changed by commit db73ee0d46 to fix a problem
that doesn't apply to the code above.

HZ/10 is purely arbitrary but that's ok because we assume normally
nobody hits it. If you do often, we need to figure out why and how not
to hit it so often.
Tim Chen April 22, 2021, 8:38 p.m. UTC | #8
On 4/22/21 1:30 PM, Yu Zhao wrote:
> 
> HZ/10 is purely arbitrary but that's ok because we assume normally
> nobody hits it. If you do often, we need to figure out why and how not
> to hit it so often.
> 

Perhaps Zhengjun can test the proposed fix in his test case to see if the timeout value
makes any difference.

Tim
Yu Zhao April 22, 2021, 8:57 p.m. UTC | #9
On Thu, Apr 22, 2021 at 2:38 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
>
>
> On 4/22/21 1:30 PM, Yu Zhao wrote:
> >
> > HZ/10 is purely arbitrary but that's ok because we assume normally
> > nobody hits it. If you do often, we need to figure out why and how not
> > to hit it so often.
> >
>
> Perhaps Zhengjun can test the proposed fix in his test case to see if the timeout value
> makes any difference.

Shakeel has another test to stress page reclaim to a point that the
kernel can livelock for two hours because of a large number of
concurrent reclaimers stepping on each other. He might be able to
share that test with you in case you are interested.

Also it's Hugh who first noticed that migration can isolate many pages
and in turn block page reclaim. He might be able to help too, in case
you are interested in the interaction between migration and page
reclaim.

Thanks.
Tim Chen April 22, 2021, 9:02 p.m. UTC | #10
On 4/22/21 1:57 PM, Yu Zhao wrote:
> On Thu, Apr 22, 2021 at 2:38 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>>
>>
>> On 4/22/21 1:30 PM, Yu Zhao wrote:
>>>
>>> HZ/10 is purely arbitrary but that's ok because we assume normally
>>> nobody hits it. If you do often, we need to figure out why and how not
>>> to hit it so often.
>>>
>>
>> Perhaps Zhengjun can test the proposed fix in his test case to see if the timeout value
>> makes any difference.
> 
> Shakeel has another test to stress page reclaim to a point that the
> kernel can livelock for two hours because of a large number of
> concurrent reclaimers stepping on each other. He might be able to
> share that test with you in case you are interested.

That will be great.  Yes, we are interested to have the test.

Tim

> 
> Also it's Hugh who first noticed that migration can isolate many pages
> and in turn block page reclaim. He might be able to help too, in case
> you are interested in the interaction between migration and page
> reclaim.
> 
> Thanks.
>
Xing Zhengjun April 23, 2021, 6:55 a.m. UTC | #11
On 4/22/2021 6:23 PM, Hillf Danton wrote:
> Another option seems like we take a nap at the second time of lru tmi
> with some allocators in your case served without the 100ms delay.

Thanks, I will try it with my test cases.
Xing Zhengjun April 23, 2021, 6:57 a.m. UTC | #12
On 4/23/2021 1:13 AM, Yu Zhao wrote:
> On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
>> Hi,
>>
>>     In the system with very few file pages (nr_active_file + nr_inactive_file
>> < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
>> too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
>> long latency will happen.
>>
>> The test case to reproduce it is very simple: allocate many huge pages(near
>> the DRAM size), then do free, repeat the same operation many times.
>> In the test case, the system with very few file pages (nr_active_file +
>> nr_inactive_file < 100), I have dumpped the numbers of
>> active/inactive/isolated file pages during the whole test(see in the
>> attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
>> return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
>> 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
>> enter “inactive >>=3”, then “isolated > inactive” will be true.
>>
>> So I  have a proposal to set a threshold number for the total file pages to
>> ignore the system with very few file pages, and then bypass the 100ms sleep.
>> It is hard to set a perfect number for the threshold, so I just give an
>> example of "256" for it.
>>
>> I appreciate it if you can give me your suggestion/comments. Thanks.
> 
> Hi Zhengjun,
> 
> It seems to me using the number of isolated pages to keep a lid on
> direct reclaimers is not a good solution. We shouldn't keep going
> that direction if we really want to fix the problem because migration
> can isolate many pages too, which in turn blocks page reclaim.
> 
> Here is something works a lot better. Please give it a try. Thanks.

Thanks, I will try it with my test cases.

> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 507d216610bf2..9a09f7e76f6b8 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -951,6 +951,8 @@ typedef struct pglist_data {
>   
>   	/* Fields commonly accessed by the page reclaim scanner */
>   
> +	atomic_t nr_reclaimers;
> +
>   	/*
>   	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
>   	 *
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1c080fafec396..f7278642290a6 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1786,43 +1786,6 @@ int isolate_lru_page(struct page *page)
>   	return ret;
>   }
>   
> -/*
> - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
> - * then get rescheduled. When there are massive number of tasks doing page
> - * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
> - * the LRU list will go small and be scanned faster than necessary, leading to
> - * unnecessary swapping, thrashing and OOM.
> - */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> -		struct scan_control *sc)
> -{
> -	unsigned long inactive, isolated;
> -
> -	if (current_is_kswapd())
> -		return 0;
> -
> -	if (!writeback_throttling_sane(sc))
> -		return 0;
> -
> -	if (file) {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> -	} else {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> -	}
> -
> -	/*
> -	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> -	 * won't get blocked by normal direct-reclaimers, forming a circular
> -	 * deadlock.
> -	 */
> -	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> -		inactive >>= 3;
> -
> -	return isolated > inactive;
> -}
> -
>   /*
>    * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
>    * On return, @list is reused as a list of pages to be freed by the caller.
> @@ -1924,19 +1887,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>   	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>   	bool stalled = false;
>   
> -	while (unlikely(too_many_isolated(pgdat, file, sc))) {
> -		if (stalled)
> -			return 0;
> -
> -		/* wait a bit for the reclaimer. */
> -		msleep(100);
> -		stalled = true;
> -
> -		/* We are about to die and free our memory. Return now. */
> -		if (fatal_signal_pending(current))
> -			return SWAP_CLUSTER_MAX;
> -	}
> -
>   	lru_add_drain();
>   
>   	spin_lock_irq(&lruvec->lru_lock);
> @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
>   unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>   				gfp_t gfp_mask, nodemask_t *nodemask)
>   {
> +	int nr_cpus;
>   	unsigned long nr_reclaimed;
>   	struct scan_control sc = {
>   		.nr_to_reclaim = SWAP_CLUSTER_MAX,
> @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>   	set_task_reclaim_state(current, &sc.reclaim_state);
>   	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
>   
> +	nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> +	while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> +		if (schedule_timeout_killable(HZ / 10))
> +			return SWAP_CLUSTER_MAX;
> +	}
> +
>   	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>   
> +	if (nr_cpus)
> +		atomic_dec(&pgdat->nr_reclaimers);
> +
>   	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
>   	set_task_reclaim_state(current, NULL);
>
Yu Zhao April 23, 2021, 8:23 p.m. UTC | #13
On Fri, Apr 23, 2021 at 02:57:07PM +0800, Xing Zhengjun wrote:
> On 4/23/2021 1:13 AM, Yu Zhao wrote:
> > On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> > > Hi,
> > > 
> > >     In the system with very few file pages (nr_active_file + nr_inactive_file
> > > < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
> > > too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> > > long latency will happen.
> > > 
> > > The test case to reproduce it is very simple: allocate many huge pages(near
> > > the DRAM size), then do free, repeat the same operation many times.
> > > In the test case, the system with very few file pages (nr_active_file +
> > > nr_inactive_file < 100), I have dumpped the numbers of
> > > active/inactive/isolated file pages during the whole test(see in the
> > > attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> > > return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> > > 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> > > enter “inactive >>=3”, then “isolated > inactive” will be true.
> > > 
> > > So I  have a proposal to set a threshold number for the total file pages to
> > > ignore the system with very few file pages, and then bypass the 100ms sleep.
> > > It is hard to set a perfect number for the threshold, so I just give an
> > > example of "256" for it.
> > > 
> > > I appreciate it if you can give me your suggestion/comments. Thanks.
> > 
> > Hi Zhengjun,
> > 
> > It seems to me using the number of isolated pages to keep a lid on
> > direct reclaimers is not a good solution. We shouldn't keep going
> > that direction if we really want to fix the problem because migration
> > can isolate many pages too, which in turn blocks page reclaim.
> > 
> > Here is something works a lot better. Please give it a try. Thanks.
> 
> Thanks, I will try it with my test cases.

Thanks. I took care my sloppiness from yesterday and tested the
following. It should apply cleanly and work well. Please let me know.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..48bb2b77389e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -832,6 +832,7 @@ typedef struct pglist_data {
 #endif
 
 	/* Fields commonly accessed by the page reclaim scanner */
+	atomic_t		nr_reclaimers;
 
 	/*
 	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..3fcdfbee89c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1775,43 +1775,6 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
-/*
- * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
- * then get rescheduled. When there are massive number of tasks doing page
- * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
- * the LRU list will go small and be scanned faster than necessary, leading to
- * unnecessary swapping, thrashing and OOM.
- */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
-		struct scan_control *sc)
-{
-	unsigned long inactive, isolated;
-
-	if (current_is_kswapd())
-		return 0;
-
-	if (!writeback_throttling_sane(sc))
-		return 0;
-
-	if (file) {
-		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
-		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
-	} else {
-		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
-		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
-	}
-
-	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
-	 */
-	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
-		inactive >>= 3;
-
-	return isolated > inactive;
-}
-
 /*
  * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
  * On return, @list is reused as a list of pages to be freed by the caller.
@@ -1911,20 +1874,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	bool file = is_file_lru(lru);
 	enum vm_event_item item;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	bool stalled = false;
-
-	while (unlikely(too_many_isolated(pgdat, file, sc))) {
-		if (stalled)
-			return 0;
-
-		/* wait a bit for the reclaimer. */
-		msleep(100);
-		stalled = true;
-
-		/* We are about to die and free our memory. Return now. */
-		if (fatal_signal_pending(current))
-			return SWAP_CLUSTER_MAX;
-	}
 
 	lru_add_drain();
 
@@ -2903,6 +2852,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	pg_data_t *last_pgdat = NULL;
+	bool should_retry = false;
+	int nr_cpus = num_online_cpus();
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2914,9 +2865,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		sc->gfp_mask |= __GFP_HIGHMEM;
 		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
 	}
-
+retry:
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					sc->reclaim_idx, sc->nodemask) {
+		/*
+		 * Shrink each node in the zonelist once. If the zonelist is
+		 * ordered by zone (not the default) then a node may be shrunk
+		 * multiple times but in that case the user prefers lower zones
+		 * being preserved.
+		 */
+		if (zone->zone_pgdat == last_pgdat)
+			continue;
+
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
@@ -2941,16 +2901,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 				sc->compaction_ready = true;
 				continue;
 			}
+		}
 
-			/*
-			 * Shrink each node in the zonelist once. If the
-			 * zonelist is ordered by zone (not the default) then a
-			 * node may be shrunk multiple times but in that case
-			 * the user prefers lower zones being preserved.
-			 */
-			if (zone->zone_pgdat == last_pgdat)
-				continue;
+		/*
+		 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from
+		 * the LRU list and then get rescheduled. When there are massive
+		 * number of tasks doing page allocation, such sleeping direct
+		 * reclaimers may keep piling up on each CPU, the LRU list will
+		 * go small and be scanned faster than necessary, leading to
+		 * unnecessary swapping, thrashing and OOM.
+		 */
+		VM_BUG_ON(current_is_kswapd());
 
+		if (!atomic_add_unless(&zone->zone_pgdat->nr_reclaimers, 1, nr_cpus)) {
+			should_retry = true;
+			continue;
+		}
+
+		if (last_pgdat)
+			atomic_dec(&last_pgdat->nr_reclaimers);
+		last_pgdat = zone->zone_pgdat;
+
+		if (!cgroup_reclaim(sc)) {
 			/*
 			 * This steals pages from memory cgroups over softlimit
 			 * and returns the number of reclaimed pages and
@@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		/* See comment about same check for global reclaim above */
-		if (zone->zone_pgdat == last_pgdat)
-			continue;
-		last_pgdat = zone->zone_pgdat;
 		shrink_node(zone->zone_pgdat, sc);
 	}
 
+	if (last_pgdat)
+		atomic_dec(&last_pgdat->nr_reclaimers);
+	else if (should_retry) {
+		/* wait a bit for the reclaimer. */
+		if (!schedule_timeout_killable(HZ / 10))
+			goto retry;
+
+		/* We are about to die and free our memory. Return now. */
+		sc->nr_reclaimed += SWAP_CLUSTER_MAX;
+	}
+
 	/*
 	 * Restore to original mask to avoid the impact on the caller if we
 	 * promoted it to __GFP_HIGHMEM.
@@ -4189,6 +4168,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	set_task_reclaim_state(p, &sc.reclaim_state);
 
 	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+		int nr_cpus = num_online_cpus();
+
+		VM_BUG_ON(current_is_kswapd());
+
+		if (!atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
+			schedule_timeout_killable(HZ / 10);
+			goto out;
+		}
+
 		/*
 		 * Free memory by calling shrink node with increasing
 		 * priorities until we have enough memory freed.
@@ -4196,8 +4184,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		do {
 			shrink_node(pgdat, &sc);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
-	}
 
+		atomic_dec(&pgdat->nr_reclaimers);
+	}
+out:
 	set_task_reclaim_state(p, NULL);
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
Huang, Ying April 25, 2021, 12:48 a.m. UTC | #14
Yu Zhao <yuzhao@google.com> writes:
[snip]

> @@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  			/* need some check for avoid more shrink_zone() */
>  		}
>  
> -		/* See comment about same check for global reclaim above */
> -		if (zone->zone_pgdat == last_pgdat)
> -			continue;
> -		last_pgdat = zone->zone_pgdat;
>  		shrink_node(zone->zone_pgdat, sc);
>  	}
>  
> +	if (last_pgdat)
> +		atomic_dec(&last_pgdat->nr_reclaimers);
> +	else if (should_retry) {
> +		/* wait a bit for the reclaimer. */
> +		if (!schedule_timeout_killable(HZ / 10))

Once we reached here, even accidentally, the caller needs to sleep at
least 100ms.  How about use a semaphore for pgdat->nr_reclaimers?  Then
the sleeper can be waken up when the resource is considered enough.

Best Regards,
Huang, Ying

> +			goto retry;
> +
> +		/* We are about to die and free our memory. Return now. */
> +		sc->nr_reclaimed += SWAP_CLUSTER_MAX;
> +	}
> +
>  	/*
>  	 * Restore to original mask to avoid the impact on the caller if we
>  	 * promoted it to __GFP_HIGHMEM.

[snip]
Yu Zhao April 27, 2021, 9:53 p.m. UTC | #15
On Sat, Apr 24, 2021 at 6:48 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Yu Zhao <yuzhao@google.com> writes:
> [snip]
>
> > @@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >                       /* need some check for avoid more shrink_zone() */
> >               }
> >
> > -             /* See comment about same check for global reclaim above */
> > -             if (zone->zone_pgdat == last_pgdat)
> > -                     continue;
> > -             last_pgdat = zone->zone_pgdat;
> >               shrink_node(zone->zone_pgdat, sc);
> >       }
> >
> > +     if (last_pgdat)
> > +             atomic_dec(&last_pgdat->nr_reclaimers);
> > +     else if (should_retry) {
> > +             /* wait a bit for the reclaimer. */
> > +             if (!schedule_timeout_killable(HZ / 10))
>
> Once we reached here, even accidentally, the caller needs to sleep at
> least 100ms.  How about use a semaphore for pgdat->nr_reclaimers?  Then
> the sleeper can be waken up when the resource is considered enough.

Yeah, that sounds good to me. I guess we will have to wait and see the
test result from Zhengjun.

Thanks.
Michal Hocko April 28, 2021, 11:55 a.m. UTC | #16
[Cc Rik and Andrea]

On Thu 22-04-21 11:13:34, Yu Zhao wrote:
> On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> > Hi,
> > 
> >    In the system with very few file pages (nr_active_file + nr_inactive_file
> > < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
> > too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> > long latency will happen.
> > 
> > The test case to reproduce it is very simple: allocate many huge pages(near
> > the DRAM size), then do free, repeat the same operation many times.
> > In the test case, the system with very few file pages (nr_active_file +
> > nr_inactive_file < 100), I have dumpped the numbers of
> > active/inactive/isolated file pages during the whole test(see in the
> > attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> > return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> > 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> > enter “inactive >>=3”, then “isolated > inactive” will be true.
> > 
> > So I  have a proposal to set a threshold number for the total file pages to
> > ignore the system with very few file pages, and then bypass the 100ms sleep.
> > It is hard to set a perfect number for the threshold, so I just give an
> > example of "256" for it.
> > 
> > I appreciate it if you can give me your suggestion/comments. Thanks.
> 
> Hi Zhengjun,
> 
> It seems to me using the number of isolated pages to keep a lid on
> direct reclaimers is not a good solution. We shouldn't keep going
> that direction if we really want to fix the problem because migration
> can isolate many pages too, which in turn blocks page reclaim.
> 
> Here is something works a lot better. Please give it a try. Thanks.

O do have a very vague recollection that number of reclaimers used to be
a criterion in very old days and it has proven to be quite bad in the
end. I am sorry but I do not have an reference at hands and do not have
time to crawl git history. Maybe Rik/Andrea will remember details.

The existing throttling mechanism is quite far from optimal but it aims
at handling close to OOM situations where effectivelly a large part of
the existing LRUs can be already isolated. We already have a retry
logic which is LRU aware in the page allocator
(should_reclaim_retry). The logic would have to be extended but that
sounds like a better fit for the back off to me.

> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 507d216610bf2..9a09f7e76f6b8 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -951,6 +951,8 @@ typedef struct pglist_data {
>  
>  	/* Fields commonly accessed by the page reclaim scanner */
>  
> +	atomic_t nr_reclaimers;
> +
>  	/*
>  	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
>  	 *
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1c080fafec396..f7278642290a6 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1786,43 +1786,6 @@ int isolate_lru_page(struct page *page)
>  	return ret;
>  }
>  
> -/*
> - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
> - * then get rescheduled. When there are massive number of tasks doing page
> - * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
> - * the LRU list will go small and be scanned faster than necessary, leading to
> - * unnecessary swapping, thrashing and OOM.
> - */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> -		struct scan_control *sc)
> -{
> -	unsigned long inactive, isolated;
> -
> -	if (current_is_kswapd())
> -		return 0;
> -
> -	if (!writeback_throttling_sane(sc))
> -		return 0;
> -
> -	if (file) {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> -	} else {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> -	}
> -
> -	/*
> -	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> -	 * won't get blocked by normal direct-reclaimers, forming a circular
> -	 * deadlock.
> -	 */
> -	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> -		inactive >>= 3;
> -
> -	return isolated > inactive;
> -}
> -
>  /*
>   * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
>   * On return, @list is reused as a list of pages to be freed by the caller.
> @@ -1924,19 +1887,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>  	bool stalled = false;
>  
> -	while (unlikely(too_many_isolated(pgdat, file, sc))) {
> -		if (stalled)
> -			return 0;
> -
> -		/* wait a bit for the reclaimer. */
> -		msleep(100);
> -		stalled = true;
> -
> -		/* We are about to die and free our memory. Return now. */
> -		if (fatal_signal_pending(current))
> -			return SWAP_CLUSTER_MAX;
> -	}
> -
>  	lru_add_drain();
>  
>  	spin_lock_irq(&lruvec->lru_lock);
> @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
>  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  				gfp_t gfp_mask, nodemask_t *nodemask)
>  {
> +	int nr_cpus;
>  	unsigned long nr_reclaimed;
>  	struct scan_control sc = {
>  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
> @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  	set_task_reclaim_state(current, &sc.reclaim_state);
>  	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
>  
> +	nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> +	while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> +		if (schedule_timeout_killable(HZ / 10))
> +			return SWAP_CLUSTER_MAX;
> +	}
> +
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  
> +	if (nr_cpus)
> +		atomic_dec(&pgdat->nr_reclaimers);
> +
>  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
>  	set_task_reclaim_state(current, NULL);

This will surely break any memcg direct reclaim.
Yu Zhao April 28, 2021, 3:05 p.m. UTC | #17
On Wed, Apr 28, 2021 at 5:55 AM Michal Hocko <mhocko@suse.com> wrote:
>
> [Cc Rik and Andrea]
>
> On Thu 22-04-21 11:13:34, Yu Zhao wrote:
> > On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> > > Hi,
> > >
> > >    In the system with very few file pages (nr_active_file + nr_inactive_file
> > > < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
> > > too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> > > long latency will happen.
> > >
> > > The test case to reproduce it is very simple: allocate many huge pages(near
> > > the DRAM size), then do free, repeat the same operation many times.
> > > In the test case, the system with very few file pages (nr_active_file +
> > > nr_inactive_file < 100), I have dumpped the numbers of
> > > active/inactive/isolated file pages during the whole test(see in the
> > > attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> > > return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> > > 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> > > enter “inactive >>=3”, then “isolated > inactive” will be true.
> > >
> > > So I  have a proposal to set a threshold number for the total file pages to
> > > ignore the system with very few file pages, and then bypass the 100ms sleep.
> > > It is hard to set a perfect number for the threshold, so I just give an
> > > example of "256" for it.
> > >
> > > I appreciate it if you can give me your suggestion/comments. Thanks.
> >
> > Hi Zhengjun,
> >
> > It seems to me using the number of isolated pages to keep a lid on
> > direct reclaimers is not a good solution. We shouldn't keep going
> > that direction if we really want to fix the problem because migration
> > can isolate many pages too, which in turn blocks page reclaim.
> >
> > Here is something works a lot better. Please give it a try. Thanks.
>
> O do have a very vague recollection that number of reclaimers used to be
> a criterion in very old days and it has proven to be quite bad in the
> end. I am sorry but I do not have an reference at hands and do not have
> time to crawl git history. Maybe Rik/Andrea will remember details.

Well, I found nothing.

> The existing throttling mechanism is quite far from optimal but it aims
> at handling close to OOM situations where effectivelly a large part of
> the existing LRUs can be already isolated. We already have a retry
> logic which is LRU aware in the page allocator
> (should_reclaim_retry). The logic would have to be extended but that
> sounds like a better fit for the back off to me.
>
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 507d216610bf2..9a09f7e76f6b8 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -951,6 +951,8 @@ typedef struct pglist_data {
> >
> >       /* Fields commonly accessed by the page reclaim scanner */
> >
> > +     atomic_t nr_reclaimers;
> > +
> >       /*
> >        * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
> >        *
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 1c080fafec396..f7278642290a6 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1786,43 +1786,6 @@ int isolate_lru_page(struct page *page)
> >       return ret;
> >  }
> >
> > -/*
> > - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
> > - * then get rescheduled. When there are massive number of tasks doing page
> > - * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
> > - * the LRU list will go small and be scanned faster than necessary, leading to
> > - * unnecessary swapping, thrashing and OOM.
> > - */
> > -static int too_many_isolated(struct pglist_data *pgdat, int file,
> > -             struct scan_control *sc)
> > -{
> > -     unsigned long inactive, isolated;
> > -
> > -     if (current_is_kswapd())
> > -             return 0;
> > -
> > -     if (!writeback_throttling_sane(sc))
> > -             return 0;
> > -
> > -     if (file) {
> > -             inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> > -             isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> > -     } else {
> > -             inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
> > -             isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> > -     }
> > -
> > -     /*
> > -      * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> > -      * won't get blocked by normal direct-reclaimers, forming a circular
> > -      * deadlock.
> > -      */
> > -     if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> > -             inactive >>= 3;
> > -
> > -     return isolated > inactive;
> > -}
> > -
> >  /*
> >   * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
> >   * On return, @list is reused as a list of pages to be freed by the caller.
> > @@ -1924,19 +1887,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >       bool stalled = false;
> >
> > -     while (unlikely(too_many_isolated(pgdat, file, sc))) {
> > -             if (stalled)
> > -                     return 0;
> > -
> > -             /* wait a bit for the reclaimer. */
> > -             msleep(100);
> > -             stalled = true;
> > -
> > -             /* We are about to die and free our memory. Return now. */
> > -             if (fatal_signal_pending(current))
> > -                     return SWAP_CLUSTER_MAX;
> > -     }
> > -
> >       lru_add_drain();
> >
> >       spin_lock_irq(&lruvec->lru_lock);
> > @@ -3302,6 +3252,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
> >  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >                               gfp_t gfp_mask, nodemask_t *nodemask)
> >  {
> > +     int nr_cpus;
> >       unsigned long nr_reclaimed;
> >       struct scan_control sc = {
> >               .nr_to_reclaim = SWAP_CLUSTER_MAX,
> > @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> >       set_task_reclaim_state(current, &sc.reclaim_state);
> >       trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
> >
> > +     nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> > +     while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> > +             if (schedule_timeout_killable(HZ / 10))
> > +                     return SWAP_CLUSTER_MAX;
> > +     }
> > +
> >       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> >
> > +     if (nr_cpus)
> > +             atomic_dec(&pgdat->nr_reclaimers);
> > +
> >       trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
> >       set_task_reclaim_state(current, NULL);
>
> This will surely break any memcg direct reclaim.

Mind elaborating how it will "surely" break any memcg direct reclaim?
Michal Hocko April 29, 2021, 10 a.m. UTC | #18
On Wed 28-04-21 09:05:06, Yu Zhao wrote:
> On Wed, Apr 28, 2021 at 5:55 AM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > > @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> > >       set_task_reclaim_state(current, &sc.reclaim_state);
> > >       trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
> > >
> > > +     nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> > > +     while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> > > +             if (schedule_timeout_killable(HZ / 10))
> > > +                     return SWAP_CLUSTER_MAX;
> > > +     }
> > > +
> > >       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> > >
> > > +     if (nr_cpus)
> > > +             atomic_dec(&pgdat->nr_reclaimers);
> > > +
> > >       trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
> > >       set_task_reclaim_state(current, NULL);
> >
> > This will surely break any memcg direct reclaim.
> 
> Mind elaborating how it will "surely" break any memcg direct reclaim?

I was wrong here. I though this is done in a common path for all direct
reclaimers (likely mixed up try_to_free_pages with do_try_free_pages).
Sorry about the confusion.

Still, I do not think that the above heuristic will work properly.
Different reclaimers have a different reclaim target (e.g. lower zones
and/or numa node mask) and strength (e.g.  GFP_NOFS vs. GFP_KERNEL). A
simple count based throttling would be be prone to different sorts of
priority inversions.
Xing Zhengjun April 30, 2021, 5:33 a.m. UTC | #19
Hi Hillf,

On 4/22/2021 6:23 PM, Hillf Danton wrote:
> Hi Zhengjun
> 
> On Thu, 22 Apr 2021 16:36:19 +0800 Zhengjun Xing wrote:
>>      In the system with very few file pages (nr_active_file +
>> nr_inactive_file < 100), it is easy to reproduce "nr_isolated_file >
>> nr_inactive_file",  then too_many_isolated return true,
>> shrink_inactive_list enter "msleep(100)", the long latency will happen.
> 
> We should skip reclaiming page cache in this case.
>>
>> The test case to reproduce it is very simple: allocate many huge
>> pages(near the DRAM size), then do free, repeat the same operation many
>> times.
>> In the test case, the system with very few file pages (nr_active_file +
>> nr_inactive_file < 100), I have dumpped the numbers of
>> active/inactive/isolated file pages during the whole test(see in the
>> attachments) , in shrink_inactive_list "too_many_isolated" is very easy
>> to return true, then enter "msleep(100)",in "too_many_isolated"
>> sc->gfp_mask is 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is
>> also very easy to enter “inactive >>=3”, then “isolated > inactive” will
>> be true.
>>
>> So I  have a proposal to set a threshold number for the total file pages
>> to ignore the system with very few file pages, and then bypass the 100ms
>> sleep.
>> It is hard to set a perfect number for the threshold, so I just give an
>> example of "256" for it.
> 
> Another option seems like we take a nap at the second time of lru tmi
> with some allocators in your case served without the 100ms delay.
> 
> +++ x/mm/vmscan.c
> @@ -118,6 +118,9 @@ struct scan_control {
>   	/* The file pages on the current node are dangerously low */
>   	unsigned int file_is_tiny:1;
>   
> +	unsigned int file_tmi:1; /* too many isolated */
> +	unsigned int anon_tmi:1;
> +
>   	/* Allocation order */
>   	s8 order;
>   
> @@ -1905,6 +1908,21 @@ static int current_may_throttle(void)
>   		bdi_write_congested(current->backing_dev_info);
>   }
>   
> +static void update_sc_tmi(struct scan_control *sc, bool file, int set)
> +{
> +	if (file)
> +		sc->file_tmi = set;
> +	else
> +		sc->anon_tmi = set;
> +}
> +static bool is_sc_tmi(struct scan_control *sc, bool file)
> +{
> +	if (file)
> +		return sc->file_tmi != 0;
> +	else
> +		return sc->anon_tmi != 0;
> +}
> +
>   /*
>    * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>    * of reclaimed pages
> @@ -1927,6 +1945,11 @@ shrink_inactive_list(unsigned long nr_to
>   		if (stalled)
>   			return 0;
>   
> +		if (!is_sc_tmi(sc, file)) {
> +			update_sc_tmi(sc, file, 1);
> +			return 0;
> +		}
> +
>   		/* wait a bit for the reclaimer. */
>   		msleep(100);
>   		stalled = true;
> @@ -1936,6 +1959,9 @@ shrink_inactive_list(unsigned long nr_to
>   			return SWAP_CLUSTER_MAX;
>   	}
>   
> +	if (is_sc_tmi(sc, file))
> +		update_sc_tmi(sc, file, 0);
> +
>   	lru_add_drain();
>   
>   	spin_lock_irq(&lruvec->lru_lock);
> 

I use my compaction test case to test it, 1/10 ratio can reproduce 100ms 
sleep.

  60) @ 103942.6 us |      shrink_node();

  60) @ 103795.8 us |      shrink_node();
Xing Zhengjun April 30, 2021, 5:57 a.m. UTC | #20
Hi Yu,

On 4/24/2021 4:23 AM, Yu Zhao wrote:
> On Fri, Apr 23, 2021 at 02:57:07PM +0800, Xing Zhengjun wrote:
>> On 4/23/2021 1:13 AM, Yu Zhao wrote:
>>> On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
>>>> Hi,
>>>>
>>>>      In the system with very few file pages (nr_active_file + nr_inactive_file
>>>> < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
>>>> too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
>>>> long latency will happen.
>>>>
>>>> The test case to reproduce it is very simple: allocate many huge pages(near
>>>> the DRAM size), then do free, repeat the same operation many times.
>>>> In the test case, the system with very few file pages (nr_active_file +
>>>> nr_inactive_file < 100), I have dumpped the numbers of
>>>> active/inactive/isolated file pages during the whole test(see in the
>>>> attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
>>>> return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
>>>> 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
>>>> enter “inactive >>=3”, then “isolated > inactive” will be true.
>>>>
>>>> So I  have a proposal to set a threshold number for the total file pages to
>>>> ignore the system with very few file pages, and then bypass the 100ms sleep.
>>>> It is hard to set a perfect number for the threshold, so I just give an
>>>> example of "256" for it.
>>>>
>>>> I appreciate it if you can give me your suggestion/comments. Thanks.
>>>
>>> Hi Zhengjun,
>>>
>>> It seems to me using the number of isolated pages to keep a lid on
>>> direct reclaimers is not a good solution. We shouldn't keep going
>>> that direction if we really want to fix the problem because migration
>>> can isolate many pages too, which in turn blocks page reclaim.
>>>
>>> Here is something works a lot better. Please give it a try. Thanks.
>>
>> Thanks, I will try it with my test cases.
> 
> Thanks. I took care my sloppiness from yesterday and tested the
> following. It should apply cleanly and work well. Please let me know.
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 47946cec7584..48bb2b77389e 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -832,6 +832,7 @@ typedef struct pglist_data {
>   #endif
>   
>   	/* Fields commonly accessed by the page reclaim scanner */
> +	atomic_t		nr_reclaimers;
>   
>   	/*
>   	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 562e87cbd7a1..3fcdfbee89c7 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1775,43 +1775,6 @@ int isolate_lru_page(struct page *page)
>   	return ret;
>   }
>   
> -/*
> - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
> - * then get rescheduled. When there are massive number of tasks doing page
> - * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
> - * the LRU list will go small and be scanned faster than necessary, leading to
> - * unnecessary swapping, thrashing and OOM.
> - */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> -		struct scan_control *sc)
> -{
> -	unsigned long inactive, isolated;
> -
> -	if (current_is_kswapd())
> -		return 0;
> -
> -	if (!writeback_throttling_sane(sc))
> -		return 0;
> -
> -	if (file) {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> -	} else {
> -		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
> -		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> -	}
> -
> -	/*
> -	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> -	 * won't get blocked by normal direct-reclaimers, forming a circular
> -	 * deadlock.
> -	 */
> -	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> -		inactive >>= 3;
> -
> -	return isolated > inactive;
> -}
> -
>   /*
>    * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
>    * On return, @list is reused as a list of pages to be freed by the caller.
> @@ -1911,20 +1874,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>   	bool file = is_file_lru(lru);
>   	enum vm_event_item item;
>   	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> -	bool stalled = false;
> -
> -	while (unlikely(too_many_isolated(pgdat, file, sc))) {
> -		if (stalled)
> -			return 0;
> -
> -		/* wait a bit for the reclaimer. */
> -		msleep(100);
> -		stalled = true;
> -
> -		/* We are about to die and free our memory. Return now. */
> -		if (fatal_signal_pending(current))
> -			return SWAP_CLUSTER_MAX;
> -	}
>   
>   	lru_add_drain();
>   
> @@ -2903,6 +2852,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>   	unsigned long nr_soft_scanned;
>   	gfp_t orig_mask;
>   	pg_data_t *last_pgdat = NULL;
> +	bool should_retry = false;
> +	int nr_cpus = num_online_cpus();
>   
>   	/*
>   	 * If the number of buffer_heads in the machine exceeds the maximum
> @@ -2914,9 +2865,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>   		sc->gfp_mask |= __GFP_HIGHMEM;
>   		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
>   	}
> -
> +retry:
>   	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>   					sc->reclaim_idx, sc->nodemask) {
> +		/*
> +		 * Shrink each node in the zonelist once. If the zonelist is
> +		 * ordered by zone (not the default) then a node may be shrunk
> +		 * multiple times but in that case the user prefers lower zones
> +		 * being preserved.
> +		 */
> +		if (zone->zone_pgdat == last_pgdat)
> +			continue;
> +
>   		/*
>   		 * Take care memory controller reclaiming has small influence
>   		 * to global LRU.
> @@ -2941,16 +2901,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>   				sc->compaction_ready = true;
>   				continue;
>   			}
> +		}
>   
> -			/*
> -			 * Shrink each node in the zonelist once. If the
> -			 * zonelist is ordered by zone (not the default) then a
> -			 * node may be shrunk multiple times but in that case
> -			 * the user prefers lower zones being preserved.
> -			 */
> -			if (zone->zone_pgdat == last_pgdat)
> -				continue;
> +		/*
> +		 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from
> +		 * the LRU list and then get rescheduled. When there are massive
> +		 * number of tasks doing page allocation, such sleeping direct
> +		 * reclaimers may keep piling up on each CPU, the LRU list will
> +		 * go small and be scanned faster than necessary, leading to
> +		 * unnecessary swapping, thrashing and OOM.
> +		 */
> +		VM_BUG_ON(current_is_kswapd());
>   
> +		if (!atomic_add_unless(&zone->zone_pgdat->nr_reclaimers, 1, nr_cpus)) {
> +			should_retry = true;
> +			continue;
> +		}
> +
> +		if (last_pgdat)
> +			atomic_dec(&last_pgdat->nr_reclaimers);
> +		last_pgdat = zone->zone_pgdat;
> +
> +		if (!cgroup_reclaim(sc)) {
>   			/*
>   			 * This steals pages from memory cgroups over softlimit
>   			 * and returns the number of reclaimed pages and
> @@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>   			/* need some check for avoid more shrink_zone() */
>   		}
>   
> -		/* See comment about same check for global reclaim above */
> -		if (zone->zone_pgdat == last_pgdat)
> -			continue;
> -		last_pgdat = zone->zone_pgdat;
>   		shrink_node(zone->zone_pgdat, sc);
>   	}
>   
> +	if (last_pgdat)
> +		atomic_dec(&last_pgdat->nr_reclaimers);
> +	else if (should_retry) {
> +		/* wait a bit for the reclaimer. */
> +		if (!schedule_timeout_killable(HZ / 10))
> +			goto retry;
> +
> +		/* We are about to die and free our memory. Return now. */
> +		sc->nr_reclaimed += SWAP_CLUSTER_MAX;
> +	}
> +
>   	/*
>   	 * Restore to original mask to avoid the impact on the caller if we
>   	 * promoted it to __GFP_HIGHMEM.
> @@ -4189,6 +4168,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>   	set_task_reclaim_state(p, &sc.reclaim_state);
>   
>   	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
> +		int nr_cpus = num_online_cpus();
> +
> +		VM_BUG_ON(current_is_kswapd());
> +
> +		if (!atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> +			schedule_timeout_killable(HZ / 10);
> +			goto out;
> +		}
> +
>   		/*
>   		 * Free memory by calling shrink node with increasing
>   		 * priorities until we have enough memory freed.
> @@ -4196,8 +4184,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>   		do {
>   			shrink_node(pgdat, &sc);
>   		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
> -	}
>   
> +		atomic_dec(&pgdat->nr_reclaimers);
> +	}
> +out:
>   	set_task_reclaim_state(p, NULL);
>   	current->flags &= ~PF_SWAPWRITE;
>   	memalloc_noreclaim_restore(noreclaim_flag);
> 

I use my compaction test case to test it, test more than 30 times can 
not reproduce the 100ms sleep. I find that applies the patch, direct 
reclaim path latency reduces much, but the direct compact path latency 
double compares with it before.

  24)               |  __alloc_pages_direct_compact() {
  24)               |    try_to_compact_pages() {
  24)   0.131 us    |      __next_zones_zonelist();
  24) @ 184008.2 us |      compact_zone_order();
  24)   0.189 us    |      __next_zones_zonelist();
  24)   0.547 us    |      compact_zone_order();
  24)   0.225 us    |      __next_zones_zonelist();
  24)   0.592 us    |      compact_zone_order();
  24)   0.146 us    |      __next_zones_zonelist();
  24) @ 184012.3 us |    }
  24)               |    get_page_from_freelist() {
  24)   0.160 us    |      __zone_watermark_ok();
  24)   0.140 us    |      __next_zones_zonelist();
  24)   0.141 us    |      __zone_watermark_ok();
  24)   0.134 us    |      __next_zones_zonelist();
  24)   0.121 us    |      __zone_watermark_ok();
  24)   0.123 us    |      __next_zones_zonelist();
  24)   1.688 us    |    }
  24)   0.130 us    |    ___might_sleep();
  24)               |    __cond_resched() {
  24)   0.123 us    |      rcu_all_qs();
  24)   0.370 us    |    }
  24) @ 184015.2 us |  }
  24)               |  /* mm_page_alloc: page=0000000000000000 pfn=0 
order=9 migratetype=1 
gfp_flags=GFP_HIGHUSER_MOVABLE|__GFP_NOWARN|__GFP_COMP|__GFP_THISNODE */
  24)               |  /* memlatency: lat=184716 order=9 
gfp_flags=342cca 
(GFP_HIGHUSER_MOVABLE|__GFP_NOWARN|__GFP_COMP|__GFP_THISNODE|0x812a3c6000000000^@)migratetype=1 
*/
/*The memlatency measures the latency of "__alloc_pages_nodemask" */
Yu Zhao April 30, 2021, 6:24 a.m. UTC | #21
On Thu, Apr 29, 2021 at 11:57 PM Xing Zhengjun
<zhengjun.xing@linux.intel.com> wrote:
>
> Hi Yu,
>
> On 4/24/2021 4:23 AM, Yu Zhao wrote:
> > On Fri, Apr 23, 2021 at 02:57:07PM +0800, Xing Zhengjun wrote:
> >> On 4/23/2021 1:13 AM, Yu Zhao wrote:
> >>> On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> >>>> Hi,
> >>>>
> >>>>      In the system with very few file pages (nr_active_file + nr_inactive_file
> >>>> < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file",  then
> >>>> too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> >>>> long latency will happen.
> >>>>
> >>>> The test case to reproduce it is very simple: allocate many huge pages(near
> >>>> the DRAM size), then do free, repeat the same operation many times.
> >>>> In the test case, the system with very few file pages (nr_active_file +
> >>>> nr_inactive_file < 100), I have dumpped the numbers of
> >>>> active/inactive/isolated file pages during the whole test(see in the
> >>>> attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> >>>> return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> >>>> 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> >>>> enter “inactive >>=3”, then “isolated > inactive” will be true.
> >>>>
> >>>> So I  have a proposal to set a threshold number for the total file pages to
> >>>> ignore the system with very few file pages, and then bypass the 100ms sleep.
> >>>> It is hard to set a perfect number for the threshold, so I just give an
> >>>> example of "256" for it.
> >>>>
> >>>> I appreciate it if you can give me your suggestion/comments. Thanks.
> >>>
> >>> Hi Zhengjun,
> >>>
> >>> It seems to me using the number of isolated pages to keep a lid on
> >>> direct reclaimers is not a good solution. We shouldn't keep going
> >>> that direction if we really want to fix the problem because migration
> >>> can isolate many pages too, which in turn blocks page reclaim.
> >>>
> >>> Here is something works a lot better. Please give it a try. Thanks.
> >>
> >> Thanks, I will try it with my test cases.
> >
> > Thanks. I took care my sloppiness from yesterday and tested the
> > following. It should apply cleanly and work well. Please let me know.
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 47946cec7584..48bb2b77389e 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -832,6 +832,7 @@ typedef struct pglist_data {
> >   #endif
> >
> >       /* Fields commonly accessed by the page reclaim scanner */
> > +     atomic_t                nr_reclaimers;
> >
> >       /*
> >        * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 562e87cbd7a1..3fcdfbee89c7 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1775,43 +1775,6 @@ int isolate_lru_page(struct page *page)
> >       return ret;
> >   }
> >
> > -/*
> > - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
> > - * then get rescheduled. When there are massive number of tasks doing page
> > - * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
> > - * the LRU list will go small and be scanned faster than necessary, leading to
> > - * unnecessary swapping, thrashing and OOM.
> > - */
> > -static int too_many_isolated(struct pglist_data *pgdat, int file,
> > -             struct scan_control *sc)
> > -{
> > -     unsigned long inactive, isolated;
> > -
> > -     if (current_is_kswapd())
> > -             return 0;
> > -
> > -     if (!writeback_throttling_sane(sc))
> > -             return 0;
> > -
> > -     if (file) {
> > -             inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> > -             isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
> > -     } else {
> > -             inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
> > -             isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
> > -     }
> > -
> > -     /*
> > -      * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> > -      * won't get blocked by normal direct-reclaimers, forming a circular
> > -      * deadlock.
> > -      */
> > -     if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> > -             inactive >>= 3;
> > -
> > -     return isolated > inactive;
> > -}
> > -
> >   /*
> >    * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
> >    * On return, @list is reused as a list of pages to be freed by the caller.
> > @@ -1911,20 +1874,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >       bool file = is_file_lru(lru);
> >       enum vm_event_item item;
> >       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> > -     bool stalled = false;
> > -
> > -     while (unlikely(too_many_isolated(pgdat, file, sc))) {
> > -             if (stalled)
> > -                     return 0;
> > -
> > -             /* wait a bit for the reclaimer. */
> > -             msleep(100);
> > -             stalled = true;
> > -
> > -             /* We are about to die and free our memory. Return now. */
> > -             if (fatal_signal_pending(current))
> > -                     return SWAP_CLUSTER_MAX;
> > -     }
> >
> >       lru_add_drain();
> >
> > @@ -2903,6 +2852,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >       unsigned long nr_soft_scanned;
> >       gfp_t orig_mask;
> >       pg_data_t *last_pgdat = NULL;
> > +     bool should_retry = false;
> > +     int nr_cpus = num_online_cpus();
> >
> >       /*
> >        * If the number of buffer_heads in the machine exceeds the maximum
> > @@ -2914,9 +2865,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >               sc->gfp_mask |= __GFP_HIGHMEM;
> >               sc->reclaim_idx = gfp_zone(sc->gfp_mask);
> >       }
> > -
> > +retry:
> >       for_each_zone_zonelist_nodemask(zone, z, zonelist,
> >                                       sc->reclaim_idx, sc->nodemask) {
> > +             /*
> > +              * Shrink each node in the zonelist once. If the zonelist is
> > +              * ordered by zone (not the default) then a node may be shrunk
> > +              * multiple times but in that case the user prefers lower zones
> > +              * being preserved.
> > +              */
> > +             if (zone->zone_pgdat == last_pgdat)
> > +                     continue;
> > +
> >               /*
> >                * Take care memory controller reclaiming has small influence
> >                * to global LRU.
> > @@ -2941,16 +2901,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >                               sc->compaction_ready = true;
> >                               continue;
> >                       }
> > +             }
> >
> > -                     /*
> > -                      * Shrink each node in the zonelist once. If the
> > -                      * zonelist is ordered by zone (not the default) then a
> > -                      * node may be shrunk multiple times but in that case
> > -                      * the user prefers lower zones being preserved.
> > -                      */
> > -                     if (zone->zone_pgdat == last_pgdat)
> > -                             continue;
> > +             /*
> > +              * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from
> > +              * the LRU list and then get rescheduled. When there are massive
> > +              * number of tasks doing page allocation, such sleeping direct
> > +              * reclaimers may keep piling up on each CPU, the LRU list will
> > +              * go small and be scanned faster than necessary, leading to
> > +              * unnecessary swapping, thrashing and OOM.
> > +              */
> > +             VM_BUG_ON(current_is_kswapd());
> >
> > +             if (!atomic_add_unless(&zone->zone_pgdat->nr_reclaimers, 1, nr_cpus)) {
> > +                     should_retry = true;
> > +                     continue;
> > +             }
> > +
> > +             if (last_pgdat)
> > +                     atomic_dec(&last_pgdat->nr_reclaimers);
> > +             last_pgdat = zone->zone_pgdat;
> > +
> > +             if (!cgroup_reclaim(sc)) {
> >                       /*
> >                        * This steals pages from memory cgroups over softlimit
> >                        * and returns the number of reclaimed pages and
> > @@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >                       /* need some check for avoid more shrink_zone() */
> >               }
> >
> > -             /* See comment about same check for global reclaim above */
> > -             if (zone->zone_pgdat == last_pgdat)
> > -                     continue;
> > -             last_pgdat = zone->zone_pgdat;
> >               shrink_node(zone->zone_pgdat, sc);
> >       }
> >
> > +     if (last_pgdat)
> > +             atomic_dec(&last_pgdat->nr_reclaimers);
> > +     else if (should_retry) {
> > +             /* wait a bit for the reclaimer. */
> > +             if (!schedule_timeout_killable(HZ / 10))
> > +                     goto retry;
> > +
> > +             /* We are about to die and free our memory. Return now. */
> > +             sc->nr_reclaimed += SWAP_CLUSTER_MAX;
> > +     }
> > +
> >       /*
> >        * Restore to original mask to avoid the impact on the caller if we
> >        * promoted it to __GFP_HIGHMEM.
> > @@ -4189,6 +4168,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >       set_task_reclaim_state(p, &sc.reclaim_state);
> >
> >       if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
> > +             int nr_cpus = num_online_cpus();
> > +
> > +             VM_BUG_ON(current_is_kswapd());
> > +
> > +             if (!atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> > +                     schedule_timeout_killable(HZ / 10);
> > +                     goto out;
> > +             }
> > +
> >               /*
> >                * Free memory by calling shrink node with increasing
> >                * priorities until we have enough memory freed.
> > @@ -4196,8 +4184,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >               do {
> >                       shrink_node(pgdat, &sc);
> >               } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
> > -     }
> >
> > +             atomic_dec(&pgdat->nr_reclaimers);
> > +     }
> > +out:
> >       set_task_reclaim_state(p, NULL);
> >       current->flags &= ~PF_SWAPWRITE;
> >       memalloc_noreclaim_restore(noreclaim_flag);
> >
>
> I use my compaction test case to test it, test more than 30 times can
> not reproduce the 100ms sleep. I find that applies the patch, direct
> reclaim path latency reduces much, but the direct compact path latency
> double compares with it before.

Thanks for the update. I think both the speedup and the slowdown are
good news. Why?

My theory is that this patch speeds up the reclaim path, which takes
the lru lock. This in turn induces the lock contention and causes the
slowdown in the compaction path. The compaction path also takes the
lru lock: compact_zone_order() -> isolate_migratepages() ->
isolate_migratepages_block() -> mem_cgroup_page_lruvec().

This theory can be proven by checking whether the slowdown in the path
above. Also did you compare the following stats with and without this
patch?

$ grep compact /proc/vmstat
compact_migrate_scanned 5114098
compact_free_scanned 93606243
compact_isolated 3629513
compact_stall 66
compact_fail 61
compact_success 5
compact_daemon_wake 390
compact_daemon_migrate_scanned 288911
compact_daemon_free_scanned 9835224

You might see a higher number of isolated pages.

>   24)               |  __alloc_pages_direct_compact() {
>   24)               |    try_to_compact_pages() {
>   24)   0.131 us    |      __next_zones_zonelist();
>   24) @ 184008.2 us |      compact_zone_order();
>   24)   0.189 us    |      __next_zones_zonelist();
>   24)   0.547 us    |      compact_zone_order();
>   24)   0.225 us    |      __next_zones_zonelist();
>   24)   0.592 us    |      compact_zone_order();
>   24)   0.146 us    |      __next_zones_zonelist();
>   24) @ 184012.3 us |    }
>   24)               |    get_page_from_freelist() {
>   24)   0.160 us    |      __zone_watermark_ok();
>   24)   0.140 us    |      __next_zones_zonelist();
>   24)   0.141 us    |      __zone_watermark_ok();
>   24)   0.134 us    |      __next_zones_zonelist();
>   24)   0.121 us    |      __zone_watermark_ok();
>   24)   0.123 us    |      __next_zones_zonelist();
>   24)   1.688 us    |    }
>   24)   0.130 us    |    ___might_sleep();
>   24)               |    __cond_resched() {
>   24)   0.123 us    |      rcu_all_qs();
>   24)   0.370 us    |    }
>   24) @ 184015.2 us |  }
>   24)               |  /* mm_page_alloc: page=0000000000000000 pfn=0
> order=9 migratetype=1
> gfp_flags=GFP_HIGHUSER_MOVABLE|__GFP_NOWARN|__GFP_COMP|__GFP_THISNODE */
>   24)               |  /* memlatency: lat=184716 order=9
> gfp_flags=342cca
> (GFP_HIGHUSER_MOVABLE|__GFP_NOWARN|__GFP_COMP|__GFP_THISNODE|0x812a3c6000000000^@)migratetype=1
> */
> /*The memlatency measures the latency of "__alloc_pages_nodemask" */
>
>
> --
> Zhengjun Xing
Hillf Danton April 30, 2021, 6:43 a.m. UTC | #22
On Fri, 30 Apr 2021 13:33:57 +0800 Xing Zhengjun wrote:
>
> I use my compaction test case to test it, 1/10 ratio can reproduce 100ms
> sleep.
>
>  60) @ 103942.6 us |      shrink_node();
>
>  60) @ 103795.8 us |      shrink_node();

Thanks for your test.

In bid to cut the number of 100ms sleepers further down, add another place
for them to nap by flushing lru cache before falling in sleep, instead of
mulling why 50ms or 10ms is more adequate.

Alternatively, and simpler IMHO, take a 5ms nap one time until !tmi.

--- y/mm/vmscan.c
+++ x/mm/vmscan.c
@@ -118,6 +118,9 @@ struct scan_control {
 	/* The file pages on the current node are dangerously low */
 	unsigned int file_is_tiny:1;
 
+	unsigned int file_tmi:1; /* too many isolated */
+	unsigned int anon_tmi:1;
+
 	/* Allocation order */
 	s8 order;
 
@@ -2092,6 +2095,22 @@ static int current_may_throttle(void)
 		bdi_write_congested(current->backing_dev_info);
 }
 
+static void set_sc_tmi(struct scan_control *sc, bool file, int tmi)
+{
+	if (file)
+		sc->file_tmi = tmi;
+	else
+		sc->anon_tmi = tmi;
+}
+
+static bool is_sc_tmi(struct scan_control *sc, bool file)
+{
+	if (file)
+		return sc->file_tmi != 0;
+	else
+		return sc->anon_tmi != 0;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -2109,11 +2128,23 @@ shrink_inactive_list(unsigned long nr_to
 	enum vm_event_item item;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	bool stalled = false;
+	bool drained = false;
 
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		if (stalled)
 			return 0;
 
+		if (!is_sc_tmi(sc, file)) {
+			set_sc_tmi(sc, file, 1);
+			return 0;
+		}
+
+		if (!drained) {
+			drained = true;
+			lru_add_drain_all();
+			continue;
+		}
+
 		/* wait a bit for the reclaimer. */
 		msleep(100);
 		stalled = true;
@@ -2123,6 +2154,9 @@ shrink_inactive_list(unsigned long nr_to
 			return SWAP_CLUSTER_MAX;
 	}
 
+	if (is_sc_tmi(sc, file))
+		set_sc_tmi(sc, file, 0);
+
 	lru_add_drain();
 
 	spin_lock_irq(&lruvec->lru_lock);
Yu Zhao April 30, 2021, 8:34 a.m. UTC | #23
On Thu, Apr 29, 2021 at 4:00 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 28-04-21 09:05:06, Yu Zhao wrote:
> > On Wed, Apr 28, 2021 at 5:55 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > > @@ -3334,8 +3285,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> > > >       set_task_reclaim_state(current, &sc.reclaim_state);
> > > >       trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
> > > >
> > > > +     nr_cpus = current_is_kswapd() ? 0 : num_online_cpus();
> > > > +     while (nr_cpus && !atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
> > > > +             if (schedule_timeout_killable(HZ / 10))
> > > > +                     return SWAP_CLUSTER_MAX;
> > > > +     }
> > > > +
> > > >       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> > > >
> > > > +     if (nr_cpus)
> > > > +             atomic_dec(&pgdat->nr_reclaimers);
> > > > +
> > > >       trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
> > > >       set_task_reclaim_state(current, NULL);
> > >
> > > This will surely break any memcg direct reclaim.
> >
> > Mind elaborating how it will "surely" break any memcg direct reclaim?
>
> I was wrong here. I though this is done in a common path for all direct
> reclaimers (likely mixed up try_to_free_pages with do_try_free_pages).
> Sorry about the confusion.
>
> Still, I do not think that the above heuristic will work properly.
> Different reclaimers have a different reclaim target (e.g. lower zones
> and/or numa node mask) and strength (e.g.  GFP_NOFS vs. GFP_KERNEL). A
> simple count based throttling would be be prone to different sorts of
> priority inversions.

I see where your concern is coming from. Let's look at it from
multiple angles, and hopefully this will clear things up.

1, looking into this approach:
This approach limits the number of direct reclaimers without any bias.
It doesn't favor or disfavor anybody. IOW, everyone has an equal
chance to run, regardless of the reclaim parameters. So where does the
inversion come from?

2, comparing it with the existing code:
Both try to limit direct reclaims,: one by the number of isolated
pages and the other by the number of concurrent direct reclaimers.
Neither numbers are correlated with any parameters you mentioned above
except the following:

too_many_isolated()
{
...
        /*
         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
         * won't get blocked by normal direct-reclaimers, forming a circular
         * deadlock.
         */
        if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                inactive >>= 3;
...
}

Let's at the commit that added the above:

commit 3cf23841b4b7 ("mm/vmscan.c: avoid possible deadlock caused by
too_many_isolated()"):
Date:   Tue Dec 18 14:23:31 2012 -0800

    Neil found that if too_many_isolated() returns true while performing
    direct reclaim we can end up waiting for other threads to complete their
    direct reclaim.  If those threads are allowed to enter the FS or IO to
    free memory, but this thread is not, then it is possible that those
    threads will be waiting on this thread and so we get a circular deadlock.

    some task enters direct reclaim with GFP_KERNEL
      => too_many_isolated() false
        => vmscan and run into dirty pages
          => pageout()
            => take some FS lock
              => fs/block code does GFP_NOIO allocation
                => enter direct reclaim again
                  => too_many_isolated() true
                    => waiting for others to progress, however the other
                       tasks may be circular waiting for the FS lock..

Hmm, how could reclaim be recursive nowadays?

__alloc_pages_slowpath()
{
...

        /* Avoid recursion of direct reclaim */
        if (current->flags & PF_MEMALLOC)
                goto nopage;

        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim()
...
}

Let's assume it still could, do you remember the following commit?

commit db73ee0d4637 ("mm, vmscan: do not loop on too_many_isolated for ever")
Date:   Wed Sep 6 16:21:11 2017 -0700

If too_many_isolated() does loop forever anymore, how could the above
deadlock happen? IOW, why would we need the first commit nowadays?

If you don't remember the second commit, let me jog your memory:

Author: Michal Hocko <mhocko@suse.com>

3, thinking abstractly
A problem hard to solve in one domain can become a walk in the park in
another domain. This problem is a perfect example: it's different to
solve based on the number of isolated pages; but it becomes a lot
easier based on the number of direct reclaimers.

But there is a caveat: when we transform to a new domain, we need to
preserve the "reclaim target and strength" you mentioned. Fortunately,
there is nothing to preserve, because the existing code has none,
given that the "__GFP_IO | __GFP_FS" check in too_many_isolated() is
obsolete.

Does it make sense?
Michal Hocko April 30, 2021, 9:17 a.m. UTC | #24
On Fri 30-04-21 02:34:28, Yu Zhao wrote:
> On Thu, Apr 29, 2021 at 4:00 AM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > Still, I do not think that the above heuristic will work properly.
> > Different reclaimers have a different reclaim target (e.g. lower zones
> > and/or numa node mask) and strength (e.g.  GFP_NOFS vs. GFP_KERNEL). A
> > simple count based throttling would be be prone to different sorts of
> > priority inversions.
> 
> I see where your concern is coming from. Let's look at it from
> multiple angles, and hopefully this will clear things up.
> 
> 1, looking into this approach:
> This approach limits the number of direct reclaimers without any bias.
> It doesn't favor or disfavor anybody. IOW, everyone has an equal
> chance to run, regardless of the reclaim parameters. So where does the
> inversion come from?

Say you have a flood of GFP_KERNEL allocations contending with *MOVABLE
allocations. The former will not be able to reclaim for any non-kernel
zones. Similar effect would be contention of a heavy GFP_NOFS workload
condending with others but not being able to release filesystem
metadata.

> 2, comparing it with the existing code:
> Both try to limit direct reclaims,: one by the number of isolated
> pages and the other by the number of concurrent direct reclaimers.
> Neither numbers are correlated with any parameters you mentioned above
> except the following:
> 
> too_many_isolated()
> {
> ...
>         /*
>          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
>          * won't get blocked by normal direct-reclaimers, forming a circular
>          * deadlock.
>          */
>         if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
>                 inactive >>= 3;
> ...
> }
> 
> Let's at the commit that added the above:
> 
> commit 3cf23841b4b7 ("mm/vmscan.c: avoid possible deadlock caused by
> too_many_isolated()"):
> Date:   Tue Dec 18 14:23:31 2012 -0800
> 
>     Neil found that if too_many_isolated() returns true while performing
>     direct reclaim we can end up waiting for other threads to complete their
>     direct reclaim.  If those threads are allowed to enter the FS or IO to
>     free memory, but this thread is not, then it is possible that those
>     threads will be waiting on this thread and so we get a circular deadlock.
> 
>     some task enters direct reclaim with GFP_KERNEL
>       => too_many_isolated() false
>         => vmscan and run into dirty pages
>           => pageout()
>             => take some FS lock
>               => fs/block code does GFP_NOIO allocation
>                 => enter direct reclaim again
>                   => too_many_isolated() true
>                     => waiting for others to progress, however the other
>                        tasks may be circular waiting for the FS lock..
> 
> Hmm, how could reclaim be recursive nowadays?

I do not think it is. And I doubt it was back then and I also think the
above is not suggesting a recursion really. I tries to avoid a situation
when fs/block layer cannot make a fwd progress because it is being
blocked.
 
> __alloc_pages_slowpath()
> {
> ...
> 
>         /* Avoid recursion of direct reclaim */
>         if (current->flags & PF_MEMALLOC)
>                 goto nopage;
> 
>         /* Try direct reclaim and then allocating */
>         page = __alloc_pages_direct_reclaim()
> ...
> }
> 
> Let's assume it still could, do you remember the following commit?
> 
> commit db73ee0d4637 ("mm, vmscan: do not loop on too_many_isolated for ever")
> Date:   Wed Sep 6 16:21:11 2017 -0700
> 
> If too_many_isolated() does loop forever anymore, how could the above
> deadlock happen? IOW, why would we need the first commit nowadays?

Whether the Neil's commit is still needed would require a deeper
analysis. Even back then we didn't perform pageout for fs dirty pages
from the direct reclaim IIRC.

> If you don't remember the second commit, let me jog your memory:

Yes i do remember that one and that was handling a dependency between
kswapd (which is allowed to perform pageout on diryt fs data) which
is blocked and it prevents direct reclaimers to make a fwd progress e.g.
by declaring OOM. This was mostly a band aid rather than a systematic
solution. And it clearly shows limits of the existing approach. Please
note that I am not trying to defend what we have now. I am just pointing
out that strict count based approach will hit other problems.

> Author: Michal Hocko <mhocko@suse.com>
> 
> 3, thinking abstractly
> A problem hard to solve in one domain can become a walk in the park in
> another domain. This problem is a perfect example: it's different to
> solve based on the number of isolated pages; but it becomes a lot
> easier based on the number of direct reclaimers.

This would be really true if all those reclaimers where equal in their
capabilities. But they are not due to reclaim constrains if nothing
else.

IMHO the best way forward would be removing the throttling from the
reclaim path altogether. The reclaim should be only throttled by the
work it does. Both allocator and memcg charging path implement some sort
of retry logic and I believe this would be much better suited to
implement any backoff.
Yu Zhao April 30, 2021, 5:04 p.m. UTC | #25
On Fri, Apr 30, 2021 at 3:17 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 30-04-21 02:34:28, Yu Zhao wrote:
> > On Thu, Apr 29, 2021 at 4:00 AM Michal Hocko <mhocko@suse.com> wrote:
> [...]
> > > Still, I do not think that the above heuristic will work properly.
> > > Different reclaimers have a different reclaim target (e.g. lower zones
> > > and/or numa node mask) and strength (e.g.  GFP_NOFS vs. GFP_KERNEL). A
> > > simple count based throttling would be be prone to different sorts of
> > > priority inversions.
> >
> > I see where your concern is coming from. Let's look at it from
> > multiple angles, and hopefully this will clear things up.
> >
> > 1, looking into this approach:
> > This approach limits the number of direct reclaimers without any bias.
> > It doesn't favor or disfavor anybody. IOW, everyone has an equal
> > chance to run, regardless of the reclaim parameters. So where does the
> > inversion come from?
>
> Say you have a flood of GFP_KERNEL allocations contending with *MOVABLE
> allocations. The former will not be able to reclaim for any non-kernel
> zones. Similar effect would be contention of a heavy GFP_NOFS workload
> condending with others but not being able to release filesystem
> metadata.
>
> > 2, comparing it with the existing code:
> > Both try to limit direct reclaims,: one by the number of isolated
> > pages and the other by the number of concurrent direct reclaimers.
> > Neither numbers are correlated with any parameters you mentioned above
> > except the following:
> >
> > too_many_isolated()
> > {
> > ...
> >         /*
> >          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
> >          * won't get blocked by normal direct-reclaimers, forming a circular
> >          * deadlock.
> >          */
> >         if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> >                 inactive >>= 3;
> > ...
> > }
> >
> > Let's at the commit that added the above:
> >
> > commit 3cf23841b4b7 ("mm/vmscan.c: avoid possible deadlock caused by
> > too_many_isolated()"):
> > Date:   Tue Dec 18 14:23:31 2012 -0800
> >
> >     Neil found that if too_many_isolated() returns true while performing
> >     direct reclaim we can end up waiting for other threads to complete their
> >     direct reclaim.  If those threads are allowed to enter the FS or IO to
> >     free memory, but this thread is not, then it is possible that those
> >     threads will be waiting on this thread and so we get a circular deadlock.
> >
> >     some task enters direct reclaim with GFP_KERNEL
> >       => too_many_isolated() false
> >         => vmscan and run into dirty pages
> >           => pageout()
> >             => take some FS lock
> >               => fs/block code does GFP_NOIO allocation
> >                 => enter direct reclaim again
> >                   => too_many_isolated() true
> >                     => waiting for others to progress, however the other
> >                        tasks may be circular waiting for the FS lock..
> >
> > Hmm, how could reclaim be recursive nowadays?
>
> I do not think it is. And I doubt it was back then and I also think the
> above is not suggesting a recursion really. I tries to avoid a situation
> when fs/block layer cannot make a fwd progress because it is being
> blocked.
>
> > __alloc_pages_slowpath()
> > {
> > ...
> >
> >         /* Avoid recursion of direct reclaim */
> >         if (current->flags & PF_MEMALLOC)
> >                 goto nopage;
> >
> >         /* Try direct reclaim and then allocating */
> >         page = __alloc_pages_direct_reclaim()
> > ...
> > }
> >
> > Let's assume it still could, do you remember the following commit?
> >
> > commit db73ee0d4637 ("mm, vmscan: do not loop on too_many_isolated for ever")
> > Date:   Wed Sep 6 16:21:11 2017 -0700
> >
> > If too_many_isolated() does loop forever anymore, how could the above
> > deadlock happen? IOW, why would we need the first commit nowadays?
>
> Whether the Neil's commit is still needed would require a deeper
> analysis. Even back then we didn't perform pageout for fs dirty pages
> from the direct reclaim IIRC.
>
> > If you don't remember the second commit, let me jog your memory:
>
> Yes i do remember that one and that was handling a dependency between
> kswapd (which is allowed to perform pageout on diryt fs data) which
> is blocked and it prevents direct reclaimers to make a fwd progress e.g.
> by declaring OOM. This was mostly a band aid rather than a systematic
> solution. And it clearly shows limits of the existing approach. Please
> note that I am not trying to defend what we have now. I am just pointing
> out that strict count based approach will hit other problems.
>
> > Author: Michal Hocko <mhocko@suse.com>
> >
> > 3, thinking abstractly
> > A problem hard to solve in one domain can become a walk in the park in
> > another domain. This problem is a perfect example: it's different to
> > solve based on the number of isolated pages; but it becomes a lot
> > easier based on the number of direct reclaimers.
>
> This would be really true if all those reclaimers where equal in their
> capabilities. But they are not due to reclaim constrains if nothing
> else.

Thanks for the clarification above.

> IMHO the best way forward would be removing the throttling from the
> reclaim path altogether. The reclaim should be only throttled by the
> work it does. Both allocator and memcg charging path implement some sort
> of retry logic and I believe this would be much better suited to
> implement any backoff.

I completely agree with this.
Xing Zhengjun May 10, 2021, 8:03 a.m. UTC | #26
Hi Hillf,

On 4/30/2021 2:43 PM, Hillf Danton wrote:
> On Fri, 30 Apr 2021 13:33:57 +0800 Xing Zhengjun wrote:
>>
>> I use my compaction test case to test it, 1/10 ratio can reproduce 100ms
>> sleep.
>>
>>   60) @ 103942.6 us |      shrink_node();
>>
>>   60) @ 103795.8 us |      shrink_node();
> 
> Thanks for your test.
> 
> In bid to cut the number of 100ms sleepers further down, add another place
> for them to nap by flushing lru cache before falling in sleep, instead of
> mulling why 50ms or 10ms is more adequate.
> 
> Alternatively, and simpler IMHO, take a 5ms nap one time until !tmi.
> 
> --- y/mm/vmscan.c
> +++ x/mm/vmscan.c
> @@ -118,6 +118,9 @@ struct scan_control {
>   	/* The file pages on the current node are dangerously low */
>   	unsigned int file_is_tiny:1;
>   
> +	unsigned int file_tmi:1; /* too many isolated */
> +	unsigned int anon_tmi:1;
> +
>   	/* Allocation order */
>   	s8 order;
>   
> @@ -2092,6 +2095,22 @@ static int current_may_throttle(void)
>   		bdi_write_congested(current->backing_dev_info);
>   }
>   
> +static void set_sc_tmi(struct scan_control *sc, bool file, int tmi)
> +{
> +	if (file)
> +		sc->file_tmi = tmi;
> +	else
> +		sc->anon_tmi = tmi;
> +}
> +
> +static bool is_sc_tmi(struct scan_control *sc, bool file)
> +{
> +	if (file)
> +		return sc->file_tmi != 0;
> +	else
> +		return sc->anon_tmi != 0;
> +}
> +
>   /*
>    * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>    * of reclaimed pages
> @@ -2109,11 +2128,23 @@ shrink_inactive_list(unsigned long nr_to
>   	enum vm_event_item item;
>   	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>   	bool stalled = false;
> +	bool drained = false;
>   
>   	while (unlikely(too_many_isolated(pgdat, file, sc))) {
>   		if (stalled)
>   			return 0;
>   
> +		if (!is_sc_tmi(sc, file)) {
> +			set_sc_tmi(sc, file, 1);
> +			return 0;
> +		}
> +
> +		if (!drained) {
> +			drained = true;
> +			lru_add_drain_all();
> +			continue;
> +		}
> +
>   		/* wait a bit for the reclaimer. */
>   		msleep(100);
>   		stalled = true;
> @@ -2123,6 +2154,9 @@ shrink_inactive_list(unsigned long nr_to
>   			return SWAP_CLUSTER_MAX;
>   	}
>   
> +	if (is_sc_tmi(sc, file))
> +		set_sc_tmi(sc, file, 0);
> +
>   	lru_add_drain();
>   
>   	spin_lock_irq(&lruvec->lru_lock);
> 

I tried the patch, it still can reproduce the 100ms sleep.

52) @ 103829.8 us |      shrink_lruvec();
Hillf Danton May 10, 2021, 9:46 a.m. UTC | #27
On Mon, 10 May 2021 16:03:06  Xing Zhengjun wrote:
>On 4/30/2021 2:43 PM, Hillf Danton wrote:
>> On Fri, 30 Apr 2021 13:33:57 +0800 Xing Zhengjun wrote:
>>>
>>> I use my compaction test case to test it, 1/10 ratio can reproduce 100ms
>>> sleep.
>>>
>>>   60) @ 103942.6 us |      shrink_node();
>>>
>>>   60) @ 103795.8 us |      shrink_node();
>> 
>> Thanks for your test.
>> 
>> In bid to cut the number of 100ms sleepers further down, add another place
>> for them to nap by flushing lru cache before falling in sleep, instead of
>> mulling why 50ms or 10ms is more adequate.
>> 
>> Alternatively, and simpler IMHO, take a 5ms nap one time until !tmi.
>> 
>> --- y/mm/vmscan.c
>> +++ x/mm/vmscan.c
>> @@ -118,6 +118,9 @@ struct scan_control {
>>   	/* The file pages on the current node are dangerously low */
>>   	unsigned int file_is_tiny:1;
>>   
>> +	unsigned int file_tmi:1; /* too many isolated */
>> +	unsigned int anon_tmi:1;
>> +
>>   	/* Allocation order */
>>   	s8 order;
>>   
>> @@ -2092,6 +2095,22 @@ static int current_may_throttle(void)
>>   		bdi_write_congested(current->backing_dev_info);
>>   }
>>   
>> +static void set_sc_tmi(struct scan_control *sc, bool file, int tmi)
>> +{
>> +	if (file)
>> +		sc->file_tmi = tmi;
>> +	else
>> +		sc->anon_tmi = tmi;
>> +}
>> +
>> +static bool is_sc_tmi(struct scan_control *sc, bool file)
>> +{
>> +	if (file)
>> +		return sc->file_tmi != 0;
>> +	else
>> +		return sc->anon_tmi != 0;
>> +}
>> +
>>   /*
>>    * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>>    * of reclaimed pages
>> @@ -2109,11 +2128,23 @@ shrink_inactive_list(unsigned long nr_to
>>   	enum vm_event_item item;
>>   	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>   	bool stalled = false;
>> +	bool drained = false;
>>   
>>   	while (unlikely(too_many_isolated(pgdat, file, sc))) {
>>   		if (stalled)
>>   			return 0;
>>   
>> +		if (!is_sc_tmi(sc, file)) {
>> +			set_sc_tmi(sc, file, 1);
>> +			return 0;
>> +		}
>> +
>> +		if (!drained) {
>> +			drained = true;
>> +			lru_add_drain_all();
>> +			continue;
>> +		}
>> +
>>   		/* wait a bit for the reclaimer. */
>>   		msleep(100);
>>   		stalled = true;
>> @@ -2123,6 +2154,9 @@ shrink_inactive_list(unsigned long nr_to
>>   			return SWAP_CLUSTER_MAX;
>>   	}
>>   
>> +	if (is_sc_tmi(sc, file))
>> +		set_sc_tmi(sc, file, 0);
>> +
>>   	lru_add_drain();
>>   
>>   	spin_lock_irq(&lruvec->lru_lock);
>> 
>
>I tried the patch, it still can reproduce the 100ms sleep.
>
>52) @ 103829.8 us |      shrink_lruvec();

Thanks for you data.

What we learn from it is a 5ms nap a time and no longer than 100ms in total
is an acceptable option.

Hillf
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..a1926463455c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -168,6 +168,7 @@  struct scan_control {
  * From 0 .. 200.  Higher means more swappy.
  */
 int vm_swappiness = 60;
+int lru_list_threshold = SWAP_CLUSTER_MAX << 3;
 
 static void set_task_reclaim_state(struct task_struct *task,
 				   struct reclaim_state *rs)
@@ -1785,7 +1786,7 @@  int isolate_lru_page(struct page *page)
 static int too_many_isolated(struct pglist_data *pgdat, int file,
 		struct scan_control *sc)
 {
-	unsigned long inactive, isolated;
+	unsigned long inactive, isolated, active, nr_lru_pages;
 
 	if (current_is_kswapd())
 		return 0;
@@ -1796,11 +1797,13 @@  static int too_many_isolated(struct pglist_data *pgdat, int file,
 	if (file) {
 		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
 		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
+		active = node_page_state(pgdat, NR_ACTIVE_FILE);
 	} else {
 		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
 		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
+		active = node_page_state(pgdat, NR_ACTIVE_ANON);
 	}
-
+	nr_lru_pages = inactive + active;
 	/*
 	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
 	 * won't get blocked by normal direct-reclaimers, forming a circular
@@ -1809,6 +1812,10 @@  static int too_many_isolated(struct pglist_data *pgdat, int file,
 	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
 		inactive >>= 3;
 
+	if (isolated > inactive)
+		if (nr_lru_pages < lru_list_threshold)
+			return 0;
+
 	return isolated > inactive;
 }