diff mbox

[v2,RESEND,2/2] mm: ignore memory.min of abandoned memory cgroups

Message ID 20180502154710.18737-2-guro@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Gushchin May 2, 2018, 3:47 p.m. UTC
If a cgroup has no associated tasks, invoking the OOM killer
won't help release any memory, so respecting the memory.min
can lead to an infinite OOM loop or system stall.

Let's ignore memory.min of unpopulated cgroups.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/vmscan.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Comments

kernel test robot May 2, 2018, 11:45 p.m. UTC | #1
Hi Roman,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20180502]
[cannot apply to v4.17-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-introduce-memory-min/20180503-064145
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-x006-201817 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   mm/vmscan.c: In function 'shrink_node':
>> mm/vmscan.c:2555:34: error: dereferencing pointer to incomplete type 'struct mem_cgroup'
        if (cgroup_is_populated(memcg->css.cgroup))
                                     ^~

vim +2555 mm/vmscan.c

  2520	
  2521	static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  2522	{
  2523		struct reclaim_state *reclaim_state = current->reclaim_state;
  2524		unsigned long nr_reclaimed, nr_scanned;
  2525		bool reclaimable = false;
  2526	
  2527		do {
  2528			struct mem_cgroup *root = sc->target_mem_cgroup;
  2529			struct mem_cgroup_reclaim_cookie reclaim = {
  2530				.pgdat = pgdat,
  2531				.priority = sc->priority,
  2532			};
  2533			unsigned long node_lru_pages = 0;
  2534			struct mem_cgroup *memcg;
  2535	
  2536			memset(&sc->nr, 0, sizeof(sc->nr));
  2537	
  2538			nr_reclaimed = sc->nr_reclaimed;
  2539			nr_scanned = sc->nr_scanned;
  2540	
  2541			memcg = mem_cgroup_iter(root, NULL, &reclaim);
  2542			do {
  2543				unsigned long lru_pages;
  2544				unsigned long reclaimed;
  2545				unsigned long scanned;
  2546	
  2547				switch (mem_cgroup_protected(root, memcg)) {
  2548				case MEMCG_PROT_MIN:
  2549					/*
  2550					 * Hard protection.
  2551					 * If there is no reclaimable memory, OOM.
  2552					 * Abandoned cgroups are loosing protection,
  2553					 * because OOM killer won't release any memory.
  2554					 */
> 2555					if (cgroup_is_populated(memcg->css.cgroup))
  2556						continue;
  2557				case MEMCG_PROT_LOW:
  2558					/*
  2559					 * Soft protection.
  2560					 * Respect the protection only as long as
  2561					 * there is an unprotected supply
  2562					 * of reclaimable memory from other cgroups.
  2563					 */
  2564					if (!sc->memcg_low_reclaim) {
  2565						sc->memcg_low_skipped = 1;
  2566						continue;
  2567					}
  2568					memcg_memory_event(memcg, MEMCG_LOW);
  2569					break;
  2570				case MEMCG_PROT_NONE:
  2571					break;
  2572				}
  2573	
  2574				reclaimed = sc->nr_reclaimed;
  2575				scanned = sc->nr_scanned;
  2576				shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
  2577				node_lru_pages += lru_pages;
  2578	
  2579				if (memcg)
  2580					shrink_slab(sc->gfp_mask, pgdat->node_id,
  2581						    memcg, sc->priority);
  2582	
  2583				/* Record the group's reclaim efficiency */
  2584				vmpressure(sc->gfp_mask, memcg, false,
  2585					   sc->nr_scanned - scanned,
  2586					   sc->nr_reclaimed - reclaimed);
  2587	
  2588				/*
  2589				 * Direct reclaim and kswapd have to scan all memory
  2590				 * cgroups to fulfill the overall scan target for the
  2591				 * node.
  2592				 *
  2593				 * Limit reclaim, on the other hand, only cares about
  2594				 * nr_to_reclaim pages to be reclaimed and it will
  2595				 * retry with decreasing priority if one round over the
  2596				 * whole hierarchy is not sufficient.
  2597				 */
  2598				if (!global_reclaim(sc) &&
  2599						sc->nr_reclaimed >= sc->nr_to_reclaim) {
  2600					mem_cgroup_iter_break(root, memcg);
  2601					break;
  2602				}
  2603			} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
  2604	
  2605			if (global_reclaim(sc))
  2606				shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
  2607					    sc->priority);
  2608	
  2609			if (reclaim_state) {
  2610				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
  2611				reclaim_state->reclaimed_slab = 0;
  2612			}
  2613	
  2614			/* Record the subtree's reclaim efficiency */
  2615			vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
  2616				   sc->nr_scanned - nr_scanned,
  2617				   sc->nr_reclaimed - nr_reclaimed);
  2618	
  2619			if (sc->nr_reclaimed - nr_reclaimed)
  2620				reclaimable = true;
  2621	
  2622			if (current_is_kswapd()) {
  2623				/*
  2624				 * If reclaim is isolating dirty pages under writeback,
  2625				 * it implies that the long-lived page allocation rate
  2626				 * is exceeding the page laundering rate. Either the
  2627				 * global limits are not being effective at throttling
  2628				 * processes due to the page distribution throughout
  2629				 * zones or there is heavy usage of a slow backing
  2630				 * device. The only option is to throttle from reclaim
  2631				 * context which is not ideal as there is no guarantee
  2632				 * the dirtying process is throttled in the same way
  2633				 * balance_dirty_pages() manages.
  2634				 *
  2635				 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
  2636				 * count the number of pages under pages flagged for
  2637				 * immediate reclaim and stall if any are encountered
  2638				 * in the nr_immediate check below.
  2639				 */
  2640				if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
  2641					set_bit(PGDAT_WRITEBACK, &pgdat->flags);
  2642	
  2643				/*
  2644				 * Tag a node as congested if all the dirty pages
  2645				 * scanned were backed by a congested BDI and
  2646				 * wait_iff_congested will stall.
  2647				 */
  2648				if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  2649					set_bit(PGDAT_CONGESTED, &pgdat->flags);
  2650	
  2651				/* Allow kswapd to start writing pages during reclaim.*/
  2652				if (sc->nr.unqueued_dirty == sc->nr.file_taken)
  2653					set_bit(PGDAT_DIRTY, &pgdat->flags);
  2654	
  2655				/*
  2656				 * If kswapd scans pages marked marked for immediate
  2657				 * reclaim and under writeback (nr_immediate), it
  2658				 * implies that pages are cycling through the LRU
  2659				 * faster than they are written so also forcibly stall.
  2660				 */
  2661				if (sc->nr.immediate)
  2662					congestion_wait(BLK_RW_ASYNC, HZ/10);
  2663			}
  2664	
  2665			/*
  2666			 * Legacy memcg will stall in page writeback so avoid forcibly
  2667			 * stalling in wait_iff_congested().
  2668			 */
  2669			if (!global_reclaim(sc) && sane_reclaim(sc) &&
  2670			    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  2671				set_memcg_congestion(pgdat, root, true);
  2672	
  2673			/*
  2674			 * Stall direct reclaim for IO completions if underlying BDIs
  2675			 * and node is congested. Allow kswapd to continue until it
  2676			 * starts encountering unqueued dirty pages or cycling through
  2677			 * the LRU too quickly.
  2678			 */
  2679			if (!sc->hibernation_mode && !current_is_kswapd() &&
  2680			   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
  2681				wait_iff_congested(BLK_RW_ASYNC, HZ/10);
  2682	
  2683		} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
  2684						 sc->nr_scanned - nr_scanned, sc));
  2685	
  2686		/*
  2687		 * Kswapd gives up on balancing particular nodes after too
  2688		 * many failures to reclaim anything from them and goes to
  2689		 * sleep. On reclaim progress, reset the failure counter. A
  2690		 * successful direct reclaim run will revive a dormant kswapd.
  2691		 */
  2692		if (reclaimable)
  2693			pgdat->kswapd_failures = 0;
  2694	
  2695		return reclaimable;
  2696	}
  2697	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot May 3, 2018, 1:08 a.m. UTC | #2
Hi Roman,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20180502]
[cannot apply to v4.17-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-introduce-memory-min/20180503-064145
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   mm/vmscan.c: In function 'shrink_node':
>> mm/vmscan.c:2555:9: error: implicit declaration of function 'cgroup_is_populated'; did you mean 'cgroup_bpf_put'? [-Werror=implicit-function-declaration]
        if (cgroup_is_populated(memcg->css.cgroup))
            ^~~~~~~~~~~~~~~~~~~
            cgroup_bpf_put
   mm/vmscan.c:2555:34: error: dereferencing pointer to incomplete type 'struct mem_cgroup'
        if (cgroup_is_populated(memcg->css.cgroup))
                                     ^~
   cc1: some warnings being treated as errors

vim +2555 mm/vmscan.c

  2520	
  2521	static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  2522	{
  2523		struct reclaim_state *reclaim_state = current->reclaim_state;
  2524		unsigned long nr_reclaimed, nr_scanned;
  2525		bool reclaimable = false;
  2526	
  2527		do {
  2528			struct mem_cgroup *root = sc->target_mem_cgroup;
  2529			struct mem_cgroup_reclaim_cookie reclaim = {
  2530				.pgdat = pgdat,
  2531				.priority = sc->priority,
  2532			};
  2533			unsigned long node_lru_pages = 0;
  2534			struct mem_cgroup *memcg;
  2535	
  2536			memset(&sc->nr, 0, sizeof(sc->nr));
  2537	
  2538			nr_reclaimed = sc->nr_reclaimed;
  2539			nr_scanned = sc->nr_scanned;
  2540	
  2541			memcg = mem_cgroup_iter(root, NULL, &reclaim);
  2542			do {
  2543				unsigned long lru_pages;
  2544				unsigned long reclaimed;
  2545				unsigned long scanned;
  2546	
  2547				switch (mem_cgroup_protected(root, memcg)) {
  2548				case MEMCG_PROT_MIN:
  2549					/*
  2550					 * Hard protection.
  2551					 * If there is no reclaimable memory, OOM.
  2552					 * Abandoned cgroups are loosing protection,
  2553					 * because OOM killer won't release any memory.
  2554					 */
> 2555					if (cgroup_is_populated(memcg->css.cgroup))
  2556						continue;
  2557				case MEMCG_PROT_LOW:
  2558					/*
  2559					 * Soft protection.
  2560					 * Respect the protection only as long as
  2561					 * there is an unprotected supply
  2562					 * of reclaimable memory from other cgroups.
  2563					 */
  2564					if (!sc->memcg_low_reclaim) {
  2565						sc->memcg_low_skipped = 1;
  2566						continue;
  2567					}
  2568					memcg_memory_event(memcg, MEMCG_LOW);
  2569					break;
  2570				case MEMCG_PROT_NONE:
  2571					break;
  2572				}
  2573	
  2574				reclaimed = sc->nr_reclaimed;
  2575				scanned = sc->nr_scanned;
  2576				shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
  2577				node_lru_pages += lru_pages;
  2578	
  2579				if (memcg)
  2580					shrink_slab(sc->gfp_mask, pgdat->node_id,
  2581						    memcg, sc->priority);
  2582	
  2583				/* Record the group's reclaim efficiency */
  2584				vmpressure(sc->gfp_mask, memcg, false,
  2585					   sc->nr_scanned - scanned,
  2586					   sc->nr_reclaimed - reclaimed);
  2587	
  2588				/*
  2589				 * Direct reclaim and kswapd have to scan all memory
  2590				 * cgroups to fulfill the overall scan target for the
  2591				 * node.
  2592				 *
  2593				 * Limit reclaim, on the other hand, only cares about
  2594				 * nr_to_reclaim pages to be reclaimed and it will
  2595				 * retry with decreasing priority if one round over the
  2596				 * whole hierarchy is not sufficient.
  2597				 */
  2598				if (!global_reclaim(sc) &&
  2599						sc->nr_reclaimed >= sc->nr_to_reclaim) {
  2600					mem_cgroup_iter_break(root, memcg);
  2601					break;
  2602				}
  2603			} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
  2604	
  2605			if (global_reclaim(sc))
  2606				shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
  2607					    sc->priority);
  2608	
  2609			if (reclaim_state) {
  2610				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
  2611				reclaim_state->reclaimed_slab = 0;
  2612			}
  2613	
  2614			/* Record the subtree's reclaim efficiency */
  2615			vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
  2616				   sc->nr_scanned - nr_scanned,
  2617				   sc->nr_reclaimed - nr_reclaimed);
  2618	
  2619			if (sc->nr_reclaimed - nr_reclaimed)
  2620				reclaimable = true;
  2621	
  2622			if (current_is_kswapd()) {
  2623				/*
  2624				 * If reclaim is isolating dirty pages under writeback,
  2625				 * it implies that the long-lived page allocation rate
  2626				 * is exceeding the page laundering rate. Either the
  2627				 * global limits are not being effective at throttling
  2628				 * processes due to the page distribution throughout
  2629				 * zones or there is heavy usage of a slow backing
  2630				 * device. The only option is to throttle from reclaim
  2631				 * context which is not ideal as there is no guarantee
  2632				 * the dirtying process is throttled in the same way
  2633				 * balance_dirty_pages() manages.
  2634				 *
  2635				 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
  2636				 * count the number of pages under pages flagged for
  2637				 * immediate reclaim and stall if any are encountered
  2638				 * in the nr_immediate check below.
  2639				 */
  2640				if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
  2641					set_bit(PGDAT_WRITEBACK, &pgdat->flags);
  2642	
  2643				/*
  2644				 * Tag a node as congested if all the dirty pages
  2645				 * scanned were backed by a congested BDI and
  2646				 * wait_iff_congested will stall.
  2647				 */
  2648				if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  2649					set_bit(PGDAT_CONGESTED, &pgdat->flags);
  2650	
  2651				/* Allow kswapd to start writing pages during reclaim.*/
  2652				if (sc->nr.unqueued_dirty == sc->nr.file_taken)
  2653					set_bit(PGDAT_DIRTY, &pgdat->flags);
  2654	
  2655				/*
  2656				 * If kswapd scans pages marked marked for immediate
  2657				 * reclaim and under writeback (nr_immediate), it
  2658				 * implies that pages are cycling through the LRU
  2659				 * faster than they are written so also forcibly stall.
  2660				 */
  2661				if (sc->nr.immediate)
  2662					congestion_wait(BLK_RW_ASYNC, HZ/10);
  2663			}
  2664	
  2665			/*
  2666			 * Legacy memcg will stall in page writeback so avoid forcibly
  2667			 * stalling in wait_iff_congested().
  2668			 */
  2669			if (!global_reclaim(sc) && sane_reclaim(sc) &&
  2670			    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  2671				set_memcg_congestion(pgdat, root, true);
  2672	
  2673			/*
  2674			 * Stall direct reclaim for IO completions if underlying BDIs
  2675			 * and node is congested. Allow kswapd to continue until it
  2676			 * starts encountering unqueued dirty pages or cycling through
  2677			 * the LRU too quickly.
  2678			 */
  2679			if (!sc->hibernation_mode && !current_is_kswapd() &&
  2680			   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
  2681				wait_iff_congested(BLK_RW_ASYNC, HZ/10);
  2682	
  2683		} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
  2684						 sc->nr_scanned - nr_scanned, sc));
  2685	
  2686		/*
  2687		 * Kswapd gives up on balancing particular nodes after too
  2688		 * many failures to reclaim anything from them and goes to
  2689		 * sleep. On reclaim progress, reset the failure counter. A
  2690		 * successful direct reclaim run will revive a dormant kswapd.
  2691		 */
  2692		if (reclaimable)
  2693			pgdat->kswapd_failures = 0;
  2694	
  2695		return reclaimable;
  2696	}
  2697	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Matthew Wilcox May 3, 2018, 2:31 a.m. UTC | #3
On Wed, May 02, 2018 at 04:47:10PM +0100, Roman Gushchin wrote:
> +				 * Abandoned cgroups are loosing protection,

"losing".
Roman Gushchin May 3, 2018, 11:45 a.m. UTC | #4
On Wed, May 02, 2018 at 07:31:43PM -0700, Matthew Wilcox wrote:
> On Wed, May 02, 2018 at 04:47:10PM +0100, Roman Gushchin wrote:
> > +				 * Abandoned cgroups are loosing protection,
> 
> "losing".
> 

Fixed in v3.

Thanks!
diff mbox

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 50055d72f294..709237feddc1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2549,8 +2549,11 @@  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 				/*
 				 * Hard protection.
 				 * If there is no reclaimable memory, OOM.
+				 * Abandoned cgroups are loosing protection,
+				 * because OOM killer won't release any memory.
 				 */
-				continue;
+				if (cgroup_is_populated(memcg->css.cgroup))
+					continue;
 			case MEMCG_PROT_LOW:
 				/*
 				 * Soft protection.