[RFC,22/26] mm: page_alloc: manage free memory in whole pageblocks

Message ID	20230418191313.268131-23-hannes@cmpxchg.org (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Johannes Weiner <hannes@cmpxchg.org> To: linux-mm@kvack.org Cc: Kaiyang Zhao <kaiyang2@cs.cmu.edu>, Mel Gorman <mgorman@techsingularity.net>, Vlastimil Babka <vbabka@suse.cz>, David Rientjes <rientjes@google.com>, linux-kernel@vger.kernel.org, kernel-team@fb.com Subject: [RFC PATCH 22/26] mm: page_alloc: manage free memory in whole pageblocks Date: Tue, 18 Apr 2023 15:13:09 -0400 Message-Id: <20230418191313.268131-23-hannes@cmpxchg.org> In-Reply-To: <20230418191313.268131-1-hannes@cmpxchg.org> References: <20230418191313.268131-1-hannes@cmpxchg.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	mm: reliable huge page allocator \| expand [RFC,00/26] mm: reliable huge page allocator [RFC,01/26] block: bdev: blockdev page cache is movable [RFC,02/26] mm: compaction: avoid GFP_NOFS deadlocks [RFC,03/26] mm: make pageblock_order 2M per default [RFC,04/26] mm: page_isolation: write proper kerneldoc [RFC,05/26] mm: page_alloc: per-migratetype pcplist for THPs [RFC,06/26] mm: page_alloc: consolidate free page accounting [RFC,07/26] mm: page_alloc: move capture_control to the page allocator [RFC,08/26] mm: page_alloc: claim blocks during compaction capturing [RFC,09/26] mm: page_alloc: move expand() above compaction_capture() [RFC,10/26] mm: page_alloc: allow compaction capturing from larger blocks [RFC,11/26] mm: page_alloc: introduce MIGRATE_FREE [RFC,12/26] mm: page_alloc: per-migratetype free counts [RFC,13/26] mm: compaction: remove compaction result helpers [RFC,14/26] mm: compaction: simplify should_compact_retry() [RFC,15/26] mm: compaction: simplify free block check in suitable_migration_target() [RFC,16/26] mm: compaction: improve compaction_suitable() accuracy [RFC,17/26] mm: compaction: refactor __compaction_suitable() [RFC,18/26] mm: compaction: remove unnecessary is_via_compact_memory() checks [RFC,19/26] mm: compaction: drop redundant watermark check in compaction_zonelist_suitable() [RFC,20/26] mm: vmscan: use compaction_suitable() check in kswapd [RFC,21/26] mm: compaction: align compaction goals with reclaim goals [RFC,22/26] mm: page_alloc: manage free memory in whole pageblocks [RFC,23/26] mm: page_alloc: kill highatomic [RFC,24/26] mm: page_alloc: kill watermark boosting [RFC,25/26] mm: page_alloc: disallow fallbacks when 2M defrag is enabled [RFC,26/26] mm: page_alloc: add sanity checks for migratetypes

diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9e1b2c56df62..52b2487ef901 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -10,7 +10,6 @@ enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, - MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC diff --git a/mm/compaction.c b/mm/compaction.c index 8080c04e644a..e33c99eb34a8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1784,15 +1784,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn; - /* - * Only allow kcompactd and direct requests for movable pages to - * quickly clear out a MOVABLE pageblock for allocation. This - * reduces the risk that a large movable pageblock is freed for - * an unmovable/reclaimable small allocation. - */ - if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) - return pfn; - /* * When starting the migration scanner, pick any pageblock within the * first half of the search space. Otherwise try and pick a pageblock @@ -2065,8 +2056,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat) static enum compact_result __compact_finished(struct compact_control *cc) { - unsigned int order; - const int migratetype = cc->migratetype; + unsigned long mark; int ret; /* Compaction run completes if the migrate and free scanner meet */ @@ -2120,39 +2110,16 @@ static enum compact_result __compact_finished(struct compact_control *cc) if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; - /* Direct compactor: Is a suitable page free? */ + /* Done when watermarks are restored */ ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; - - /* Job done if page is free of the right migratetype */ - if (!free_area_empty(area, migratetype)) - return COMPACT_SUCCESS; - -#ifdef CONFIG_CMA - /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ - if (migratetype == MIGRATE_MOVABLE && - !free_area_empty(area, MIGRATE_CMA)) - return COMPACT_SUCCESS; -#endif - /* - * Job done if allocation would steal freepages from - * other migratetype buddy lists. - */ - if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - /* - * Movable pages are OK in any pageblock. If we are - * stealing for a non-movable allocation, make sure - * we finish compacting the current pageblock first - * (which is assured by the above migrate_pfn align - * check) so it is as free as possible and we won't - * have to steal another one soon. - */ - return COMPACT_SUCCESS; - } - + if (cc->direct_compaction) + mark = wmark_pages(cc->zone, + cc->alloc_flags & ALLOC_WMARK_MASK); + else + mark = high_wmark_pages(cc->zone); + if (zone_watermark_ok(cc->zone, cc->order, mark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; out: if (cc->contended || fatal_signal_pending(current)) ret = COMPACT_CONTENDED; @@ -2310,8 +2277,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long watermark; /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(cc->zone, - cc->alloc_flags & ALLOC_WMARK_MASK); + if (cc->direct_compaction) + watermark = wmark_pages(cc->zone, + cc->alloc_flags & + ALLOC_WMARK_MASK); + else + watermark = high_wmark_pages(cc->zone); if (zone_watermark_ok(cc->zone, cc->order, watermark, cc->highest_zoneidx, cc->alloc_flags)) return COMPACT_SUCCESS; @@ -2800,7 +2771,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) /* Allocation can succeed in any zone, done */ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, - min_wmark_pages(zone), + high_wmark_pages(zone), highest_zoneidx, 0)) return true; @@ -2845,7 +2816,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, cc.order, - min_wmark_pages(zone), zoneid, 0)) + high_wmark_pages(zone), zoneid, 0)) continue; if (compaction_suitable(zone, cc.order, diff --git a/mm/internal.h b/mm/internal.h index 39f65a463631..5c76455f8042 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -432,6 +432,7 @@ struct compact_control { */ struct capture_control { struct compact_control *cc; + int order; int migratetype; struct page *page; }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18fa2bbba44b..6f0bfc226c36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1075,7 +1075,7 @@ static inline bool compaction_capture(struct zone *zone, struct page *page, int order, int migratetype, struct capture_control *capc) { - if (!capc || order < capc->cc->order) + if (!capc || order < capc->order) return false; /* Do not accidentally pollute CMA or isolated regions*/ @@ -1097,8 +1097,8 @@ compaction_capture(struct zone *zone, struct page *page, int order, return false; } - if (order > capc->cc->order) - expand(zone, page, capc->cc->order, order, migratetype); + if (order > capc->order) + expand(zone, page, capc->order, order, migratetype); capc->page = page; return true; @@ -3649,15 +3649,15 @@ int __isolate_free_page(struct page *page, unsigned int order) int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { + long free_pages = zone_page_state(zone, NR_FREE_PAGES); unsigned long watermark; /* - * Obey watermarks as if the page was being allocated. We can - * emulate a high-order watermark check with a raised order-0 - * watermark, because we already know our high-order page - * exists. + * Keep a lid on concurrent compaction. MIGRATE_FREE + * watermarks alone cannot be checked here, because + * that's what the caller is trying to produce. */ watermark = zone->_watermark[WMARK_MIN] + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!__zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA, free_pages)) return 0; } @@ -3976,27 +3976,59 @@ noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); -static inline long __zone_watermark_unusable_free(struct zone *z, - unsigned int order, unsigned int alloc_flags) +static long page_state(struct zone *zone, enum zone_stat_item item, bool safe) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); - long unusable_free = (1 << order) - 1; + if (safe) + return zone_page_state_snapshot(zone, item); + else + return zone_page_state(zone, item); +} + +static long __zone_free_pages(struct zone *zone, int alloc_flags, bool safe) +{ + long free_pages; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * Enforce watermarks against MIGRATE_FREE pages. This ensures + * that there is always a reserve of higher-order pages + * maintained for all migratetypes and allocation contexts. + * + * Allocations will still use up any compatible free pages + * that may exist inside claimed blocks first. But the reserve + * prevents smaller allocations from starving out higher-order + * requests (which may not be able to sleep, e.g. highatomic). + * + * The additional memory requirements of this are mininmal. If + * internal free pages already exceed the compact_gap(), only + * compaction is necessary to restore the watermarks. */ - if (likely(!alloc_harder)) - unusable_free += z->nr_reserved_highatomic; + free_pages = page_state(zone, NR_FREE_FREE, safe); + if (alloc_flags & (ALLOC_HARDER | ALLOC_OOM)) + free_pages += page_state(zone, NR_FREE_HIGHATOMIC, safe); + if (IS_ENABLED(CONFIG_CMA) && (alloc_flags & ALLOC_CMA)) + free_pages += page_state(zone, NR_FREE_CMA_PAGES, safe); -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); -#endif + if (!IS_ENABLED(CONFIG_COMPACTION)) { + /* + * We can't reasonably defragment without compaction. + * Consider everything and do best-effort grouping. + */ + free_pages += page_state(zone, NR_FREE_UNMOVABLE, safe); + free_pages += page_state(zone, NR_FREE_MOVABLE, safe); + free_pages += page_state(zone, NR_FREE_RECLAIMABLE, safe); + } - return unusable_free; + return free_pages; +} + +static long zone_free_pages(struct zone *zone, int alloc_flags) +{ + return __zone_free_pages(zone, alloc_flags, false); +} + +static long zone_free_pages_safe(struct zone *zone, int alloc_flags) +{ + return __zone_free_pages(zone, alloc_flags, true); } /* @@ -4014,7 +4046,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ - free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; @@ -4076,33 +4108,22 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, - zone_page_state(z, NR_FREE_PAGES)); + zone_free_pages(z, alloc_flags)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, gfp_t gfp_mask) { - long free_pages; - - free_pages = zone_page_state(z, NR_FREE_PAGES); + long free_pages = zone_free_pages(z, alloc_flags); /* * Fast check for order-0 only. If this fails then the reserves * need to be calculated. */ - if (!order) { - long usable_free; - long reserved; - - usable_free = free_pages; - reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); - - /* reserved may over estimate high-atomic reserves. */ - usable_free -= min(usable_free, reserved); - if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) - return true; - } + if (!order && (free_pages - ((1 << order) - 1) > + mark + z->lowmem_reserve[highest_zoneidx])) + return true; if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) @@ -4126,13 +4147,8 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx) { - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); + zone_free_pages_safe(z, 0)); } #ifdef CONFIG_NUMA @@ -4524,12 +4540,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned long pflags; unsigned int noreclaim_flag; struct capture_control capc = { + .order = order, .migratetype = ac->migratetype, .page = NULL, }; + int compact_order; - if (!order) - return NULL; + /* Use reclaim/compaction to produce neutral blocks */ + compact_order = max_t(int, order, pageblock_order); /* * Make sure the structs are really initialized before we expose the @@ -4543,8 +4561,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); - *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio, &capc); + *compact_result = try_to_compact_pages(gfp_mask, compact_order, + alloc_flags, ac, prio, &capc); memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); @@ -4608,13 +4626,12 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_priority *compact_priority, int *compaction_retries) { - int min_priority; bool ret = false; int retries = *compaction_retries; enum compact_priority priority = *compact_priority; - if (!order) - return false; + /* Use reclaim/compaction to produce neutral blocks */ + order = max_t(int, order, pageblock_order); if (fatal_signal_pending(current)) return false; @@ -4624,20 +4641,6 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * failed, presumably due to a race. Retry a few times. */ if (compact_result == COMPACT_SUCCESS) { - int max_retries = MAX_COMPACT_RETRIES; - - /* - * !costly requests are much more important than - * __GFP_RETRY_MAYFAIL costly ones because they are de - * facto nofail and invoke OOM killer to move on while - * costly can fail and users are ready to cope with - * that. 1/4 retries is rather arbitrary but we would - * need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - ret = ++(*compaction_retries) <= MAX_COMPACT_RETRIES; goto out; } @@ -4654,16 +4657,13 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, /* * Compaction failed. Retry with increasing priority. */ - min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? - MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; - - if (*compact_priority > min_priority) { + if (*compact_priority > MIN_COMPACT_PRIORITY) { (*compact_priority)--; *compaction_retries = 0; ret = true; } out: - trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + trace_compact_retry(order, priority, compact_result, retries, MAX_COMPACT_RETRIES, ret); return ret; } #else @@ -4822,9 +4822,16 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned long pflags; bool drained = false; + int reclaim_order; + + /* Use reclaim/compaction to produce neutral blocks */ + if (IS_ENABLED(CONFIG_COMPACTION)) + reclaim_order = max_t(int, order, pageblock_order); + else + reclaim_order = order; psi_memstall_enter(&pflags); - *did_some_progress = __perform_reclaim(gfp_mask, order, ac); + *did_some_progress = __perform_reclaim(gfp_mask, reclaim_order, ac); if (unlikely(!(*did_some_progress))) goto out; @@ -4856,6 +4863,10 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, pg_data_t *last_pgdat = NULL; enum zone_type highest_zoneidx = ac->highest_zoneidx; + /* Use reclaim/compaction to produce neutral blocks */ + if (IS_ENABLED(CONFIG_COMPACTION)) + order = max_t(unsigned int, order, pageblock_order); + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (!managed_zone(zone)) @@ -4970,6 +4981,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct zoneref *z; bool ret = false; + /* + * In the old world, order-0 pages only need reclaim, and + * higher orders might be present but the order-0 watermarks + * aren't met yet. These things can be fixed by reclaim alone. + * + * In the new world, though, watermark checks are against + * MIGRATE_FREE blocks. That means if the watermarks aren't + * met, reclaim isn't going to be the solution. Neither for + * order-0 nor for anything else. Whether it makes sense to + * retry depends fully on whether compaction should retry. + * + * should_compact_retry() already checks for COMPACT_SKIPPED + * and compaction_zonelist_suitable() to test whether reclaim + * is needed. + */ + if (IS_ENABLED(CONFIG_COMPACTION)) + goto schedule; + /* * Costly allocations might have made a progress but this doesn't mean * their order will become available due to high fragmentation so @@ -5019,6 +5048,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } } +schedule: /* * Memory allocation/reclaim might be called from a WQ context and the * current implementation of the WQ concurrency control doesn't @@ -8833,6 +8863,13 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + /* + * Ensure the watermark delta is a multiple of the + * neutral block that reclaim/compaction produces. + */ + if (IS_ENABLED(CONFIG_COMPACTION)) + tmp = ALIGN(tmp, 1 << pageblock_order); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; diff --git a/mm/vmscan.c b/mm/vmscan.c index 14d6116384cc..a7374cd6fe91 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7438,8 +7438,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, highest_zoneidx) && - !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { + pgdat_balanced(pgdat, order, highest_zoneidx)) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -7447,8 +7446,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ - if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, highest_zoneidx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; }

[RFC,22/26] mm: page_alloc: manage free memory in whole pageblocks

Commit Message

Patch