diff mbox series

[v11,3/6] mm: make alloc_contig_range work at pageblock granularity

Message ID 20220425143118.2850746-4-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series Use pageblock_order for cma and alloc_contig_range alignment. | expand

Commit Message

Zi Yan April 25, 2022, 2:31 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
merging pageblocks with different migratetypes. It might unnecessarily
convert extra pageblocks at the beginning and at the end of the range.
Change alloc_contig_range() to work at pageblock granularity.

Special handling is needed for free pages and in-use pages across the
boundaries of the range specified by alloc_contig_range(). Because these
partially isolated pages causes free page accounting issues. The free
pages will be split and freed into separate migratetype lists; the
in-use pages will be migrated then the freed pages will be handled in
the aforementioned way.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/page-isolation.h |   4 +-
 mm/internal.h                  |   6 ++
 mm/memory_hotplug.c            |   3 +-
 mm/page_alloc.c                |  54 ++++++++--
 mm/page_isolation.c            | 184 ++++++++++++++++++++++++++++++++-
 5 files changed, 233 insertions(+), 18 deletions(-)

Comments

Zi Yan April 29, 2022, 1:54 p.m. UTC | #1
On 25 Apr 2022, at 10:31, Zi Yan wrote:

> From: Zi Yan <ziy@nvidia.com>
>
> alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
> merging pageblocks with different migratetypes. It might unnecessarily
> convert extra pageblocks at the beginning and at the end of the range.
> Change alloc_contig_range() to work at pageblock granularity.
>
> Special handling is needed for free pages and in-use pages across the
> boundaries of the range specified by alloc_contig_range(). Because these
> partially isolated pages causes free page accounting issues. The free
> pages will be split and freed into separate migratetype lists; the
> in-use pages will be migrated then the freed pages will be handled in
> the aforementioned way.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  include/linux/page-isolation.h |   4 +-
>  mm/internal.h                  |   6 ++
>  mm/memory_hotplug.c            |   3 +-
>  mm/page_alloc.c                |  54 ++++++++--
>  mm/page_isolation.c            | 184 ++++++++++++++++++++++++++++++++-
>  5 files changed, 233 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
> index e14eddf6741a..5456b7be38ae 100644
> --- a/include/linux/page-isolation.h
> +++ b/include/linux/page-isolation.h
> @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>   */
>  int
>  start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			 unsigned migratetype, int flags);
> +			 int migratetype, int flags, gfp_t gfp_flags);
>
>  /*
>   * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
> @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>   */
>  void
>  undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			unsigned migratetype);
> +			int migratetype);
>
>  /*
>   * Test all pages in [start_pfn, end_pfn) are isolated or not.
> diff --git a/mm/internal.h b/mm/internal.h
> index 919fa07e1031..0667abd57634 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
>  			  phys_addr_t min_addr,
>  			  int nid, bool exact_nid);
>
> +void split_free_page(struct page *free_page,
> +				int order, unsigned long split_pfn_offset);
> +
>  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
>
>  /*
> @@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
>  int
>  isolate_migratepages_range(struct compact_control *cc,
>  			   unsigned long low_pfn, unsigned long end_pfn);
> +
> +int __alloc_contig_migrate_range(struct compact_control *cc,
> +					unsigned long start, unsigned long end);
>  #endif
>  int find_suitable_fallback(struct free_area *area, unsigned int order,
>  			int migratetype, bool only_stealable, bool *can_steal);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 4c6065e5d274..9f8ae4cb77ee 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1845,7 +1845,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  	/* set above range as isolated */
>  	ret = start_isolate_page_range(start_pfn, end_pfn,
>  				       MIGRATE_MOVABLE,
> -				       MEMORY_OFFLINE | REPORT_FAILURE);
> +				       MEMORY_OFFLINE | REPORT_FAILURE,
> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>  	if (ret) {
>  		reason = "failure to isolate range";
>  		goto failed_removal_pcplists_disabled;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index ce23ac8ad085..70ddd9a0bcf3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1094,6 +1094,43 @@ static inline void __free_one_page(struct page *page,
>  		page_reporting_notify_free(order);
>  }
>
> +/**
> + * split_free_page() -- split a free page at split_pfn_offset
> + * @free_page:		the original free page
> + * @order:		the order of the page
> + * @split_pfn_offset:	split offset within the page
> + *
> + * It is used when the free page crosses two pageblocks with different migratetypes
> + * at split_pfn_offset within the page. The split free page will be put into
> + * separate migratetype lists afterwards. Otherwise, the function achieves
> + * nothing.
> + */
> +void split_free_page(struct page *free_page,
> +				int order, unsigned long split_pfn_offset)
> +{
> +	struct zone *zone = page_zone(free_page);
> +	unsigned long free_page_pfn = page_to_pfn(free_page);
> +	unsigned long pfn;
> +	unsigned long flags;
> +	int free_page_order;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +	del_page_from_free_list(free_page, zone, order);
> +	for (pfn = free_page_pfn;
> +	     pfn < free_page_pfn + (1UL << order);) {
> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
> +
> +		free_page_order = ffs(split_pfn_offset) - 1;
> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
> +				mt, FPI_NONE);
> +		pfn += 1UL << free_page_order;
> +		split_pfn_offset -= (1UL << free_page_order);
> +		/* we have done the first part, now switch to second part */
> +		if (split_pfn_offset == 0)
> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
> +	}
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +}
>  /*
>   * A bad page could be due to a number of fields. Instead of multiple branches,
>   * try and check multiple fields with one check. The caller must do a detailed
> @@ -8919,7 +8956,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>  #endif
>
>  /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(struct compact_control *cc,
> +int __alloc_contig_migrate_range(struct compact_control *cc,
>  					unsigned long start, unsigned long end)
>  {
>  	/* This function is based on compact_zone() from compaction.c. */
> @@ -9002,7 +9039,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  		       unsigned migratetype, gfp_t gfp_mask)
>  {
>  	unsigned long outer_start, outer_end;
> -	unsigned int order;
> +	int order;
>  	int ret = 0;
>
>  	struct compact_control cc = {
> @@ -9021,14 +9058,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	 * What we do here is we mark all pageblocks in range as
>  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>  	 * have different sizes, and due to the way page allocator
> -	 * work, we align the range to biggest of the two pages so
> -	 * that page allocator won't try to merge buddies from
> -	 * different pageblocks and change MIGRATE_ISOLATE to some
> -	 * other migration type.
> +	 * work, start_isolate_page_range() has special handlings for this.
>  	 *
>  	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>  	 * migrate the pages from an unaligned range (ie. pages that
> -	 * we are interested in).  This will put all the pages in
> +	 * we are interested in). This will put all the pages in
>  	 * range back to page allocator as MIGRATE_ISOLATE.
>  	 *
>  	 * When this is done, we take the pages in range from page
> @@ -9042,9 +9076,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	 */
>
>  	ret = start_isolate_page_range(pfn_max_align_down(start),
> -				       pfn_max_align_up(end), migratetype, 0);
> +				pfn_max_align_up(end), migratetype, 0, gfp_mask);
>  	if (ret)
> -		return ret;
> +		goto done;
>
>  	drain_all_pages(cc.zone);
>
> @@ -9064,7 +9098,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	ret = 0;
>
>  	/*
> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
> +	 * Pages from [start, end) are within a pageblock_nr_pages
>  	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>  	 * more, all pages in [start, end) are free in page allocator.
>  	 * What we are going to do is to allocate all pages from
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index c2f7a8bb634d..94b3467e5ba2 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
>  	return -EBUSY;
>  }
>
> -static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
> +static void unset_migratetype_isolate(struct page *page, int migratetype)
>  {
>  	struct zone *zone;
>  	unsigned long flags, nr_pages;
> @@ -279,6 +279,157 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>  	return NULL;
>  }
>
> +/**
> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
> + * within a free or in-use page.
> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
> + * @gfp_flags:			GFP flags used for migrating pages
> + * @isolate_before:	isolate the pageblock before the boundary_pfn
> + *
> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
> + * pageblock. When not all pageblocks within a page are isolated at the same
> + * time, free page accounting can go wrong. For example, in the case of
> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
> + * [         MAX_ORDER-1         ]
> + * [  pageblock0  |  pageblock1  ]
> + * When either pageblock is isolated, if it is a free page, the page is not
> + * split into separate migratetype lists, which is supposed to; if it is an
> + * in-use page and freed later, __free_one_page() does not split the free page
> + * either. The function handles this by splitting the free page or migrating
> + * the in-use page then splitting the free page.
> + */
> +static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
> +			bool isolate_before)
> +{
> +	unsigned char saved_mt;
> +	unsigned long start_pfn;
> +	unsigned long isolate_pageblock;
> +	unsigned long pfn;
> +	struct zone *zone;
> +
> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
> +
> +	if (isolate_before)
> +		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
> +	else
> +		isolate_pageblock = boundary_pfn;
> +
> +	/*
> +	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
> +	 * only isolating a subset of pageblocks from a bigger than pageblock
> +	 * free or in-use page. Also make sure all to-be-isolated pageblocks
> +	 * are within the same zone.
> +	 */
> +	zone  = page_zone(pfn_to_page(isolate_pageblock));
> +	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
> +				      zone->zone_start_pfn);
> +
> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
> +
> +	/*
> +	 * Bail out early when the to-be-isolated pageblock does not form
> +	 * a free or in-use page across boundary_pfn:
> +	 *
> +	 * 1. isolate before boundary_pfn: the page after is not online
> +	 * 2. isolate after boundary_pfn: the page before is not online
> +	 *
> +	 * This also ensures correctness. Without it, when isolate after
> +	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
> +	 * __first_valid_page() will return unexpected NULL in the for loop
> +	 * below.
> +	 */
> +	if (isolate_before) {
> +		if (!pfn_to_online_page(boundary_pfn))
> +			return 0;
> +	} else {
> +		if (!pfn_to_online_page(boundary_pfn - 1))
> +			return 0;
> +	}
> +
> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
> +		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
> +
> +		VM_BUG_ON(!page);
> +		pfn = page_to_pfn(page);
> +		/*
> +		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
> +		 * free pages in [start_pfn, boundary_pfn), its head page will
> +		 * always be in the range.
> +		 */
> +		if (PageBuddy(page)) {
> +			int order = buddy_order(page);
> +
> +			if (pfn + (1UL << order) > boundary_pfn)
> +				split_free_page(page, order, boundary_pfn - pfn);
> +			pfn += (1UL << order);
> +			continue;
> +		}
> +		/*
> +		 * migrate compound pages then let the free page handling code
> +		 * above do the rest. If migration is not enabled, just fail.
> +		 */
> +		if (PageHuge(page) || PageTransCompound(page)) {
> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
> +			unsigned long nr_pages = compound_nr(page);
> +			int order = compound_order(page);
> +			struct page *head = compound_head(page);
> +			unsigned long head_pfn = page_to_pfn(head);
> +			int ret;
> +			struct compact_control cc = {
> +				.nr_migratepages = 0,
> +				.order = -1,
> +				.zone = page_zone(pfn_to_page(head_pfn)),
> +				.mode = MIGRATE_SYNC,
> +				.ignore_skip_hint = true,
> +				.no_set_skip_hint = true,
> +				.gfp_mask = gfp_flags,
> +				.alloc_contig = true,
> +			};
> +			INIT_LIST_HEAD(&cc.migratepages);
> +
> +			if (head_pfn + nr_pages < boundary_pfn) {
> +				pfn += nr_pages;
> +				continue;
> +			}
> +
> +			ret = __alloc_contig_migrate_range(&cc, head_pfn,
> +						head_pfn + nr_pages);
> +
> +			if (ret)
> +				goto failed;
> +			/*
> +			 * reset pfn, let the free page handling code above
> +			 * split the free page to the right migratetype list.
> +			 *
> +			 * head_pfn is not used here as a hugetlb page order
> +			 * can be bigger than MAX_ORDER-1, but after it is
> +			 * freed, the free page order is not. Use pfn within
> +			 * the range to find the head of the free page and
> +			 * reset order to 0 if a hugetlb page with
> +			 * >MAX_ORDER-1 order is encountered.
> +			 */
> +			if (order > MAX_ORDER-1)
> +				order = 0;
> +			while (!PageBuddy(pfn_to_page(pfn))) {
> +				order++;
> +				pfn &= ~0UL << order;
> +			}
> +			continue;
> +#else
> +			goto failed;
> +#endif
> +		}
> +
> +		pfn++;
> +	}
> +	return 0;
> +failed:
> +	/* restore the original migratetype */
> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
> +	return -EBUSY;
> +}
> +
>  /**
>   * start_isolate_page_range() - make page-allocation-type of range of pages to
>   * be MIGRATE_ISOLATE.
> @@ -293,6 +444,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   *					 and PageOffline() pages.
>   *			REPORT_FAILURE - report details about the failure to
>   *			isolate the range
> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
> + *			range boundaries.
>   *
>   * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>   * the range will never be allocated. Any free pages and pages freed in the
> @@ -301,6 +454,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   * pages in the range finally, the caller have to free all pages in the range.
>   * test_page_isolated() can be used for test it.
>   *
> + * The function first tries to isolate the pageblocks at the beginning and end
> + * of the range, since there might be pages across the range boundaries.
> + * Afterwards, it isolates the rest of the range.
> + *
>   * There is no high level synchronization mechanism that prevents two threads
>   * from trying to isolate overlapping ranges. If this happens, one thread
>   * will notice pageblocks in the overlapping range already set to isolate.
> @@ -321,21 +478,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>   */
>  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			     unsigned migratetype, int flags)
> +			     int migratetype, int flags, gfp_t gfp_flags)
>  {
>  	unsigned long pfn;
>  	struct page *page;
> +	int ret;
>
>  	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
>  	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
>
> -	for (pfn = start_pfn;
> -	     pfn < end_pfn;
> +	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
> +	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
> +	if (ret)
> +		return ret;
> +
> +	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
> +	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
> +	if (ret) {
> +		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
> +		return ret;
> +	}
> +
> +	/* skip isolated pageblocks at the beginning and end */
> +	for (pfn = start_pfn + pageblock_nr_pages;
> +	     pfn < end_pfn - pageblock_nr_pages;
>  	     pfn += pageblock_nr_pages) {
>  		page = __first_valid_page(pfn, pageblock_nr_pages);
>  		if (page && set_migratetype_isolate(page, migratetype, flags,
>  					start_pfn, end_pfn)) {
>  			undo_isolate_page_range(start_pfn, pfn, migratetype);
> +			unset_migratetype_isolate(
> +				pfn_to_page(end_pfn - pageblock_nr_pages),
> +				migratetype);
>  			return -EBUSY;
>  		}
>  	}
> @@ -346,7 +520,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>   * Make isolated pages available again.
>   */
>  void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			    unsigned migratetype)
> +			    int migratetype)
>  {
>  	unsigned long pfn;
>  	struct page *page;
> -- 
> 2.35.1

Qian hit a bug caused by this series https://lore.kernel.org/linux-mm/20220426201855.GA1014@qian/
and the fix is:

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 75e454f5cf45..b3f074d1682e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -367,58 +367,67 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 		}
 		/*
 		 * migrate compound pages then let the free page handling code
-		 * above do the rest. If migration is not enabled, just fail.
+		 * above do the rest. If migration is not possible, just fail.
 		 */
-		if (PageHuge(page) || PageTransCompound(page)) {
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+		if (PageCompound(page)) {
 			unsigned long nr_pages = compound_nr(page);
-			int order = compound_order(page);
 			struct page *head = compound_head(page);
 			unsigned long head_pfn = page_to_pfn(head);
-			int ret;
-			struct compact_control cc = {
-				.nr_migratepages = 0,
-				.order = -1,
-				.zone = page_zone(pfn_to_page(head_pfn)),
-				.mode = MIGRATE_SYNC,
-				.ignore_skip_hint = true,
-				.no_set_skip_hint = true,
-				.gfp_mask = gfp_flags,
-				.alloc_contig = true,
-			};
-			INIT_LIST_HEAD(&cc.migratepages);

 			if (head_pfn + nr_pages < boundary_pfn) {
-				pfn += nr_pages;
+				pfn = head_pfn + nr_pages;
 				continue;
 			}
-
-			ret = __alloc_contig_migrate_range(&cc, head_pfn,
-						head_pfn + nr_pages);
-
-			if (ret)
-				goto failed;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 			/*
-			 * reset pfn, let the free page handling code above
-			 * split the free page to the right migratetype list.
-			 *
-			 * head_pfn is not used here as a hugetlb page order
-			 * can be bigger than MAX_ORDER-1, but after it is
-			 * freed, the free page order is not. Use pfn within
-			 * the range to find the head of the free page and
-			 * reset order to 0 if a hugetlb page with
-			 * >MAX_ORDER-1 order is encountered.
+			 * hugetlb, lru compound (THP), and movable compound pages
+			 * can be migrated. Otherwise, fail the isolation.
 			 */
-			if (order > MAX_ORDER-1)
+			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+				int order;
+				unsigned long outer_pfn;
+				int ret;
+				struct compact_control cc = {
+					.nr_migratepages = 0,
+					.order = -1,
+					.zone = page_zone(pfn_to_page(head_pfn)),
+					.mode = MIGRATE_SYNC,
+					.ignore_skip_hint = true,
+					.no_set_skip_hint = true,
+					.gfp_mask = gfp_flags,
+					.alloc_contig = true,
+				};
+				INIT_LIST_HEAD(&cc.migratepages);
+
+				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+							head_pfn + nr_pages);
+
+				if (ret)
+					goto failed;
+				/*
+				 * reset pfn to the head of the free page, so
+				 * that the free page handling code above can split
+				 * the free page to the right migratetype list.
+				 *
+				 * head_pfn is not used here as a hugetlb page order
+				 * can be bigger than MAX_ORDER-1, but after it is
+				 * freed, the free page order is not. Use pfn within
+				 * the range to find the head of the free page.
+				 */
 				order = 0;
-			while (!PageBuddy(pfn_to_page(pfn))) {
-				order++;
-				pfn &= ~0UL << order;
-			}
-			continue;
-#else
-			goto failed;
+				outer_pfn = pfn;
+				while (!PageBuddy(pfn_to_page(outer_pfn))) {
+					if (++order >= MAX_ORDER) {
+						outer_pfn = pfn;
+						break;
+					}
+					outer_pfn &= ~0UL << order;
+				}
+				pfn = outer_pfn;
+				continue;
+			} else
 #endif
+				goto failed;
 		}

 		pfn++;
Zi Yan May 24, 2022, 7 p.m. UTC | #2
>
> From fce466e89e50bcb0ebb56d7809db1b8bbea47628 Mon Sep 17 00:00:00 2001
> From: Zi Yan <ziy@nvidia.com>
> Date: Tue, 26 Apr 2022 23:00:33 -0400
> Subject: [PATCH] mm: make alloc_contig_range work at pageblock granularity
>
> alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
> merging pageblocks with different migratetypes.  It might unnecessarily
> convert extra pageblocks at the beginning and at the end of the range.
> Change alloc_contig_range() to work at pageblock granularity.
>
> Special handling is needed for free pages and in-use pages across the
> boundaries of the range specified by alloc_contig_range().  Because these
> partially isolated pages causes free page accounting issues.  The free
> pages will be split and freed into separate migratetype lists; the in-use
> pages will be migrated then the freed pages will be handled in the
> aforementioned way.
>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  include/linux/page-isolation.h |   4 +-
>  mm/internal.h                  |   6 +
>  mm/memory_hotplug.c            |   3 +-
>  mm/page_alloc.c                |  54 +++++++--
>  mm/page_isolation.c            | 193 ++++++++++++++++++++++++++++++++-
>  5 files changed, 242 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
> index e14eddf6741a..5456b7be38ae 100644
> --- a/include/linux/page-isolation.h
> +++ b/include/linux/page-isolation.h
> @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>   */
>  int
>  start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			 unsigned migratetype, int flags);
> +			 int migratetype, int flags, gfp_t gfp_flags);
>
>  /*
>   * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
> @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>   */
>  void
>  undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			unsigned migratetype);
> +			int migratetype);
>
>  /*
>   * Test all pages in [start_pfn, end_pfn) are isolated or not.
> diff --git a/mm/internal.h b/mm/internal.h
> index 919fa07e1031..0667abd57634 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
>  			  phys_addr_t min_addr,
>  			  int nid, bool exact_nid);
>
> +void split_free_page(struct page *free_page,
> +				int order, unsigned long split_pfn_offset);
> +
>  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
>
>  /*
> @@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
>  int
>  isolate_migratepages_range(struct compact_control *cc,
>  			   unsigned long low_pfn, unsigned long end_pfn);
> +
> +int __alloc_contig_migrate_range(struct compact_control *cc,
> +					unsigned long start, unsigned long end);
>  #endif
>  int find_suitable_fallback(struct free_area *area, unsigned int order,
>  			int migratetype, bool only_stealable, bool *can_steal);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 4c6065e5d274..9f8ae4cb77ee 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1845,7 +1845,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  	/* set above range as isolated */
>  	ret = start_isolate_page_range(start_pfn, end_pfn,
>  				       MIGRATE_MOVABLE,
> -				       MEMORY_OFFLINE | REPORT_FAILURE);
> +				       MEMORY_OFFLINE | REPORT_FAILURE,
> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>  	if (ret) {
>  		reason = "failure to isolate range";
>  		goto failed_removal_pcplists_disabled;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 93dbe05a6029..6a0d1746c095 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1094,6 +1094,43 @@ static inline void __free_one_page(struct page *page,
>  		page_reporting_notify_free(order);
>  }
>
> +/**
> + * split_free_page() -- split a free page at split_pfn_offset
> + * @free_page:		the original free page
> + * @order:		the order of the page
> + * @split_pfn_offset:	split offset within the page
> + *
> + * It is used when the free page crosses two pageblocks with different migratetypes
> + * at split_pfn_offset within the page. The split free page will be put into
> + * separate migratetype lists afterwards. Otherwise, the function achieves
> + * nothing.
> + */
> +void split_free_page(struct page *free_page,
> +				int order, unsigned long split_pfn_offset)
> +{
> +	struct zone *zone = page_zone(free_page);
> +	unsigned long free_page_pfn = page_to_pfn(free_page);
> +	unsigned long pfn;
> +	unsigned long flags;
> +	int free_page_order;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +	del_page_from_free_list(free_page, zone, order);
> +	for (pfn = free_page_pfn;
> +	     pfn < free_page_pfn + (1UL << order);) {
> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
> +
> +		free_page_order = ffs(split_pfn_offset) - 1;
> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
> +				mt, FPI_NONE);
> +		pfn += 1UL << free_page_order;
> +		split_pfn_offset -= (1UL << free_page_order);
> +		/* we have done the first part, now switch to second part */
> +		if (split_pfn_offset == 0)
> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
> +	}
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +}
>  /*
>   * A bad page could be due to a number of fields. Instead of multiple branches,
>   * try and check multiple fields with one check. The caller must do a detailed
> @@ -8919,7 +8956,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>  #endif
>
>  /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(struct compact_control *cc,
> +int __alloc_contig_migrate_range(struct compact_control *cc,
>  					unsigned long start, unsigned long end)
>  {
>  	/* This function is based on compact_zone() from compaction.c. */
> @@ -9002,7 +9039,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  		       unsigned migratetype, gfp_t gfp_mask)
>  {
>  	unsigned long outer_start, outer_end;
> -	unsigned int order;
> +	int order;
>  	int ret = 0;
>
>  	struct compact_control cc = {
> @@ -9021,14 +9058,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	 * What we do here is we mark all pageblocks in range as
>  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>  	 * have different sizes, and due to the way page allocator
> -	 * work, we align the range to biggest of the two pages so
> -	 * that page allocator won't try to merge buddies from
> -	 * different pageblocks and change MIGRATE_ISOLATE to some
> -	 * other migration type.
> +	 * work, start_isolate_page_range() has special handlings for this.
>  	 *
>  	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>  	 * migrate the pages from an unaligned range (ie. pages that
> -	 * we are interested in).  This will put all the pages in
> +	 * we are interested in). This will put all the pages in
>  	 * range back to page allocator as MIGRATE_ISOLATE.
>  	 *
>  	 * When this is done, we take the pages in range from page
> @@ -9042,9 +9076,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	 */
>
>  	ret = start_isolate_page_range(pfn_max_align_down(start),
> -				       pfn_max_align_up(end), migratetype, 0);
> +				pfn_max_align_up(end), migratetype, 0, gfp_mask);
>  	if (ret)
> -		return ret;
> +		goto done;
>
>  	drain_all_pages(cc.zone);
>
> @@ -9064,7 +9098,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	ret = 0;
>
>  	/*
> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
> +	 * Pages from [start, end) are within a pageblock_nr_pages
>  	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>  	 * more, all pages in [start, end) are free in page allocator.
>  	 * What we are going to do is to allocate all pages from
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index c2f7a8bb634d..8a0f16d2e4c3 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
>  	return -EBUSY;
>  }
>
> -static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
> +static void unset_migratetype_isolate(struct page *page, int migratetype)
>  {
>  	struct zone *zone;
>  	unsigned long flags, nr_pages;
> @@ -279,6 +279,166 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>  	return NULL;
>  }
>
> +/**
> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
> + * within a free or in-use page.
> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
> + * @gfp_flags:			GFP flags used for migrating pages
> + * @isolate_before:	isolate the pageblock before the boundary_pfn
> + *
> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
> + * pageblock. When not all pageblocks within a page are isolated at the same
> + * time, free page accounting can go wrong. For example, in the case of
> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
> + * [         MAX_ORDER-1         ]
> + * [  pageblock0  |  pageblock1  ]
> + * When either pageblock is isolated, if it is a free page, the page is not
> + * split into separate migratetype lists, which is supposed to; if it is an
> + * in-use page and freed later, __free_one_page() does not split the free page
> + * either. The function handles this by splitting the free page or migrating
> + * the in-use page then splitting the free page.
> + */
> +static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
> +			bool isolate_before)
> +{
> +	unsigned char saved_mt;
> +	unsigned long start_pfn;
> +	unsigned long isolate_pageblock;
> +	unsigned long pfn;
> +	struct zone *zone;
> +
> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
> +
> +	if (isolate_before)
> +		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
> +	else
> +		isolate_pageblock = boundary_pfn;
> +
> +	/*
> +	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
> +	 * only isolating a subset of pageblocks from a bigger than pageblock
> +	 * free or in-use page. Also make sure all to-be-isolated pageblocks
> +	 * are within the same zone.
> +	 */
> +	zone  = page_zone(pfn_to_page(isolate_pageblock));
> +	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
> +				      zone->zone_start_pfn);
> +
> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
> +
> +	/*
> +	 * Bail out early when the to-be-isolated pageblock does not form
> +	 * a free or in-use page across boundary_pfn:
> +	 *
> +	 * 1. isolate before boundary_pfn: the page after is not online
> +	 * 2. isolate after boundary_pfn: the page before is not online
> +	 *
> +	 * This also ensures correctness. Without it, when isolate after
> +	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
> +	 * __first_valid_page() will return unexpected NULL in the for loop
> +	 * below.
> +	 */
> +	if (isolate_before) {
> +		if (!pfn_to_online_page(boundary_pfn))
> +			return 0;
> +	} else {
> +		if (!pfn_to_online_page(boundary_pfn - 1))
> +			return 0;
> +	}
> +
> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
> +		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
> +
> +		VM_BUG_ON(!page);
> +		pfn = page_to_pfn(page);
> +		/*
> +		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
> +		 * free pages in [start_pfn, boundary_pfn), its head page will
> +		 * always be in the range.
> +		 */
> +		if (PageBuddy(page)) {
> +			int order = buddy_order(page);
> +
> +			if (pfn + (1UL << order) > boundary_pfn)
> +				split_free_page(page, order, boundary_pfn - pfn);
> +			pfn += (1UL << order);
> +			continue;
> +		}
> +		/*
> +		 * migrate compound pages then let the free page handling code
> +		 * above do the rest. If migration is not possible, just fail.
> +		 */
> +		if (PageCompound(page)) {
> +			unsigned long nr_pages = compound_nr(page);
> +			struct page *head = compound_head(page);
> +			unsigned long head_pfn = page_to_pfn(head);
> +
> +			if (head_pfn + nr_pages < boundary_pfn) {
> +				pfn = head_pfn + nr_pages;
> +				continue;
> +			}
> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
> +			/*
> +			 * hugetlb, lru compound (THP), and movable compound pages
> +			 * can be migrated. Otherwise, fail the isolation.
> +			 */
> +			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
> +				int order;
> +				unsigned long outer_pfn;
> +				int ret;
> +				struct compact_control cc = {
> +					.nr_migratepages = 0,
> +					.order = -1,
> +					.zone = page_zone(pfn_to_page(head_pfn)),
> +					.mode = MIGRATE_SYNC,
> +					.ignore_skip_hint = true,
> +					.no_set_skip_hint = true,
> +					.gfp_mask = gfp_flags,
> +					.alloc_contig = true,
> +				};
> +				INIT_LIST_HEAD(&cc.migratepages);
> +
> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
> +							head_pfn + nr_pages);
> +
> +				if (ret)
> +					goto failed;
> +				/*
> +				 * reset pfn to the head of the free page, so
> +				 * that the free page handling code above can split
> +				 * the free page to the right migratetype list.
> +				 *
> +				 * head_pfn is not used here as a hugetlb page order
> +				 * can be bigger than MAX_ORDER-1, but after it is
> +				 * freed, the free page order is not. Use pfn within
> +				 * the range to find the head of the free page.
> +				 */
> +				order = 0;
> +				outer_pfn = pfn;
> +				while (!PageBuddy(pfn_to_page(outer_pfn))) {
> +					if (++order >= MAX_ORDER) {
> +						outer_pfn = pfn;
> +						break;
> +					}
> +					outer_pfn &= ~0UL << order;
> +				}
> +				pfn = outer_pfn;
> +				continue;
> +			} else
> +#endif
> +				goto failed;
> +		}
> +
> +		pfn++;
> +	}
> +	return 0;
> +failed:
> +	/* restore the original migratetype */
> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
> +	return -EBUSY;
> +}
> +
>  /**
>   * start_isolate_page_range() - make page-allocation-type of range of pages to
>   * be MIGRATE_ISOLATE.
> @@ -293,6 +453,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   *					 and PageOffline() pages.
>   *			REPORT_FAILURE - report details about the failure to
>   *			isolate the range
> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
> + *			range boundaries.
>   *
>   * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>   * the range will never be allocated. Any free pages and pages freed in the
> @@ -301,6 +463,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   * pages in the range finally, the caller have to free all pages in the range.
>   * test_page_isolated() can be used for test it.
>   *
> + * The function first tries to isolate the pageblocks at the beginning and end
> + * of the range, since there might be pages across the range boundaries.
> + * Afterwards, it isolates the rest of the range.
> + *
>   * There is no high level synchronization mechanism that prevents two threads
>   * from trying to isolate overlapping ranges. If this happens, one thread
>   * will notice pageblocks in the overlapping range already set to isolate.
> @@ -321,21 +487,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>   * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>   */
>  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			     unsigned migratetype, int flags)
> +			     int migratetype, int flags, gfp_t gfp_flags)
>  {
>  	unsigned long pfn;
>  	struct page *page;
> +	int ret;
>
>  	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
>  	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
>
> -	for (pfn = start_pfn;
> -	     pfn < end_pfn;
> +	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
> +	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
> +	if (ret)
> +		return ret;
> +
> +	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
> +	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
> +	if (ret) {
> +		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
> +		return ret;
> +	}
> +
> +	/* skip isolated pageblocks at the beginning and end */
> +	for (pfn = start_pfn + pageblock_nr_pages;
> +	     pfn < end_pfn - pageblock_nr_pages;
>  	     pfn += pageblock_nr_pages) {
>  		page = __first_valid_page(pfn, pageblock_nr_pages);
>  		if (page && set_migratetype_isolate(page, migratetype, flags,
>  					start_pfn, end_pfn)) {
>  			undo_isolate_page_range(start_pfn, pfn, migratetype);
> +			unset_migratetype_isolate(
> +				pfn_to_page(end_pfn - pageblock_nr_pages),
> +				migratetype);
>  			return -EBUSY;
>  		}
>  	}
> @@ -346,7 +529,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>   * Make isolated pages available again.
>   */
>  void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			    unsigned migratetype)
> +			    int migratetype)
>  {
>  	unsigned long pfn;
>  	struct page *page;
> -- 
> 2.35.1
>
> --
> Best Regards,
> Yan, Zi

To address the infinite loop issue reported by Qian Cai, the follow fixup should be applied to the commit above, another fixup patch should be applied to Patch 4 in this series (I will reply to Patch 4 email) :


diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c7252ed14a0..76551933bb1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1114,13 +1114,16 @@ void split_free_page(struct page *free_page,
 	unsigned long flags;
 	int free_page_order;

+	if (split_pfn_offset == 0)
+		return;
+
 	spin_lock_irqsave(&zone->lock, flags);
 	del_page_from_free_list(free_page, zone, order);
 	for (pfn = free_page_pfn;
 	     pfn < free_page_pfn + (1UL << order);) {
 		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);

-		free_page_order = ffs(split_pfn_offset) - 1;
+		free_page_order = min(pfn ? __ffs(pfn) : order, __fls(split_pfn_offset));
 		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
 				mt, FPI_NONE);
 		pfn += 1UL << free_page_order;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 8a0f16d2e4c3..7e45736d6451 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -283,6 +283,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * isolate_single_pageblock() -- tries to isolate a pageblock that might be
  * within a free or in-use page.
  * @boundary_pfn:		pageblock-aligned pfn that a page might cross
+ * @flags:			isolation flags
  * @gfp_flags:			GFP flags used for migrating pages
  * @isolate_before:	isolate the pageblock before the boundary_pfn
  *
@@ -298,14 +299,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * either. The function handles this by splitting the free page or migrating
  * the in-use page then splitting the free page.
  */
-static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
-			bool isolate_before)
+static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+			gfp_t gfp_flags, bool isolate_before)
 {
 	unsigned char saved_mt;
 	unsigned long start_pfn;
 	unsigned long isolate_pageblock;
 	unsigned long pfn;
 	struct zone *zone;
+	int ret;

 	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));

@@ -325,7 +327,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 				      zone->zone_start_pfn);

 	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
+	ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+			isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+	if (ret)
+		return ret;

 	/*
 	 * Bail out early when the to-be-isolated pageblock does not form
@@ -374,7 +380,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 			struct page *head = compound_head(page);
 			unsigned long head_pfn = page_to_pfn(head);

-			if (head_pfn + nr_pages < boundary_pfn) {
+			if (head_pfn + nr_pages <= boundary_pfn) {
 				pfn = head_pfn + nr_pages;
 				continue;
 			}
@@ -386,7 +392,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
 				int order;
 				unsigned long outer_pfn;
-				int ret;
+				int page_mt = get_pageblock_migratetype(page);
+				bool isolate_page = !is_migrate_isolate_page(page);
 				struct compact_control cc = {
 					.nr_migratepages = 0,
 					.order = -1,
@@ -399,9 +406,31 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 				};
 				INIT_LIST_HEAD(&cc.migratepages);

+				/*
+				 * XXX: mark the page as MIGRATE_ISOLATE so that
+				 * no one else can grab the freed page after migration.
+				 * Ideally, the page should be freed as two separate
+				 * pages to be added into separate migratetype free
+				 * lists.
+				 */
+				if (isolate_page) {
+					ret = set_migratetype_isolate(page, page_mt,
+						flags, head_pfn, head_pfn + nr_pages);
+					if (ret)
+						goto failed;
+				}
+
 				ret = __alloc_contig_migrate_range(&cc, head_pfn,
 							head_pfn + nr_pages);

+				/*
+				 * restore the page's migratetype so that it can
+				 * be split into separate migratetype free lists
+				 * later.
+				 */
+				if (isolate_page)
+					unset_migratetype_isolate(page, page_mt);
+
 				if (ret)
 					goto failed;
 				/*
@@ -417,10 +446,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 				order = 0;
 				outer_pfn = pfn;
 				while (!PageBuddy(pfn_to_page(outer_pfn))) {
-					if (++order >= MAX_ORDER) {
-						outer_pfn = pfn;
-						break;
-					}
+					/* stop if we cannot find the free page */
+					if (++order >= MAX_ORDER)
+						goto failed;
 					outer_pfn &= ~0UL << order;
 				}
 				pfn = outer_pfn;
@@ -435,7 +463,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
 	return 0;
 failed:
 	/* restore the original migratetype */
-	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
+	unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
 	return -EBUSY;
 }

@@ -497,12 +525,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));

 	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
-	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
+	ret = isolate_single_pageblock(start_pfn, flags, gfp_flags, false);
 	if (ret)
 		return ret;

 	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
-	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
+	ret = isolate_single_pageblock(end_pfn, flags, gfp_flags, true);
 	if (ret) {
 		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
 		return ret;



The complete commit with the fixup patch applied is:

From 71a4c830ce96d23aacb11ec715cc27d482acdd93 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Thu, 12 May 2022 20:22:58 -0700
Subject: [PATCH] mm: make alloc_contig_range work at pageblock granularity

alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
merging pageblocks with different migratetypes.  It might unnecessarily
convert extra pageblocks at the beginning and at the end of the range.
Change alloc_contig_range() to work at pageblock granularity.

Special handling is needed for free pages and in-use pages across the
boundaries of the range specified by alloc_contig_range().  Because these=

Partially isolated pages causes free page accounting issues.  The free
pages will be split and freed into separate migratetype lists; the in-use=

Pages will be migrated then the freed pages will be handled in the
aforementioned way.

[ziy@nvidia.com: fix deadlock/crash]
  Link: https://lkml.kernel.org/r/23A7297E-6C84-4138-A9FE-3598234004E6@nvidia.com
Link: https://lkml.kernel.org/r/20220425143118.2850746-4-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: Eric Ren <renzhengeek@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h |   4 +-
 mm/internal.h                  |   6 +
 mm/memory_hotplug.c            |   3 +-
 mm/page_alloc.c                |  57 +++++++--
 mm/page_isolation.c            | 221 ++++++++++++++++++++++++++++++++-
 5 files changed, 273 insertions(+), 18 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index e14eddf6741a..5456b7be38ae 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			 unsigned migratetype, int flags);
+			 int migratetype, int flags, gfp_t gfp_flags);

 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  */
 void
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			unsigned migratetype);
+			int migratetype);

 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
diff --git a/mm/internal.h b/mm/internal.h
index ddd09245a6db..a770029beb08 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 			  phys_addr_t min_addr,
 			  int nid, bool exact_nid);

+void split_free_page(struct page *free_page,
+				int order, unsigned long split_pfn_offset);
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA

 /*
@@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
 int
 isolate_migratepages_range(struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
+
+int __alloc_contig_migrate_range(struct compact_control *cc,
+					unsigned long start, unsigned long end);
 #endif
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e99fd60548f5..945191708ef6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1837,7 +1837,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
-				       MEMORY_OFFLINE | REPORT_FAILURE);
+				       MEMORY_OFFLINE | REPORT_FAILURE,
+				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
 	if (ret) {
 		reason = "failure to isolate range";
 		goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0756f046b644..76551933bb1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1094,6 +1094,46 @@ static inline void __free_one_page(struct page *page,
 		page_reporting_notify_free(order);
 }

+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page:		the original free page
+ * @order:		the order of the page
+ * @split_pfn_offset:	split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+void split_free_page(struct page *free_page,
+				int order, unsigned long split_pfn_offset)
+{
+	struct zone *zone = page_zone(free_page);
+	unsigned long free_page_pfn = page_to_pfn(free_page);
+	unsigned long pfn;
+	unsigned long flags;
+	int free_page_order;
+
+	if (split_pfn_offset == 0)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	del_page_from_free_list(free_page, zone, order);
+	for (pfn = free_page_pfn;
+	     pfn < free_page_pfn + (1UL << order);) {
+		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+		free_page_order = min(pfn ? __ffs(pfn) : order, __fls(split_pfn_offset));
+		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+				mt, FPI_NONE);
+		pfn += 1UL << free_page_order;
+		split_pfn_offset -= (1UL << free_page_order);
+		/* we have done the first part, now switch to second part */
+		if (split_pfn_offset == 0)
+			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
 /*
  * A bad page could be due to a number of fields. Instead of multiple branches,
  * try and check multiple fields with one check. The caller must do a detailed
@@ -8951,7 +8991,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
 #endif

 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
@@ -9034,7 +9074,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
-	unsigned int order;
+	int order;
 	int ret = 0;

 	struct compact_control cc = {
@@ -9053,14 +9093,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
-	 * work, we align the range to biggest of the two pages so
-	 * that page allocator won't try to merge buddies from
-	 * different pageblocks and change MIGRATE_ISOLATE to some
-	 * other migration type.
+	 * work, start_isolate_page_range() has special handlings for this.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
-	 * we are interested in).  This will put all the pages in
+	 * we are interested in). This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
@@ -9074,9 +9111,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	 */

 	ret = start_isolate_page_range(pfn_max_align_down(start),
-				       pfn_max_align_up(end), migratetype, 0);
+				pfn_max_align_up(end), migratetype, 0, gfp_mask);
 	if (ret)
-		return ret;
+		goto done;

 	drain_all_pages(cc.zone);

@@ -9096,7 +9133,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	ret = 0;

 	/*
-	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+	 * Pages from [start, end) are within a pageblock_nr_pages
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c2f7a8bb634d..6b47acaf51f3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	return -EBUSY;
 }

-static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, int migratetype)
 {
 	struct zone *zone;
 	unsigned long flags, nr_pages;
@@ -279,6 +279,194 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 	return NULL;
 }

+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn:		pageblock-aligned pfn that a page might cross
+ * @flags:			isolation flags
+ * @gfp_flags:			GFP flags used for migrating pages
+ * @isolate_before:	isolate the pageblock before the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [         MAX_ORDER-1         ]
+ * [  pageblock0  |  pageblock1  ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+			gfp_t gfp_flags, bool isolate_before)
+{
+	unsigned char saved_mt;
+	unsigned long start_pfn;
+	unsigned long isolate_pageblock;
+	unsigned long pfn;
+	struct zone *zone;
+	int ret;
+
+	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+	if (isolate_before)
+		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+	else
+		isolate_pageblock = boundary_pfn;
+
+	/*
+	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+	 * only isolating a subset of pageblocks from a bigger than pageblock
+	 * free or in-use page. Also make sure all to-be-isolated pageblocks
+	 * are within the same zone.
+	 */
+	zone  = page_zone(pfn_to_page(isolate_pageblock));
+	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+				      zone->zone_start_pfn);
+
+	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+	ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+			isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * Bail out early when the to-be-isolated pageblock does not form
+	 * a free or in-use page across boundary_pfn:
+	 *
+	 * 1. isolate before boundary_pfn: the page after is not online
+	 * 2. isolate after boundary_pfn: the page before is not online
+	 *
+	 * This also ensures correctness. Without it, when isolate after
+	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+	 * __first_valid_page() will return unexpected NULL in the for loop
+	 * below.
+	 */
+	if (isolate_before) {
+		if (!pfn_to_online_page(boundary_pfn))
+			return 0;
+	} else {
+		if (!pfn_to_online_page(boundary_pfn - 1))
+			return 0;
+	}
+
+	for (pfn = start_pfn; pfn < boundary_pfn;) {
+		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+		VM_BUG_ON(!page);
+		pfn = page_to_pfn(page);
+		/*
+		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+		 * free pages in [start_pfn, boundary_pfn), its head page will
+		 * always be in the range.
+		 */
+		if (PageBuddy(page)) {
+			int order = buddy_order(page);
+
+			if (pfn + (1UL << order) > boundary_pfn)
+				split_free_page(page, order, boundary_pfn - pfn);
+			pfn += (1UL << order);
+			continue;
+		}
+		/*
+		 * migrate compound pages then let the free page handling code
+		 * above do the rest. If migration is not possible, just fail.
+		 */
+		if (PageCompound(page)) {
+			unsigned long nr_pages = compound_nr(page);
+			struct page *head = compound_head(page);
+			unsigned long head_pfn = page_to_pfn(head);
+
+			if (head_pfn + nr_pages <= boundary_pfn) {
+				pfn = head_pfn + nr_pages;
+				continue;
+			}
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+			/*
+			 * hugetlb, lru compound (THP), and movable compound pages
+			 * can be migrated. Otherwise, fail the isolation.
+			 */
+			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+				int order;
+				unsigned long outer_pfn;
+				int page_mt = get_pageblock_migratetype(page);
+				bool isolate_page = !is_migrate_isolate_page(page);
+				struct compact_control cc = {
+					.nr_migratepages = 0,
+					.order = -1,
+					.zone = page_zone(pfn_to_page(head_pfn)),
+					.mode = MIGRATE_SYNC,
+					.ignore_skip_hint = true,
+					.no_set_skip_hint = true,
+					.gfp_mask = gfp_flags,
+					.alloc_contig = true,
+				};
+				INIT_LIST_HEAD(&cc.migratepages);
+
+				/*
+				 * XXX: mark the page as MIGRATE_ISOLATE so that
+				 * no one else can grab the freed page after migration.
+				 * Ideally, the page should be freed as two separate
+				 * pages to be added into separate migratetype free
+				 * lists.
+				 */
+				if (isolate_page) {
+					ret = set_migratetype_isolate(page, page_mt,
+						flags, head_pfn, boundary_pfn - 1);
+					if (ret)
+						goto failed;
+				}
+
+				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+							head_pfn + nr_pages);
+
+				/*
+				 * restore the page's migratetype so that it can
+				 * be split into separate migratetype free lists
+				 * later.
+				 */
+				if (isolate_page)
+					unset_migratetype_isolate(page, page_mt);
+
+				if (ret)
+					goto failed;
+				/*
+				 * reset pfn to the head of the free page, so
+				 * that the free page handling code above can split
+				 * the free page to the right migratetype list.
+				 *
+				 * head_pfn is not used here as a hugetlb page order
+				 * can be bigger than MAX_ORDER-1, but after it is
+				 * freed, the free page order is not. Use pfn within
+				 * the range to find the head of the free page.
+				 */
+				order = 0;
+				outer_pfn = pfn;
+				while (!PageBuddy(pfn_to_page(outer_pfn))) {
+					/* stop if we cannot find the free page */
+					if (++order >= MAX_ORDER)
+						goto failed;
+					outer_pfn &= ~0UL << order;
+				}
+				pfn = outer_pfn;
+				continue;
+			} else
+#endif
+				goto failed;
+		}
+
+		pfn++;
+	}
+	return 0;
+failed:
+	/* restore the original migratetype */
+	unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+	return -EBUSY;
+}
+
 /**
  * start_isolate_page_range() - make page-allocation-type of range of pages to
  * be MIGRATE_ISOLATE.
@@ -293,6 +481,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *					 and PageOffline() pages.
  *			REPORT_FAILURE - report details about the failure to
  *			isolate the range
+ * @gfp_flags:		GFP flags used for migrating pages that sit across the
+ *			range boundaries.
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
@@ -301,6 +491,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * pages in the range finally, the caller have to free all pages in the range.
  * test_page_isolated() can be used for test it.
  *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
  * There is no high level synchronization mechanism that prevents two threads
  * from trying to isolate overlapping ranges. If this happens, one thread
  * will notice pageblocks in the overlapping range already set to isolate.
@@ -321,21 +515,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     unsigned migratetype, int flags)
+			     int migratetype, int flags, gfp_t gfp_flags)
 {
 	unsigned long pfn;
 	struct page *page;
+	int ret;

 	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));

-	for (pfn = start_pfn;
-	     pfn < end_pfn;
+	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
+	ret = isolate_single_pageblock(start_pfn, flags, gfp_flags, false);
+	if (ret)
+		return ret;
+
+	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
+	ret = isolate_single_pageblock(end_pfn, flags, gfp_flags, true);
+	if (ret) {
+		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
+		return ret;
+	}
+
+	/* skip isolated pageblocks at the beginning and end */
+	for (pfn = start_pfn + pageblock_nr_pages;
+	     pfn < end_pfn - pageblock_nr_pages;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (page && set_migratetype_isolate(page, migratetype, flags,
 					start_pfn, end_pfn)) {
 			undo_isolate_page_range(start_pfn, pfn, migratetype);
+			unset_migratetype_isolate(
+				pfn_to_page(end_pfn - pageblock_nr_pages),
+				migratetype);
 			return -EBUSY;
 		}
 	}
@@ -346,7 +557,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Make isolated pages available again.
  */
 void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			    unsigned migratetype)
+			    int migratetype)
 {
 	unsigned long pfn;
 	struct page *page;
Doug Berger May 25, 2022, 5:41 p.m. UTC | #3
I am seeing some free memory accounting problems with linux-next that I 
have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make 
alloc_contig_range work at pageblock granularity").

On an arm64 SMP platform with 4GB total memory and the default 16MB 
default CMA pool, I am seeing the following after boot with a sysrq Show 
Memory (e.g. 'echo m > /proc/sysrq-trigger'):

[   16.015906] sysrq: Show Memory
[   16.019039] Mem-Info:
[   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
[   16.021348]  active_file:0 inactive_file:0 isolated_file:0
[   16.021348]  unevictable:0 dirty:0 writeback:0
[   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
[   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
[   16.021348]  kernel_misc_reclaimable:0
[   16.021348]  free:976766 free_pcp:991 free_cma:7017
[   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB 
active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB 
isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB 
writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
[   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB 
high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB 
active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB 
present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB 
free_pcp:636kB local_pcp:0kB free_cma:28068kB
[   16.108650] lowmem_reserve[]: 0 0 944 944
[   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB 
high:3864kB reserved_highatomic:0KB active_anon:58416kB 
inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB 
writepending:0kB present:1048576kB managed:967352kB mlocked:0kB 
bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
[   16.140393] lowmem_reserve[]: 0 0 0 0
[   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB 
(M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 
732*4096kB (MC) = 3027724kB
[   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB 
(ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 
0*2048kB 210*4096kB (M) = 866028kB
[   16.175165] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=1048576kB
[   16.183937] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=32768kB
[   16.192533] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=2048kB
[   16.201040] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=64kB
[   16.209374] 15146 total pagecache pages
[   16.213246] 0 pages in swap cache
[   16.216595] Swap cache stats: add 0, delete 0, find 0/0
[   16.221867] Free swap  = 0kB
[   16.224780] Total swap = 0kB
[   16.227693] 1048576 pages RAM
[   16.230694] 0 pages HighMem/MovableOnly
[   16.234564] 49240 pages reserved
[   16.237825] 4096 pages cma reserved

Some anomolies in the above are:
free_cma:7017 with only 4096 pages cma reserved
DMA free:3041036kB with only managed:3029992kB

I'm not sure what is going on here, but I am suspicious of 
split_free_page() since del_page_from_free_list doesn't affect 
migrate_type accounting, but __free_one_page() can.
Also PageBuddy(page) is being checked without zone->lock in 
isolate_single_pageblock().

Please investigate this as well.

Thanks!
     Doug

On 4/29/2022 6:54 AM, Zi Yan wrote:
> On 25 Apr 2022, at 10:31, Zi Yan wrote:
> 
>> From: Zi Yan <ziy@nvidia.com>
>>
>> alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
>> merging pageblocks with different migratetypes. It might unnecessarily
>> convert extra pageblocks at the beginning and at the end of the range.
>> Change alloc_contig_range() to work at pageblock granularity.
>>
>> Special handling is needed for free pages and in-use pages across the
>> boundaries of the range specified by alloc_contig_range(). Because these
>> partially isolated pages causes free page accounting issues. The free
>> pages will be split and freed into separate migratetype lists; the
>> in-use pages will be migrated then the freed pages will be handled in
>> the aforementioned way.
>>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/page-isolation.h |   4 +-
>>   mm/internal.h                  |   6 ++
>>   mm/memory_hotplug.c            |   3 +-
>>   mm/page_alloc.c                |  54 ++++++++--
>>   mm/page_isolation.c            | 184 ++++++++++++++++++++++++++++++++-
>>   5 files changed, 233 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index e14eddf6741a..5456b7be38ae 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>>    */
>>   int
>>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			 unsigned migratetype, int flags);
>> +			 int migratetype, int flags, gfp_t gfp_flags);
>>
>>   /*
>>    * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
>> @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>    */
>>   void
>>   undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			unsigned migratetype);
>> +			int migratetype);
>>
>>   /*
>>    * Test all pages in [start_pfn, end_pfn) are isolated or not.
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 919fa07e1031..0667abd57634 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
>>   			  phys_addr_t min_addr,
>>   			  int nid, bool exact_nid);
>>
>> +void split_free_page(struct page *free_page,
>> +				int order, unsigned long split_pfn_offset);
>> +
>>   #if defined CONFIG_COMPACTION || defined CONFIG_CMA
>>
>>   /*
>> @@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
>>   int
>>   isolate_migratepages_range(struct compact_control *cc,
>>   			   unsigned long low_pfn, unsigned long end_pfn);
>> +
>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>> +					unsigned long start, unsigned long end);
>>   #endif
>>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>>   			int migratetype, bool only_stealable, bool *can_steal);
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 4c6065e5d274..9f8ae4cb77ee 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1845,7 +1845,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   	/* set above range as isolated */
>>   	ret = start_isolate_page_range(start_pfn, end_pfn,
>>   				       MIGRATE_MOVABLE,
>> -				       MEMORY_OFFLINE | REPORT_FAILURE);
>> +				       MEMORY_OFFLINE | REPORT_FAILURE,
>> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>>   	if (ret) {
>>   		reason = "failure to isolate range";
>>   		goto failed_removal_pcplists_disabled;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index ce23ac8ad085..70ddd9a0bcf3 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1094,6 +1094,43 @@ static inline void __free_one_page(struct page *page,
>>   		page_reporting_notify_free(order);
>>   }
>>
>> +/**
>> + * split_free_page() -- split a free page at split_pfn_offset
>> + * @free_page:		the original free page
>> + * @order:		the order of the page
>> + * @split_pfn_offset:	split offset within the page
>> + *
>> + * It is used when the free page crosses two pageblocks with different migratetypes
>> + * at split_pfn_offset within the page. The split free page will be put into
>> + * separate migratetype lists afterwards. Otherwise, the function achieves
>> + * nothing.
>> + */
>> +void split_free_page(struct page *free_page,
>> +				int order, unsigned long split_pfn_offset)
>> +{
>> +	struct zone *zone = page_zone(free_page);
>> +	unsigned long free_page_pfn = page_to_pfn(free_page);
>> +	unsigned long pfn;
>> +	unsigned long flags;
>> +	int free_page_order;
>> +
>> +	spin_lock_irqsave(&zone->lock, flags);
>> +	del_page_from_free_list(free_page, zone, order);
>> +	for (pfn = free_page_pfn;
>> +	     pfn < free_page_pfn + (1UL << order);) {
>> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
>> +
>> +		free_page_order = ffs(split_pfn_offset) - 1;
>> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
>> +				mt, FPI_NONE);
>> +		pfn += 1UL << free_page_order;
>> +		split_pfn_offset -= (1UL << free_page_order);
>> +		/* we have done the first part, now switch to second part */
>> +		if (split_pfn_offset == 0)
>> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>> +	}
>> +	spin_unlock_irqrestore(&zone->lock, flags);
>> +}
>>   /*
>>    * A bad page could be due to a number of fields. Instead of multiple branches,
>>    * try and check multiple fields with one check. The caller must do a detailed
>> @@ -8919,7 +8956,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>>   #endif
>>
>>   /* [start, end) must belong to a single zone. */
>> -static int __alloc_contig_migrate_range(struct compact_control *cc,
>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>>   					unsigned long start, unsigned long end)
>>   {
>>   	/* This function is based on compact_zone() from compaction.c. */
>> @@ -9002,7 +9039,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   		       unsigned migratetype, gfp_t gfp_mask)
>>   {
>>   	unsigned long outer_start, outer_end;
>> -	unsigned int order;
>> +	int order;
>>   	int ret = 0;
>>
>>   	struct compact_control cc = {
>> @@ -9021,14 +9058,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 * What we do here is we mark all pageblocks in range as
>>   	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>>   	 * have different sizes, and due to the way page allocator
>> -	 * work, we align the range to biggest of the two pages so
>> -	 * that page allocator won't try to merge buddies from
>> -	 * different pageblocks and change MIGRATE_ISOLATE to some
>> -	 * other migration type.
>> +	 * work, start_isolate_page_range() has special handlings for this.
>>   	 *
>>   	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>>   	 * migrate the pages from an unaligned range (ie. pages that
>> -	 * we are interested in).  This will put all the pages in
>> +	 * we are interested in). This will put all the pages in
>>   	 * range back to page allocator as MIGRATE_ISOLATE.
>>   	 *
>>   	 * When this is done, we take the pages in range from page
>> @@ -9042,9 +9076,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 */
>>
>>   	ret = start_isolate_page_range(pfn_max_align_down(start),
>> -				       pfn_max_align_up(end), migratetype, 0);
>> +				pfn_max_align_up(end), migratetype, 0, gfp_mask);
>>   	if (ret)
>> -		return ret;
>> +		goto done;
>>
>>   	drain_all_pages(cc.zone);
>>
>> @@ -9064,7 +9098,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	ret = 0;
>>
>>   	/*
>> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
>> +	 * Pages from [start, end) are within a pageblock_nr_pages
>>   	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>>   	 * more, all pages in [start, end) are free in page allocator.
>>   	 * What we are going to do is to allocate all pages from
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index c2f7a8bb634d..94b3467e5ba2 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
>>   	return -EBUSY;
>>   }
>>
>> -static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
>> +static void unset_migratetype_isolate(struct page *page, int migratetype)
>>   {
>>   	struct zone *zone;
>>   	unsigned long flags, nr_pages;
>> @@ -279,6 +279,157 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>   	return NULL;
>>   }
>>
>> +/**
>> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
>> + * within a free or in-use page.
>> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
>> + * @gfp_flags:			GFP flags used for migrating pages
>> + * @isolate_before:	isolate the pageblock before the boundary_pfn
>> + *
>> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
>> + * pageblock. When not all pageblocks within a page are isolated at the same
>> + * time, free page accounting can go wrong. For example, in the case of
>> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
>> + * [         MAX_ORDER-1         ]
>> + * [  pageblock0  |  pageblock1  ]
>> + * When either pageblock is isolated, if it is a free page, the page is not
>> + * split into separate migratetype lists, which is supposed to; if it is an
>> + * in-use page and freed later, __free_one_page() does not split the free page
>> + * either. The function handles this by splitting the free page or migrating
>> + * the in-use page then splitting the free page.
>> + */
>> +static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>> +			bool isolate_before)
>> +{
>> +	unsigned char saved_mt;
>> +	unsigned long start_pfn;
>> +	unsigned long isolate_pageblock;
>> +	unsigned long pfn;
>> +	struct zone *zone;
>> +
>> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
>> +
>> +	if (isolate_before)
>> +		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
>> +	else
>> +		isolate_pageblock = boundary_pfn;
>> +
>> +	/*
>> +	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
>> +	 * only isolating a subset of pageblocks from a bigger than pageblock
>> +	 * free or in-use page. Also make sure all to-be-isolated pageblocks
>> +	 * are within the same zone.
>> +	 */
>> +	zone  = page_zone(pfn_to_page(isolate_pageblock));
>> +	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
>> +				      zone->zone_start_pfn);
>> +
>> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
>> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
>> +
>> +	/*
>> +	 * Bail out early when the to-be-isolated pageblock does not form
>> +	 * a free or in-use page across boundary_pfn:
>> +	 *
>> +	 * 1. isolate before boundary_pfn: the page after is not online
>> +	 * 2. isolate after boundary_pfn: the page before is not online
>> +	 *
>> +	 * This also ensures correctness. Without it, when isolate after
>> +	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
>> +	 * __first_valid_page() will return unexpected NULL in the for loop
>> +	 * below.
>> +	 */
>> +	if (isolate_before) {
>> +		if (!pfn_to_online_page(boundary_pfn))
>> +			return 0;
>> +	} else {
>> +		if (!pfn_to_online_page(boundary_pfn - 1))
>> +			return 0;
>> +	}
>> +
>> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
>> +		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
>> +
>> +		VM_BUG_ON(!page);
>> +		pfn = page_to_pfn(page);
>> +		/*
>> +		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
>> +		 * free pages in [start_pfn, boundary_pfn), its head page will
>> +		 * always be in the range.
>> +		 */
>> +		if (PageBuddy(page)) {
>> +			int order = buddy_order(page);
>> +
>> +			if (pfn + (1UL << order) > boundary_pfn)
>> +				split_free_page(page, order, boundary_pfn - pfn);
>> +			pfn += (1UL << order);
>> +			continue;
>> +		}
>> +		/*
>> +		 * migrate compound pages then let the free page handling code
>> +		 * above do the rest. If migration is not enabled, just fail.
>> +		 */
>> +		if (PageHuge(page) || PageTransCompound(page)) {
>> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
>> +			unsigned long nr_pages = compound_nr(page);
>> +			int order = compound_order(page);
>> +			struct page *head = compound_head(page);
>> +			unsigned long head_pfn = page_to_pfn(head);
>> +			int ret;
>> +			struct compact_control cc = {
>> +				.nr_migratepages = 0,
>> +				.order = -1,
>> +				.zone = page_zone(pfn_to_page(head_pfn)),
>> +				.mode = MIGRATE_SYNC,
>> +				.ignore_skip_hint = true,
>> +				.no_set_skip_hint = true,
>> +				.gfp_mask = gfp_flags,
>> +				.alloc_contig = true,
>> +			};
>> +			INIT_LIST_HEAD(&cc.migratepages);
>> +
>> +			if (head_pfn + nr_pages < boundary_pfn) {
>> +				pfn += nr_pages;
>> +				continue;
>> +			}
>> +
>> +			ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> +						head_pfn + nr_pages);
>> +
>> +			if (ret)
>> +				goto failed;
>> +			/*
>> +			 * reset pfn, let the free page handling code above
>> +			 * split the free page to the right migratetype list.
>> +			 *
>> +			 * head_pfn is not used here as a hugetlb page order
>> +			 * can be bigger than MAX_ORDER-1, but after it is
>> +			 * freed, the free page order is not. Use pfn within
>> +			 * the range to find the head of the free page and
>> +			 * reset order to 0 if a hugetlb page with
>> +			 * >MAX_ORDER-1 order is encountered.
>> +			 */
>> +			if (order > MAX_ORDER-1)
>> +				order = 0;
>> +			while (!PageBuddy(pfn_to_page(pfn))) {
>> +				order++;
>> +				pfn &= ~0UL << order;
>> +			}
>> +			continue;
>> +#else
>> +			goto failed;
>> +#endif
>> +		}
>> +
>> +		pfn++;
>> +	}
>> +	return 0;
>> +failed:
>> +	/* restore the original migratetype */
>> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
>> +	return -EBUSY;
>> +}
>> +
>>   /**
>>    * start_isolate_page_range() - make page-allocation-type of range of pages to
>>    * be MIGRATE_ISOLATE.
>> @@ -293,6 +444,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    *					 and PageOffline() pages.
>>    *			REPORT_FAILURE - report details about the failure to
>>    *			isolate the range
>> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
>> + *			range boundaries.
>>    *
>>    * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>>    * the range will never be allocated. Any free pages and pages freed in the
>> @@ -301,6 +454,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * pages in the range finally, the caller have to free all pages in the range.
>>    * test_page_isolated() can be used for test it.
>>    *
>> + * The function first tries to isolate the pageblocks at the beginning and end
>> + * of the range, since there might be pages across the range boundaries.
>> + * Afterwards, it isolates the rest of the range.
>> + *
>>    * There is no high level synchronization mechanism that prevents two threads
>>    * from trying to isolate overlapping ranges. If this happens, one thread
>>    * will notice pageblocks in the overlapping range already set to isolate.
>> @@ -321,21 +478,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>>    */
>>   int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			     unsigned migratetype, int flags)
>> +			     int migratetype, int flags, gfp_t gfp_flags)
>>   {
>>   	unsigned long pfn;
>>   	struct page *page;
>> +	int ret;
>>
>>   	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
>>   	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
>>
>> -	for (pfn = start_pfn;
>> -	     pfn < end_pfn;
>> +	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
>> +	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
>> +	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
>> +	if (ret) {
>> +		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
>> +		return ret;
>> +	}
>> +
>> +	/* skip isolated pageblocks at the beginning and end */
>> +	for (pfn = start_pfn + pageblock_nr_pages;
>> +	     pfn < end_pfn - pageblock_nr_pages;
>>   	     pfn += pageblock_nr_pages) {
>>   		page = __first_valid_page(pfn, pageblock_nr_pages);
>>   		if (page && set_migratetype_isolate(page, migratetype, flags,
>>   					start_pfn, end_pfn)) {
>>   			undo_isolate_page_range(start_pfn, pfn, migratetype);
>> +			unset_migratetype_isolate(
>> +				pfn_to_page(end_pfn - pageblock_nr_pages),
>> +				migratetype);
>>   			return -EBUSY;
>>   		}
>>   	}
>> @@ -346,7 +520,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>    * Make isolated pages available again.
>>    */
>>   void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			    unsigned migratetype)
>> +			    int migratetype)
>>   {
>>   	unsigned long pfn;
>>   	struct page *page;
>> -- 
>> 2.35.1
> 
> Qian hit a bug caused by this series https://lore.kernel.org/linux-mm/20220426201855.GA1014@qian/
> and the fix is:
> 
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index 75e454f5cf45..b3f074d1682e 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -367,58 +367,67 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>   		}
>   		/*
>   		 * migrate compound pages then let the free page handling code
> -		 * above do the rest. If migration is not enabled, just fail.
> +		 * above do the rest. If migration is not possible, just fail.
>   		 */
> -		if (PageHuge(page) || PageTransCompound(page)) {
> -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
> +		if (PageCompound(page)) {
>   			unsigned long nr_pages = compound_nr(page);
> -			int order = compound_order(page);
>   			struct page *head = compound_head(page);
>   			unsigned long head_pfn = page_to_pfn(head);
> -			int ret;
> -			struct compact_control cc = {
> -				.nr_migratepages = 0,
> -				.order = -1,
> -				.zone = page_zone(pfn_to_page(head_pfn)),
> -				.mode = MIGRATE_SYNC,
> -				.ignore_skip_hint = true,
> -				.no_set_skip_hint = true,
> -				.gfp_mask = gfp_flags,
> -				.alloc_contig = true,
> -			};
> -			INIT_LIST_HEAD(&cc.migratepages);
> 
>   			if (head_pfn + nr_pages < boundary_pfn) {
> -				pfn += nr_pages;
> +				pfn = head_pfn + nr_pages;
>   				continue;
>   			}
> -
> -			ret = __alloc_contig_migrate_range(&cc, head_pfn,
> -						head_pfn + nr_pages);
> -
> -			if (ret)
> -				goto failed;
> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
>   			/*
> -			 * reset pfn, let the free page handling code above
> -			 * split the free page to the right migratetype list.
> -			 *
> -			 * head_pfn is not used here as a hugetlb page order
> -			 * can be bigger than MAX_ORDER-1, but after it is
> -			 * freed, the free page order is not. Use pfn within
> -			 * the range to find the head of the free page and
> -			 * reset order to 0 if a hugetlb page with
> -			 * >MAX_ORDER-1 order is encountered.
> +			 * hugetlb, lru compound (THP), and movable compound pages
> +			 * can be migrated. Otherwise, fail the isolation.
>   			 */
> -			if (order > MAX_ORDER-1)
> +			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
> +				int order;
> +				unsigned long outer_pfn;
> +				int ret;
> +				struct compact_control cc = {
> +					.nr_migratepages = 0,
> +					.order = -1,
> +					.zone = page_zone(pfn_to_page(head_pfn)),
> +					.mode = MIGRATE_SYNC,
> +					.ignore_skip_hint = true,
> +					.no_set_skip_hint = true,
> +					.gfp_mask = gfp_flags,
> +					.alloc_contig = true,
> +				};
> +				INIT_LIST_HEAD(&cc.migratepages);
> +
> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
> +							head_pfn + nr_pages);
> +
> +				if (ret)
> +					goto failed;
> +				/*
> +				 * reset pfn to the head of the free page, so
> +				 * that the free page handling code above can split
> +				 * the free page to the right migratetype list.
> +				 *
> +				 * head_pfn is not used here as a hugetlb page order
> +				 * can be bigger than MAX_ORDER-1, but after it is
> +				 * freed, the free page order is not. Use pfn within
> +				 * the range to find the head of the free page.
> +				 */
>   				order = 0;
> -			while (!PageBuddy(pfn_to_page(pfn))) {
> -				order++;
> -				pfn &= ~0UL << order;
> -			}
> -			continue;
> -#else
> -			goto failed;
> +				outer_pfn = pfn;
> +				while (!PageBuddy(pfn_to_page(outer_pfn))) {
> +					if (++order >= MAX_ORDER) {
> +						outer_pfn = pfn;
> +						break;
> +					}
> +					outer_pfn &= ~0UL << order;
> +				}
> +				pfn = outer_pfn;
> +				continue;
> +			} else
>   #endif
> +				goto failed;
>   		}
> 
>   		pfn++;
Zi Yan May 25, 2022, 5:53 p.m. UTC | #4
On 25 May 2022, at 13:41, Doug Berger wrote:

> I am seeing some free memory accounting problems with linux-next that I have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity").
>
> On an arm64 SMP platform with 4GB total memory and the default 16MB default CMA pool, I am seeing the following after boot with a sysrq Show Memory (e.g. 'echo m > /proc/sysrq-trigger'):
>
> [   16.015906] sysrq: Show Memory
> [   16.019039] Mem-Info:
> [   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
> [   16.021348]  active_file:0 inactive_file:0 isolated_file:0
> [   16.021348]  unevictable:0 dirty:0 writeback:0
> [   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
> [   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
> [   16.021348]  kernel_misc_reclaimable:0
> [   16.021348]  free:976766 free_pcp:991 free_cma:7017
> [   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
> [   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:28068kB
> [   16.108650] lowmem_reserve[]: 0 0 944 944
> [   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB high:3864kB reserved_highatomic:0KB active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
> [   16.140393] lowmem_reserve[]: 0 0 0 0
> [   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB (M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 732*4096kB (MC) = 3027724kB
> [   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB (ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 0*2048kB 210*4096kB (M) = 866028kB
> [   16.175165] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
> [   16.183937] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
> [   16.192533] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> [   16.201040] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
> [   16.209374] 15146 total pagecache pages
> [   16.213246] 0 pages in swap cache
> [   16.216595] Swap cache stats: add 0, delete 0, find 0/0
> [   16.221867] Free swap  = 0kB
> [   16.224780] Total swap = 0kB
> [   16.227693] 1048576 pages RAM
> [   16.230694] 0 pages HighMem/MovableOnly
> [   16.234564] 49240 pages reserved
> [   16.237825] 4096 pages cma reserved
>
> Some anomolies in the above are:
> free_cma:7017 with only 4096 pages cma reserved
> DMA free:3041036kB with only managed:3029992kB
>
> I'm not sure what is going on here, but I am suspicious of split_free_page() since del_page_from_free_list doesn't affect migrate_type accounting, but __free_one_page() can.
> Also PageBuddy(page) is being checked without zone->lock in isolate_single_pageblock().
>
> Please investigate this as well.


Can you try this patch https://lore.kernel.org/linux-mm/20220524194756.1698351-1-zi.yan@sent.com/
and see if it fixes the issue?

Thanks.

>
> Thanks!
>     Doug
>
> On 4/29/2022 6:54 AM, Zi Yan wrote:
>> On 25 Apr 2022, at 10:31, Zi Yan wrote:
>>
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
>>> merging pageblocks with different migratetypes. It might unnecessarily
>>> convert extra pageblocks at the beginning and at the end of the range.
>>> Change alloc_contig_range() to work at pageblock granularity.
>>>
>>> Special handling is needed for free pages and in-use pages across the
>>> boundaries of the range specified by alloc_contig_range(). Because these
>>> partially isolated pages causes free page accounting issues. The free
>>> pages will be split and freed into separate migratetype lists; the
>>> in-use pages will be migrated then the freed pages will be handled in
>>> the aforementioned way.
>>>
>>> Reported-by: kernel test robot <lkp@intel.com>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>   include/linux/page-isolation.h |   4 +-
>>>   mm/internal.h                  |   6 ++
>>>   mm/memory_hotplug.c            |   3 +-
>>>   mm/page_alloc.c                |  54 ++++++++--
>>>   mm/page_isolation.c            | 184 ++++++++++++++++++++++++++++++++-
>>>   5 files changed, 233 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>>> index e14eddf6741a..5456b7be38ae 100644
>>> --- a/include/linux/page-isolation.h
>>> +++ b/include/linux/page-isolation.h
>>> @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>>>    */
>>>   int
>>>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>> -			 unsigned migratetype, int flags);
>>> +			 int migratetype, int flags, gfp_t gfp_flags);
>>>
>>>   /*
>>>    * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
>>> @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>>    */
>>>   void
>>>   undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>> -			unsigned migratetype);
>>> +			int migratetype);
>>>
>>>   /*
>>>    * Test all pages in [start_pfn, end_pfn) are isolated or not.
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 919fa07e1031..0667abd57634 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -359,6 +359,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
>>>   			  phys_addr_t min_addr,
>>>   			  int nid, bool exact_nid);
>>>
>>> +void split_free_page(struct page *free_page,
>>> +				int order, unsigned long split_pfn_offset);
>>> +
>>>   #if defined CONFIG_COMPACTION || defined CONFIG_CMA
>>>
>>>   /*
>>> @@ -422,6 +425,9 @@ isolate_freepages_range(struct compact_control *cc,
>>>   int
>>>   isolate_migratepages_range(struct compact_control *cc,
>>>   			   unsigned long low_pfn, unsigned long end_pfn);
>>> +
>>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>>> +					unsigned long start, unsigned long end);
>>>   #endif
>>>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>>>   			int migratetype, bool only_stealable, bool *can_steal);
>>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>>> index 4c6065e5d274..9f8ae4cb77ee 100644
>>> --- a/mm/memory_hotplug.c
>>> +++ b/mm/memory_hotplug.c
>>> @@ -1845,7 +1845,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>>   	/* set above range as isolated */
>>>   	ret = start_isolate_page_range(start_pfn, end_pfn,
>>>   				       MIGRATE_MOVABLE,
>>> -				       MEMORY_OFFLINE | REPORT_FAILURE);
>>> +				       MEMORY_OFFLINE | REPORT_FAILURE,
>>> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>>>   	if (ret) {
>>>   		reason = "failure to isolate range";
>>>   		goto failed_removal_pcplists_disabled;
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index ce23ac8ad085..70ddd9a0bcf3 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1094,6 +1094,43 @@ static inline void __free_one_page(struct page *page,
>>>   		page_reporting_notify_free(order);
>>>   }
>>>
>>> +/**
>>> + * split_free_page() -- split a free page at split_pfn_offset
>>> + * @free_page:		the original free page
>>> + * @order:		the order of the page
>>> + * @split_pfn_offset:	split offset within the page
>>> + *
>>> + * It is used when the free page crosses two pageblocks with different migratetypes
>>> + * at split_pfn_offset within the page. The split free page will be put into
>>> + * separate migratetype lists afterwards. Otherwise, the function achieves
>>> + * nothing.
>>> + */
>>> +void split_free_page(struct page *free_page,
>>> +				int order, unsigned long split_pfn_offset)
>>> +{
>>> +	struct zone *zone = page_zone(free_page);
>>> +	unsigned long free_page_pfn = page_to_pfn(free_page);
>>> +	unsigned long pfn;
>>> +	unsigned long flags;
>>> +	int free_page_order;
>>> +
>>> +	spin_lock_irqsave(&zone->lock, flags);
>>> +	del_page_from_free_list(free_page, zone, order);
>>> +	for (pfn = free_page_pfn;
>>> +	     pfn < free_page_pfn + (1UL << order);) {
>>> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
>>> +
>>> +		free_page_order = ffs(split_pfn_offset) - 1;
>>> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
>>> +				mt, FPI_NONE);
>>> +		pfn += 1UL << free_page_order;
>>> +		split_pfn_offset -= (1UL << free_page_order);
>>> +		/* we have done the first part, now switch to second part */
>>> +		if (split_pfn_offset == 0)
>>> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>>> +	}
>>> +	spin_unlock_irqrestore(&zone->lock, flags);
>>> +}
>>>   /*
>>>    * A bad page could be due to a number of fields. Instead of multiple branches,
>>>    * try and check multiple fields with one check. The caller must do a detailed
>>> @@ -8919,7 +8956,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>>>   #endif
>>>
>>>   /* [start, end) must belong to a single zone. */
>>> -static int __alloc_contig_migrate_range(struct compact_control *cc,
>>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>>>   					unsigned long start, unsigned long end)
>>>   {
>>>   	/* This function is based on compact_zone() from compaction.c. */
>>> @@ -9002,7 +9039,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>>   		       unsigned migratetype, gfp_t gfp_mask)
>>>   {
>>>   	unsigned long outer_start, outer_end;
>>> -	unsigned int order;
>>> +	int order;
>>>   	int ret = 0;
>>>
>>>   	struct compact_control cc = {
>>> @@ -9021,14 +9058,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>>   	 * What we do here is we mark all pageblocks in range as
>>>   	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>>>   	 * have different sizes, and due to the way page allocator
>>> -	 * work, we align the range to biggest of the two pages so
>>> -	 * that page allocator won't try to merge buddies from
>>> -	 * different pageblocks and change MIGRATE_ISOLATE to some
>>> -	 * other migration type.
>>> +	 * work, start_isolate_page_range() has special handlings for this.
>>>   	 *
>>>   	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>>>   	 * migrate the pages from an unaligned range (ie. pages that
>>> -	 * we are interested in).  This will put all the pages in
>>> +	 * we are interested in). This will put all the pages in
>>>   	 * range back to page allocator as MIGRATE_ISOLATE.
>>>   	 *
>>>   	 * When this is done, we take the pages in range from page
>>> @@ -9042,9 +9076,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>>   	 */
>>>
>>>   	ret = start_isolate_page_range(pfn_max_align_down(start),
>>> -				       pfn_max_align_up(end), migratetype, 0);
>>> +				pfn_max_align_up(end), migratetype, 0, gfp_mask);
>>>   	if (ret)
>>> -		return ret;
>>> +		goto done;
>>>
>>>   	drain_all_pages(cc.zone);
>>>
>>> @@ -9064,7 +9098,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>>   	ret = 0;
>>>
>>>   	/*
>>> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
>>> +	 * Pages from [start, end) are within a pageblock_nr_pages
>>>   	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>>>   	 * more, all pages in [start, end) are free in page allocator.
>>>   	 * What we are going to do is to allocate all pages from
>>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>>> index c2f7a8bb634d..94b3467e5ba2 100644
>>> --- a/mm/page_isolation.c
>>> +++ b/mm/page_isolation.c
>>> @@ -203,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
>>>   	return -EBUSY;
>>>   }
>>>
>>> -static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
>>> +static void unset_migratetype_isolate(struct page *page, int migratetype)
>>>   {
>>>   	struct zone *zone;
>>>   	unsigned long flags, nr_pages;
>>> @@ -279,6 +279,157 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>>   	return NULL;
>>>   }
>>>
>>> +/**
>>> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
>>> + * within a free or in-use page.
>>> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
>>> + * @gfp_flags:			GFP flags used for migrating pages
>>> + * @isolate_before:	isolate the pageblock before the boundary_pfn
>>> + *
>>> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
>>> + * pageblock. When not all pageblocks within a page are isolated at the same
>>> + * time, free page accounting can go wrong. For example, in the case of
>>> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
>>> + * [         MAX_ORDER-1         ]
>>> + * [  pageblock0  |  pageblock1  ]
>>> + * When either pageblock is isolated, if it is a free page, the page is not
>>> + * split into separate migratetype lists, which is supposed to; if it is an
>>> + * in-use page and freed later, __free_one_page() does not split the free page
>>> + * either. The function handles this by splitting the free page or migrating
>>> + * the in-use page then splitting the free page.
>>> + */
>>> +static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>>> +			bool isolate_before)
>>> +{
>>> +	unsigned char saved_mt;
>>> +	unsigned long start_pfn;
>>> +	unsigned long isolate_pageblock;
>>> +	unsigned long pfn;
>>> +	struct zone *zone;
>>> +
>>> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
>>> +
>>> +	if (isolate_before)
>>> +		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
>>> +	else
>>> +		isolate_pageblock = boundary_pfn;
>>> +
>>> +	/*
>>> +	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
>>> +	 * only isolating a subset of pageblocks from a bigger than pageblock
>>> +	 * free or in-use page. Also make sure all to-be-isolated pageblocks
>>> +	 * are within the same zone.
>>> +	 */
>>> +	zone  = page_zone(pfn_to_page(isolate_pageblock));
>>> +	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
>>> +				      zone->zone_start_pfn);
>>> +
>>> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
>>> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
>>> +
>>> +	/*
>>> +	 * Bail out early when the to-be-isolated pageblock does not form
>>> +	 * a free or in-use page across boundary_pfn:
>>> +	 *
>>> +	 * 1. isolate before boundary_pfn: the page after is not online
>>> +	 * 2. isolate after boundary_pfn: the page before is not online
>>> +	 *
>>> +	 * This also ensures correctness. Without it, when isolate after
>>> +	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
>>> +	 * __first_valid_page() will return unexpected NULL in the for loop
>>> +	 * below.
>>> +	 */
>>> +	if (isolate_before) {
>>> +		if (!pfn_to_online_page(boundary_pfn))
>>> +			return 0;
>>> +	} else {
>>> +		if (!pfn_to_online_page(boundary_pfn - 1))
>>> +			return 0;
>>> +	}
>>> +
>>> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
>>> +		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
>>> +
>>> +		VM_BUG_ON(!page);
>>> +		pfn = page_to_pfn(page);
>>> +		/*
>>> +		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
>>> +		 * free pages in [start_pfn, boundary_pfn), its head page will
>>> +		 * always be in the range.
>>> +		 */
>>> +		if (PageBuddy(page)) {
>>> +			int order = buddy_order(page);
>>> +
>>> +			if (pfn + (1UL << order) > boundary_pfn)
>>> +				split_free_page(page, order, boundary_pfn - pfn);
>>> +			pfn += (1UL << order);
>>> +			continue;
>>> +		}
>>> +		/*
>>> +		 * migrate compound pages then let the free page handling code
>>> +		 * above do the rest. If migration is not enabled, just fail.
>>> +		 */
>>> +		if (PageHuge(page) || PageTransCompound(page)) {
>>> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
>>> +			unsigned long nr_pages = compound_nr(page);
>>> +			int order = compound_order(page);
>>> +			struct page *head = compound_head(page);
>>> +			unsigned long head_pfn = page_to_pfn(head);
>>> +			int ret;
>>> +			struct compact_control cc = {
>>> +				.nr_migratepages = 0,
>>> +				.order = -1,
>>> +				.zone = page_zone(pfn_to_page(head_pfn)),
>>> +				.mode = MIGRATE_SYNC,
>>> +				.ignore_skip_hint = true,
>>> +				.no_set_skip_hint = true,
>>> +				.gfp_mask = gfp_flags,
>>> +				.alloc_contig = true,
>>> +			};
>>> +			INIT_LIST_HEAD(&cc.migratepages);
>>> +
>>> +			if (head_pfn + nr_pages < boundary_pfn) {
>>> +				pfn += nr_pages;
>>> +				continue;
>>> +			}
>>> +
>>> +			ret = __alloc_contig_migrate_range(&cc, head_pfn,
>>> +						head_pfn + nr_pages);
>>> +
>>> +			if (ret)
>>> +				goto failed;
>>> +			/*
>>> +			 * reset pfn, let the free page handling code above
>>> +			 * split the free page to the right migratetype list.
>>> +			 *
>>> +			 * head_pfn is not used here as a hugetlb page order
>>> +			 * can be bigger than MAX_ORDER-1, but after it is
>>> +			 * freed, the free page order is not. Use pfn within
>>> +			 * the range to find the head of the free page and
>>> +			 * reset order to 0 if a hugetlb page with
>>> +			 * >MAX_ORDER-1 order is encountered.
>>> +			 */
>>> +			if (order > MAX_ORDER-1)
>>> +				order = 0;
>>> +			while (!PageBuddy(pfn_to_page(pfn))) {
>>> +				order++;
>>> +				pfn &= ~0UL << order;
>>> +			}
>>> +			continue;
>>> +#else
>>> +			goto failed;
>>> +#endif
>>> +		}
>>> +
>>> +		pfn++;
>>> +	}
>>> +	return 0;
>>> +failed:
>>> +	/* restore the original migratetype */
>>> +	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
>>> +	return -EBUSY;
>>> +}
>>> +
>>>   /**
>>>    * start_isolate_page_range() - make page-allocation-type of range of pages to
>>>    * be MIGRATE_ISOLATE.
>>> @@ -293,6 +444,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>>    *					 and PageOffline() pages.
>>>    *			REPORT_FAILURE - report details about the failure to
>>>    *			isolate the range
>>> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
>>> + *			range boundaries.
>>>    *
>>>    * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>>>    * the range will never be allocated. Any free pages and pages freed in the
>>> @@ -301,6 +454,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>>    * pages in the range finally, the caller have to free all pages in the range.
>>>    * test_page_isolated() can be used for test it.
>>>    *
>>> + * The function first tries to isolate the pageblocks at the beginning and end
>>> + * of the range, since there might be pages across the range boundaries.
>>> + * Afterwards, it isolates the rest of the range.
>>> + *
>>>    * There is no high level synchronization mechanism that prevents two threads
>>>    * from trying to isolate overlapping ranges. If this happens, one thread
>>>    * will notice pageblocks in the overlapping range already set to isolate.
>>> @@ -321,21 +478,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>>    * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>>>    */
>>>   int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>> -			     unsigned migratetype, int flags)
>>> +			     int migratetype, int flags, gfp_t gfp_flags)
>>>   {
>>>   	unsigned long pfn;
>>>   	struct page *page;
>>> +	int ret;
>>>
>>>   	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
>>>   	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
>>>
>>> -	for (pfn = start_pfn;
>>> -	     pfn < end_pfn;
>>> +	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
>>> +	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
>>> +	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
>>> +	if (ret) {
>>> +		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
>>> +		return ret;
>>> +	}
>>> +
>>> +	/* skip isolated pageblocks at the beginning and end */
>>> +	for (pfn = start_pfn + pageblock_nr_pages;
>>> +	     pfn < end_pfn - pageblock_nr_pages;
>>>   	     pfn += pageblock_nr_pages) {
>>>   		page = __first_valid_page(pfn, pageblock_nr_pages);
>>>   		if (page && set_migratetype_isolate(page, migratetype, flags,
>>>   					start_pfn, end_pfn)) {
>>>   			undo_isolate_page_range(start_pfn, pfn, migratetype);
>>> +			unset_migratetype_isolate(
>>> +				pfn_to_page(end_pfn - pageblock_nr_pages),
>>> +				migratetype);
>>>   			return -EBUSY;
>>>   		}
>>>   	}
>>> @@ -346,7 +520,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>>    * Make isolated pages available again.
>>>    */
>>>   void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>>> -			    unsigned migratetype)
>>> +			    int migratetype)
>>>   {
>>>   	unsigned long pfn;
>>>   	struct page *page;
>>> -- 
>>> 2.35.1
>>
>> Qian hit a bug caused by this series https://lore.kernel.org/linux-mm/20220426201855.GA1014@qian/
>> and the fix is:
>>
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 75e454f5cf45..b3f074d1682e 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -367,58 +367,67 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>>   		}
>>   		/*
>>   		 * migrate compound pages then let the free page handling code
>> -		 * above do the rest. If migration is not enabled, just fail.
>> +		 * above do the rest. If migration is not possible, just fail.
>>   		 */
>> -		if (PageHuge(page) || PageTransCompound(page)) {
>> -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
>> +		if (PageCompound(page)) {
>>   			unsigned long nr_pages = compound_nr(page);
>> -			int order = compound_order(page);
>>   			struct page *head = compound_head(page);
>>   			unsigned long head_pfn = page_to_pfn(head);
>> -			int ret;
>> -			struct compact_control cc = {
>> -				.nr_migratepages = 0,
>> -				.order = -1,
>> -				.zone = page_zone(pfn_to_page(head_pfn)),
>> -				.mode = MIGRATE_SYNC,
>> -				.ignore_skip_hint = true,
>> -				.no_set_skip_hint = true,
>> -				.gfp_mask = gfp_flags,
>> -				.alloc_contig = true,
>> -			};
>> -			INIT_LIST_HEAD(&cc.migratepages);
>>
>>   			if (head_pfn + nr_pages < boundary_pfn) {
>> -				pfn += nr_pages;
>> +				pfn = head_pfn + nr_pages;
>>   				continue;
>>   			}
>> -
>> -			ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> -						head_pfn + nr_pages);
>> -
>> -			if (ret)
>> -				goto failed;
>> +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
>>   			/*
>> -			 * reset pfn, let the free page handling code above
>> -			 * split the free page to the right migratetype list.
>> -			 *
>> -			 * head_pfn is not used here as a hugetlb page order
>> -			 * can be bigger than MAX_ORDER-1, but after it is
>> -			 * freed, the free page order is not. Use pfn within
>> -			 * the range to find the head of the free page and
>> -			 * reset order to 0 if a hugetlb page with
>> -			 * >MAX_ORDER-1 order is encountered.
>> +			 * hugetlb, lru compound (THP), and movable compound pages
>> +			 * can be migrated. Otherwise, fail the isolation.
>>   			 */
>> -			if (order > MAX_ORDER-1)
>> +			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
>> +				int order;
>> +				unsigned long outer_pfn;
>> +				int ret;
>> +				struct compact_control cc = {
>> +					.nr_migratepages = 0,
>> +					.order = -1,
>> +					.zone = page_zone(pfn_to_page(head_pfn)),
>> +					.mode = MIGRATE_SYNC,
>> +					.ignore_skip_hint = true,
>> +					.no_set_skip_hint = true,
>> +					.gfp_mask = gfp_flags,
>> +					.alloc_contig = true,
>> +				};
>> +				INIT_LIST_HEAD(&cc.migratepages);
>> +
>> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> +							head_pfn + nr_pages);
>> +
>> +				if (ret)
>> +					goto failed;
>> +				/*
>> +				 * reset pfn to the head of the free page, so
>> +				 * that the free page handling code above can split
>> +				 * the free page to the right migratetype list.
>> +				 *
>> +				 * head_pfn is not used here as a hugetlb page order
>> +				 * can be bigger than MAX_ORDER-1, but after it is
>> +				 * freed, the free page order is not. Use pfn within
>> +				 * the range to find the head of the free page.
>> +				 */
>>   				order = 0;
>> -			while (!PageBuddy(pfn_to_page(pfn))) {
>> -				order++;
>> -				pfn &= ~0UL << order;
>> -			}
>> -			continue;
>> -#else
>> -			goto failed;
>> +				outer_pfn = pfn;
>> +				while (!PageBuddy(pfn_to_page(outer_pfn))) {
>> +					if (++order >= MAX_ORDER) {
>> +						outer_pfn = pfn;
>> +						break;
>> +					}
>> +					outer_pfn &= ~0UL << order;
>> +				}
>> +				pfn = outer_pfn;
>> +				continue;
>> +			} else
>>   #endif
>> +				goto failed;
>>   		}
>>
>>   		pfn++;

--
Best Regards,
Yan, Zi
Doug Berger May 25, 2022, 9:03 p.m. UTC | #5
On 5/25/2022 10:53 AM, Zi Yan wrote:
> On 25 May 2022, at 13:41, Doug Berger wrote:
> 
>> I am seeing some free memory accounting problems with linux-next that I have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity").
>>
>> On an arm64 SMP platform with 4GB total memory and the default 16MB default CMA pool, I am seeing the following after boot with a sysrq Show Memory (e.g. 'echo m > /proc/sysrq-trigger'):
>>
>> [   16.015906] sysrq: Show Memory
>> [   16.019039] Mem-Info:
>> [   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
>> [   16.021348]  active_file:0 inactive_file:0 isolated_file:0
>> [   16.021348]  unevictable:0 dirty:0 writeback:0
>> [   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
>> [   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
>> [   16.021348]  kernel_misc_reclaimable:0
>> [   16.021348]  free:976766 free_pcp:991 free_cma:7017
>> [   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
>> [   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:28068kB
>> [   16.108650] lowmem_reserve[]: 0 0 944 944
>> [   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB high:3864kB reserved_highatomic:0KB active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
>> [   16.140393] lowmem_reserve[]: 0 0 0 0
>> [   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB (M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 732*4096kB (MC) = 3027724kB
>> [   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB (ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 0*2048kB 210*4096kB (M) = 866028kB
>> [   16.175165] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>> [   16.183937] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>> [   16.192533] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>> [   16.201040] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>> [   16.209374] 15146 total pagecache pages
>> [   16.213246] 0 pages in swap cache
>> [   16.216595] Swap cache stats: add 0, delete 0, find 0/0
>> [   16.221867] Free swap  = 0kB
>> [   16.224780] Total swap = 0kB
>> [   16.227693] 1048576 pages RAM
>> [   16.230694] 0 pages HighMem/MovableOnly
>> [   16.234564] 49240 pages reserved
>> [   16.237825] 4096 pages cma reserved
>>
>> Some anomolies in the above are:
>> free_cma:7017 with only 4096 pages cma reserved
>> DMA free:3041036kB with only managed:3029992kB
>>
>> I'm not sure what is going on here, but I am suspicious of split_free_page() since del_page_from_free_list doesn't affect migrate_type accounting, but __free_one_page() can.
>> Also PageBuddy(page) is being checked without zone->lock in isolate_single_pageblock().
>>
>> Please investigate this as well.
> 
> 
> Can you try this patch https://lore.kernel.org/linux-mm/20220524194756.1698351-1-zi.yan@sent.com/
> and see if it fixes the issue?
> 
> Thanks.
> 
The last hunk didn't apply directly to this commit, but I was able to 
apply the patch to linux-next/master with no improvement to the free 
memory accounting (actually anecdotaly worse):

[    6.236828] sysrq: Show Memory
[    6.239973] Mem-Info:
[    6.242290] active_anon:14594 inactive_anon:924 isolated_anon:0
[    6.242290]  active_file:0 inactive_file:0 isolated_file:0
[    6.242290]  unevictable:0 dirty:0 writeback:0
[    6.242290]  slab_reclaimable:3671 slab_unreclaimable:3575
[    6.242290]  mapped:935 shmem:15147 pagetables:63 bounce:0
[    6.242290]  kernel_misc_reclaimable:0
[    6.242290]  free:1059009 free_pcp:1067 free_cma:90112
[    6.278048] Node 0 active_anon:58376kB inactive_anon:3844kB 
active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB 
isolated(file):0kB mapped:3740kB dirty:0kB writeback:0kB shmem:60588kB 
writeback_tmp:0kB kernel_stack:1216kB pagetables:252kB all_unreclaimable? no
[    6.279422] arm-scmi brcm_scmi@0: timed out in resp(caller: 
scmi_perf_level_set+0xe0/0x110)
[    6.302501] DMA free:3372200kB boost:0kB min:6032kB low:9040kB 
high:12048kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB 
active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB 
present:3145728kB managed:3029800kB mlocked:0kB bounce:0kB 
free_pcp:636kB local_pcp:0kB free_cma:360448kB
[    6.302515] lowmem_reserve[]: 0 0 944
[    6.310894] cpufreq: __target_index: Failed to change cpu frequency: -110
[    6.337920]  944
[    6.337925] Normal free:863584kB boost:0kB min:1940kB low:2904kB 
high:3868kB reserved_highatomic:0KB active_anon:58376kB 
inactive_anon:3896kB active_file:0kB inactive_file:0kB unevictable:0kB 
writepending:0kB present:1048576kB managed:967352kB mlocked:0kB 
bounce:0kB free_pcp:3492kB local_pcp:828kB free_cma:0kB
[    6.377782] lowmem_reserve[]: 0 0 0 0
[    6.381461] DMA: 4*4kB (UM) 5*8kB (M) 3*16kB (M) 2*32kB (M) 6*64kB 
(M) 5*128kB (M) 6*256kB (UM) 5*512kB (UM) 4*1024kB (M) 10*2048kB (UMC) 
732*4096kB (MC) = 3028136kB
[    6.396324] Normal: 84*4kB (U) 94*8kB (UM) 260*16kB (UME) 149*32kB 
(UM) 99*64kB (UME) 39*128kB (UM) 12*256kB (U) 3*512kB (UME) 2*1024kB 
(UM) 0*2048kB 204*4096kB (M) = 863584kB
[    6.412054] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=1048576kB
[    6.420770] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=32768kB
[    6.429312] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=2048kB
[    6.437767] Node 0 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=64kB
[    6.446047] 15147 total pagecache pages
[    6.449890] 0 pages in swap cache
[    6.453210] Swap cache stats: add 0, delete 0, find 0/0
[    6.458445] Free swap  = 0kB
[    6.461331] Total swap = 0kB
[    6.464217] 1048576 pages RAM
[    6.467190] 0 pages HighMem/MovableOnly
[    6.471032] 49288 pages reserved
[    6.474267] 4096 pages cma reserved

Regards,
     Doug
Zi Yan May 25, 2022, 9:11 p.m. UTC | #6
On 25 May 2022, at 17:03, Doug Berger wrote:

> On 5/25/2022 10:53 AM, Zi Yan wrote:
>> On 25 May 2022, at 13:41, Doug Berger wrote:
>>
>>> I am seeing some free memory accounting problems with linux-next that I have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity").
>>>
>>> On an arm64 SMP platform with 4GB total memory and the default 16MB default CMA pool, I am seeing the following after boot with a sysrq Show Memory (e.g. 'echo m > /proc/sysrq-trigger'):
>>>
>>> [   16.015906] sysrq: Show Memory
>>> [   16.019039] Mem-Info:
>>> [   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
>>> [   16.021348]  active_file:0 inactive_file:0 isolated_file:0
>>> [   16.021348]  unevictable:0 dirty:0 writeback:0
>>> [   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
>>> [   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
>>> [   16.021348]  kernel_misc_reclaimable:0
>>> [   16.021348]  free:976766 free_pcp:991 free_cma:7017
>>> [   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
>>> [   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:28068kB
>>> [   16.108650] lowmem_reserve[]: 0 0 944 944
>>> [   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB high:3864kB reserved_highatomic:0KB active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
>>> [   16.140393] lowmem_reserve[]: 0 0 0 0
>>> [   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB (M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 732*4096kB (MC) = 3027724kB
>>> [   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB (ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 0*2048kB 210*4096kB (M) = 866028kB
>>> [   16.175165] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>>> [   16.183937] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>>> [   16.192533] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>>> [   16.201040] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>>> [   16.209374] 15146 total pagecache pages
>>> [   16.213246] 0 pages in swap cache
>>> [   16.216595] Swap cache stats: add 0, delete 0, find 0/0
>>> [   16.221867] Free swap  = 0kB
>>> [   16.224780] Total swap = 0kB
>>> [   16.227693] 1048576 pages RAM
>>> [   16.230694] 0 pages HighMem/MovableOnly
>>> [   16.234564] 49240 pages reserved
>>> [   16.237825] 4096 pages cma reserved
>>>
>>> Some anomolies in the above are:
>>> free_cma:7017 with only 4096 pages cma reserved
>>> DMA free:3041036kB with only managed:3029992kB
>>>
>>> I'm not sure what is going on here, but I am suspicious of split_free_page() since del_page_from_free_list doesn't affect migrate_type accounting, but __free_one_page() can.
>>> Also PageBuddy(page) is being checked without zone->lock in isolate_single_pageblock().
>>>
>>> Please investigate this as well.
>>
>>
>> Can you try this patch https://lore.kernel.org/linux-mm/20220524194756.1698351-1-zi.yan@sent.com/
>> and see if it fixes the issue?
>>
>> Thanks.
>>
> The last hunk didn't apply directly to this commit, but I was able to apply the patch to linux-next/master with no improvement to the free memory accounting (actually anecdotaly worse):
>
> [    6.236828] sysrq: Show Memory
> [    6.239973] Mem-Info:
> [    6.242290] active_anon:14594 inactive_anon:924 isolated_anon:0
> [    6.242290]  active_file:0 inactive_file:0 isolated_file:0
> [    6.242290]  unevictable:0 dirty:0 writeback:0
> [    6.242290]  slab_reclaimable:3671 slab_unreclaimable:3575
> [    6.242290]  mapped:935 shmem:15147 pagetables:63 bounce:0
> [    6.242290]  kernel_misc_reclaimable:0
> [    6.242290]  free:1059009 free_pcp:1067 free_cma:90112
> [    6.278048] Node 0 active_anon:58376kB inactive_anon:3844kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3740kB dirty:0kB writeback:0kB shmem:60588kB writeback_tmp:0kB kernel_stack:1216kB pagetables:252kB all_unreclaimable? no
> [    6.279422] arm-scmi brcm_scmi@0: timed out in resp(caller: scmi_perf_level_set+0xe0/0x110)
> [    6.302501] DMA free:3372200kB boost:0kB min:6032kB low:9040kB high:12048kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029800kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:360448kB
> [    6.302515] lowmem_reserve[]: 0 0 944
> [    6.310894] cpufreq: __target_index: Failed to change cpu frequency: -110
> [    6.337920]  944
> [    6.337925] Normal free:863584kB boost:0kB min:1940kB low:2904kB high:3868kB reserved_highatomic:0KB active_anon:58376kB inactive_anon:3896kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3492kB local_pcp:828kB free_cma:0kB
> [    6.377782] lowmem_reserve[]: 0 0 0 0
> [    6.381461] DMA: 4*4kB (UM) 5*8kB (M) 3*16kB (M) 2*32kB (M) 6*64kB (M) 5*128kB (M) 6*256kB (UM) 5*512kB (UM) 4*1024kB (M) 10*2048kB (UMC) 732*4096kB (MC) = 3028136kB
> [    6.396324] Normal: 84*4kB (U) 94*8kB (UM) 260*16kB (UME) 149*32kB (UM) 99*64kB (UME) 39*128kB (UM) 12*256kB (U) 3*512kB (UME) 2*1024kB (UM) 0*2048kB 204*4096kB (M) = 863584kB
> [    6.412054] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
> [    6.420770] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
> [    6.429312] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> [    6.437767] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
> [    6.446047] 15147 total pagecache pages
> [    6.449890] 0 pages in swap cache
> [    6.453210] Swap cache stats: add 0, delete 0, find 0/0
> [    6.458445] Free swap  = 0kB
> [    6.461331] Total swap = 0kB
> [    6.464217] 1048576 pages RAM
> [    6.467190] 0 pages HighMem/MovableOnly
> [    6.471032] 49288 pages reserved
> [    6.474267] 4096 pages cma reserved
>
> Regards,
>     Doug

I will look into it. Thanks for reporting it.

--
Best Regards,
Yan, Zi
Zi Yan May 26, 2022, 5:34 p.m. UTC | #7
On 25 May 2022, at 17:11, Zi Yan wrote:

> On 25 May 2022, at 17:03, Doug Berger wrote:
>
>> On 5/25/2022 10:53 AM, Zi Yan wrote:
>>> On 25 May 2022, at 13:41, Doug Berger wrote:
>>>
>>>> I am seeing some free memory accounting problems with linux-next that I have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity").
>>>>
>>>> On an arm64 SMP platform with 4GB total memory and the default 16MB default CMA pool, I am seeing the following after boot with a sysrq Show Memory (e.g. 'echo m > /proc/sysrq-trigger'):
>>>>
>>>> [   16.015906] sysrq: Show Memory
>>>> [   16.019039] Mem-Info:
>>>> [   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
>>>> [   16.021348]  active_file:0 inactive_file:0 isolated_file:0
>>>> [   16.021348]  unevictable:0 dirty:0 writeback:0
>>>> [   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
>>>> [   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
>>>> [   16.021348]  kernel_misc_reclaimable:0
>>>> [   16.021348]  free:976766 free_pcp:991 free_cma:7017
>>>> [   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
>>>> [   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:28068kB
>>>> [   16.108650] lowmem_reserve[]: 0 0 944 944
>>>> [   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB high:3864kB reserved_highatomic:0KB active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
>>>> [   16.140393] lowmem_reserve[]: 0 0 0 0
>>>> [   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB (M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 732*4096kB (MC) = 3027724kB
>>>> [   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB (ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 0*2048kB 210*4096kB (M) = 866028kB
>>>> [   16.175165] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>>>> [   16.183937] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>>>> [   16.192533] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>>>> [   16.201040] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>>>> [   16.209374] 15146 total pagecache pages
>>>> [   16.213246] 0 pages in swap cache
>>>> [   16.216595] Swap cache stats: add 0, delete 0, find 0/0
>>>> [   16.221867] Free swap  = 0kB
>>>> [   16.224780] Total swap = 0kB
>>>> [   16.227693] 1048576 pages RAM
>>>> [   16.230694] 0 pages HighMem/MovableOnly
>>>> [   16.234564] 49240 pages reserved
>>>> [   16.237825] 4096 pages cma reserved
>>>>
>>>> Some anomolies in the above are:
>>>> free_cma:7017 with only 4096 pages cma reserved
>>>> DMA free:3041036kB with only managed:3029992kB
>>>>
>>>> I'm not sure what is going on here, but I am suspicious of split_free_page() since del_page_from_free_list doesn't affect migrate_type accounting, but __free_one_page() can.
>>>> Also PageBuddy(page) is being checked without zone->lock in isolate_single_pageblock().
>>>>
>>>> Please investigate this as well.
>>>
>>>
>>> Can you try this patch https://lore.kernel.org/linux-mm/20220524194756.1698351-1-zi.yan@sent.com/
>>> and see if it fixes the issue?
>>>
>>> Thanks.
>>>
>> The last hunk didn't apply directly to this commit, but I was able to apply the patch to linux-next/master with no improvement to the free memory accounting (actually anecdotaly worse):
>>
>> [    6.236828] sysrq: Show Memory
>> [    6.239973] Mem-Info:
>> [    6.242290] active_anon:14594 inactive_anon:924 isolated_anon:0
>> [    6.242290]  active_file:0 inactive_file:0 isolated_file:0
>> [    6.242290]  unevictable:0 dirty:0 writeback:0
>> [    6.242290]  slab_reclaimable:3671 slab_unreclaimable:3575
>> [    6.242290]  mapped:935 shmem:15147 pagetables:63 bounce:0
>> [    6.242290]  kernel_misc_reclaimable:0
>> [    6.242290]  free:1059009 free_pcp:1067 free_cma:90112
>> [    6.278048] Node 0 active_anon:58376kB inactive_anon:3844kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3740kB dirty:0kB writeback:0kB shmem:60588kB writeback_tmp:0kB kernel_stack:1216kB pagetables:252kB all_unreclaimable? no
>> [    6.279422] arm-scmi brcm_scmi@0: timed out in resp(caller: scmi_perf_level_set+0xe0/0x110)
>> [    6.302501] DMA free:3372200kB boost:0kB min:6032kB low:9040kB high:12048kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029800kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:360448kB
>> [    6.302515] lowmem_reserve[]: 0 0 944
>> [    6.310894] cpufreq: __target_index: Failed to change cpu frequency: -110
>> [    6.337920]  944
>> [    6.337925] Normal free:863584kB boost:0kB min:1940kB low:2904kB high:3868kB reserved_highatomic:0KB active_anon:58376kB inactive_anon:3896kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3492kB local_pcp:828kB free_cma:0kB
>> [    6.377782] lowmem_reserve[]: 0 0 0 0
>> [    6.381461] DMA: 4*4kB (UM) 5*8kB (M) 3*16kB (M) 2*32kB (M) 6*64kB (M) 5*128kB (M) 6*256kB (UM) 5*512kB (UM) 4*1024kB (M) 10*2048kB (UMC) 732*4096kB (MC) = 3028136kB
>> [    6.396324] Normal: 84*4kB (U) 94*8kB (UM) 260*16kB (UME) 149*32kB (UM) 99*64kB (UME) 39*128kB (UM) 12*256kB (U) 3*512kB (UME) 2*1024kB (UM) 0*2048kB 204*4096kB (M) = 863584kB
>> [    6.412054] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>> [    6.420770] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>> [    6.429312] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>> [    6.437767] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>> [    6.446047] 15147 total pagecache pages
>> [    6.449890] 0 pages in swap cache
>> [    6.453210] Swap cache stats: add 0, delete 0, find 0/0
>> [    6.458445] Free swap  = 0kB
>> [    6.461331] Total swap = 0kB
>> [    6.464217] 1048576 pages RAM
>> [    6.467190] 0 pages HighMem/MovableOnly
>> [    6.471032] 49288 pages reserved
>> [    6.474267] 4096 pages cma reserved
>>
>> Regards,
>>     Doug
>
> I will look into it. Thanks for reporting it.

Hi Doug,

Can you try the patch below? It takes out free pages under zone lock now
and modifies page stats properly. Thanks.


diff --git a/mm/internal.h b/mm/internal.h
index 64e61b032dac..c0f8fbe0445b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 			  phys_addr_t min_addr,
 			  int nid, bool exact_nid);

-void split_free_page(struct page *free_page,
-				int order, unsigned long split_pfn_offset);
+int split_free_page(struct page *free_page,
+			unsigned int order, unsigned long split_pfn_offset);

 #if defined CONFIG_COMPACTION || defined CONFIG_CMA

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bc93a82e51e6..6f6e4649ac21 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1100,30 +1100,44 @@ static inline void __free_one_page(struct page *page,
  * @order:		the order of the page
  * @split_pfn_offset:	split offset within the page
  *
+ * Return -ENOENT if the free page is changed, otherwise 0
+ *
  * It is used when the free page crosses two pageblocks with different migratetypes
  * at split_pfn_offset within the page. The split free page will be put into
  * separate migratetype lists afterwards. Otherwise, the function achieves
  * nothing.
  */
-void split_free_page(struct page *free_page,
-				int order, unsigned long split_pfn_offset)
+int split_free_page(struct page *free_page,
+			unsigned int order, unsigned long split_pfn_offset)
 {
 	struct zone *zone = page_zone(free_page);
 	unsigned long free_page_pfn = page_to_pfn(free_page);
 	unsigned long pfn;
 	unsigned long flags;
 	int free_page_order;
+	int mt;
+	int ret = 0;

 	if (split_pfn_offset == 0)
-		return;
+		return ret;

 	spin_lock_irqsave(&zone->lock, flags);
+
+	if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mt = get_pageblock_migratetype(free_page);
+	if (likely(!is_migrate_isolate(mt)))
+		__mod_zone_freepage_state(zone, -(1UL << order), mt);
+
 	del_page_from_free_list(free_page, zone, order);
 	for (pfn = free_page_pfn;
 	     pfn < free_page_pfn + (1UL << order);) {
 		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);

-		free_page_order = min_t(int,
+		free_page_order = min_t(unsigned int,
 					pfn ? __ffs(pfn) : order,
 					__fls(split_pfn_offset));
 		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
@@ -1134,7 +1148,9 @@ void split_free_page(struct page *free_page,
 		if (split_pfn_offset == 0)
 			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
 	}
+out:
 	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
 }
 /*
  * A bad page could be due to a number of fields. Instead of multiple branches,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c643c8420809..f539ccf7fb44 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * the in-use page then splitting the free page.
  */
 static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
-			gfp_t gfp_flags, bool isolate_before)
+			gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
 {
 	unsigned char saved_mt;
 	unsigned long start_pfn;
@@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				      zone->zone_start_pfn);

 	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-	ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
-			isolate_pageblock, isolate_pageblock + pageblock_nr_pages);

-	if (ret)
-		return ret;
+	if (skip_isolation)
+		VM_BUG_ON(!is_migrate_isolate(saved_mt));
+	else {
+		ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+				isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+		if (ret)
+			return ret;
+	}

 	/*
 	 * Bail out early when the to-be-isolated pageblock does not form
@@ -367,8 +372,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 			int order = buddy_order(page);

 			if (pfn + (1UL << order) > boundary_pfn)
-				split_free_page(page, order, boundary_pfn - pfn);
-			pfn += (1UL << order);
+				/* free page changed before split, check it again */
+				if (split_free_page(page, order, boundary_pfn - pfn))
+				    continue;
+
+			pfn += 1UL << order;
 			continue;
 		}
 		/*
@@ -463,7 +471,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 	return 0;
 failed:
 	/* restore the original migratetype */
-	unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+	if (!skip_isolation)
+		unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
 	return -EBUSY;
 }

@@ -522,14 +531,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
 	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
 	int ret;
+	bool skip_isolation = false;

 	/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
-	ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
+	ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
 	if (ret)
 		return ret;

+	if (isolate_start == isolate_end - pageblock_nr_pages)
+		skip_isolation = true;
+
 	/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
-	ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
+	ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
 	if (ret) {
 		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
 		return ret;


--
Best Regards,
Yan, Zi
Doug Berger May 26, 2022, 7:46 p.m. UTC | #8
On 5/26/2022 10:34 AM, Zi Yan wrote:
> On 25 May 2022, at 17:11, Zi Yan wrote:
> 
>> On 25 May 2022, at 17:03, Doug Berger wrote:
>>
>>> On 5/25/2022 10:53 AM, Zi Yan wrote:
>>>> On 25 May 2022, at 13:41, Doug Berger wrote:
>>>>
>>>>> I am seeing some free memory accounting problems with linux-next that I have bisected to this commit (i.e. b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity").
>>>>>
>>>>> On an arm64 SMP platform with 4GB total memory and the default 16MB default CMA pool, I am seeing the following after boot with a sysrq Show Memory (e.g. 'echo m > /proc/sysrq-trigger'):
>>>>>
>>>>> [   16.015906] sysrq: Show Memory
>>>>> [   16.019039] Mem-Info:
>>>>> [   16.021348] active_anon:14604 inactive_anon:919 isolated_anon:0
>>>>> [   16.021348]  active_file:0 inactive_file:0 isolated_file:0
>>>>> [   16.021348]  unevictable:0 dirty:0 writeback:0
>>>>> [   16.021348]  slab_reclaimable:3662 slab_unreclaimable:3333
>>>>> [   16.021348]  mapped:928 shmem:15146 pagetables:63 bounce:0
>>>>> [   16.021348]  kernel_misc_reclaimable:0
>>>>> [   16.021348]  free:976766 free_pcp:991 free_cma:7017
>>>>> [   16.056937] Node 0 active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3712kB dirty:0kB writeback:0kB shmem:60584kB writeback_tmp:0kB kernel_stack:1200kB pagetables:252kB all_unreclaimable? no
>>>>> [   16.081526] DMA free:3041036kB boost:0kB min:6036kB low:9044kB high:12052kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029992kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:28068kB
>>>>> [   16.108650] lowmem_reserve[]: 0 0 944 944
>>>>> [   16.112746] Normal free:866028kB boost:0kB min:1936kB low:2900kB high:3864kB reserved_highatomic:0KB active_anon:58416kB inactive_anon:3676kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3328kB local_pcp:864kB free_cma:0kB
>>>>> [   16.140393] lowmem_reserve[]: 0 0 0 0
>>>>> [   16.144133] DMA: 7*4kB (UMC) 4*8kB (M) 3*16kB (M) 3*32kB (MC) 5*64kB (M) 4*128kB (MC) 5*256kB (UMC) 7*512kB (UM) 5*1024kB (UM) 9*2048kB (UMC) 732*4096kB (MC) = 3027724kB
>>>>> [   16.159609] Normal: 149*4kB (UM) 95*8kB (UME) 26*16kB (UME) 8*32kB (ME) 2*64kB (UE) 1*128kB (M) 2*256kB (ME) 2*512kB (ME) 2*1024kB (UM) 0*2048kB 210*4096kB (M) = 866028kB
>>>>> [   16.175165] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>>>>> [   16.183937] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>>>>> [   16.192533] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>>>>> [   16.201040] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>>>>> [   16.209374] 15146 total pagecache pages
>>>>> [   16.213246] 0 pages in swap cache
>>>>> [   16.216595] Swap cache stats: add 0, delete 0, find 0/0
>>>>> [   16.221867] Free swap  = 0kB
>>>>> [   16.224780] Total swap = 0kB
>>>>> [   16.227693] 1048576 pages RAM
>>>>> [   16.230694] 0 pages HighMem/MovableOnly
>>>>> [   16.234564] 49240 pages reserved
>>>>> [   16.237825] 4096 pages cma reserved
>>>>>
>>>>> Some anomolies in the above are:
>>>>> free_cma:7017 with only 4096 pages cma reserved
>>>>> DMA free:3041036kB with only managed:3029992kB
>>>>>
>>>>> I'm not sure what is going on here, but I am suspicious of split_free_page() since del_page_from_free_list doesn't affect migrate_type accounting, but __free_one_page() can.
>>>>> Also PageBuddy(page) is being checked without zone->lock in isolate_single_pageblock().
>>>>>
>>>>> Please investigate this as well.
>>>>
>>>>
>>>> Can you try this patch https://lore.kernel.org/linux-mm/20220524194756.1698351-1-zi.yan@sent.com/
>>>> and see if it fixes the issue?
>>>>
>>>> Thanks.
>>>>
>>> The last hunk didn't apply directly to this commit, but I was able to apply the patch to linux-next/master with no improvement to the free memory accounting (actually anecdotaly worse):
>>>
>>> [    6.236828] sysrq: Show Memory
>>> [    6.239973] Mem-Info:
>>> [    6.242290] active_anon:14594 inactive_anon:924 isolated_anon:0
>>> [    6.242290]  active_file:0 inactive_file:0 isolated_file:0
>>> [    6.242290]  unevictable:0 dirty:0 writeback:0
>>> [    6.242290]  slab_reclaimable:3671 slab_unreclaimable:3575
>>> [    6.242290]  mapped:935 shmem:15147 pagetables:63 bounce:0
>>> [    6.242290]  kernel_misc_reclaimable:0
>>> [    6.242290]  free:1059009 free_pcp:1067 free_cma:90112
>>> [    6.278048] Node 0 active_anon:58376kB inactive_anon:3844kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3740kB dirty:0kB writeback:0kB shmem:60588kB writeback_tmp:0kB kernel_stack:1216kB pagetables:252kB all_unreclaimable? no
>>> [    6.279422] arm-scmi brcm_scmi@0: timed out in resp(caller: scmi_perf_level_set+0xe0/0x110)
>>> [    6.302501] DMA free:3372200kB boost:0kB min:6032kB low:9040kB high:12048kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:3145728kB managed:3029800kB mlocked:0kB bounce:0kB free_pcp:636kB local_pcp:0kB free_cma:360448kB
>>> [    6.302515] lowmem_reserve[]: 0 0 944
>>> [    6.310894] cpufreq: __target_index: Failed to change cpu frequency: -110
>>> [    6.337920]  944
>>> [    6.337925] Normal free:863584kB boost:0kB min:1940kB low:2904kB high:3868kB reserved_highatomic:0KB active_anon:58376kB inactive_anon:3896kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:1048576kB managed:967352kB mlocked:0kB bounce:0kB free_pcp:3492kB local_pcp:828kB free_cma:0kB
>>> [    6.377782] lowmem_reserve[]: 0 0 0 0
>>> [    6.381461] DMA: 4*4kB (UM) 5*8kB (M) 3*16kB (M) 2*32kB (M) 6*64kB (M) 5*128kB (M) 6*256kB (UM) 5*512kB (UM) 4*1024kB (M) 10*2048kB (UMC) 732*4096kB (MC) = 3028136kB
>>> [    6.396324] Normal: 84*4kB (U) 94*8kB (UM) 260*16kB (UME) 149*32kB (UM) 99*64kB (UME) 39*128kB (UM) 12*256kB (U) 3*512kB (UME) 2*1024kB (UM) 0*2048kB 204*4096kB (M) = 863584kB
>>> [    6.412054] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
>>> [    6.420770] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=32768kB
>>> [    6.429312] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>>> [    6.437767] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=64kB
>>> [    6.446047] 15147 total pagecache pages
>>> [    6.449890] 0 pages in swap cache
>>> [    6.453210] Swap cache stats: add 0, delete 0, find 0/0
>>> [    6.458445] Free swap  = 0kB
>>> [    6.461331] Total swap = 0kB
>>> [    6.464217] 1048576 pages RAM
>>> [    6.467190] 0 pages HighMem/MovableOnly
>>> [    6.471032] 49288 pages reserved
>>> [    6.474267] 4096 pages cma reserved
>>>
>>> Regards,
>>>      Doug
>>
>> I will look into it. Thanks for reporting it.
> 
> Hi Doug,
> 
> Can you try the patch below? It takes out free pages under zone lock now
> and modifies page stats properly. Thanks.
> 
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index 64e61b032dac..c0f8fbe0445b 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
>   			  phys_addr_t min_addr,
>   			  int nid, bool exact_nid);
> 
> -void split_free_page(struct page *free_page,
> -				int order, unsigned long split_pfn_offset);
> +int split_free_page(struct page *free_page,
> +			unsigned int order, unsigned long split_pfn_offset);
> 
>   #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index bc93a82e51e6..6f6e4649ac21 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1100,30 +1100,44 @@ static inline void __free_one_page(struct page *page,
>    * @order:		the order of the page
>    * @split_pfn_offset:	split offset within the page
>    *
> + * Return -ENOENT if the free page is changed, otherwise 0
> + *
>    * It is used when the free page crosses two pageblocks with different migratetypes
>    * at split_pfn_offset within the page. The split free page will be put into
>    * separate migratetype lists afterwards. Otherwise, the function achieves
>    * nothing.
>    */
> -void split_free_page(struct page *free_page,
> -				int order, unsigned long split_pfn_offset)
> +int split_free_page(struct page *free_page,
> +			unsigned int order, unsigned long split_pfn_offset)
>   {
>   	struct zone *zone = page_zone(free_page);
>   	unsigned long free_page_pfn = page_to_pfn(free_page);
>   	unsigned long pfn;
>   	unsigned long flags;
>   	int free_page_order;
> +	int mt;
> +	int ret = 0;
> 
>   	if (split_pfn_offset == 0)
> -		return;
> +		return ret;
> 
>   	spin_lock_irqsave(&zone->lock, flags);
> +
> +	if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	mt = get_pageblock_migratetype(free_page);
> +	if (likely(!is_migrate_isolate(mt)))
> +		__mod_zone_freepage_state(zone, -(1UL << order), mt);
> +
>   	del_page_from_free_list(free_page, zone, order);
>   	for (pfn = free_page_pfn;
>   	     pfn < free_page_pfn + (1UL << order);) {
>   		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
> 
> -		free_page_order = min_t(int,
> +		free_page_order = min_t(unsigned int,
>   					pfn ? __ffs(pfn) : order,
>   					__fls(split_pfn_offset));
This part of the patch doesn't agree with any version of page_alloc.c I 
have, but I was able to manually apply the change.


>   		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
> @@ -1134,7 +1148,9 @@ void split_free_page(struct page *free_page,
>   		if (split_pfn_offset == 0)
>   			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>   	}
> +out:
>   	spin_unlock_irqrestore(&zone->lock, flags);
> +	return ret;
>   }
>   /*
>    * A bad page could be due to a number of fields. Instead of multiple branches,
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index c643c8420809..f539ccf7fb44 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>    * the in-use page then splitting the free page.
>    */
>   static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
> -			gfp_t gfp_flags, bool isolate_before)
> +			gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
>   {
>   	unsigned char saved_mt;
>   	unsigned long start_pfn;
> @@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
>   				      zone->zone_start_pfn);
> 
>   	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
> -	ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
> -			isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
> 
> -	if (ret)
> -		return ret;
> +	if (skip_isolation)
> +		VM_BUG_ON(!is_migrate_isolate(saved_mt));
> +	else {
> +		ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
> +				isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
> +
> +		if (ret)
> +			return ret;
> +	}
> 
>   	/*
>   	 * Bail out early when the to-be-isolated pageblock does not form
> @@ -367,8 +372,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
>   			int order = buddy_order(page);
> 
>   			if (pfn + (1UL << order) > boundary_pfn)
> -				split_free_page(page, order, boundary_pfn - pfn);
> -			pfn += (1UL << order);
> +				/* free page changed before split, check it again */
> +				if (split_free_page(page, order, boundary_pfn - pfn))
> +				    continue;
> +
> +			pfn += 1UL << order;
>   			continue;
>   		}
>   		/*
> @@ -463,7 +471,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
>   	return 0;
>   failed:
>   	/* restore the original migratetype */
> -	unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
> +	if (!skip_isolation)
> +		unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
>   	return -EBUSY;
>   }
> 
> @@ -522,14 +531,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>   	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
>   	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
>   	int ret;
> +	bool skip_isolation = false;
> 
>   	/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
> -	ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
> +	ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
>   	if (ret)
>   		return ret;
> 
> +	if (isolate_start == isolate_end - pageblock_nr_pages)
> +		skip_isolation = true;
> +
>   	/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
> -	ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
> +	ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
>   	if (ret) {
>   		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
>   		return ret;
> 
> 
> --
> Best Regards,
> Yan, Zi

This patch does appear to fix the problem I observed. I'll poke it a 
little more, but so far it looks good.

Thanks!,
     Doug
diff mbox series

Patch

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index e14eddf6741a..5456b7be38ae 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -42,7 +42,7 @@  int move_freepages_block(struct zone *zone, struct page *page,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			 unsigned migratetype, int flags);
+			 int migratetype, int flags, gfp_t gfp_flags);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -50,7 +50,7 @@  start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  */
 void
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			unsigned migratetype);
+			int migratetype);
 
 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
diff --git a/mm/internal.h b/mm/internal.h
index 919fa07e1031..0667abd57634 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -359,6 +359,9 @@  extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 			  phys_addr_t min_addr,
 			  int nid, bool exact_nid);
 
+void split_free_page(struct page *free_page,
+				int order, unsigned long split_pfn_offset);
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
@@ -422,6 +425,9 @@  isolate_freepages_range(struct compact_control *cc,
 int
 isolate_migratepages_range(struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
+
+int __alloc_contig_migrate_range(struct compact_control *cc,
+					unsigned long start, unsigned long end);
 #endif
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4c6065e5d274..9f8ae4cb77ee 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1845,7 +1845,8 @@  int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
-				       MEMORY_OFFLINE | REPORT_FAILURE);
+				       MEMORY_OFFLINE | REPORT_FAILURE,
+				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
 	if (ret) {
 		reason = "failure to isolate range";
 		goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce23ac8ad085..70ddd9a0bcf3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1094,6 +1094,43 @@  static inline void __free_one_page(struct page *page,
 		page_reporting_notify_free(order);
 }
 
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page:		the original free page
+ * @order:		the order of the page
+ * @split_pfn_offset:	split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+void split_free_page(struct page *free_page,
+				int order, unsigned long split_pfn_offset)
+{
+	struct zone *zone = page_zone(free_page);
+	unsigned long free_page_pfn = page_to_pfn(free_page);
+	unsigned long pfn;
+	unsigned long flags;
+	int free_page_order;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	del_page_from_free_list(free_page, zone, order);
+	for (pfn = free_page_pfn;
+	     pfn < free_page_pfn + (1UL << order);) {
+		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+		free_page_order = ffs(split_pfn_offset) - 1;
+		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+				mt, FPI_NONE);
+		pfn += 1UL << free_page_order;
+		split_pfn_offset -= (1UL << free_page_order);
+		/* we have done the first part, now switch to second part */
+		if (split_pfn_offset == 0)
+			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
 /*
  * A bad page could be due to a number of fields. Instead of multiple branches,
  * try and check multiple fields with one check. The caller must do a detailed
@@ -8919,7 +8956,7 @@  static inline void alloc_contig_dump_pages(struct list_head *page_list)
 #endif
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
@@ -9002,7 +9039,7 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
-	unsigned int order;
+	int order;
 	int ret = 0;
 
 	struct compact_control cc = {
@@ -9021,14 +9058,11 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
-	 * work, we align the range to biggest of the two pages so
-	 * that page allocator won't try to merge buddies from
-	 * different pageblocks and change MIGRATE_ISOLATE to some
-	 * other migration type.
+	 * work, start_isolate_page_range() has special handlings for this.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
-	 * we are interested in).  This will put all the pages in
+	 * we are interested in). This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
@@ -9042,9 +9076,9 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	 */
 
 	ret = start_isolate_page_range(pfn_max_align_down(start),
-				       pfn_max_align_up(end), migratetype, 0);
+				pfn_max_align_up(end), migratetype, 0, gfp_mask);
 	if (ret)
-		return ret;
+		goto done;
 
 	drain_all_pages(cc.zone);
 
@@ -9064,7 +9098,7 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	ret = 0;
 
 	/*
-	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+	 * Pages from [start, end) are within a pageblock_nr_pages
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c2f7a8bb634d..94b3467e5ba2 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -203,7 +203,7 @@  static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	return -EBUSY;
 }
 
-static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, int migratetype)
 {
 	struct zone *zone;
 	unsigned long flags, nr_pages;
@@ -279,6 +279,157 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 	return NULL;
 }
 
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn:		pageblock-aligned pfn that a page might cross
+ * @gfp_flags:			GFP flags used for migrating pages
+ * @isolate_before:	isolate the pageblock before the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [         MAX_ORDER-1         ]
+ * [  pageblock0  |  pageblock1  ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
+			bool isolate_before)
+{
+	unsigned char saved_mt;
+	unsigned long start_pfn;
+	unsigned long isolate_pageblock;
+	unsigned long pfn;
+	struct zone *zone;
+
+	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+	if (isolate_before)
+		isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+	else
+		isolate_pageblock = boundary_pfn;
+
+	/*
+	 * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+	 * only isolating a subset of pageblocks from a bigger than pageblock
+	 * free or in-use page. Also make sure all to-be-isolated pageblocks
+	 * are within the same zone.
+	 */
+	zone  = page_zone(pfn_to_page(isolate_pageblock));
+	start_pfn  = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+				      zone->zone_start_pfn);
+
+	saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
+
+	/*
+	 * Bail out early when the to-be-isolated pageblock does not form
+	 * a free or in-use page across boundary_pfn:
+	 *
+	 * 1. isolate before boundary_pfn: the page after is not online
+	 * 2. isolate after boundary_pfn: the page before is not online
+	 *
+	 * This also ensures correctness. Without it, when isolate after
+	 * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+	 * __first_valid_page() will return unexpected NULL in the for loop
+	 * below.
+	 */
+	if (isolate_before) {
+		if (!pfn_to_online_page(boundary_pfn))
+			return 0;
+	} else {
+		if (!pfn_to_online_page(boundary_pfn - 1))
+			return 0;
+	}
+
+	for (pfn = start_pfn; pfn < boundary_pfn;) {
+		struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+		VM_BUG_ON(!page);
+		pfn = page_to_pfn(page);
+		/*
+		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+		 * free pages in [start_pfn, boundary_pfn), its head page will
+		 * always be in the range.
+		 */
+		if (PageBuddy(page)) {
+			int order = buddy_order(page);
+
+			if (pfn + (1UL << order) > boundary_pfn)
+				split_free_page(page, order, boundary_pfn - pfn);
+			pfn += (1UL << order);
+			continue;
+		}
+		/*
+		 * migrate compound pages then let the free page handling code
+		 * above do the rest. If migration is not enabled, just fail.
+		 */
+		if (PageHuge(page) || PageTransCompound(page)) {
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+			unsigned long nr_pages = compound_nr(page);
+			int order = compound_order(page);
+			struct page *head = compound_head(page);
+			unsigned long head_pfn = page_to_pfn(head);
+			int ret;
+			struct compact_control cc = {
+				.nr_migratepages = 0,
+				.order = -1,
+				.zone = page_zone(pfn_to_page(head_pfn)),
+				.mode = MIGRATE_SYNC,
+				.ignore_skip_hint = true,
+				.no_set_skip_hint = true,
+				.gfp_mask = gfp_flags,
+				.alloc_contig = true,
+			};
+			INIT_LIST_HEAD(&cc.migratepages);
+
+			if (head_pfn + nr_pages < boundary_pfn) {
+				pfn += nr_pages;
+				continue;
+			}
+
+			ret = __alloc_contig_migrate_range(&cc, head_pfn,
+						head_pfn + nr_pages);
+
+			if (ret)
+				goto failed;
+			/*
+			 * reset pfn, let the free page handling code above
+			 * split the free page to the right migratetype list.
+			 *
+			 * head_pfn is not used here as a hugetlb page order
+			 * can be bigger than MAX_ORDER-1, but after it is
+			 * freed, the free page order is not. Use pfn within
+			 * the range to find the head of the free page and
+			 * reset order to 0 if a hugetlb page with
+			 * >MAX_ORDER-1 order is encountered.
+			 */
+			if (order > MAX_ORDER-1)
+				order = 0;
+			while (!PageBuddy(pfn_to_page(pfn))) {
+				order++;
+				pfn &= ~0UL << order;
+			}
+			continue;
+#else
+			goto failed;
+#endif
+		}
+
+		pfn++;
+	}
+	return 0;
+failed:
+	/* restore the original migratetype */
+	set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
+	return -EBUSY;
+}
+
 /**
  * start_isolate_page_range() - make page-allocation-type of range of pages to
  * be MIGRATE_ISOLATE.
@@ -293,6 +444,8 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *					 and PageOffline() pages.
  *			REPORT_FAILURE - report details about the failure to
  *			isolate the range
+ * @gfp_flags:		GFP flags used for migrating pages that sit across the
+ *			range boundaries.
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
@@ -301,6 +454,10 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * pages in the range finally, the caller have to free all pages in the range.
  * test_page_isolated() can be used for test it.
  *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
  * There is no high level synchronization mechanism that prevents two threads
  * from trying to isolate overlapping ranges. If this happens, one thread
  * will notice pageblocks in the overlapping range already set to isolate.
@@ -321,21 +478,38 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     unsigned migratetype, int flags)
+			     int migratetype, int flags, gfp_t gfp_flags)
 {
 	unsigned long pfn;
 	struct page *page;
+	int ret;
 
 	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
 
-	for (pfn = start_pfn;
-	     pfn < end_pfn;
+	/* isolate [start_pfn, start_pfn + pageblock_nr_pages) pageblock */
+	ret = isolate_single_pageblock(start_pfn, gfp_flags, false);
+	if (ret)
+		return ret;
+
+	/* isolate [end_pfn - pageblock_nr_pages, end_pfn) pageblock */
+	ret = isolate_single_pageblock(end_pfn, gfp_flags, true);
+	if (ret) {
+		unset_migratetype_isolate(pfn_to_page(start_pfn), migratetype);
+		return ret;
+	}
+
+	/* skip isolated pageblocks at the beginning and end */
+	for (pfn = start_pfn + pageblock_nr_pages;
+	     pfn < end_pfn - pageblock_nr_pages;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (page && set_migratetype_isolate(page, migratetype, flags,
 					start_pfn, end_pfn)) {
 			undo_isolate_page_range(start_pfn, pfn, migratetype);
+			unset_migratetype_isolate(
+				pfn_to_page(end_pfn - pageblock_nr_pages),
+				migratetype);
 			return -EBUSY;
 		}
 	}
@@ -346,7 +520,7 @@  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Make isolated pages available again.
  */
 void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			    unsigned migratetype)
+			    int migratetype)
 {
 	unsigned long pfn;
 	struct page *page;