diff mbox series

[3/6] mm/page_alloc: Explicitly record high-order atomic allocations in alloc_flags

Message ID 20221129151701.23261-4-mgorman@techsingularity.net (mailing list archive)
State New
Headers show
Series Discard __GFP_ATOMIC | expand

Commit Message

Mel Gorman Nov. 29, 2022, 3:16 p.m. UTC
A high-order ALLOC_HARDER allocation is assumed to be atomic. While that
is accurate, it changes later in the series. In preparation, explicitly
record high-order atomic allocations in gfp_to_alloc_flags().

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/internal.h   |  1 +
 mm/page_alloc.c | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

Comments

NeilBrown Dec. 5, 2022, 5:17 a.m. UTC | #1
Hi Mel,
 thanks a lot for doing this!
 I tried reviewing it but "HIGHATOMIC" is new to me and I quickly got
 lost :-(
 Maybe one day I'll work it out - now that several names are more
 meaningful, it will likely be easier.

Thanks,
NeilBrown

On Wed, 30 Nov 2022, Mel Gorman wrote:
> A high-order ALLOC_HARDER allocation is assumed to be atomic. While that
> is accurate, it changes later in the series. In preparation, explicitly
> record high-order atomic allocations in gfp_to_alloc_flags().
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  mm/internal.h   |  1 +
>  mm/page_alloc.c | 19 +++++++++++++------
>  2 files changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index d503e57a57a1..9a9d9b5ee87f 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -754,6 +754,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  #else
>  #define ALLOC_NOFRAGMENT	  0x0
>  #endif
> +#define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
>  #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
>  
>  enum ttu_flags;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index da746e9eb2cf..e2b65767dda0 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3710,7 +3710,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
>  		 * reserved for high-order atomic allocation, so order-0
>  		 * request should skip it.
>  		 */
> -		if (order > 0 && alloc_flags & ALLOC_HARDER)
> +		if (alloc_flags & ALLOC_HIGHATOMIC)
>  			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
>  		if (!page) {
>  			page = __rmqueue(zone, order, migratetype, alloc_flags);
> @@ -4028,8 +4028,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>  			return true;
>  		}
>  #endif
> -		if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
> +		if ((alloc_flags & ALLOC_HIGHATOMIC) &&
> +		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
>  			return true;
> +		}
>  	}
>  	return false;
>  }
> @@ -4291,7 +4293,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			 * If this is a high-order atomic allocation then check
>  			 * if the pageblock should be reserved for the future
>  			 */
> -			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
> +			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
>  				reserve_highatomic_pageblock(page, zone, order);
>  
>  			return page;
> @@ -4818,7 +4820,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
>  }
>  
>  static inline unsigned int
> -gfp_to_alloc_flags(gfp_t gfp_mask)
> +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
>  {
>  	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
>  
> @@ -4844,8 +4846,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
>  		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
>  		 * if it can't schedule.
>  		 */
> -		if (!(gfp_mask & __GFP_NOMEMALLOC))
> +		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
>  			alloc_flags |= ALLOC_HARDER;
> +
> +			if (order > 0)
> +				alloc_flags |= ALLOC_HIGHATOMIC;
> +		}
> +
>  		/*
>  		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
>  		 * comment for __cpuset_node_allowed().
> @@ -5053,7 +5060,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	 * kswapd needs to be woken up, and to avoid the cost of setting up
>  	 * alloc_flags precisely. So we do that now.
>  	 */
> -	alloc_flags = gfp_to_alloc_flags(gfp_mask);
> +	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
>  
>  	/*
>  	 * We need to recalculate the starting point for the zonelist iterator
> -- 
> 2.35.3
> 
>
Mel Gorman Dec. 5, 2022, 10:27 a.m. UTC | #2
On Mon, Dec 05, 2022 at 04:17:12PM +1100, NeilBrown wrote:
> 
> Hi Mel,
>  thanks a lot for doing this!

My pleasure.

>  I tried reviewing it but "HIGHATOMIC" is new to me and I quickly got
>  lost :-(

That's ok, HIGHATOMIC reserves are obscure and internal to the
allocator. It's almost as obscure as granting access to reserves for RT
tasks with the only difference being a lack of data on what sort of RT
tasks needed extra privileges back in the early 2000's but I happened to
remember why highatomic reserves were introduced. It was introduced when
fragmentation avoidance triggered  high-order atomic allocations failures
that "happened to work" by accident before fragmentation avoidance (details
in 0aaa29a56e4f). IIRC, there were a storm of bugs, mostly on small embedded
platforms where devices without an IOMMU relied on high-order atomics
to work so a small reserve was created. I was concerned your patch would
trigger this class of bug again even though it might take a few years to
show up as embedded platforms tend to stay on old kernels for ages.

The main point of this patch is identifying these high-order atomic
allocations without relying on __GFP_ATOMIC but it should not be necessary
to understand how high-order atomic reserves work. They still work the
same way, access is just granted differently.

>  Maybe one day I'll work it out - now that several names are more
>  meaningful, it will likely be easier.
> 
> > @@ -4818,7 +4820,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
> >  }
> >  
> >  static inline unsigned int
> > -gfp_to_alloc_flags(gfp_t gfp_mask)
> > +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> >  {
> >  	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
> >  
> > @@ -4844,8 +4846,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> >  		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
> >  		 * if it can't schedule.
> >  		 */
> > -		if (!(gfp_mask & __GFP_NOMEMALLOC))
> > +		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
> >  			alloc_flags |= ALLOC_HARDER;
> > +
> > +			if (order > 0)
> > +				alloc_flags |= ALLOC_HIGHATOMIC;
> > +		}
> > +
> >  		/*
> >  		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
> >  		 * comment for __cpuset_node_allowed().

This is the most crucial hunk two hunks of the patch. If the series is
merged and we start seeing high-order atomic allocations failure, the key
will be to look at the gfp_flags and determine if this hunk is enough to
accurately detect high-order atomic allocations. If the GFP flags look ok,
then a tracing debugging patch will be created to dump gfp_flags every
time access is granted to high-atomic reserves to determine if access is
given incorrectly and under what circumstances.

The main concern I had with your original patch was that it was too
easy to grant access to high-atomic reserves for requests that were not
high-order atomics requests which might exhaust the reserves. The rest of
the series tries to improve the naming of the ALLOC flags and what they
mean. The actual changes to your patch are minimal.  I could have started
with your patch and fixed it up but I preferred this ordering to reduce
use of __GFP_ATOMIC and then delete it. It should be bisection safe.
Vlastimil Babka Dec. 8, 2022, 4:51 p.m. UTC | #3
On 11/29/22 16:16, Mel Gorman wrote:
> A high-order ALLOC_HARDER allocation is assumed to be atomic. While that
> is accurate, it changes later in the series. In preparation, explicitly
> record high-order atomic allocations in gfp_to_alloc_flags().
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  mm/internal.h   |  1 +
>  mm/page_alloc.c | 19 +++++++++++++------
>  2 files changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index d503e57a57a1..9a9d9b5ee87f 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -754,6 +754,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  #else
>  #define ALLOC_NOFRAGMENT	  0x0
>  #endif
> +#define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
>  #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
>  
>  enum ttu_flags;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index da746e9eb2cf..e2b65767dda0 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3710,7 +3710,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
>  		 * reserved for high-order atomic allocation, so order-0
>  		 * request should skip it.
>  		 */
> -		if (order > 0 && alloc_flags & ALLOC_HARDER)
> +		if (alloc_flags & ALLOC_HIGHATOMIC)
>  			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
>  		if (!page) {
>  			page = __rmqueue(zone, order, migratetype, alloc_flags);
> @@ -4028,8 +4028,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>  			return true;
>  		}
>  #endif
> -		if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
> +		if ((alloc_flags & ALLOC_HIGHATOMIC) &&
> +		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
>  			return true;

alloc_harder is defined as
	(alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
AFAICS this means we no longer allow ALLOC_OOM to use the highatomic
reserve. Isn't that a risk?

> +		}
>  	}
>  	return false;
>  }
> @@ -4291,7 +4293,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			 * If this is a high-order atomic allocation then check
>  			 * if the pageblock should be reserved for the future
>  			 */
> -			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
> +			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
>  				reserve_highatomic_pageblock(page, zone, order);
>  
>  			return page;
> @@ -4818,7 +4820,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
>  }
>  
>  static inline unsigned int
> -gfp_to_alloc_flags(gfp_t gfp_mask)
> +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
>  {
>  	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
>  
> @@ -4844,8 +4846,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
>  		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
>  		 * if it can't schedule.
>  		 */
> -		if (!(gfp_mask & __GFP_NOMEMALLOC))
> +		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
>  			alloc_flags |= ALLOC_HARDER;
> +
> +			if (order > 0)
> +				alloc_flags |= ALLOC_HIGHATOMIC;
> +		}
> +
>  		/*
>  		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
>  		 * comment for __cpuset_node_allowed().
> @@ -5053,7 +5060,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	 * kswapd needs to be woken up, and to avoid the cost of setting up
>  	 * alloc_flags precisely. So we do that now.
>  	 */
> -	alloc_flags = gfp_to_alloc_flags(gfp_mask);
> +	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
>  
>  	/*
>  	 * We need to recalculate the starting point for the zonelist iterator
Mel Gorman Jan. 4, 2023, 11:45 a.m. UTC | #4
First off, sorry for the long delay getting back to you. I was sick for
a few weeks and still catching up. I'm still not 100%.

On Thu, Dec 08, 2022 at 05:51:11PM +0100, Vlastimil Babka wrote:
> On 11/29/22 16:16, Mel Gorman wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index da746e9eb2cf..e2b65767dda0 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3710,7 +3710,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
> >  		 * reserved for high-order atomic allocation, so order-0
> >  		 * request should skip it.
> >  		 */
> > -		if (order > 0 && alloc_flags & ALLOC_HARDER)
> > +		if (alloc_flags & ALLOC_HIGHATOMIC)
> >  			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
> >  		if (!page) {
> >  			page = __rmqueue(zone, order, migratetype, alloc_flags);
> > @@ -4028,8 +4028,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
> >  			return true;
> >  		}
> >  #endif
> > -		if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
> > +		if ((alloc_flags & ALLOC_HIGHATOMIC) &&
> > +		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
> >  			return true;
> 
> alloc_harder is defined as
> 	(alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
> AFAICS this means we no longer allow ALLOC_OOM to use the highatomic
> reserve. Isn't that a risk?
> 

Yes, it is. I intend to apply the patch below on top. I didn't alter the
first check for ALLOC_HIGHATOMIC as I wanted OOM handling to only use the
high-order reserves if there was no other option. While this is a change
in behaviour, it should be a harmless one. I'll add a note in the changelog.

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 50fc1e7cb154..0ef4f3236a5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3710,6 +3710,16 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 		if (!page) {
 			page = __rmqueue(zone, order, migratetype, alloc_flags);
+
+			/*
+			 * If the allocation fails, allow OOM handling access
+			 * to HIGHATOMIC reserves as failing now is worse than
+			 * failing a high-order atomic allocation in the
+			 * future.
+			 */
+			if (!page && (alloc_flags & ALLOC_OOM))
+				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+
 			if (!page) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return NULL;
@@ -4023,7 +4033,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			return true;
 		}
 #endif
-		if ((alloc_flags & ALLOC_HIGHATOMIC) &&
+		if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
 		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
 			return true;
 		}
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index d503e57a57a1..9a9d9b5ee87f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -754,6 +754,7 @@  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 #else
 #define ALLOC_NOFRAGMENT	  0x0
 #endif
+#define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
 
 enum ttu_flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index da746e9eb2cf..e2b65767dda0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3710,7 +3710,7 @@  struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 		 * reserved for high-order atomic allocation, so order-0
 		 * request should skip it.
 		 */
-		if (order > 0 && alloc_flags & ALLOC_HARDER)
+		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 		if (!page) {
 			page = __rmqueue(zone, order, migratetype, alloc_flags);
@@ -4028,8 +4028,10 @@  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			return true;
 		}
 #endif
-		if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
+		if ((alloc_flags & ALLOC_HIGHATOMIC) &&
+		    !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
 			return true;
+		}
 	}
 	return false;
 }
@@ -4291,7 +4293,7 @@  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			 * If this is a high-order atomic allocation then check
 			 * if the pageblock should be reserved for the future
 			 */
-			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
 				reserve_highatomic_pageblock(page, zone, order);
 
 			return page;
@@ -4818,7 +4820,7 @@  static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 }
 
 static inline unsigned int
-gfp_to_alloc_flags(gfp_t gfp_mask)
+gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 {
 	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 
@@ -4844,8 +4846,13 @@  gfp_to_alloc_flags(gfp_t gfp_mask)
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
-		if (!(gfp_mask & __GFP_NOMEMALLOC))
+		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			alloc_flags |= ALLOC_HARDER;
+
+			if (order > 0)
+				alloc_flags |= ALLOC_HIGHATOMIC;
+		}
+
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed().
@@ -5053,7 +5060,7 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * kswapd needs to be woken up, and to avoid the cost of setting up
 	 * alloc_flags precisely. So we do that now.
 	 */
-	alloc_flags = gfp_to_alloc_flags(gfp_mask);
+	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
 
 	/*
 	 * We need to recalculate the starting point for the zonelist iterator