diff mbox series

[RFC,5/5] mm, page_alloc: Introduce ZONELIST_FALLBACK_SAME_TYPE fallback list

Message ID 1556155295-77723-6-git-send-email-fan.du@intel.com (mailing list archive)
State New, archived
Headers show
Series New fallback workflow for heterogeneous memory system | expand

Commit Message

Du, Fan April 25, 2019, 1:21 a.m. UTC
On system with heterogeneous memory, reasonable fall back lists woul be:
a. No fall back, stick to current running node.
b. Fall back to other nodes of the same type or different type
   e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
c. Fall back to other nodes of the same type only.
   e.g. DRAM node 0 -> DRAM node 1

a. is already in place, previous patch implement b. providing way to
satisfy memory request as best effort by default. And this patch of
writing build c. to fallback to the same node type when user specify
GFP_SAME_NODE_TYPE only.

Signed-off-by: Fan Du <fan.du@intel.com>
---
 include/linux/gfp.h    |  7 +++++++
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 15 +++++++++++++++
 3 files changed, 23 insertions(+)

Comments

Xishi Qiu April 25, 2019, 3:26 a.m. UTC | #1
Hi Fan Du,

I think we should change the print in mminit_verify_zonelist too.

This patch changes the order of ZONELIST_FALLBACK, so the default numa policy can
alloc DRAM first, then PMEM, right?

Thanks,
Xishi Qiu
>     On system with heterogeneous memory, reasonable fall back lists woul be:
>     a. No fall back, stick to current running node.
>     b. Fall back to other nodes of the same type or different type
>        e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
>     c. Fall back to other nodes of the same type only.
>        e.g. DRAM node 0 -> DRAM node 1
> 
>     a. is already in place, previous patch implement b. providing way to
>     satisfy memory request as best effort by default. And this patch of
>     writing build c. to fallback to the same node type when user specify
>     GFP_SAME_NODE_TYPE only.
> 
>     Signed-off-by: Fan Du <fan.du@intel.com>
>     ---
>      include/linux/gfp.h    |  7 +++++++
>      include/linux/mmzone.h |  1 +
>      mm/page_alloc.c        | 15 +++++++++++++++
>      3 files changed, 23 insertions(+)
> 
>     diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>     index fdab7de..ca5fdfc 100644
>     --- a/include/linux/gfp.h
>     +++ b/include/linux/gfp.h
>     @@ -44,6 +44,8 @@
>      #else
>      #define ___GFP_NOLOCKDEP 0
>      #endif
>     +#define ___GFP_SAME_NODE_TYPE 0x1000000u
>     +
>      /* If the above are modified, __GFP_BITS_SHIFT may need updating */
>      
>      /*
>     @@ -215,6 +217,7 @@
>      
>      /* Disable lockdep for GFP context tracking */
>      #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
>     +#define __GFP_SAME_NODE_TYPE ((__force gfp_t)___GFP_SAME_NODE_TYPE)
>      
>      /* Room for N __GFP_FOO bits */
>      #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
>     @@ -301,6 +304,8 @@
>          __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
>      #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
>      
>     +#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
>     +
>      /* Convert GFP flags to their corresponding migrate type */
>      #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
>      #define GFP_MOVABLE_SHIFT 3
>     @@ -438,6 +443,8 @@ static inline int gfp_zonelist(gfp_t flags)
>      #ifdef CONFIG_NUMA
>       if (unlikely(flags & __GFP_THISNODE))
>        return ZONELIST_NOFALLBACK;
>     + if (unlikely(flags & __GFP_SAME_NODE_TYPE))
>     +  return ZONELIST_FALLBACK_SAME_TYPE;
>      #endif
>       return ZONELIST_FALLBACK;
>      }
>     diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>     index 8c37e1c..2f8603e 100644
>     --- a/include/linux/mmzone.h
>     +++ b/include/linux/mmzone.h
>     @@ -583,6 +583,7 @@ static inline bool zone_intersects(struct zone *zone,
>      
>      enum {
>       ZONELIST_FALLBACK, /* zonelist with fallback */
>     + ZONELIST_FALLBACK_SAME_TYPE, /* zonelist with fallback to the same type node */
>      #ifdef CONFIG_NUMA
>       /*
>        * The NUMA zonelists are doubled because we need zonelists that
>     diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>     index a408a91..de797921 100644
>     --- a/mm/page_alloc.c
>     +++ b/mm/page_alloc.c
>     @@ -5448,6 +5448,21 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
>       }
>       zonerefs->zone = NULL;
>       zonerefs->zone_idx = 0;
>     +
>     + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zonerefs;
>     +
>     + for (i = 0; i < nr_nodes; i++) {
>     +  int nr_zones;
>     +
>     +  pg_data_t *node = NODE_DATA(node_order[i]);
>     +
>     +  if (!is_node_same_type(node->node_id, pgdat->node_id))
>     +   continue;
>     +  nr_zones = build_zonerefs_node(node, zonerefs);
>     +  zonerefs += nr_zones;
>     + }
>     + zonerefs->zone = NULL;
>     + zonerefs->zone_idx = 0;
>      }
>      
>      /*
>     -- 
>     1.8.3.1
> 
>
Michal Hocko April 25, 2019, 6:38 a.m. UTC | #2
On Thu 25-04-19 09:21:35, Fan Du wrote:
> On system with heterogeneous memory, reasonable fall back lists woul be:
> a. No fall back, stick to current running node.
> b. Fall back to other nodes of the same type or different type
>    e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
> c. Fall back to other nodes of the same type only.
>    e.g. DRAM node 0 -> DRAM node 1
> 
> a. is already in place, previous patch implement b. providing way to
> satisfy memory request as best effort by default. And this patch of
> writing build c. to fallback to the same node type when user specify
> GFP_SAME_NODE_TYPE only.

So an immediate question which should be answered by this changelog. Who
is going to use the new gfp flag? Why cannot all allocations without an
explicit numa policy fallback to all existing nodes?
 
> Signed-off-by: Fan Du <fan.du@intel.com>
> ---
>  include/linux/gfp.h    |  7 +++++++
>  include/linux/mmzone.h |  1 +
>  mm/page_alloc.c        | 15 +++++++++++++++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index fdab7de..ca5fdfc 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -44,6 +44,8 @@
>  #else
>  #define ___GFP_NOLOCKDEP	0
>  #endif
> +#define ___GFP_SAME_NODE_TYPE	0x1000000u
> +
>  /* If the above are modified, __GFP_BITS_SHIFT may need updating */
>  
>  /*
> @@ -215,6 +217,7 @@
>  
>  /* Disable lockdep for GFP context tracking */
>  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
> +#define __GFP_SAME_NODE_TYPE ((__force gfp_t)___GFP_SAME_NODE_TYPE)
>  
>  /* Room for N __GFP_FOO bits */
>  #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
> @@ -301,6 +304,8 @@
>  			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
>  #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
>  
> +#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
> +
>  /* Convert GFP flags to their corresponding migrate type */
>  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
>  #define GFP_MOVABLE_SHIFT 3
> @@ -438,6 +443,8 @@ static inline int gfp_zonelist(gfp_t flags)
>  #ifdef CONFIG_NUMA
>  	if (unlikely(flags & __GFP_THISNODE))
>  		return ZONELIST_NOFALLBACK;
> +	if (unlikely(flags & __GFP_SAME_NODE_TYPE))
> +		return ZONELIST_FALLBACK_SAME_TYPE;
>  #endif
>  	return ZONELIST_FALLBACK;
>  }
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 8c37e1c..2f8603e 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -583,6 +583,7 @@ static inline bool zone_intersects(struct zone *zone,
>  
>  enum {
>  	ZONELIST_FALLBACK,	/* zonelist with fallback */
> +	ZONELIST_FALLBACK_SAME_TYPE,	/* zonelist with fallback to the same type node */
>  #ifdef CONFIG_NUMA
>  	/*
>  	 * The NUMA zonelists are doubled because we need zonelists that
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index a408a91..de797921 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5448,6 +5448,21 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
>  	}
>  	zonerefs->zone = NULL;
>  	zonerefs->zone_idx = 0;
> +
> +	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zonerefs;
> +
> +	for (i = 0; i < nr_nodes; i++) {
> +		int nr_zones;
> +
> +		pg_data_t *node = NODE_DATA(node_order[i]);
> +
> +		if (!is_node_same_type(node->node_id, pgdat->node_id))
> +			continue;
> +		nr_zones = build_zonerefs_node(node, zonerefs);
> +		zonerefs += nr_zones;
> +	}
> +	zonerefs->zone = NULL;
> +	zonerefs->zone_idx = 0;
>  }
>  
>  /*
> -- 
> 1.8.3.1
>
Du, Fan April 25, 2019, 7:43 a.m. UTC | #3
>-----Original Message-----
>From: Michal Hocko [mailto:mhocko@kernel.org]
>Sent: Thursday, April 25, 2019 2:38 PM
>To: Du, Fan <fan.du@intel.com>
>Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
>Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
>Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>ZONELIST_FALLBACK_SAME_TYPE fallback list
>
>On Thu 25-04-19 09:21:35, Fan Du wrote:
>> On system with heterogeneous memory, reasonable fall back lists woul be:
>> a. No fall back, stick to current running node.
>> b. Fall back to other nodes of the same type or different type
>>    e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
>> c. Fall back to other nodes of the same type only.
>>    e.g. DRAM node 0 -> DRAM node 1
>>
>> a. is already in place, previous patch implement b. providing way to
>> satisfy memory request as best effort by default. And this patch of
>> writing build c. to fallback to the same node type when user specify
>> GFP_SAME_NODE_TYPE only.
>
>So an immediate question which should be answered by this changelog. Who
>is going to use the new gfp flag? Why cannot all allocations without an
>explicit numa policy fallback to all existing nodes?

PMEM is good for frequently read accessed page, e.g. page cache(implicit page
request), or user space data base (explicit page request)

For now this patch create GFP_SAME_NODE_TYPE for such cases, additional
Implementation will be followed up.

For example:
a. Open file
b. Populate pagecache with PMEM page if user set O_RDONLY
c. Migrate frequently read accessed page to PMEM from DRAM,
  for cases w/o O_RDONLY.


>> Signed-off-by: Fan Du <fan.du@intel.com>
>> ---
>>  include/linux/gfp.h    |  7 +++++++
>>  include/linux/mmzone.h |  1 +
>>  mm/page_alloc.c        | 15 +++++++++++++++
>>  3 files changed, 23 insertions(+)
>>
>> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>> index fdab7de..ca5fdfc 100644
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -44,6 +44,8 @@
>>  #else
>>  #define ___GFP_NOLOCKDEP	0
>>  #endif
>> +#define ___GFP_SAME_NODE_TYPE	0x1000000u
>> +
>>  /* If the above are modified, __GFP_BITS_SHIFT may need updating */
>>
>>  /*
>> @@ -215,6 +217,7 @@
>>
>>  /* Disable lockdep for GFP context tracking */
>>  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
>> +#define __GFP_SAME_NODE_TYPE ((__force
>gfp_t)___GFP_SAME_NODE_TYPE)
>>
>>  /* Room for N __GFP_FOO bits */
>>  #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
>> @@ -301,6 +304,8 @@
>>  			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
>>  #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT |
>__GFP_DIRECT_RECLAIM)
>>
>> +#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
>> +
>>  /* Convert GFP flags to their corresponding migrate type */
>>  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
>>  #define GFP_MOVABLE_SHIFT 3
>> @@ -438,6 +443,8 @@ static inline int gfp_zonelist(gfp_t flags)
>>  #ifdef CONFIG_NUMA
>>  	if (unlikely(flags & __GFP_THISNODE))
>>  		return ZONELIST_NOFALLBACK;
>> +	if (unlikely(flags & __GFP_SAME_NODE_TYPE))
>> +		return ZONELIST_FALLBACK_SAME_TYPE;
>>  #endif
>>  	return ZONELIST_FALLBACK;
>>  }
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 8c37e1c..2f8603e 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -583,6 +583,7 @@ static inline bool zone_intersects(struct zone *zone,
>>
>>  enum {
>>  	ZONELIST_FALLBACK,	/* zonelist with fallback */
>> +	ZONELIST_FALLBACK_SAME_TYPE,	/* zonelist with fallback to the
>same type node */
>>  #ifdef CONFIG_NUMA
>>  	/*
>>  	 * The NUMA zonelists are doubled because we need zonelists that
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index a408a91..de797921 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5448,6 +5448,21 @@ static void
>build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
>>  	}
>>  	zonerefs->zone = NULL;
>>  	zonerefs->zone_idx = 0;
>> +
>> +	zonerefs =
>pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zonerefs;
>> +
>> +	for (i = 0; i < nr_nodes; i++) {
>> +		int nr_zones;
>> +
>> +		pg_data_t *node = NODE_DATA(node_order[i]);
>> +
>> +		if (!is_node_same_type(node->node_id, pgdat->node_id))
>> +			continue;
>> +		nr_zones = build_zonerefs_node(node, zonerefs);
>> +		zonerefs += nr_zones;
>> +	}
>> +	zonerefs->zone = NULL;
>> +	zonerefs->zone_idx = 0;
>>  }
>>
>>  /*
>> --
>> 1.8.3.1
>>
>
>--
>Michal Hocko
>SUSE Labs
Du, Fan April 25, 2019, 7:45 a.m. UTC | #4
>-----Original Message-----
>From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
>Behalf Of Xishi Qiu
>Sent: Thursday, April 25, 2019 11:26 AM
>To: Wu, Fengguang <fengguang.wu@intel.com>; Du, Fan <fan.du@intel.com>
>Cc: akpm@linux-foundation.org; Michal Hocko <mhocko@suse.com>;
>Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
><dave.hansen@intel.com>; Huang, Ying <ying.huang@intel.com>;
>linux-mm@kvack.org; Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
>Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>ZONELIST_FALLBACK_SAME_TYPE fallback list
>
>Hi Fan Du,
>
>I think we should change the print in mminit_verify_zonelist too.
>
>This patch changes the order of ZONELIST_FALLBACK, so the default numa
>policy can
>alloc DRAM first, then PMEM, right?

Yes, you are right. :)

>Thanks,
>Xishi Qiu
>>
>On system with heterogeneous memory, reasonable fall back lists wo
>ul be:
>>     a. No fall back, stick to current running node.
>>
>b. Fall back to other nodes of the same type or different type
>>        e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 ->
>PMEM node 3
>>     c. Fall back to other nodes of the same type only.
>>        e.g. DRAM node 0 -> DRAM node 1
>>
>>
>a. is already in place, previous patch implement b. providing way to
>>
>satisfy memory request as best effort by default. And this patch of
>>
>writing build c. to fallback to the same node type when user specify
>>     GFP_SAME_NODE_TYPE only.
>>
>>     Signed-off-by: Fan Du <fan.du@intel.com>
>>     ---
>>      include/linux/gfp.h    |  7 +++++++
>>      include/linux/mmzone.h |  1 +
>>      mm/page_alloc.c        | 15 +++++++++++++++
>>      3 files changed, 23 insertions(+)
>>
>>     diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>>     index fdab7de..ca5fdfc 100644
>>     --- a/include/linux/gfp.h
>>     +++ b/include/linux/gfp.h
>>     @@ -44,6 +44,8 @@
>>      #else
>>      #define ___GFP_NOLOCKDEP 0
>>      #endif
>>     +#define ___GFP_SAME_NODE_TYPE 0x1000000u
>>     +
>>      /* If the above are modified, __GFP_BITS_SHIFT may need up
>dating */
>>
>>      /*
>>     @@ -215,6 +217,7 @@
>>
>>      /* Disable lockdep for GFP context tracking */
>>      #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
>>
>+#define __GFP_SAME_NODE_TYPE ((__force gfp_t)___GFP_SAME_NODE_T
>YPE)
>>
>>      /* Room for N __GFP_FOO bits */
>>      #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
>>     @@ -301,6 +304,8 @@
>>          __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLA
>IM)
>>      #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRE
>CT_RECLAIM)
>>
>>     +#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
>>     +
>>      /* Convert GFP flags to their corresponding migrate type */
>>      #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVA
>BLE)
>>      #define GFP_MOVABLE_SHIFT 3
>>     @@ -438,6 +443,8 @@ static inline int gfp_zonelist(gfp_t flags)
>>      #ifdef CONFIG_NUMA
>>       if (unlikely(flags & __GFP_THISNODE))
>>        return ZONELIST_NOFALLBACK;
>>     + if (unlikely(flags & __GFP_SAME_NODE_TYPE))
>>     +  return ZONELIST_FALLBACK_SAME_TYPE;
>>      #endif
>>       return ZONELIST_FALLBACK;
>>      }
>>     diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>     index 8c37e1c..2f8603e 100644
>>     --- a/include/linux/mmzone.h
>>     +++ b/include/linux/mmzone.h
>>
>@@ -583,6 +583,7 @@ static inline bool zone_intersects(struct zone
>*zone,
>>
>>      enum {
>>       ZONELIST_FALLBACK, /* zonelist with fallback */
>>
>+ ZONELIST_FALLBACK_SAME_TYPE, /* zonelist with fallback to the sam
>e type node */
>>      #ifdef CONFIG_NUMA
>>       /*
>>        * The NUMA zonelists are doubled because we need zonel
>ists that
>>     diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>     index a408a91..de797921 100644
>>     --- a/mm/page_alloc.c
>>     +++ b/mm/page_alloc.c
>>
>@@ -5448,6 +5448,21 @@ static void build_zonelists_in_node_order(pg
>_data_t *pgdat, int *node_order,
>>       }
>>       zonerefs->zone = NULL;
>>       zonerefs->zone_idx = 0;
>>     +
>>
>+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zon
>erefs;
>>     +
>>     + for (i = 0; i < nr_nodes; i++) {
>>     +  int nr_zones;
>>     +
>>     +  pg_data_t *node = NODE_DATA(node_order[i]);
>>     +
>>     +  if (!is_node_same_type(node->node_id, pgdat->node_id))
>>     +   continue;
>>     +  nr_zones = build_zonerefs_node(node, zonerefs);
>>     +  zonerefs += nr_zones;
>>     + }
>>     + zonerefs->zone = NULL;
>>     + zonerefs->zone_idx = 0;
>>      }
>>
>>      /*
>>     --
>>     1.8.3.1
>>
>>
Michal Hocko April 25, 2019, 7:48 a.m. UTC | #5
On Thu 25-04-19 07:43:09, Du, Fan wrote:
> 
> 
> >-----Original Message-----
> >From: Michal Hocko [mailto:mhocko@kernel.org]
> >Sent: Thursday, April 25, 2019 2:38 PM
> >To: Du, Fan <fan.du@intel.com>
> >Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
> >Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
> ><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
> ><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
> >Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
> >ZONELIST_FALLBACK_SAME_TYPE fallback list
> >
> >On Thu 25-04-19 09:21:35, Fan Du wrote:
> >> On system with heterogeneous memory, reasonable fall back lists woul be:
> >> a. No fall back, stick to current running node.
> >> b. Fall back to other nodes of the same type or different type
> >>    e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
> >> c. Fall back to other nodes of the same type only.
> >>    e.g. DRAM node 0 -> DRAM node 1
> >>
> >> a. is already in place, previous patch implement b. providing way to
> >> satisfy memory request as best effort by default. And this patch of
> >> writing build c. to fallback to the same node type when user specify
> >> GFP_SAME_NODE_TYPE only.
> >
> >So an immediate question which should be answered by this changelog. Who
> >is going to use the new gfp flag? Why cannot all allocations without an
> >explicit numa policy fallback to all existing nodes?
> 
> PMEM is good for frequently read accessed page, e.g. page cache(implicit page
> request), or user space data base (explicit page request)
> For now this patch create GFP_SAME_NODE_TYPE for such cases, additional
> Implementation will be followed up.

Then simply configure that NUMA node as movable and you get these
allocations for any movable allocation. I am not really convinced a new
gfp flag is really justified.
Du, Fan April 25, 2019, 7:55 a.m. UTC | #6
>-----Original Message-----
>From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
>Behalf Of Michal Hocko
>Sent: Thursday, April 25, 2019 3:49 PM
>To: Du, Fan <fan.du@intel.com>
>Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
>Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
>Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>ZONELIST_FALLBACK_SAME_TYPE fallback list
>
>On Thu 25-04-19 07:43:09, Du, Fan wrote:
>>
>>
>> >-----Original Message-----
>> >From: Michal Hocko [mailto:mhocko@kernel.org]
>> >Sent: Thursday, April 25, 2019 2:38 PM
>> >To: Du, Fan <fan.du@intel.com>
>> >Cc: akpm@linux-foundation.org; Wu, Fengguang
><fengguang.wu@intel.com>;
>> >Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
>> ><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
>> ><ying.huang@intel.com>; linux-mm@kvack.org;
>linux-kernel@vger.kernel.org
>> >Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>> >ZONELIST_FALLBACK_SAME_TYPE fallback list
>> >
>> >On Thu 25-04-19 09:21:35, Fan Du wrote:
>> >> On system with heterogeneous memory, reasonable fall back lists woul
>be:
>> >> a. No fall back, stick to current running node.
>> >> b. Fall back to other nodes of the same type or different type
>> >>    e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node
>3
>> >> c. Fall back to other nodes of the same type only.
>> >>    e.g. DRAM node 0 -> DRAM node 1
>> >>
>> >> a. is already in place, previous patch implement b. providing way to
>> >> satisfy memory request as best effort by default. And this patch of
>> >> writing build c. to fallback to the same node type when user specify
>> >> GFP_SAME_NODE_TYPE only.
>> >
>> >So an immediate question which should be answered by this changelog.
>Who
>> >is going to use the new gfp flag? Why cannot all allocations without an
>> >explicit numa policy fallback to all existing nodes?
>>
>> PMEM is good for frequently read accessed page, e.g. page cache(implicit
>page
>> request), or user space data base (explicit page request)
>> For now this patch create GFP_SAME_NODE_TYPE for such cases, additional
>> Implementation will be followed up.
>
>Then simply configure that NUMA node as movable and you get these
>allocations for any movable allocation. I am not really convinced a new
>gfp flag is really justified.

Case 1: frequently write and/or read accessed page deserved to DRAM
Case 2: frequently read accessed page deserved to PMEM

We need something like a new gfp flag to sort above two cases out
From each other.

>--
>Michal Hocko
>SUSE Labs
Michal Hocko April 25, 2019, 8:09 a.m. UTC | #7
On Thu 25-04-19 07:55:58, Du, Fan wrote:
> >> PMEM is good for frequently read accessed page, e.g. page cache(implicit
> >> page
> >> request), or user space data base (explicit page request)
> >> For now this patch create GFP_SAME_NODE_TYPE for such cases, additional
> >> Implementation will be followed up.
> >
> >Then simply configure that NUMA node as movable and you get these
> >allocations for any movable allocation. I am not really convinced a new
> >gfp flag is really justified.
> 
> Case 1: frequently write and/or read accessed page deserved to DRAM

NUMA balancing

> Case 2: frequently read accessed page deserved to PMEM

memory reclaim to move those pages to a more distant node (e.g. a PMEM).

Btw. none of the above is a static thing you would easily know at the
allocation time.

Please spare some time reading surrounding discussions - e.g.
http://lkml.kernel.org/r/1554955019-29472-1-git-send-email-yang.shi@linux.alibaba.com
Du, Fan April 25, 2019, 8:20 a.m. UTC | #8
>-----Original Message-----
>From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
>Behalf Of Michal Hocko
>Sent: Thursday, April 25, 2019 4:10 PM
>To: Du, Fan <fan.du@intel.com>
>Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
>Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
>Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>ZONELIST_FALLBACK_SAME_TYPE fallback list
>
>On Thu 25-04-19 07:55:58, Du, Fan wrote:
>> >> PMEM is good for frequently read accessed page, e.g. page cache(implicit
>> >> page
>> >> request), or user space data base (explicit page request)
>> >> For now this patch create GFP_SAME_NODE_TYPE for such cases,
>additional
>> >> Implementation will be followed up.
>> >
>> >Then simply configure that NUMA node as movable and you get these
>> >allocations for any movable allocation. I am not really convinced a new
>> >gfp flag is really justified.
>>
>> Case 1: frequently write and/or read accessed page deserved to DRAM
>
>NUMA balancing

Sorry, I mean page cache case here.
Numa balancing works for pages mapped in pagetable style.

>> Case 2: frequently read accessed page deserved to PMEM
>
>memory reclaim to move those pages to a more distant node (e.g. a PMEM).
>
>Btw. none of the above is a static thing you would easily know at the
>allocation time.
>
>Please spare some time reading surrounding discussions - e.g.
>http://lkml.kernel.org/r/1554955019-29472-1-git-send-email-yang.shi@linux.a
>libaba.com

Thanks for the point.

>Michal Hocko
>SUSE Labs
Michal Hocko April 25, 2019, 8:43 a.m. UTC | #9
On Thu 25-04-19 08:20:28, Du, Fan wrote:
> 
> 
> >-----Original Message-----
> >From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
> >Behalf Of Michal Hocko
> >Sent: Thursday, April 25, 2019 4:10 PM
> >To: Du, Fan <fan.du@intel.com>
> >Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
> >Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
> ><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
> ><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
> >Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
> >ZONELIST_FALLBACK_SAME_TYPE fallback list
> >
> >On Thu 25-04-19 07:55:58, Du, Fan wrote:
> >> >> PMEM is good for frequently read accessed page, e.g. page cache(implicit
> >> >> page
> >> >> request), or user space data base (explicit page request)
> >> >> For now this patch create GFP_SAME_NODE_TYPE for such cases,
> >additional
> >> >> Implementation will be followed up.
> >> >
> >> >Then simply configure that NUMA node as movable and you get these
> >> >allocations for any movable allocation. I am not really convinced a new
> >> >gfp flag is really justified.
> >>
> >> Case 1: frequently write and/or read accessed page deserved to DRAM
> >
> >NUMA balancing
> 
> Sorry, I mean page cache case here.
> Numa balancing works for pages mapped in pagetable style.

I would still expect that a remote PMEM node access latency is
smaller/comparable to the real storage so a promoting part is not that
important for the unmapped pagecache. Maybe I am wrong here but that
really begs for some experiments before we start adding special casing.
Du, Fan April 25, 2019, 9:18 a.m. UTC | #10
>-----Original Message-----
>From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
>Behalf Of Michal Hocko
>Sent: Thursday, April 25, 2019 4:43 PM
>To: Du, Fan <fan.du@intel.com>
>Cc: akpm@linux-foundation.org; Wu, Fengguang <fengguang.wu@intel.com>;
>Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
><ying.huang@intel.com>; linux-mm@kvack.org; linux-kernel@vger.kernel.org
>Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>ZONELIST_FALLBACK_SAME_TYPE fallback list
>
>On Thu 25-04-19 08:20:28, Du, Fan wrote:
>>
>>
>> >-----Original Message-----
>> >From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
>> >Behalf Of Michal Hocko
>> >Sent: Thursday, April 25, 2019 4:10 PM
>> >To: Du, Fan <fan.du@intel.com>
>> >Cc: akpm@linux-foundation.org; Wu, Fengguang
><fengguang.wu@intel.com>;
>> >Williams, Dan J <dan.j.williams@intel.com>; Hansen, Dave
>> ><dave.hansen@intel.com>; xishi.qiuxishi@alibaba-inc.com; Huang, Ying
>> ><ying.huang@intel.com>; linux-mm@kvack.org;
>linux-kernel@vger.kernel.org
>> >Subject: Re: [RFC PATCH 5/5] mm, page_alloc: Introduce
>> >ZONELIST_FALLBACK_SAME_TYPE fallback list
>> >
>> >On Thu 25-04-19 07:55:58, Du, Fan wrote:
>> >> >> PMEM is good for frequently read accessed page, e.g. page
>cache(implicit
>> >> >> page
>> >> >> request), or user space data base (explicit page request)
>> >> >> For now this patch create GFP_SAME_NODE_TYPE for such cases,
>> >additional
>> >> >> Implementation will be followed up.
>> >> >
>> >> >Then simply configure that NUMA node as movable and you get these
>> >> >allocations for any movable allocation. I am not really convinced a new
>> >> >gfp flag is really justified.
>> >>
>> >> Case 1: frequently write and/or read accessed page deserved to DRAM
>> >
>> >NUMA balancing
>>
>> Sorry, I mean page cache case here.
>> Numa balancing works for pages mapped in pagetable style.
>
>I would still expect that a remote PMEM node access latency is
>smaller/comparable to the real storage so a promoting part is not that
>important for the unmapped pagecache. Maybe I am wrong here but that
>really begs for some experiments before we start adding special casing.

I understand your concern :), please refer to following summary from 3rd party.
https://arxiv.org/pdf/1903.05714.pdf


>--
>Michal Hocko
>SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fdab7de..ca5fdfc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -44,6 +44,8 @@ 
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
+#define ___GFP_SAME_NODE_TYPE	0x1000000u
+
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -215,6 +217,7 @@ 
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+#define __GFP_SAME_NODE_TYPE ((__force gfp_t)___GFP_SAME_NODE_TYPE)
 
 /* Room for N __GFP_FOO bits */
 #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
@@ -301,6 +304,8 @@ 
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
 
+#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
+
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
 #define GFP_MOVABLE_SHIFT 3
@@ -438,6 +443,8 @@  static inline int gfp_zonelist(gfp_t flags)
 #ifdef CONFIG_NUMA
 	if (unlikely(flags & __GFP_THISNODE))
 		return ZONELIST_NOFALLBACK;
+	if (unlikely(flags & __GFP_SAME_NODE_TYPE))
+		return ZONELIST_FALLBACK_SAME_TYPE;
 #endif
 	return ZONELIST_FALLBACK;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8c37e1c..2f8603e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -583,6 +583,7 @@  static inline bool zone_intersects(struct zone *zone,
 
 enum {
 	ZONELIST_FALLBACK,	/* zonelist with fallback */
+	ZONELIST_FALLBACK_SAME_TYPE,	/* zonelist with fallback to the same type node */
 #ifdef CONFIG_NUMA
 	/*
 	 * The NUMA zonelists are doubled because we need zonelists that
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a408a91..de797921 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5448,6 +5448,21 @@  static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
 	}
 	zonerefs->zone = NULL;
 	zonerefs->zone_idx = 0;
+
+	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zonerefs;
+
+	for (i = 0; i < nr_nodes; i++) {
+		int nr_zones;
+
+		pg_data_t *node = NODE_DATA(node_order[i]);
+
+		if (!is_node_same_type(node->node_id, pgdat->node_id))
+			continue;
+		nr_zones = build_zonerefs_node(node, zonerefs);
+		zonerefs += nr_zones;
+	}
+	zonerefs->zone = NULL;
+	zonerefs->zone_idx = 0;
 }
 
 /*