diff mbox series

[RFC] mm/mempolicy: add MPOL_PREFERRED_STRICT memory policy

Message ID 20211013094539.962357-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series [RFC] mm/mempolicy: add MPOL_PREFERRED_STRICT memory policy | expand

Commit Message

Aneesh Kumar K.V Oct. 13, 2021, 9:45 a.m. UTC
This mempolicy mode can be used with either the set_mempolicy(2)
or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
allows an application to set a preference node from which the kernel
will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
it takes a set of nodes. The nodes in the nodemask are used as fallback
allocation nodes if memory is not available on the preferred node.
Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
to all nodes in the system. Like the MPOL_BIND interface, it works over a
set of nodes and will cause a SIGSEGV or invoke the OOM killer if
memory is not available on those preferred nodes.

This patch helps applications to hint a memory allocation preference node
and fallback to _only_ a set of nodes if the memory is not available
on the preferred node.  Fallback allocation is attempted from the node which is
nearest to the preferred node.

This new memory policy helps applications to have explicit control on slow
memory allocation and avoids default fallback to slow memory NUMA nodes.
The difference with MPOL_BIND is the ability to specify a preferred node
which is the first node in the nodemask argument passed.

Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Huang Ying <ying.huang@intel.com>b

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../admin-guide/mm/numa_memory_policy.rst     |  7 +++
 include/uapi/linux/mempolicy.h                |  1 +
 mm/mempolicy.c                                | 43 +++++++++++++++++--
 3 files changed, 48 insertions(+), 3 deletions(-)

Comments

Michal Hocko Oct. 13, 2021, 10:42 a.m. UTC | #1
[Cc linux-api]

On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
> This mempolicy mode can be used with either the set_mempolicy(2)
> or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
> allows an application to set a preference node from which the kernel
> will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
> it takes a set of nodes. The nodes in the nodemask are used as fallback
> allocation nodes if memory is not available on the preferred node.
> Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
> to all nodes in the system. Like the MPOL_BIND interface, it works over a
> set of nodes and will cause a SIGSEGV or invoke the OOM killer if
> memory is not available on those preferred nodes.
> 
> This patch helps applications to hint a memory allocation preference node
> and fallback to _only_ a set of nodes if the memory is not available
> on the preferred node.  Fallback allocation is attempted from the node which is
> nearest to the preferred node.
> 
> This new memory policy helps applications to have explicit control on slow
> memory allocation and avoids default fallback to slow memory NUMA nodes.
> The difference with MPOL_BIND is the ability to specify a preferred node
> which is the first node in the nodemask argument passed.
> 
> Cc: Ben Widawsky <ben.widawsky@intel.com>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Feng Tang <feng.tang@intel.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Randy Dunlap <rdunlap@infradead.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Huang Ying <ying.huang@intel.com>b
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  .../admin-guide/mm/numa_memory_policy.rst     |  7 +++
>  include/uapi/linux/mempolicy.h                |  1 +
>  mm/mempolicy.c                                | 43 +++++++++++++++++--
>  3 files changed, 48 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
> index 64fd0ba0d057..4dfdcbd22d67 100644
> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
> @@ -252,6 +252,13 @@ MPOL_PREFERRED_MANY
>  	can fall back to all existing numa nodes. This is effectively
>  	MPOL_PREFERRED allowed for a mask rather than a single node.
>  
> +MPOL_PREFERRED_STRICT
> +	This mode specifies that the allocation should be attempted
> +	from the first node specified in the nodemask of the policy.
> +	If that allocation fails, the kernel will search other nodes
> +	in the nodemask, in order of increasing distance from the
> +	preferred node based on information provided by the platform firmware.
> +
>  NUMA memory policy supports the following optional mode flags:
>  
>  MPOL_F_STATIC_NODES
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 046d0ccba4cd..8aa1d1963235 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -23,6 +23,7 @@ enum {
>  	MPOL_INTERLEAVE,
>  	MPOL_LOCAL,
>  	MPOL_PREFERRED_MANY,
> +	MPOL_PREFERRED_STRICT,
>  	MPOL_MAX,	/* always last member of enum */
>  };
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 1592b081c58e..59080dd1ea69 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -407,6 +407,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
>  		.create = mpol_new_nodemask,
>  		.rebind = mpol_rebind_preferred,
>  	},
> +	[MPOL_PREFERRED_STRICT] = {
> +		.create = mpol_new_nodemask,
> +		.rebind = mpol_rebind_preferred,
> +	},
>  };
>  
>  static int migrate_page_add(struct page *page, struct list_head *pagelist,
> @@ -900,6 +904,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
>  	case MPOL_INTERLEAVE:
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  		*nodes = p->nodes;
>  		break;
>  	case MPOL_LOCAL:
> @@ -1781,7 +1786,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
>  		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
>  		return &policy->nodes;
>  
> -	if (mode == MPOL_PREFERRED_MANY)
> +	if (mode == MPOL_PREFERRED_MANY || mode == MPOL_PREFERRED_STRICT)
>  		return &policy->nodes;
>  
>  	return NULL;
> @@ -1796,7 +1801,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
>   */
>  static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  {
> -	if (policy->mode == MPOL_PREFERRED) {
> +	if (policy->mode == MPOL_PREFERRED || policy->mode == MPOL_PREFERRED_STRICT) {
>  		nd = first_node(policy->nodes);
>  	} else {
>  		/*
> @@ -1840,6 +1845,7 @@ unsigned int mempolicy_slab_node(void)
>  
>  	switch (policy->mode) {
>  	case MPOL_PREFERRED:
> +	case MPOL_PREFERRED_STRICT:
>  		return first_node(policy->nodes);
>  
>  	case MPOL_INTERLEAVE:
> @@ -1952,7 +1958,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
>  					huge_page_shift(hstate_vma(vma)));
>  	} else {
>  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
> -		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
> +		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY ||
> +			mode == MPOL_PREFERRED_STRICT)
>  			*nodemask = &(*mpol)->nodes;
>  	}
>  	return nid;
> @@ -1986,6 +1993,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  	switch (mempolicy->mode) {
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		*mask = mempolicy->nodes;
> @@ -2072,6 +2080,23 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
>  	return page;
>  }
>  
> +static struct page *alloc_pages_preferred_strict(gfp_t gfp, unsigned int order,
> +						 struct mempolicy *pol)
> +{
> +	int nid;
> +	gfp_t preferred_gfp;
> +
> +	/*
> +	 * With MPOL_PREFERRED_STRICT first node in the policy nodemask
> +	 * is picked as the preferred node id and the fallback allocation
> +	 * is still restricted to the preferred nodes in the nodemask.
> +	 */
> +	preferred_gfp = gfp | __GFP_NOWARN;
> +	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
> +	nid = first_node(pol->nodes);
> +	return __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
> +}
> +
>  /**
>   * alloc_pages_vma - Allocate a page for a VMA.
>   * @gfp: GFP flags.
> @@ -2113,6 +2138,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
>  		goto out;
>  	}
>  
> +	if (pol->mode == MPOL_PREFERRED_STRICT) {
> +		page = alloc_pages_preferred_strict(gfp, order, pol);
> +		mpol_cond_put(pol);
> +		goto out;
> +	}
> +
>  	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
>  		int hpage_node = node;
>  
> @@ -2193,6 +2224,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
>  	else if (pol->mode == MPOL_PREFERRED_MANY)
>  		page = alloc_pages_preferred_many(gfp, order,
>  				numa_node_id(), pol);
> +	else if (pol->mode == MPOL_PREFERRED_STRICT)
> +		page = alloc_pages_preferred_strict(gfp, order, pol);
>  	else
>  		page = __alloc_pages(gfp, order,
>  				policy_node(gfp, pol, numa_node_id()),
> @@ -2265,6 +2298,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  	case MPOL_INTERLEAVE:
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  		return !!nodes_equal(a->nodes, b->nodes);
>  	case MPOL_LOCAL:
>  		return true;
> @@ -2405,6 +2439,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
>  		break;
>  
>  	case MPOL_PREFERRED:
> +	case MPOL_PREFERRED_STRICT:
>  		if (node_isset(curnid, pol->nodes))
>  			goto out;
>  		polnid = first_node(pol->nodes);
> @@ -2866,6 +2901,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
>  			err = 0;
>  		goto out;
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  		/*
>  		 * Insist on a nodelist
> @@ -2953,6 +2989,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
>  		break;
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		nodes = pol->nodes;
> -- 
> 2.31.1
Michal Hocko Oct. 13, 2021, 10:48 a.m. UTC | #2
On Wed 13-10-21 12:42:34, Michal Hocko wrote:
> [Cc linux-api]
> 
> On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
> > This mempolicy mode can be used with either the set_mempolicy(2)
> > or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
> > allows an application to set a preference node from which the kernel
> > will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
> > it takes a set of nodes. The nodes in the nodemask are used as fallback
> > allocation nodes if memory is not available on the preferred node.
> > Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
> > to all nodes in the system. Like the MPOL_BIND interface, it works over a
> > set of nodes and will cause a SIGSEGV or invoke the OOM killer if
> > memory is not available on those preferred nodes.
> > 
> > This patch helps applications to hint a memory allocation preference node
> > and fallback to _only_ a set of nodes if the memory is not available
> > on the preferred node.  Fallback allocation is attempted from the node which is
> > nearest to the preferred node.
> > 
> > This new memory policy helps applications to have explicit control on slow
> > memory allocation and avoids default fallback to slow memory NUMA nodes.
> > The difference with MPOL_BIND is the ability to specify a preferred node
> > which is the first node in the nodemask argument passed.

I am sorry but I do not understand the semantic diffrence from
MPOL_BIND. Could you be more specific please?
Aneesh Kumar K.V Oct. 13, 2021, 12:35 p.m. UTC | #3
On 10/13/21 16:18, Michal Hocko wrote:
> On Wed 13-10-21 12:42:34, Michal Hocko wrote:
>> [Cc linux-api]
>>
>> On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
>>> This mempolicy mode can be used with either the set_mempolicy(2)
>>> or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
>>> allows an application to set a preference node from which the kernel
>>> will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
>>> it takes a set of nodes. The nodes in the nodemask are used as fallback
>>> allocation nodes if memory is not available on the preferred node.
>>> Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
>>> to all nodes in the system. Like the MPOL_BIND interface, it works over a
>>> set of nodes and will cause a SIGSEGV or invoke the OOM killer if
>>> memory is not available on those preferred nodes.
>>>
>>> This patch helps applications to hint a memory allocation preference node
>>> and fallback to _only_ a set of nodes if the memory is not available
>>> on the preferred node.  Fallback allocation is attempted from the node which is
>>> nearest to the preferred node.
>>>
>>> This new memory policy helps applications to have explicit control on slow
>>> memory allocation and avoids default fallback to slow memory NUMA nodes.
>>> The difference with MPOL_BIND is the ability to specify a preferred node
>>> which is the first node in the nodemask argument passed.
> 
> I am sorry but I do not understand the semantic diffrence from
> MPOL_BIND. Could you be more specific please?
> 



MPOL_BIND
	This mode specifies that memory must come from the set of
	nodes specified by the policy.  Memory will be allocated from
	the node in the set with sufficient free memory that is
	closest to the node where the allocation takes place.


MPOL_PREFERRED_STRICT
	This mode specifies that the allocation should be attempted
	from the first node specified in the nodemask of the policy.
	If that allocation fails, the kernel will search other nodes
	in the nodemask, in order of increasing distance from the
	preferred node based on information provided by the platform   firmware.

The difference is the ability to specify the preferred node as the first 
node in the nodemask and all fallback allocations are based on the 
distance from the preferred node. With MPOL_BIND they base based on the 
node where the allocation takes place.

-aneesh

-aneesh
Michal Hocko Oct. 13, 2021, 12:50 p.m. UTC | #4
On Wed 13-10-21 18:05:49, Aneesh Kumar K.V wrote:
> On 10/13/21 16:18, Michal Hocko wrote:
> > On Wed 13-10-21 12:42:34, Michal Hocko wrote:
> > > [Cc linux-api]
> > > 
> > > On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
> > > > This mempolicy mode can be used with either the set_mempolicy(2)
> > > > or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
> > > > allows an application to set a preference node from which the kernel
> > > > will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
> > > > it takes a set of nodes. The nodes in the nodemask are used as fallback
> > > > allocation nodes if memory is not available on the preferred node.
> > > > Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
> > > > to all nodes in the system. Like the MPOL_BIND interface, it works over a
> > > > set of nodes and will cause a SIGSEGV or invoke the OOM killer if
> > > > memory is not available on those preferred nodes.
> > > > 
> > > > This patch helps applications to hint a memory allocation preference node
> > > > and fallback to _only_ a set of nodes if the memory is not available
> > > > on the preferred node.  Fallback allocation is attempted from the node which is
> > > > nearest to the preferred node.
> > > > 
> > > > This new memory policy helps applications to have explicit control on slow
> > > > memory allocation and avoids default fallback to slow memory NUMA nodes.
> > > > The difference with MPOL_BIND is the ability to specify a preferred node
> > > > which is the first node in the nodemask argument passed.
> > 
> > I am sorry but I do not understand the semantic diffrence from
> > MPOL_BIND. Could you be more specific please?
> > 
> 
> 
> 
> MPOL_BIND
> 	This mode specifies that memory must come from the set of
> 	nodes specified by the policy.  Memory will be allocated from
> 	the node in the set with sufficient free memory that is
> 	closest to the node where the allocation takes place.
> 
> 
> MPOL_PREFERRED_STRICT
> 	This mode specifies that the allocation should be attempted
> 	from the first node specified in the nodemask of the policy.
> 	If that allocation fails, the kernel will search other nodes
> 	in the nodemask, in order of increasing distance from the
> 	preferred node based on information provided by the platform   firmware.
> 
> The difference is the ability to specify the preferred node as the first
> node in the nodemask and all fallback allocations are based on the distance
> from the preferred node. With MPOL_BIND they base based on the node where
> the allocation takes place.

OK, this makes it more clear. Thanks! 

I am still not sure the semantic makes sense though. Why should
the lowest node in the nodemask have any special meaning? What if it is
a node with a higher number that somebody preferes to start with?
Aneesh Kumar K.V Oct. 13, 2021, 12:58 p.m. UTC | #5
On 10/13/21 18:20, Michal Hocko wrote:
> On Wed 13-10-21 18:05:49, Aneesh Kumar K.V wrote:
>> On 10/13/21 16:18, Michal Hocko wrote:
>>> On Wed 13-10-21 12:42:34, Michal Hocko wrote:
>>>> [Cc linux-api]
>>>>
>>>> On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
>>>>> This mempolicy mode can be used with either the set_mempolicy(2)
>>>>> or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
>>>>> allows an application to set a preference node from which the kernel
>>>>> will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
>>>>> it takes a set of nodes. The nodes in the nodemask are used as fallback
>>>>> allocation nodes if memory is not available on the preferred node.
>>>>> Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
>>>>> to all nodes in the system. Like the MPOL_BIND interface, it works over a
>>>>> set of nodes and will cause a SIGSEGV or invoke the OOM killer if
>>>>> memory is not available on those preferred nodes.
>>>>>
>>>>> This patch helps applications to hint a memory allocation preference node
>>>>> and fallback to _only_ a set of nodes if the memory is not available
>>>>> on the preferred node.  Fallback allocation is attempted from the node which is
>>>>> nearest to the preferred node.
>>>>>
>>>>> This new memory policy helps applications to have explicit control on slow
>>>>> memory allocation and avoids default fallback to slow memory NUMA nodes.
>>>>> The difference with MPOL_BIND is the ability to specify a preferred node
>>>>> which is the first node in the nodemask argument passed.
>>>
>>> I am sorry but I do not understand the semantic diffrence from
>>> MPOL_BIND. Could you be more specific please?
>>>
>>
>>
>>
>> MPOL_BIND
>> 	This mode specifies that memory must come from the set of
>> 	nodes specified by the policy.  Memory will be allocated from
>> 	the node in the set with sufficient free memory that is
>> 	closest to the node where the allocation takes place.
>>
>>
>> MPOL_PREFERRED_STRICT
>> 	This mode specifies that the allocation should be attempted
>> 	from the first node specified in the nodemask of the policy.
>> 	If that allocation fails, the kernel will search other nodes
>> 	in the nodemask, in order of increasing distance from the
>> 	preferred node based on information provided by the platform   firmware.
>>
>> The difference is the ability to specify the preferred node as the first
>> node in the nodemask and all fallback allocations are based on the distance
>> from the preferred node. With MPOL_BIND they base based on the node where
>> the allocation takes place.
> 
> OK, this makes it more clear. Thanks!
> 
> I am still not sure the semantic makes sense though. Why should
> the lowest node in the nodemask have any special meaning? What if it is
> a node with a higher number that somebody preferes to start with?
> 

That is true. I haven't been able to find an easy way to specify the 
preferred node other than expressing it as first node in the node mask. 
Yes, it limits the usage of the policy. Any alternate suggestion?

We could do
set_mempolicy(MPOLD_PREFERRED, nodemask(nodeX)))
set_mempolicy(MPOLD_PREFFERED_EXTEND, nodemask(fallback nodemask for 
above PREFERRED policy))

But that really complicates the interface?

-aneesh
Michal Hocko Oct. 13, 2021, 1:07 p.m. UTC | #6
On Wed 13-10-21 18:28:40, Aneesh Kumar K.V wrote:
> On 10/13/21 18:20, Michal Hocko wrote:
[...]
> > I am still not sure the semantic makes sense though. Why should
> > the lowest node in the nodemask have any special meaning? What if it is
> > a node with a higher number that somebody preferes to start with?
> > 
> 
> That is true. I haven't been able to find an easy way to specify the
> preferred node other than expressing it as first node in the node mask. Yes,
> it limits the usage of the policy. Any alternate suggestion?

set_mempolicy is indeed not very suitable for something you are looking
for. Could you be more specific why the initial node is so important?
Is this because you want to allocate from a cpu less node first before
falling back to others?
Aneesh Kumar K.V Oct. 13, 2021, 1:10 p.m. UTC | #7
On 10/13/21 18:37, Michal Hocko wrote:
> On Wed 13-10-21 18:28:40, Aneesh Kumar K.V wrote:
>> On 10/13/21 18:20, Michal Hocko wrote:
> [...]
>>> I am still not sure the semantic makes sense though. Why should
>>> the lowest node in the nodemask have any special meaning? What if it is
>>> a node with a higher number that somebody preferes to start with?
>>>
>>
>> That is true. I haven't been able to find an easy way to specify the
>> preferred node other than expressing it as first node in the node mask. Yes,
>> it limits the usage of the policy. Any alternate suggestion?
> 
> set_mempolicy is indeed not very suitable for something you are looking
> for. Could you be more specific why the initial node is so important?
> Is this because you want to allocate from a cpu less node first before
> falling back to others?
> 

One of the reason is that the thread that is faulting in pages first is 
not the one that is going to operate on this page long term. Application 
wants to hint the allocation node for the same reason they use 
MPOL_PREFERRED now.

-aneesh
Andi Kleen Oct. 13, 2021, 1:16 p.m. UTC | #8
> The difference with MPOL_BIND is the ability to specify a preferred node
> which is the first node in the nodemask argument passed.

That's always the one with the lowest number. Isn't that quite limiting 
in practice?

It seems if you really want to do that you would need another argument.

-Andi
Aneesh Kumar K.V Oct. 13, 2021, 1:23 p.m. UTC | #9
On 10/13/21 18:46, Andi Kleen wrote:
> 
>> The difference with MPOL_BIND is the ability to specify a preferred node
>> which is the first node in the nodemask argument passed.
> 
> That's always the one with the lowest number. Isn't that quite limiting 
> in practice?
> 
> It seems if you really want to do that you would need another argument.
> 
Yes. But that would make it a new syscall. Should we do that?

-aneesh
Aneesh Kumar K.V Oct. 13, 2021, 1:57 p.m. UTC | #10
On 10/13/21 18:28, Aneesh Kumar K.V wrote:
> On 10/13/21 18:20, Michal Hocko wrote:
>> On Wed 13-10-21 18:05:49, Aneesh Kumar K.V wrote:
>>> On 10/13/21 16:18, Michal Hocko wrote:
>>>> On Wed 13-10-21 12:42:34, Michal Hocko wrote:
>>>>> [Cc linux-api]
>>>>>
>>>>> On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
>>>>>> This mempolicy mode can be used with either the set_mempolicy(2)
>>>>>> or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
>>>>>> allows an application to set a preference node from which the kernel
>>>>>> will fulfill memory allocation requests. Unlike the MPOL_PREFERRED 
>>>>>> mode,
>>>>>> it takes a set of nodes. The nodes in the nodemask are used as 
>>>>>> fallback
>>>>>> allocation nodes if memory is not available on the preferred node.
>>>>>> Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
>>>>>> to all nodes in the system. Like the MPOL_BIND interface, it works 
>>>>>> over a
>>>>>> set of nodes and will cause a SIGSEGV or invoke the OOM killer if
>>>>>> memory is not available on those preferred nodes.
>>>>>>
>>>>>> This patch helps applications to hint a memory allocation 
>>>>>> preference node
>>>>>> and fallback to _only_ a set of nodes if the memory is not available
>>>>>> on the preferred node.  Fallback allocation is attempted from the 
>>>>>> node which is
>>>>>> nearest to the preferred node.
>>>>>>
>>>>>> This new memory policy helps applications to have explicit control 
>>>>>> on slow
>>>>>> memory allocation and avoids default fallback to slow memory NUMA 
>>>>>> nodes.
>>>>>> The difference with MPOL_BIND is the ability to specify a 
>>>>>> preferred node
>>>>>> which is the first node in the nodemask argument passed.
>>>>
>>>> I am sorry but I do not understand the semantic diffrence from
>>>> MPOL_BIND. Could you be more specific please?
>>>>
>>>
>>>
>>>
>>> MPOL_BIND
>>>     This mode specifies that memory must come from the set of
>>>     nodes specified by the policy.  Memory will be allocated from
>>>     the node in the set with sufficient free memory that is
>>>     closest to the node where the allocation takes place.
>>>
>>>
>>> MPOL_PREFERRED_STRICT
>>>     This mode specifies that the allocation should be attempted
>>>     from the first node specified in the nodemask of the policy.
>>>     If that allocation fails, the kernel will search other nodes
>>>     in the nodemask, in order of increasing distance from the
>>>     preferred node based on information provided by the platform   
>>> firmware.
>>>
>>> The difference is the ability to specify the preferred node as the first
>>> node in the nodemask and all fallback allocations are based on the 
>>> distance
>>> from the preferred node. With MPOL_BIND they base based on the node 
>>> where
>>> the allocation takes place.
>>
>> OK, this makes it more clear. Thanks!
>>
>> I am still not sure the semantic makes sense though. Why should
>> the lowest node in the nodemask have any special meaning? What if it is
>> a node with a higher number that somebody preferes to start with?
>>
> 
> That is true. I haven't been able to find an easy way to specify the 
> preferred node other than expressing it as first node in the node mask. 
> Yes, it limits the usage of the policy. Any alternate suggestion?
> 
> We could do
> set_mempolicy(MPOLD_PREFERRED, nodemask(nodeX)))
> set_mempolicy(MPOLD_PREFFERED_EXTEND, nodemask(fallback nodemask for 
> above PREFERRED policy))
> 
> But that really complicates the interface?
> 
>

Another option is to keep this mbind(2) specific and overload flags to 
be the preferred nodeid.

mbind(va, len, MPOL_PREFERRED_STRICT, nodemask, max_node, preferred_node);

  -aneesh
Michal Hocko Oct. 13, 2021, 2:21 p.m. UTC | #11
On Wed 13-10-21 18:53:55, Aneesh Kumar K.V wrote:
> On 10/13/21 18:46, Andi Kleen wrote:
> > 
> > > The difference with MPOL_BIND is the ability to specify a preferred node
> > > which is the first node in the nodemask argument passed.
> > 
> > That's always the one with the lowest number. Isn't that quite limiting
> > in practice?
> > 
> > It seems if you really want to do that you would need another argument.
> > 
> Yes. But that would make it a new syscall. Should we do that?

Yes, I do not see any reasonable to cram this into the existing syscall.
I am not yet sure what the syscall should look like though. I can see
two usecases, one of the is a very specific node allocation fallback
order requirement and another one is preferrence for a cpu less node
over other nodes. Both are slightly different.
Michal Hocko Oct. 13, 2021, 2:22 p.m. UTC | #12
On Wed 13-10-21 18:40:26, Aneesh Kumar K.V wrote:
> On 10/13/21 18:37, Michal Hocko wrote:
> > On Wed 13-10-21 18:28:40, Aneesh Kumar K.V wrote:
> > > On 10/13/21 18:20, Michal Hocko wrote:
> > [...]
> > > > I am still not sure the semantic makes sense though. Why should
> > > > the lowest node in the nodemask have any special meaning? What if it is
> > > > a node with a higher number that somebody preferes to start with?
> > > > 
> > > 
> > > That is true. I haven't been able to find an easy way to specify the
> > > preferred node other than expressing it as first node in the node mask. Yes,
> > > it limits the usage of the policy. Any alternate suggestion?
> > 
> > set_mempolicy is indeed not very suitable for something you are looking
> > for. Could you be more specific why the initial node is so important?
> > Is this because you want to allocate from a cpu less node first before
> > falling back to others?
> > 
> 
> One of the reason is that the thread that is faulting in pages first is not
> the one that is going to operate on this page long term. Application wants
> to hint the allocation node for the same reason they use MPOL_PREFERRED now.

Why cannot you move the faulting thread to a numa node of the preference
during the faulting and them move it out if that is really necessary?
Michal Hocko Oct. 13, 2021, 2:26 p.m. UTC | #13
On Wed 13-10-21 19:27:03, Aneesh Kumar K.V wrote:
[...]
> Another option is to keep this mbind(2) specific and overload flags to be
> the preferred nodeid.
> 
> mbind(va, len, MPOL_PREFERRED_STRICT, nodemask, max_node, preferred_node);

First of all I do not think you really want to create a new memory
policy for this. Not to mention that PREFERRED_STRICT is kinda weird in
the first place but one could argue that a preference of the first node
to try is not really specific to BIND/PREFERRED_MANY. Overloading flags
is a nogo.
Aneesh Kumar K.V Oct. 14, 2021, 9:30 a.m. UTC | #14
Michal Hocko <mhocko@suse.com> writes:

> On Wed 13-10-21 18:53:55, Aneesh Kumar K.V wrote:
>> On 10/13/21 18:46, Andi Kleen wrote:
>> > 
>> > > The difference with MPOL_BIND is the ability to specify a preferred node
>> > > which is the first node in the nodemask argument passed.
>> > 
>> > That's always the one with the lowest number. Isn't that quite limiting
>> > in practice?
>> > 
>> > It seems if you really want to do that you would need another argument.
>> > 
>> Yes. But that would make it a new syscall. Should we do that?
>
> Yes, I do not see any reasonable to cram this into the existing syscall.
> I am not yet sure what the syscall should look like though. I can see
> two usecases, one of the is a very specific node allocation fallback
> order requirement and another one is preferrence for a cpu less node
> over other nodes. Both are slightly different.

How about

SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
		unsigned long, preferred_node, const unsigned long __user *, nmask,
		unsigned long, maxnode)
{
	return kernel_mbind(start, len, MPOL_PREFERRED_STRICT, preferred_node,
			    nmask, maxnode, 0);
}
Michal Hocko Oct. 14, 2021, 9:38 a.m. UTC | #15
On Thu 14-10-21 15:00:22, Aneesh Kumar K.V wrote:
> Michal Hocko <mhocko@suse.com> writes:
> 
> > On Wed 13-10-21 18:53:55, Aneesh Kumar K.V wrote:
> >> On 10/13/21 18:46, Andi Kleen wrote:
> >> > 
> >> > > The difference with MPOL_BIND is the ability to specify a preferred node
> >> > > which is the first node in the nodemask argument passed.
> >> > 
> >> > That's always the one with the lowest number. Isn't that quite limiting
> >> > in practice?
> >> > 
> >> > It seems if you really want to do that you would need another argument.
> >> > 
> >> Yes. But that would make it a new syscall. Should we do that?
> >
> > Yes, I do not see any reasonable to cram this into the existing syscall.
> > I am not yet sure what the syscall should look like though. I can see
> > two usecases, one of the is a very specific node allocation fallback
> > order requirement and another one is preferrence for a cpu less node
> > over other nodes. Both are slightly different.
> 
> How about
> 
> SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
> 		unsigned long, preferred_node, const unsigned long __user *, nmask,
> 		unsigned long, maxnode)
> {
> 	return kernel_mbind(start, len, MPOL_PREFERRED_STRICT, preferred_node,
> 			    nmask, maxnode, 0);
> }

Semantic? How does it interact with MPOL_PREFERRED_MANY, MPOL_BIND and
other others?

Besides that it would be really great to finish the discussion about the
usecase before suggesting a new userspace API.
Aneesh Kumar K.V Oct. 14, 2021, 10:28 a.m. UTC | #16
On 10/14/21 15:08, Michal Hocko wrote:
> On Thu 14-10-21 15:00:22, Aneesh Kumar K.V wrote:
>> Michal Hocko <mhocko@suse.com> writes:
>>
>>> On Wed 13-10-21 18:53:55, Aneesh Kumar K.V wrote:
>>>> On 10/13/21 18:46, Andi Kleen wrote:
>>>>>
>>>>>> The difference with MPOL_BIND is the ability to specify a preferred node
>>>>>> which is the first node in the nodemask argument passed.
>>>>>
>>>>> That's always the one with the lowest number. Isn't that quite limiting
>>>>> in practice?
>>>>>
>>>>> It seems if you really want to do that you would need another argument.
>>>>>
>>>> Yes. But that would make it a new syscall. Should we do that?
>>>
>>> Yes, I do not see any reasonable to cram this into the existing syscall.
>>> I am not yet sure what the syscall should look like though. I can see
>>> two usecases, one of the is a very specific node allocation fallback
>>> order requirement and another one is preferrence for a cpu less node
>>> over other nodes. Both are slightly different.
>>
>> How about
>>
>> SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
>> 		unsigned long, preferred_node, const unsigned long __user *, nmask,
>> 		unsigned long, maxnode)
>> {
>> 	return kernel_mbind(start, len, MPOL_PREFERRED_STRICT, preferred_node,
>> 			    nmask, maxnode, 0);
>> }
> 
> Semantic? How does it interact with MPOL_PREFERRED_MANY, MPOL_BIND and
> other others?
> 

This allows to specify a new memory policy for the va range. We are 
forced to use a new syscall because of the limitation of the current 
mbind(2) syscall. We could make a generic sys_mbind2(), but i was not 
sure whether we need to make it that complex. mbind() is already a 6 
argument syscall.

> Besides that it would be really great to finish the discussion about the
> usecase before suggesting a new userspace API.
> 

Application would like to hint a preferred node for allocating memory 
backing a va range and at the same time wants to avoid fallback to some 
set of nodes (in the use case I am interested don't fall back to slow 
memory nodes).


-aneesh
Michal Hocko Oct. 14, 2021, 11:41 a.m. UTC | #17
On Thu 14-10-21 15:58:29, Aneesh Kumar K.V wrote:
> On 10/14/21 15:08, Michal Hocko wrote:
[...]
> > Besides that it would be really great to finish the discussion about the
> > usecase before suggesting a new userspace API.
> > 
> 
> Application would like to hint a preferred node for allocating memory
> backing a va range and at the same time wants to avoid fallback to some set
> of nodes (in the use case I am interested don't fall back to slow memory
> nodes).

We do have means for that, right? You can set your memory policy and
then set the cpu afffinity to the node you want to allocate from
initially. You can migrate to a different cpu/node if this is not the
preferred affinity. Why is that not usable?

Also think about extensibility. Say I want to allocate from a set of
nodes first before falling back to the rest of the nodemask? If you want
to add a new API then think of other potential usecases.
Aneesh Kumar K.V Oct. 14, 2021, 1:29 p.m. UTC | #18
On 10/14/21 17:11, Michal Hocko wrote:
> On Thu 14-10-21 15:58:29, Aneesh Kumar K.V wrote:
>> On 10/14/21 15:08, Michal Hocko wrote:
> [...]
>>> Besides that it would be really great to finish the discussion about the
>>> usecase before suggesting a new userspace API.
>>>
>>
>> Application would like to hint a preferred node for allocating memory
>> backing a va range and at the same time wants to avoid fallback to some set
>> of nodes (in the use case I am interested don't fall back to slow memory
>> nodes).
> 
> We do have means for that, right? You can set your memory policy and
> then set the cpu afffinity to the node you want to allocate from
> initially. You can migrate to a different cpu/node if this is not the
> preferred affinity. Why is that not usable?

For the same reason you mentioned earlier, these nodes can be cpu less 
nodes.

> 
> Also think about extensibility. Say I want to allocate from a set of
> nodes first before falling back to the rest of the nodemask? If you want
> to add a new API then think of other potential usecases.
> 

Describing the specific allocation details become hard with preferred 
node being a nodemask. With the below interface

SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
		const unsigned long __user *, preferred_nmask, const unsigned long 
__user *, fallback_nmask,
		unsigned long, maxnode)
{


1. The preferred node is the first node in the preferred node mask
2. Then we try to allocate from nodes present in the preferred node mask 
which is closer to the first node in the preferred node mask
3. If the above fails, we try to allocate from nodes in the fallback 
node mask which is closer to the first node in the preferred nodemask.

Isn't that too complicated? Do we have a real usecase for that?

-aneesh
Michal Hocko Oct. 14, 2021, 2:56 p.m. UTC | #19
On Thu 14-10-21 18:59:14, Aneesh Kumar K.V wrote:
> On 10/14/21 17:11, Michal Hocko wrote:
> > On Thu 14-10-21 15:58:29, Aneesh Kumar K.V wrote:
> > > On 10/14/21 15:08, Michal Hocko wrote:
> > [...]
> > > > Besides that it would be really great to finish the discussion about the
> > > > usecase before suggesting a new userspace API.
> > > > 
> > > 
> > > Application would like to hint a preferred node for allocating memory
> > > backing a va range and at the same time wants to avoid fallback to some set
> > > of nodes (in the use case I am interested don't fall back to slow memory
> > > nodes).
> > 
> > We do have means for that, right? You can set your memory policy and
> > then set the cpu afffinity to the node you want to allocate from
> > initially. You can migrate to a different cpu/node if this is not the
> > preferred affinity. Why is that not usable?
> 
> For the same reason you mentioned earlier, these nodes can be cpu less
> nodes.

It would have been easier if you were explicit about the usecase rather
than let other guess.

> > Also think about extensibility. Say I want to allocate from a set of
> > nodes first before falling back to the rest of the nodemask? If you want
> > to add a new API then think of other potential usecases.
> > 
> 
> Describing the specific allocation details become hard with preferred node
> being a nodemask. With the below interface
> 
> SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
> 		const unsigned long __user *, preferred_nmask, const unsigned long __user
> *, fallback_nmask,
> 		unsigned long, maxnode)
> {
> 
> 
> 1. The preferred node is the first node in the preferred node mask
> 2. Then we try to allocate from nodes present in the preferred node mask
> which is closer to the first node in the preferred node mask
> 3. If the above fails, we try to allocate from nodes in the fallback node
> mask which is closer to the first node in the preferred nodemask.
> 
> Isn't that too complicated? Do we have a real usecase for that?

No, I think this is a suboptimal interface. AFAIU you really want to
define a "home" node(s) rather than any policy. Home node would
effectively override the default local node whatever policy you have as
it makes sense whether you have MPOL_PREFERRED_MANY or MPOL_BIND.

Another potential interface would be set_nodeorder which would
explicitly set the allocation fallback ordering. Again agnostic of the
underlying memory policy. This would be more generic but the question is
whether this is not too generic and whether there are usecases for that.

Makes sense?
Aneesh Kumar K.V Oct. 14, 2021, 3:50 p.m. UTC | #20
On 10/14/21 20:26, Michal Hocko wrote:
> On Thu 14-10-21 18:59:14, Aneesh Kumar K.V wrote:
>> On 10/14/21 17:11, Michal Hocko wrote:
>>> On Thu 14-10-21 15:58:29, Aneesh Kumar K.V wrote:
>>>> On 10/14/21 15:08, Michal Hocko wrote:
>>> [...]
>>>>> Besides that it would be really great to finish the discussion about the
>>>>> usecase before suggesting a new userspace API.
>>>>>
>>>>
>>>> Application would like to hint a preferred node for allocating memory
>>>> backing a va range and at the same time wants to avoid fallback to some set
>>>> of nodes (in the use case I am interested don't fall back to slow memory
>>>> nodes).
>>>
>>> We do have means for that, right? You can set your memory policy and
>>> then set the cpu afffinity to the node you want to allocate from
>>> initially. You can migrate to a different cpu/node if this is not the
>>> preferred affinity. Why is that not usable?
>>
>> For the same reason you mentioned earlier, these nodes can be cpu less
>> nodes.
> 
> It would have been easier if you were explicit about the usecase rather
> than let other guess.
> 
>>> Also think about extensibility. Say I want to allocate from a set of
>>> nodes first before falling back to the rest of the nodemask? If you want
>>> to add a new API then think of other potential usecases.
>>>
>>
>> Describing the specific allocation details become hard with preferred node
>> being a nodemask. With the below interface
>>
>> SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
>> 		const unsigned long __user *, preferred_nmask, const unsigned long __user
>> *, fallback_nmask,
>> 		unsigned long, maxnode)
>> {
>>
>>
>> 1. The preferred node is the first node in the preferred node mask
>> 2. Then we try to allocate from nodes present in the preferred node mask
>> which is closer to the first node in the preferred node mask
>> 3. If the above fails, we try to allocate from nodes in the fallback node
>> mask which is closer to the first node in the preferred nodemask.
>>
>> Isn't that too complicated? Do we have a real usecase for that?
> 
> No, I think this is a suboptimal interface. AFAIU you really want to
> define a "home" node(s) rather than any policy. Home node would
> effectively override the default local node whatever policy you have as
> it makes sense whether you have MPOL_PREFERRED_MANY or MPOL_BIND.
> 


yes. I did describe it as below in an earlier email

"We could do
set_mempolicy(MPOLD_PREFERRED, nodemask(nodeX)))
set_mempolicy(MPOLD_PREFFERED_EXTEND, nodemask(fallback nodemask for 
above PREFERRED policy)) "

But I agree that restricting this to virtual address range is much 
better. Now I am wondering whether a nodemask is any better than a 
nodeid. The concept of home nodes is confusing when compared to home node.
What would be the meaning of multiple nodes in a home nodes concept?

Should we do

SYSCALL_DEFINE4(home_node_mbind, unsigned long, start, unsigned long, len,
		unsigned long, home_node, unsigned long, flags)


the flags is kept for future extension if any.


I guess this home node will only apply w.r.t MPOL_BIND and 
MPOL_PREFFERED_MANY policy for now?

> Another potential interface would be set_nodeorder which would
> explicitly set the allocation fallback ordering. Again agnostic of the
> underlying memory policy. This would be more generic but the question is
> whether this is not too generic and whether there are usecases for that.
> 

I would suggest we wait for applications really wanting a fallback order 
other than distance based one before adding this. Distance based 
fallback order from a preferred node is well understood from application 
point of view.

-aneesh
Michal Hocko Oct. 19, 2021, 9:38 a.m. UTC | #21
On Thu 14-10-21 21:20:51, Aneesh Kumar K.V wrote:
> On 10/14/21 20:26, Michal Hocko wrote:
> > On Thu 14-10-21 18:59:14, Aneesh Kumar K.V wrote:
> > > On 10/14/21 17:11, Michal Hocko wrote:
> > > > On Thu 14-10-21 15:58:29, Aneesh Kumar K.V wrote:
> > > > > On 10/14/21 15:08, Michal Hocko wrote:
> > > > [...]
> > > > > > Besides that it would be really great to finish the discussion about the
> > > > > > usecase before suggesting a new userspace API.
> > > > > > 
> > > > > 
> > > > > Application would like to hint a preferred node for allocating memory
> > > > > backing a va range and at the same time wants to avoid fallback to some set
> > > > > of nodes (in the use case I am interested don't fall back to slow memory
> > > > > nodes).
> > > > 
> > > > We do have means for that, right? You can set your memory policy and
> > > > then set the cpu afffinity to the node you want to allocate from
> > > > initially. You can migrate to a different cpu/node if this is not the
> > > > preferred affinity. Why is that not usable?
> > > 
> > > For the same reason you mentioned earlier, these nodes can be cpu less
> > > nodes.
> > 
> > It would have been easier if you were explicit about the usecase rather
> > than let other guess.
> > 
> > > > Also think about extensibility. Say I want to allocate from a set of
> > > > nodes first before falling back to the rest of the nodemask? If you want
> > > > to add a new API then think of other potential usecases.
> > > > 
> > > 
> > > Describing the specific allocation details become hard with preferred node
> > > being a nodemask. With the below interface
> > > 
> > > SYSCALL_DEFINE5(preferred_mbind, unsigned long, start, unsigned long, len,
> > > 		const unsigned long __user *, preferred_nmask, const unsigned long __user
> > > *, fallback_nmask,
> > > 		unsigned long, maxnode)
> > > {
> > > 
> > > 
> > > 1. The preferred node is the first node in the preferred node mask
> > > 2. Then we try to allocate from nodes present in the preferred node mask
> > > which is closer to the first node in the preferred node mask
> > > 3. If the above fails, we try to allocate from nodes in the fallback node
> > > mask which is closer to the first node in the preferred nodemask.
> > > 
> > > Isn't that too complicated? Do we have a real usecase for that?
> > 
> > No, I think this is a suboptimal interface. AFAIU you really want to
> > define a "home" node(s) rather than any policy. Home node would
> > effectively override the default local node whatever policy you have as
> > it makes sense whether you have MPOL_PREFERRED_MANY or MPOL_BIND.
> > 
> 
> 
> yes. I did describe it as below in an earlier email
> 
> "We could do
> set_mempolicy(MPOLD_PREFERRED, nodemask(nodeX)))
> set_mempolicy(MPOLD_PREFFERED_EXTEND, nodemask(fallback nodemask for above
> PREFERRED policy)) "
> 
> But I agree that restricting this to virtual address range is much better.
> Now I am wondering whether a nodemask is any better than a nodeid. The
> concept of home nodes is confusing when compared to home node.
> What would be the meaning of multiple nodes in a home nodes concept?

If you go with a nodemask then I expect we will hit ordering requirement
very quickly. A single home node for a range makes some sense to me for
the cpu less nodes. I do not see why somebody might require them to be
the first one to consider but I can imagine there might be some
(semi)reasonable usecases out there. In any case, implementation wise
this shouldn't be really restricted to any specific memorly policy and
only override the local node when we use it currently.

> Should we do
> 
> SYSCALL_DEFINE4(home_node_mbind, unsigned long, start, unsigned long, len,
> 		unsigned long, home_node, unsigned long, flags)
> 
> 
> the flags is kept for future extension if any.
> 
> 
> I guess this home node will only apply w.r.t MPOL_BIND and
> MPOL_PREFFERED_MANY policy for now?

Why to constrain that artificially. Interleaving has to start somewhere
as well, right? Not that it matters much in practice as only the first
allocation would be affected.
 
> > Another potential interface would be set_nodeorder which would
> > explicitly set the allocation fallback ordering. Again agnostic of the
> > underlying memory policy. This would be more generic but the question is
> > whether this is not too generic and whether there are usecases for that.
> > 
> 
> I would suggest we wait for applications really wanting a fallback order
> other than distance based one before adding this. Distance based fallback
> order from a preferred node is well understood from application point of
> view.

Right, I am not pushing into that direction. The idea was that unlike
home node this has more potential extensibility as a single home node
cannot capture preferences for more nodes for example.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 64fd0ba0d057..4dfdcbd22d67 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -252,6 +252,13 @@  MPOL_PREFERRED_MANY
 	can fall back to all existing numa nodes. This is effectively
 	MPOL_PREFERRED allowed for a mask rather than a single node.
 
+MPOL_PREFERRED_STRICT
+	This mode specifies that the allocation should be attempted
+	from the first node specified in the nodemask of the policy.
+	If that allocation fails, the kernel will search other nodes
+	in the nodemask, in order of increasing distance from the
+	preferred node based on information provided by the platform firmware.
+
 NUMA memory policy supports the following optional mode flags:
 
 MPOL_F_STATIC_NODES
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..8aa1d1963235 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -23,6 +23,7 @@  enum {
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
 	MPOL_PREFERRED_MANY,
+	MPOL_PREFERRED_STRICT,
 	MPOL_MAX,	/* always last member of enum */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1592b081c58e..59080dd1ea69 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -407,6 +407,10 @@  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_nodemask,
 		.rebind = mpol_rebind_preferred,
 	},
+	[MPOL_PREFERRED_STRICT] = {
+		.create = mpol_new_nodemask,
+		.rebind = mpol_rebind_preferred,
+	},
 };
 
 static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +904,7 @@  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_PREFERRED_STRICT:
 		*nodes = p->nodes;
 		break;
 	case MPOL_LOCAL:
@@ -1781,7 +1786,7 @@  nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
 		return &policy->nodes;
 
-	if (mode == MPOL_PREFERRED_MANY)
+	if (mode == MPOL_PREFERRED_MANY || mode == MPOL_PREFERRED_STRICT)
 		return &policy->nodes;
 
 	return NULL;
@@ -1796,7 +1801,7 @@  nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
  */
 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
-	if (policy->mode == MPOL_PREFERRED) {
+	if (policy->mode == MPOL_PREFERRED || policy->mode == MPOL_PREFERRED_STRICT) {
 		nd = first_node(policy->nodes);
 	} else {
 		/*
@@ -1840,6 +1845,7 @@  unsigned int mempolicy_slab_node(void)
 
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_STRICT:
 		return first_node(policy->nodes);
 
 	case MPOL_INTERLEAVE:
@@ -1952,7 +1958,8 @@  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
-		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
+		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY ||
+			mode == MPOL_PREFERRED_STRICT)
 			*nodemask = &(*mpol)->nodes;
 	}
 	return nid;
@@ -1986,6 +1993,7 @@  bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_PREFERRED_STRICT:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		*mask = mempolicy->nodes;
@@ -2072,6 +2080,23 @@  static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
 	return page;
 }
 
+static struct page *alloc_pages_preferred_strict(gfp_t gfp, unsigned int order,
+						 struct mempolicy *pol)
+{
+	int nid;
+	gfp_t preferred_gfp;
+
+	/*
+	 * With MPOL_PREFERRED_STRICT first node in the policy nodemask
+	 * is picked as the preferred node id and the fallback allocation
+	 * is still restricted to the preferred nodes in the nodemask.
+	 */
+	preferred_gfp = gfp | __GFP_NOWARN;
+	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+	nid = first_node(pol->nodes);
+	return __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+}
+
 /**
  * alloc_pages_vma - Allocate a page for a VMA.
  * @gfp: GFP flags.
@@ -2113,6 +2138,12 @@  struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		goto out;
 	}
 
+	if (pol->mode == MPOL_PREFERRED_STRICT) {
+		page = alloc_pages_preferred_strict(gfp, order, pol);
+		mpol_cond_put(pol);
+		goto out;
+	}
+
 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
 		int hpage_node = node;
 
@@ -2193,6 +2224,8 @@  struct page *alloc_pages(gfp_t gfp, unsigned order)
 	else if (pol->mode == MPOL_PREFERRED_MANY)
 		page = alloc_pages_preferred_many(gfp, order,
 				numa_node_id(), pol);
+	else if (pol->mode == MPOL_PREFERRED_STRICT)
+		page = alloc_pages_preferred_strict(gfp, order, pol);
 	else
 		page = __alloc_pages(gfp, order,
 				policy_node(gfp, pol, numa_node_id()),
@@ -2265,6 +2298,7 @@  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_PREFERRED_STRICT:
 		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
@@ -2405,6 +2439,7 @@  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_STRICT:
 		if (node_isset(curnid, pol->nodes))
 			goto out;
 		polnid = first_node(pol->nodes);
@@ -2866,6 +2901,7 @@  int mpol_parse_str(char *str, struct mempolicy **mpol)
 			err = 0;
 		goto out;
 	case MPOL_PREFERRED_MANY:
+	case MPOL_PREFERRED_STRICT:
 	case MPOL_BIND:
 		/*
 		 * Insist on a nodelist
@@ -2953,6 +2989,7 @@  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		break;
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_PREFERRED_STRICT:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		nodes = pol->nodes;