diff mbox series

[v4,03/13] mm/mempolicy: Add MPOL_PREFERRED_MANY for multiple preferred nodes

Message ID 1615952410-36895-4-git-send-email-feng.tang@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduced multi-preference mempolicy | expand

Commit Message

Feng Tang March 17, 2021, 3:40 a.m. UTC
From: Dave Hansen <dave.hansen@linux.intel.com>

MPOL_PREFERRED honors only a single node set in the nodemask.  Add the
bare define for a new mode which will allow more than one.

The patch does all the plumbing without actually adding the new policy
type.

v2:
Plumb most MPOL_PREFERRED_MANY without exposing UAPI (Ben)
Fixes for checkpatch (Ben)

Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com
Co-developed-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 mm/mempolicy.c | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

Comments

Michal Hocko April 14, 2021, 12:50 p.m. UTC | #1
On Wed 17-03-21 11:40:00, Feng Tang wrote:
> From: Dave Hansen <dave.hansen@linux.intel.com>
> 
> MPOL_PREFERRED honors only a single node set in the nodemask.  Add the
> bare define for a new mode which will allow more than one.
> 
> The patch does all the plumbing without actually adding the new policy
> type.
> 
> v2:
> Plumb most MPOL_PREFERRED_MANY without exposing UAPI (Ben)
> Fixes for checkpatch (Ben)
> 
> Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com
> Co-developed-by: Ben Widawsky <ben.widawsky@intel.com>
> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: Feng Tang <feng.tang@intel.com>
> ---
>  mm/mempolicy.c | 46 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 2b1e0e4..1228d8e 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -31,6 +31,9 @@
>   *                but useful to set in a VMA when you have a non default
>   *                process policy.
>   *
> + * preferred many Try a set of nodes first before normal fallback. This is
> + *                similar to preferred without the special case.
> + *
>   * default        Allocate on the local node first, or when on a VMA
>   *                use the process policy. This is what Linux always did
>   *		  in a NUMA aware kernel and still does by, ahem, default.
> @@ -105,6 +108,8 @@
>  
>  #include "internal.h"
>  
> +#define MPOL_PREFERRED_MANY MPOL_MAX
> +
>  /* Internal flags */
>  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
>  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
> @@ -175,7 +180,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
>  static const struct mempolicy_operations {
>  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
>  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
> -} mpol_ops[MPOL_MAX];
> +} mpol_ops[MPOL_MAX + 1];
>  
>  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
>  {
> @@ -415,7 +420,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
>  	mmap_write_unlock(mm);
>  }
>  
> -static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
> +static const struct mempolicy_operations mpol_ops[MPOL_MAX + 1] = {
>  	[MPOL_DEFAULT] = {
>  		.rebind = mpol_rebind_default,
>  	},
> @@ -432,6 +437,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
>  		.rebind = mpol_rebind_nodemask,
>  	},
>  	/* [MPOL_LOCAL] - see mpol_new() */
> +	[MPOL_PREFERRED_MANY] = {
> +		.create = NULL,
> +		.rebind = NULL,
> +	},
>  };

I do get that you wanted to keep MPOL_PREFERRED_MANY unaccessible for
the userspace but wouldn't it be much easier to simply check in two
syscall entries rather than playing thise MAX+1 games which make the
review more complicated than necessary?

>  
>  static int migrate_page_add(struct page *page, struct list_head *pagelist,
> @@ -924,6 +933,9 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
>  	case MPOL_INTERLEAVE:
>  		*nodes = p->v.nodes;
>  		break;
> +	case MPOL_PREFERRED_MANY:
> +		*nodes = p->v.preferred_nodes;
> +		break;
>  	case MPOL_PREFERRED:
>  		if (!(p->flags & MPOL_F_LOCAL))
>  			*nodes = p->v.preferred_nodes;

Why those two do a slightly different thing? Is this because unlike
MPOL_PREFERRED it can never have MPOL_F_LOCAL cleared? If that is the
case I would still stick the two together and use the same code for
both to make the code easier to follow. Now that both use the same
nodemask it should really be just about syscall inputs sanitization and
to keep the original behavior for MPOL_PREFERRED.

[...]
> @@ -2072,6 +2087,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  	task_lock(current);
>  	mempolicy = current->mempolicy;
>  	switch (mempolicy->mode) {
> +	case MPOL_PREFERRED_MANY:
> +		*mask = mempolicy->v.preferred_nodes;
> +		break;
>  	case MPOL_PREFERRED:
>  		if (mempolicy->flags & MPOL_F_LOCAL)
>  			nid = numa_node_id();

Same here

> @@ -2126,6 +2144,9 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
>  		 * nodes in mask.
>  		 */
>  		break;
> +	case MPOL_PREFERRED_MANY:
> +		ret = nodes_intersects(mempolicy->v.preferred_nodes, *mask);
> +		break;

I do not think this is a correct behavior. Preferred policy, whether it
is a single node or a nodemask, is a hint not a requirement. So we
should always treat it as intersecting. I do understand that the naming
can be confusing because intersect operation should indeed check
nodemaska but this is yet another trap of the mempolicy code. It is
only used for the OOM selection.

Btw. the code is wrong for INTERLEAVE as well because it uses the
interleaving node as a hint as well. It is not bound by the interleave
nodemask. Sigh...

>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		ret = nodes_intersects(mempolicy->v.nodes, *mask);
[...]

> @@ -2349,6 +2373,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		return !!nodes_equal(a->v.nodes, b->v.nodes);
> +	case MPOL_PREFERRED_MANY:
> +		return !!nodes_equal(a->v.preferred_nodes,
> +				     b->v.preferred_nodes);

Again different from MPOL_PREFERRED...

>  	case MPOL_PREFERRED:
>  		/* a's ->flags is the same as b's */
>  		if (a->flags & MPOL_F_LOCAL)
> @@ -2523,6 +2550,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
>  		polnid = zone_to_nid(z->zone);
>  		break;
>  
> +		/* case MPOL_PREFERRED_MANY: */
> +

I hope a follow up patch will make this not panic but as you are already
plumbing everything in it should really be as simple as node_isset
check.

>  	default:
>  		BUG();

Besides that, this should really go!

> @@ -3035,6 +3066,9 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
>  	switch (mode) {
>  	case MPOL_DEFAULT:
>  		break;
> +	case MPOL_PREFERRED_MANY:
> +		WARN_ON(flags & MPOL_F_LOCAL);

Why WARN_ON here?

> +		fallthrough;
>  	case MPOL_PREFERRED:
>  		if (flags & MPOL_F_LOCAL)
>  			mode = MPOL_LOCAL;
> -- 
> 2.7.4
Feng Tang April 20, 2021, 7:16 a.m. UTC | #2
On Wed, Apr 14, 2021 at 02:50:53PM +0200, Michal Hocko wrote:
> On Wed 17-03-21 11:40:00, Feng Tang wrote:
> > From: Dave Hansen <dave.hansen@linux.intel.com>
> > 
> > MPOL_PREFERRED honors only a single node set in the nodemask.  Add the
> > bare define for a new mode which will allow more than one.
> > 
> > The patch does all the plumbing without actually adding the new policy
> > type.
> > 
> > v2:
> > Plumb most MPOL_PREFERRED_MANY without exposing UAPI (Ben)
> > Fixes for checkpatch (Ben)
> > 
> > Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com
> > Co-developed-by: Ben Widawsky <ben.widawsky@intel.com>
> > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > ---
> >  mm/mempolicy.c | 46 ++++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 40 insertions(+), 6 deletions(-)
> > 
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 2b1e0e4..1228d8e 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -31,6 +31,9 @@
> >   *                but useful to set in a VMA when you have a non default
> >   *                process policy.
> >   *
> > + * preferred many Try a set of nodes first before normal fallback. This is
> > + *                similar to preferred without the special case.
> > + *
> >   * default        Allocate on the local node first, or when on a VMA
> >   *                use the process policy. This is what Linux always did
> >   *		  in a NUMA aware kernel and still does by, ahem, default.
> > @@ -105,6 +108,8 @@
> >  
> >  #include "internal.h"
> >  
> > +#define MPOL_PREFERRED_MANY MPOL_MAX
> > +
> >  /* Internal flags */
> >  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
> >  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
> > @@ -175,7 +180,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
> >  static const struct mempolicy_operations {
> >  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
> >  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
> > -} mpol_ops[MPOL_MAX];
> > +} mpol_ops[MPOL_MAX + 1];
> >  
> >  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
> >  {
> > @@ -415,7 +420,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
> >  	mmap_write_unlock(mm);
> >  }
> >  
> > -static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
> > +static const struct mempolicy_operations mpol_ops[MPOL_MAX + 1] = {
> >  	[MPOL_DEFAULT] = {
> >  		.rebind = mpol_rebind_default,
> >  	},
> > @@ -432,6 +437,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
> >  		.rebind = mpol_rebind_nodemask,
> >  	},
> >  	/* [MPOL_LOCAL] - see mpol_new() */
> > +	[MPOL_PREFERRED_MANY] = {
> > +		.create = NULL,
> > +		.rebind = NULL,
> > +	},
> >  };
> 
> I do get that you wanted to keep MPOL_PREFERRED_MANY unaccessible for
> the userspace but wouldn't it be much easier to simply check in two
> syscall entries rather than playing thise MAX+1 games which make the
> review more complicated than necessary?

I will check this way, and currently the user input paramter
handling are quite complex.

Also the sanity check in kernel_mbind() and kernel_set_mempolicy()
are almost identical, which can be unified.

> >  
> >  static int migrate_page_add(struct page *page, struct list_head *pagelist,
> > @@ -924,6 +933,9 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
> >  	case MPOL_INTERLEAVE:
> >  		*nodes = p->v.nodes;
> >  		break;
> > +	case MPOL_PREFERRED_MANY:
> > +		*nodes = p->v.preferred_nodes;
> > +		break;
> >  	case MPOL_PREFERRED:
> >  		if (!(p->flags & MPOL_F_LOCAL))
> >  			*nodes = p->v.preferred_nodes;
> 
> Why those two do a slightly different thing? Is this because unlike
> MPOL_PREFERRED it can never have MPOL_F_LOCAL cleared? If that is the
> case I would still stick the two together and use the same code for
> both to make the code easier to follow. Now that both use the same
> nodemask it should really be just about syscall inputs sanitization and
> to keep the original behavior for MPOL_PREFERRED.
> 
> [...]

Our intention is to make MPOL_PREFERRED_MANY be similar to
MPOL_PREFERRED, except it perfers multiple nodes. So will try to
achieve this in following version.

Also for MPOL_LOCAL and MPOL_PREFERRED, current code logic is
turning 'MPOL_LOCAL' to 'MPOL_PREFERRED' with MPOL_F_LOCAL set.
I don't understand why not use the other way around, that
turning MPOL_PREFERRED with empty nodemask to MPOL_LOCAL, which
looks more logical.

Thanks,
Feng

> > @@ -2072,6 +2087,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
> >  	task_lock(current);
> >  	mempolicy = current->mempolicy;
> >  	switch (mempolicy->mode) {
> > +	case MPOL_PREFERRED_MANY:
> > +		*mask = mempolicy->v.preferred_nodes;
> > +		break;
> >  	case MPOL_PREFERRED:
> >  		if (mempolicy->flags & MPOL_F_LOCAL)
> >  			nid = numa_node_id();
> 
> Same here
Feng Tang May 13, 2021, 7:23 a.m. UTC | #3
mempolicy: don't handle MPOL_LOCAL as a fake MPOL_PREFERRED policy

MPOL_LOCAL policy has been setup as a real policy, but it is still
handled as a faked POL_PREFERRED policy with one internal
MPOL_F_LOCAL flag bit set, and there are many places having to
judge the real 'prefer' or the 'local' policy, which are quite
confusing.

In current code, there are four cases that MPOL_LOCAL are used:
* user specifies 'local' policy
* user specifies 'prefer' policy, but with empty nodemask
* system 'default' policy is used
* 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES
  flag set, and when it is 'rebind' to a nodemask which doesn't
  contains the 'preferred' node, it will add the MPOL_F_LOCAL bit
  and performs as 'local' policy. In future if it is 'rebind' again
  with valid nodemask, the policy will be restored back to 'prefer'

So for the first three cases, we make 'local' a real policy
instead of a fake 'prefer' one, this will reduce confusion and
make it easier to integrate our new 'prefer-many' policy

And next optional patch will kill the 'MPOL_F_LOCAL' bit.

Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 mm/mempolicy.c | 60 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d79fa29..2f20f079 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -121,8 +121,7 @@ enum zone_type policy_zone = 0;
  */
 static struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
-	.mode = MPOL_PREFERRED,
-	.flags = MPOL_F_LOCAL,
+	.mode = MPOL_LOCAL,
 };
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -200,12 +199,9 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 
 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 {
-	if (!nodes)
-		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
-	else if (nodes_empty(*nodes))
-		return -EINVAL;			/*  no allowed nodes */
-	else
-		pol->v.preferred_node = first_node(*nodes);
+	if (nodes_empty(*nodes))
+		return -EINVAL;
+	pol->v.preferred_node = first_node(*nodes);
 	return 0;
 }
 
@@ -239,25 +235,19 @@ static int mpol_set_nodemask(struct mempolicy *pol,
 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 
 	VM_BUG_ON(!nodes);
-	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
-		nodes = NULL;	/* explicit local allocation */
-	else {
-		if (pol->flags & MPOL_F_RELATIVE_NODES)
-			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
-		else
-			nodes_and(nsc->mask2, *nodes, nsc->mask1);
 
-		if (mpol_store_user_nodemask(pol))
-			pol->w.user_nodemask = *nodes;
-		else
-			pol->w.cpuset_mems_allowed =
-						cpuset_current_mems_allowed;
-	}
+	if (pol->flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+	else
+		nodes_and(nsc->mask2, *nodes, nsc->mask1);
 
-	if (nodes)
-		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+	if (mpol_store_user_nodemask(pol))
+		pol->w.user_nodemask = *nodes;
 	else
-		ret = mpol_ops[pol->mode].create(pol, NULL);
+		pol->w.cpuset_mems_allowed =
+					cpuset_current_mems_allowed;
+
+	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 	return ret;
 }
 
@@ -290,13 +280,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 			if (((flags & MPOL_F_STATIC_NODES) ||
 			     (flags & MPOL_F_RELATIVE_NODES)))
 				return ERR_PTR(-EINVAL);
+
+			mode = MPOL_LOCAL;
 		}
 	} else if (mode == MPOL_LOCAL) {
 		if (!nodes_empty(*nodes) ||
 		    (flags & MPOL_F_STATIC_NODES) ||
 		    (flags & MPOL_F_RELATIVE_NODES))
 			return ERR_PTR(-EINVAL);
-		mode = MPOL_PREFERRED;
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -427,6 +418,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_bind,
 		.rebind = mpol_rebind_nodemask,
 	},
+	[MPOL_LOCAL] = {
+		.rebind = mpol_rebind_default,
+	},
 };
 
 static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -1960,6 +1954,8 @@ unsigned int mempolicy_slab_node(void)
 							&policy->v.nodes);
 		return z->zone ? zone_to_nid(z->zone) : node;
 	}
+	case MPOL_LOCAL:
+		return node;
 
 	default:
 		BUG();
@@ -2084,6 +2080,11 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 		*mask =  mempolicy->v.nodes;
 		break;
 
+	case MPOL_LOCAL:
+		nid = numa_node_id();
+		init_nodemask_of_node(mask, nid);
+		break;
+
 	default:
 		BUG();
 	}
@@ -2344,6 +2345,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		if (a->flags & MPOL_F_LOCAL)
 			return true;
 		return a->v.preferred_node == b->v.preferred_node;
+	case MPOL_LOCAL:
+		return true;
 	default:
 		BUG();
 		return false;
@@ -2487,6 +2490,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 			polnid = pol->v.preferred_node;
 		break;
 
+	case MPOL_LOCAL:
+		polnid = numa_node_id();
+		break;
+
 	case MPOL_BIND:
 		/* Optimize placement among multiple nodes via NUMA balancing */
 		if (pol->flags & MPOL_F_MORON) {
@@ -2931,7 +2938,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 		 */
 		if (nodelist)
 			goto out;
-		mode = MPOL_PREFERRED;
 		break;
 	case MPOL_DEFAULT:
 		/*
@@ -2975,7 +2981,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 	else if (nodelist)
 		new->v.preferred_node = first_node(nodes);
 	else
-		new->flags |= MPOL_F_LOCAL;
+		new->mode = MPOL_LOCAL;
 
 	/*
 	 * Save nodes for contextualization: this will be used to "clone"
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2b1e0e4..1228d8e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@ 
  *                but useful to set in a VMA when you have a non default
  *                process policy.
  *
+ * preferred many Try a set of nodes first before normal fallback. This is
+ *                similar to preferred without the special case.
+ *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
@@ -105,6 +108,8 @@ 
 
 #include "internal.h"
 
+#define MPOL_PREFERRED_MANY MPOL_MAX
+
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
@@ -175,7 +180,7 @@  struct mempolicy *get_task_policy(struct task_struct *p)
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
-} mpol_ops[MPOL_MAX];
+} mpol_ops[MPOL_MAX + 1];
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
@@ -415,7 +420,7 @@  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 	mmap_write_unlock(mm);
 }
 
-static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+static const struct mempolicy_operations mpol_ops[MPOL_MAX + 1] = {
 	[MPOL_DEFAULT] = {
 		.rebind = mpol_rebind_default,
 	},
@@ -432,6 +437,10 @@  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.rebind = mpol_rebind_nodemask,
 	},
 	/* [MPOL_LOCAL] - see mpol_new() */
+	[MPOL_PREFERRED_MANY] = {
+		.create = NULL,
+		.rebind = NULL,
+	},
 };
 
 static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -924,6 +933,9 @@  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	case MPOL_INTERLEAVE:
 		*nodes = p->v.nodes;
 		break;
+	case MPOL_PREFERRED_MANY:
+		*nodes = p->v.preferred_nodes;
+		break;
 	case MPOL_PREFERRED:
 		if (!(p->flags & MPOL_F_LOCAL))
 			*nodes = p->v.preferred_nodes;
@@ -1895,7 +1907,9 @@  nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 /* Return the node id preferred by the given mempolicy, or the given id */
 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
-	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) {
+	if ((policy->mode == MPOL_PREFERRED ||
+	     policy->mode == MPOL_PREFERRED_MANY) &&
+	    !(policy->flags & MPOL_F_LOCAL)) {
 		nd = first_node(policy->v.preferred_nodes);
 	} else {
 		/*
@@ -1938,6 +1952,7 @@  unsigned int mempolicy_slab_node(void)
 		return node;
 
 	switch (policy->mode) {
+	case MPOL_PREFERRED_MANY:
 	case MPOL_PREFERRED:
 		/*
 		 * handled MPOL_F_LOCAL above
@@ -2072,6 +2087,9 @@  bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	task_lock(current);
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
+	case MPOL_PREFERRED_MANY:
+		*mask = mempolicy->v.preferred_nodes;
+		break;
 	case MPOL_PREFERRED:
 		if (mempolicy->flags & MPOL_F_LOCAL)
 			nid = numa_node_id();
@@ -2126,6 +2144,9 @@  bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 		 * nodes in mask.
 		 */
 		break;
+	case MPOL_PREFERRED_MANY:
+		ret = nodes_intersects(mempolicy->v.preferred_nodes, *mask);
+		break;
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
@@ -2210,10 +2231,13 @@  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		 * node and don't fall back to other nodes, as the cost of
 		 * remote accesses would likely offset THP benefits.
 		 *
-		 * If the policy is interleave, or does not allow the current
-		 * node in its nodemask, we allocate the standard way.
+		 * If the policy is interleave or multiple preferred nodes, or
+		 * does not allow the current node in its nodemask, we allocate
+		 * the standard way.
 		 */
-		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+		if ((pol->mode == MPOL_PREFERRED ||
+		     pol->mode == MPOL_PREFERRED_MANY) &&
+		    !(pol->flags & MPOL_F_LOCAL))
 			hpage_node = first_node(pol->v.preferred_nodes);
 
 		nmask = policy_nodemask(gfp, pol);
@@ -2349,6 +2373,9 @@  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		return !!nodes_equal(a->v.nodes, b->v.nodes);
+	case MPOL_PREFERRED_MANY:
+		return !!nodes_equal(a->v.preferred_nodes,
+				     b->v.preferred_nodes);
 	case MPOL_PREFERRED:
 		/* a's ->flags is the same as b's */
 		if (a->flags & MPOL_F_LOCAL)
@@ -2523,6 +2550,8 @@  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		polnid = zone_to_nid(z->zone);
 		break;
 
+		/* case MPOL_PREFERRED_MANY: */
+
 	default:
 		BUG();
 	}
@@ -2874,6 +2903,7 @@  static const char * const policy_modes[] =
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
 	[MPOL_LOCAL]      = "local",
+	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
 
 
@@ -2953,6 +2983,7 @@  int mpol_parse_str(char *str, struct mempolicy **mpol)
 		if (!nodelist)
 			err = 0;
 		goto out;
+	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 		/*
 		 * Insist on a nodelist
@@ -3035,6 +3066,9 @@  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	switch (mode) {
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_PREFERRED_MANY:
+		WARN_ON(flags & MPOL_F_LOCAL);
+		fallthrough;
 	case MPOL_PREFERRED:
 		if (flags & MPOL_F_LOCAL)
 			mode = MPOL_LOCAL;