@@ -22,6 +22,7 @@ enum {
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
+ MPOL_PREFERRED_MANY,
MPOL_MAX, /* always last member of enum */
};
@@ -31,6 +31,9 @@
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -207,6 +210,14 @@ static int mpol_new_preferred(struct mem
return 0;
}
+static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->nodes = *nodes;
+ return 0;
+}
+
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
@@ -408,6 +419,10 @@ static const struct mempolicy_operations
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_preferred_many,
+ .rebind = mpol_rebind_preferred,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +915,7 @@ static void get_policy_nodemask(struct m
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
@@ -1446,7 +1462,13 @@ static inline int sanitize_mpol_flags(in
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)(*mode) >= MPOL_MAX)
+
+ /*
+ * The check should be 'mode >= MPOL_MAX', but as 'prefer_many'
+ * is not fully implemented, don't permit it to be used for now,
+ * and the logic will be restored in following patch
+ */
+ if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
@@ -1875,16 +1897,27 @@ static int apply_policy_zone(struct memp
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
+ int mode = policy->mode;
+
/* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ if (unlikely(mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
+
+ if (mode == MPOL_PREFERRED_MANY)
return &policy->nodes;
return NULL;
}
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED) {
@@ -1936,7 +1969,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND: {
+ case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ {
struct zoneref *z;
/*
@@ -2008,12 +2043,12 @@ static inline unsigned interleave_nid(st
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
@@ -2021,16 +2056,18 @@ int huge_node(struct vm_area_struct *vma
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
+ int mode;
*mpol = get_vma_policy(vma, addr);
- *nodemask = NULL; /* assume !MPOL_BIND */
+ *nodemask = NULL;
+ mode = (*mpol)->mode;
- if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
+ if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
@@ -2063,6 +2100,7 @@ bool init_nodemask_of_mempolicy(nodemask
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
@@ -2173,7 +2211,7 @@ struct page *alloc_pages_vma(gfp_t gfp,
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
- * If the policy is interleave, or does not allow the current
+ * If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED)
@@ -2311,6 +2349,7 @@ bool __mpol_equal(struct mempolicy *a, s
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2451,6 +2490,8 @@ int mpol_misplaced(struct page *page, st
break;
case MPOL_PREFERRED:
+ if (node_isset(curnid, pol->nodes))
+ goto out;
polnid = first_node(pol->nodes);
break;
@@ -2465,9 +2506,10 @@ int mpol_misplaced(struct page *page, st
break;
goto out;
}
+ fallthrough;
+ case MPOL_PREFERRED_MANY:
/*
- * allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
@@ -2829,6 +2871,7 @@ static const char * const policy_modes[]
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2907,6 +2950,7 @@ int mpol_parse_str(char *str, struct mem
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -2993,6 +3037,7 @@ void mpol_to_str(char *buffer, int maxle
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;