@@ -250,6 +250,17 @@ MPOL_PREFERRED_MANY
can fall back to all existing numa nodes. This is effectively
MPOL_PREFERRED allowed for a mask rather than a single node.
+MPOL_WEIGHTED_INTERLEAVE
+ This mode operates the same as MPOL_INTERLEAVE, except that
+ interleaving behavior is executed based on weights set in
+ /sys/kernel/mm/mempolicy/weighted_interleave/
+
+ Weighted interleave allocations pages on nodes according to
+ their weight. For example if nodes [0,1] are weighted [5,2]
+ respectively, 5 pages will be allocated on node0 for every
+ 2 pages allocated on node1. This can better distribute data
+ according to bandwidth on heterogeneous memory systems.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
@@ -54,6 +54,11 @@ struct mempolicy {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
} w;
+
+ /* Weighted interleave settings */
+ struct {
+ unsigned char cur_weight;
+ } wil;
};
/*
@@ -23,6 +23,7 @@ enum {
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_PREFERRED_MANY,
+ MPOL_WEIGHTED_INTERLEAVE,
MPOL_MAX, /* always last member of enum */
};
@@ -305,6 +305,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy->mode = mode;
policy->flags = flags;
policy->home_node = NUMA_NO_NODE;
+ policy->wil.cur_weight = 0;
return policy;
}
@@ -417,6 +418,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
};
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -838,7 +843,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE))
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
@@ -864,6 +870,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
*nodes = pol->nodes;
break;
case MPOL_LOCAL:
@@ -948,6 +955,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ (pol->mode == MPOL_WEIGHTED_INTERLEAVE)) {
+ if (pol->wil.cur_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1777,7 +1791,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
@@ -1827,6 +1842,24 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int next;
+ struct task_struct *me = current;
+
+ next = next_node_in(me->il_prev, policy->nodes);
+ if (next == MAX_NUMNODES)
+ return next;
+
+ if (!policy->wil.cur_weight)
+ policy->wil.cur_weight = iw_table[next];
+
+ policy->wil.cur_weight--;
+ if (!policy->wil.cur_weight)
+ me->il_prev = next;
+ return next;
+}
+
/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
@@ -1861,6 +1894,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
@@ -1885,6 +1921,41 @@ unsigned int mempolicy_slab_node(void)
}
}
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ nodemask_t nodemask = pol->nodes;
+ unsigned int target, weight_total = 0;
+ int nid;
+ unsigned char weights[MAX_NUMNODES];
+ unsigned char weight;
+
+ barrier();
+
+ /* first ensure we have a valid nodemask */
+ nid = first_node(nodemask);
+ if (nid == MAX_NUMNODES)
+ return nid;
+
+ /* Then collect weights on stack and calculate totals */
+ for_each_node_mask(nid, nodemask) {
+ weight = iw_table[nid];
+ weight_total += weight;
+ weights[nid] = weight;
+ }
+
+ /* Finally, calculate the node offset based on totals */
+ target = (unsigned int)ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ weight = weights[nid];
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ return nid;
+}
+
/*
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1953,6 +2024,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
}
return nodemask;
@@ -2014,6 +2090,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2113,7 +2190,8 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode != MPOL_INTERLEAVE &&
+ if ((pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE) &&
(!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
@@ -2249,6 +2327,106 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ unsigned char weight;
+ unsigned char weights[MAX_NUMNODES];
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes = pol->nodes;
+ int nnodes, node, prev_node;
+ int i;
+
+ /* Stabilize the nodemask on the stack */
+ barrier();
+
+ nnodes = nodes_weight(nodes);
+
+ /* Collect weights and save them on stack so they don't change */
+ for_each_node_mask(node, nodes) {
+ weight = iw_table[node];
+ weight_total += weight;
+ weights[node] = weight;
+ }
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ if (pol->wil.cur_weight) {
+ node = next_node_in(me->il_prev, nodes);
+ node_pages = pol->wil.cur_weight;
+ if (node_pages > rem_pages)
+ node_pages = rem_pages;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= pol->wil.cur_weight) {
+ pol->wil.cur_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust nr_pages down, and continue from there */
+ rem_pages -= pol->wil.cur_weight;
+ pol->wil.cur_weight = 0;
+ prev_node = node;
+ }
+
+ /* Now we can continue allocating as if from 0 instead of an offset */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ if (delta) {
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else {
+ node_pages += delta;
+ delta = 0;
+ }
+ }
+ /* We may not make it all the way around */
+ if (!node_pages)
+ break;
+ /* If an over-allocation would occur, floor it */
+ if (node_pages + total_allocated > nr_pages) {
+ node_pages = nr_pages - total_allocated;
+ delta = 0;
+ }
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ prev_node = node;
+ }
+
+ /*
+ * Finally, we need to update me->il_prev and pol->wil.cur_weight
+ * if there were overflow pages, but not equivalent to the node
+ * weight, set the cur_weight to node_weight - delta and the
+ * me->il_prev to the previous node. Otherwise if it was perfect
+ * we can simply set il_prev to node and cur_weight to 0
+ */
+ if (node_pages) {
+ me->il_prev = prev_node;
+ node_pages %= weight;
+ pol->wil.cur_weight = weight - node_pages;
+ } else {
+ me->il_prev = node;
+ pol->wil.cur_weight = 0;
+ }
+
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2289,6 +2467,11 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(gfp, pol,
+ nr_pages,
+ page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2364,6 +2547,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2500,6 +2684,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
+ break;
+
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
@@ -2874,6 +3062,7 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2933,6 +3122,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -3043,6 +3233,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default: