Message ID | 20231031003810.4532-5-gregory.price@memverge.com |
---|---|
State | New, archived |
Headers | show |
Series | Node Weights and Weighted Interleave | expand |
On 10/31/2023 6:08 AM, Gregory Price wrote: > > > The node subsystem implements interleave weighting for the purpose > of bandwidth optimization. Each node may have different weights in > relation to each compute node ("access node"). > > The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement > weighted interleave. By default, since all nodes default to a weight > of 1, the original interleave behavior is retained. > > Examples > > Weight settings: > echo 4 > node0/access0/il_weight > echo 1 > node0/access1/il_weight > > echo 3 > node1/access0/il_weight > echo 2 > node1/access1/il_weight > > Results: > > Task A: > cpunode: 0 > nodemask: [0,1] > weights: [4,3] > allocation result: [0,0,0,0,1,1,1 repeat] > > Task B: > cpunode: 1 > nodemask: [0,1] > weights: [1,2] > allocation result: [0,1,1 repeat] > Weights are relative to access node > > Signed-off-by: Gregory Price <gregory.price@memverge.com> Thank you Gregory for the collaboration. Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> > --- > include/linux/mempolicy.h | 4 ++ > mm/mempolicy.c | 138 +++++++++++++++++++++++++++++--------- > 2 files changed, 112 insertions(+), 30 deletions(-) > > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > index d232de7cdc56..240468b669fd 100644 > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -48,6 +48,10 @@ struct mempolicy { > nodemask_t nodes; /* interleave/bind/perfer */ > int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ > > + /* weighted interleave settings */ > + unsigned char cur_weight; > + unsigned char il_weights[MAX_NUMNODES]; > + > union { > nodemask_t cpuset_mems_allowed; /* relative to these nodes */ > nodemask_t user_nodemask; /* nodemask passed by user */ > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 29ebf1e7898c..d62e942a13bd 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -102,6 +102,7 @@ > #include <linux/mmu_notifier.h> > #include <linux/printk.h> > #include <linux/swapops.h> > +#include <linux/memory-tiers.h> > > #include <asm/tlbflush.h> > #include <asm/tlb.h> > @@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, > policy->mode = mode; > policy->flags = flags; > policy->home_node = NUMA_NO_NODE; > + policy->cur_weight = 0; > > return policy; > } > @@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) > tmp = *nodes; > > pol->nodes = tmp; > + pol->cur_weight = 0; > } > > static void mpol_rebind_preferred(struct mempolicy *pol, > @@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, > > old = current->mempolicy; > current->mempolicy = new; > - if (new && new->mode == MPOL_INTERLEAVE) > + if (new && new->mode == MPOL_INTERLEAVE) { > current->il_prev = MAX_NUMNODES-1; > + new->cur_weight = 0; > + } > + > task_unlock(current); > mpol_put(old); > ret = 0; > @@ -1903,12 +1909,21 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) > /* Do dynamic interleaving for a process */ > static unsigned interleave_nodes(struct mempolicy *policy) > { > - unsigned next; > + unsigned int next; > + unsigned char next_weight; > struct task_struct *me = current; > > next = next_node_in(me->il_prev, policy->nodes); > - if (next < MAX_NUMNODES) > + if (!policy->cur_weight) { > + /* If the node is set, at least 1 allocation is required */ > + next_weight = node_get_il_weight(next, numa_node_id()); > + policy->cur_weight = next_weight ? next_weight : 1; > + } > + > + policy->cur_weight--; > + if (next < MAX_NUMNODES && !policy->cur_weight) > me->il_prev = next; > + > return next; > } > > @@ -1967,25 +1982,37 @@ unsigned int mempolicy_slab_node(void) > static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) > { > nodemask_t nodemask = pol->nodes; > - unsigned int target, nnodes; > - int i; > + unsigned int target, nnodes, il_weight; > + unsigned char weight; > int nid; > + int cur_node = numa_node_id(); > + > /* > * The barrier will stabilize the nodemask in a register or on > * the stack so that it will stop changing under the code. > * > * Between first_node() and next_node(), pol->nodes could be changed > * by other threads. So we put pol->nodes in a local stack. > + * > + * Additionally, place the cur_node on the stack in case of a migration > */ > barrier(); > > nnodes = nodes_weight(nodemask); > if (!nnodes) > - return numa_node_id(); > - target = (unsigned int)n % nnodes; > + return cur_node; > + > + il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights); > + target = (unsigned int)n % il_weight; > nid = first_node(nodemask); > - for (i = 0; i < target; i++) > - nid = next_node(nid, nodemask); > + while (target) { > + weight = pol->il_weights[nid]; > + if (target < weight) > + break; > + target -= weight; > + nid = next_node_in(nid, nodemask); > + } > + > return nid; > } > > @@ -2319,32 +2346,83 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, > struct mempolicy *pol, unsigned long nr_pages, > struct page **page_array) > { > - int nodes; > - unsigned long nr_pages_per_node; > - int delta; > - int i; > - unsigned long nr_allocated; > + struct task_struct *me = current; > unsigned long total_allocated = 0; > + unsigned long nr_allocated; > + unsigned long rounds; > + unsigned long node_pages, delta; > + unsigned char weight; > + unsigned long il_weight; > + unsigned long req_pages = nr_pages; > + int nnodes, node, prev_node; > + int cur_node = numa_node_id(); > + int i; > > - nodes = nodes_weight(pol->nodes); > - nr_pages_per_node = nr_pages / nodes; > - delta = nr_pages - nodes * nr_pages_per_node; > - > - for (i = 0; i < nodes; i++) { > - if (delta) { > - nr_allocated = __alloc_pages_bulk(gfp, > - interleave_nodes(pol), NULL, > - nr_pages_per_node + 1, NULL, > - page_array); > - delta--; > - } else { > - nr_allocated = __alloc_pages_bulk(gfp, > - interleave_nodes(pol), NULL, > - nr_pages_per_node, NULL, page_array); > + prev_node = me->il_prev; > + nnodes = nodes_weight(pol->nodes); > + /* Continue allocating from most recent node */ > + if (pol->cur_weight) { > + node = next_node_in(prev_node, pol->nodes); > + node_pages = pol->cur_weight; > + if (node_pages > nr_pages) > + node_pages = nr_pages; > + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, > + NULL, page_array); > + page_array += nr_allocated; > + total_allocated += nr_allocated; > + /* if that's all the pages, no need to interleave */ > + if (req_pages <= pol->cur_weight) { > + pol->cur_weight -= req_pages; > + return total_allocated; > } > - > + /* Otherwise we adjust req_pages down, and continue from there */ > + req_pages -= pol->cur_weight; > + pol->cur_weight = 0; > + prev_node = node; > + } > + > + il_weight = nodes_get_il_weights(cur_node, &pol->nodes, > + pol->il_weights); > + rounds = req_pages / il_weight; > + delta = req_pages % il_weight; > + for (i = 0; i < nnodes; i++) { > + node = next_node_in(prev_node, pol->nodes); > + weight = pol->il_weights[node]; > + node_pages = weight * rounds; > + if (delta > weight) { > + node_pages += weight; > + delta -= weight; > + } else if (delta) { > + node_pages += delta; > + delta = 0; > + } > + /* The number of requested pages may not hit every node */ > + if (!node_pages) > + break; > + /* If an over-allocation would occur, floor it */ > + if (node_pages + total_allocated > nr_pages) { > + node_pages = nr_pages - total_allocated; > + delta = 0; > + } > + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, > + NULL, page_array); > page_array += nr_allocated; > total_allocated += nr_allocated; > + prev_node = node; > + } > + > + /* > + * Finally, we need to update me->il_prev and pol->cur_weight > + * If the last node allocated on has un-used weight, apply > + * the remainder as the cur_weight, otherwise proceed to next node > + */ > + if (node_pages) { > + me->il_prev = prev_node; > + node_pages %= weight; > + pol->cur_weight = weight - node_pages; > + } else { > + me->il_prev = node; > + pol->cur_weight = 0; > } > > return total_allocated; > -- > 2.39.1 > >
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc56..240468b669fd 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -48,6 +48,10 @@ struct mempolicy { nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ + /* weighted interleave settings */ + unsigned char cur_weight; + unsigned char il_weights[MAX_NUMNODES]; + union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 29ebf1e7898c..d62e942a13bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -102,6 +102,7 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/memory-tiers.h> #include <asm/tlbflush.h> #include <asm/tlb.h> @@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; + policy->cur_weight = 0; return policy; } @@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) tmp = *nodes; pol->nodes = tmp; + pol->cur_weight = 0; } static void mpol_rebind_preferred(struct mempolicy *pol, @@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && new->mode == MPOL_INTERLEAVE) { current->il_prev = MAX_NUMNODES-1; + new->cur_weight = 0; + } + task_unlock(current); mpol_put(old); ret = 0; @@ -1903,12 +1909,21 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned next; + unsigned int next; + unsigned char next_weight; struct task_struct *me = current; next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) + if (!policy->cur_weight) { + /* If the node is set, at least 1 allocation is required */ + next_weight = node_get_il_weight(next, numa_node_id()); + policy->cur_weight = next_weight ? next_weight : 1; + } + + policy->cur_weight--; + if (next < MAX_NUMNODES && !policy->cur_weight) me->il_prev = next; + return next; } @@ -1967,25 +1982,37 @@ unsigned int mempolicy_slab_node(void) static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { nodemask_t nodemask = pol->nodes; - unsigned int target, nnodes; - int i; + unsigned int target, nnodes, il_weight; + unsigned char weight; int nid; + int cur_node = numa_node_id(); + /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. * * Between first_node() and next_node(), pol->nodes could be changed * by other threads. So we put pol->nodes in a local stack. + * + * Additionally, place the cur_node on the stack in case of a migration */ barrier(); nnodes = nodes_weight(nodemask); if (!nnodes) - return numa_node_id(); - target = (unsigned int)n % nnodes; + return cur_node; + + il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights); + target = (unsigned int)n % il_weight; nid = first_node(nodemask); - for (i = 0; i < target; i++) - nid = next_node(nid, nodemask); + while (target) { + weight = pol->il_weights[nid]; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + return nid; } @@ -2319,32 +2346,83 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { - int nodes; - unsigned long nr_pages_per_node; - int delta; - int i; - unsigned long nr_allocated; + struct task_struct *me = current; unsigned long total_allocated = 0; + unsigned long nr_allocated; + unsigned long rounds; + unsigned long node_pages, delta; + unsigned char weight; + unsigned long il_weight; + unsigned long req_pages = nr_pages; + int nnodes, node, prev_node; + int cur_node = numa_node_id(); + int i; - nodes = nodes_weight(pol->nodes); - nr_pages_per_node = nr_pages / nodes; - delta = nr_pages - nodes * nr_pages_per_node; - - for (i = 0; i < nodes; i++) { - if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, - interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, - page_array); - delta--; - } else { - nr_allocated = __alloc_pages_bulk(gfp, - interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + prev_node = me->il_prev; + nnodes = nodes_weight(pol->nodes); + /* Continue allocating from most recent node */ + if (pol->cur_weight) { + node = next_node_in(prev_node, pol->nodes); + node_pages = pol->cur_weight; + if (node_pages > nr_pages) + node_pages = nr_pages; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (req_pages <= pol->cur_weight) { + pol->cur_weight -= req_pages; + return total_allocated; } - + /* Otherwise we adjust req_pages down, and continue from there */ + req_pages -= pol->cur_weight; + pol->cur_weight = 0; + prev_node = node; + } + + il_weight = nodes_get_il_weights(cur_node, &pol->nodes, + pol->il_weights); + rounds = req_pages / il_weight; + delta = req_pages % il_weight; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, pol->nodes); + weight = pol->il_weights[node]; + node_pages = weight * rounds; + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + node_pages += delta; + delta = 0; + } + /* The number of requested pages may not hit every node */ + if (!node_pages) + break; + /* If an over-allocation would occur, floor it */ + if (node_pages + total_allocated > nr_pages) { + node_pages = nr_pages - total_allocated; + delta = 0; + } + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); page_array += nr_allocated; total_allocated += nr_allocated; + prev_node = node; + } + + /* + * Finally, we need to update me->il_prev and pol->cur_weight + * If the last node allocated on has un-used weight, apply + * the remainder as the cur_weight, otherwise proceed to next node + */ + if (node_pages) { + me->il_prev = prev_node; + node_pages %= weight; + pol->cur_weight = weight - node_pages; + } else { + me->il_prev = node; + pol->cur_weight = 0; } return total_allocated;
The node subsystem implements interleave weighting for the purpose of bandwidth optimization. Each node may have different weights in relation to each compute node ("access node"). The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement weighted interleave. By default, since all nodes default to a weight of 1, the original interleave behavior is retained. Examples Weight settings: echo 4 > node0/access0/il_weight echo 1 > node0/access1/il_weight echo 3 > node1/access0/il_weight echo 2 > node1/access1/il_weight Results: Task A: cpunode: 0 nodemask: [0,1] weights: [4,3] allocation result: [0,0,0,0,1,1,1 repeat] Task B: cpunode: 1 nodemask: [0,1] weights: [1,2] allocation result: [0,1,1 repeat] Weights are relative to access node Signed-off-by: Gregory Price <gregory.price@memverge.com> --- include/linux/mempolicy.h | 4 ++ mm/mempolicy.c | 138 +++++++++++++++++++++++++++++--------- 2 files changed, 112 insertions(+), 30 deletions(-)