diff mbox series

[RFC,v3,4/4] mm/mempolicy: modify interleave mempolicy to use node weights

Message ID 20231031003810.4532-5-gregory.price@memverge.com
State New, archived
Headers show
Series Node Weights and Weighted Interleave | expand

Commit Message

Gregory Price Oct. 31, 2023, 12:38 a.m. UTC
The node subsystem implements interleave weighting for the purpose
of bandwidth optimization.  Each node may have different weights in
relation to each compute node ("access node").

The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement
weighted interleave.  By default, since all nodes default to a weight
of 1, the original interleave behavior is retained.

Examples

Weight settings:
echo 4 > node0/access0/il_weight
echo 1 > node0/access1/il_weight

echo 3 > node1/access0/il_weight
echo 2 > node1/access1/il_weight

Results:

Task A:
   cpunode:  0
   nodemask: [0,1]
   weights:  [4,3]
   allocation result: [0,0,0,0,1,1,1 repeat]

Task B:
   cpunode:  1
   nodemask: [0,1]
   weights:  [1,2]
   allocation result: [0,1,1 repeat]
   Weights are relative to access node

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/mempolicy.h |   4 ++
 mm/mempolicy.c            | 138 +++++++++++++++++++++++++++++---------
 2 files changed, 112 insertions(+), 30 deletions(-)

Comments

Srinivasulu Opensrc Oct. 31, 2023, 6:23 p.m. UTC | #1
On 10/31/2023 6:08 AM, Gregory Price wrote:
>
>
> The node subsystem implements interleave weighting for the purpose
> of bandwidth optimization.  Each node may have different weights in
> relation to each compute node ("access node").
>
> The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement
> weighted interleave.  By default, since all nodes default to a weight
> of 1, the original interleave behavior is retained.
>
> Examples
>
> Weight settings:
> echo 4 > node0/access0/il_weight
> echo 1 > node0/access1/il_weight
>
> echo 3 > node1/access0/il_weight
> echo 2 > node1/access1/il_weight
>
> Results:
>
> Task A:
>     cpunode:  0
>     nodemask: [0,1]
>     weights:  [4,3]
>     allocation result: [0,0,0,0,1,1,1 repeat]
>
> Task B:
>     cpunode:  1
>     nodemask: [0,1]
>     weights:  [1,2]
>     allocation result: [0,1,1 repeat]
>     Weights are relative to access node
>
> Signed-off-by: Gregory Price <gregory.price@memverge.com>
Thank you Gregory for the collaboration.
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
> ---
>   include/linux/mempolicy.h |   4 ++
>   mm/mempolicy.c            | 138 +++++++++++++++++++++++++++++---------
>   2 files changed, 112 insertions(+), 30 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index d232de7cdc56..240468b669fd 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -48,6 +48,10 @@ struct mempolicy {
>          nodemask_t nodes;       /* interleave/bind/perfer */
>          int home_node;          /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
>
> +       /* weighted interleave settings */
> +       unsigned char cur_weight;
> +       unsigned char il_weights[MAX_NUMNODES];
> +
>          union {
>                  nodemask_t cpuset_mems_allowed; /* relative to these nodes */
>                  nodemask_t user_nodemask;       /* nodemask passed by user */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 29ebf1e7898c..d62e942a13bd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -102,6 +102,7 @@
>   #include <linux/mmu_notifier.h>
>   #include <linux/printk.h>
>   #include <linux/swapops.h>
> +#include <linux/memory-tiers.h>
>
>   #include <asm/tlbflush.h>
>   #include <asm/tlb.h>
> @@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
>          policy->mode = mode;
>          policy->flags = flags;
>          policy->home_node = NUMA_NO_NODE;
> +       policy->cur_weight = 0;
>
>          return policy;
>   }
> @@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
>                  tmp = *nodes;
>
>          pol->nodes = tmp;
> +       pol->cur_weight = 0;
>   }
>
>   static void mpol_rebind_preferred(struct mempolicy *pol,
> @@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>
>          old = current->mempolicy;
>          current->mempolicy = new;
> -       if (new && new->mode == MPOL_INTERLEAVE)
> +       if (new && new->mode == MPOL_INTERLEAVE) {
>                  current->il_prev = MAX_NUMNODES-1;
> +               new->cur_weight = 0;
> +       }
> +
>          task_unlock(current);
>          mpol_put(old);
>          ret = 0;
> @@ -1903,12 +1909,21 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>   /* Do dynamic interleaving for a process */
>   static unsigned interleave_nodes(struct mempolicy *policy)
>   {
> -       unsigned next;
> +       unsigned int next;
> +       unsigned char next_weight;
>          struct task_struct *me = current;
>
>          next = next_node_in(me->il_prev, policy->nodes);
> -       if (next < MAX_NUMNODES)
> +       if (!policy->cur_weight) {
> +               /* If the node is set, at least 1 allocation is required */
> +               next_weight = node_get_il_weight(next, numa_node_id());
> +               policy->cur_weight = next_weight ? next_weight : 1;
> +       }
> +
> +       policy->cur_weight--;
> +       if (next < MAX_NUMNODES && !policy->cur_weight)
>                  me->il_prev = next;
> +
>          return next;
>   }
>
> @@ -1967,25 +1982,37 @@ unsigned int mempolicy_slab_node(void)
>   static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>   {
>          nodemask_t nodemask = pol->nodes;
> -       unsigned int target, nnodes;
> -       int i;
> +       unsigned int target, nnodes, il_weight;
> +       unsigned char weight;
>          int nid;
> +       int cur_node = numa_node_id();
> +
>          /*
>           * The barrier will stabilize the nodemask in a register or on
>           * the stack so that it will stop changing under the code.
>           *
>           * Between first_node() and next_node(), pol->nodes could be changed
>           * by other threads. So we put pol->nodes in a local stack.
> +        *
> +        * Additionally, place the cur_node on the stack in case of a migration
>           */
>          barrier();
>
>          nnodes = nodes_weight(nodemask);
>          if (!nnodes)
> -               return numa_node_id();
> -       target = (unsigned int)n % nnodes;
> +               return cur_node;
> +
> +       il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights);
> +       target = (unsigned int)n % il_weight;
>          nid = first_node(nodemask);
> -       for (i = 0; i < target; i++)
> -               nid = next_node(nid, nodemask);
> +       while (target) {
> +               weight = pol->il_weights[nid];
> +               if (target < weight)
> +                       break;
> +               target -= weight;
> +               nid = next_node_in(nid, nodemask);
> +       }
> +
>          return nid;
>   }
>
> @@ -2319,32 +2346,83 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
>                  struct mempolicy *pol, unsigned long nr_pages,
>                  struct page **page_array)
>   {
> -       int nodes;
> -       unsigned long nr_pages_per_node;
> -       int delta;
> -       int i;
> -       unsigned long nr_allocated;
> +       struct task_struct *me = current;
>          unsigned long total_allocated = 0;
> +       unsigned long nr_allocated;
> +       unsigned long rounds;
> +       unsigned long node_pages, delta;
> +       unsigned char weight;
> +       unsigned long il_weight;
> +       unsigned long req_pages = nr_pages;
> +       int nnodes, node, prev_node;
> +       int cur_node = numa_node_id();
> +       int i;
>
> -       nodes = nodes_weight(pol->nodes);
> -       nr_pages_per_node = nr_pages / nodes;
> -       delta = nr_pages - nodes * nr_pages_per_node;
> -
> -       for (i = 0; i < nodes; i++) {
> -               if (delta) {
> -                       nr_allocated = __alloc_pages_bulk(gfp,
> -                                       interleave_nodes(pol), NULL,
> -                                       nr_pages_per_node + 1, NULL,
> -                                       page_array);
> -                       delta--;
> -               } else {
> -                       nr_allocated = __alloc_pages_bulk(gfp,
> -                                       interleave_nodes(pol), NULL,
> -                                       nr_pages_per_node, NULL, page_array);
> +       prev_node = me->il_prev;
> +       nnodes = nodes_weight(pol->nodes);
> +       /* Continue allocating from most recent node */
> +       if (pol->cur_weight) {
> +               node = next_node_in(prev_node, pol->nodes);
> +               node_pages = pol->cur_weight;
> +               if (node_pages > nr_pages)
> +                       node_pages = nr_pages;
> +               nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> +                                                 NULL, page_array);
> +               page_array += nr_allocated;
> +               total_allocated += nr_allocated;
> +               /* if that's all the pages, no need to interleave */
> +               if (req_pages <= pol->cur_weight) {
> +                       pol->cur_weight -= req_pages;
> +                       return total_allocated;
>                  }
> -
> +               /* Otherwise we adjust req_pages down, and continue from there */
> +               req_pages -= pol->cur_weight;
> +               pol->cur_weight = 0;
> +               prev_node = node;
> +       }
> +
> +       il_weight = nodes_get_il_weights(cur_node, &pol->nodes,
> +                                        pol->il_weights);
> +       rounds = req_pages / il_weight;
> +       delta = req_pages % il_weight;
> +       for (i = 0; i < nnodes; i++) {
> +               node = next_node_in(prev_node, pol->nodes);
> +               weight = pol->il_weights[node];
> +               node_pages = weight * rounds;
> +               if (delta > weight) {
> +                       node_pages += weight;
> +                       delta -= weight;
> +               } else if (delta) {
> +                       node_pages += delta;
> +                       delta = 0;
> +               }
> +               /* The number of requested pages may not hit every node */
> +               if (!node_pages)
> +                       break;
> +               /* If an over-allocation would occur, floor it */
> +               if (node_pages + total_allocated > nr_pages) {
> +                       node_pages = nr_pages - total_allocated;
> +                       delta = 0;
> +               }
> +               nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> +                                                 NULL, page_array);
>                  page_array += nr_allocated;
>                  total_allocated += nr_allocated;
> +               prev_node = node;
> +       }
> +
> +       /*
> +        * Finally, we need to update me->il_prev and pol->cur_weight
> +        * If the last node allocated on has un-used weight, apply
> +        * the remainder as the cur_weight, otherwise proceed to next node
> +        */
> +       if (node_pages) {
> +               me->il_prev = prev_node;
> +               node_pages %= weight;
> +               pol->cur_weight = weight - node_pages;
> +       } else {
> +               me->il_prev = node;
> +               pol->cur_weight = 0;
>          }
>
>          return total_allocated;
> --
> 2.39.1
>
>
diff mbox series

Patch

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..240468b669fd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -48,6 +48,10 @@  struct mempolicy {
 	nodemask_t nodes;	/* interleave/bind/perfer */
 	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
+	/* weighted interleave settings */
+	unsigned char cur_weight;
+	unsigned char il_weights[MAX_NUMNODES];
+
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 29ebf1e7898c..d62e942a13bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -102,6 +102,7 @@ 
 #include <linux/mmu_notifier.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
+#include <linux/memory-tiers.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -300,6 +301,7 @@  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 	policy->home_node = NUMA_NO_NODE;
+	policy->cur_weight = 0;
 
 	return policy;
 }
@@ -334,6 +336,7 @@  static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 		tmp = *nodes;
 
 	pol->nodes = tmp;
+	pol->cur_weight = 0;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -881,8 +884,11 @@  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		new->cur_weight = 0;
+	}
+
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1903,12 +1909,21 @@  static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
-	unsigned next;
+	unsigned int next;
+	unsigned char next_weight;
 	struct task_struct *me = current;
 
 	next = next_node_in(me->il_prev, policy->nodes);
-	if (next < MAX_NUMNODES)
+	if (!policy->cur_weight) {
+		/* If the node is set, at least 1 allocation is required */
+		next_weight = node_get_il_weight(next, numa_node_id());
+		policy->cur_weight = next_weight ? next_weight : 1;
+	}
+
+	policy->cur_weight--;
+	if (next < MAX_NUMNODES && !policy->cur_weight)
 		me->il_prev = next;
+
 	return next;
 }
 
@@ -1967,25 +1982,37 @@  unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
+	unsigned int target, nnodes, il_weight;
+	unsigned char weight;
 	int nid;
+	int cur_node = numa_node_id();
+
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
 	 *
 	 * Between first_node() and next_node(), pol->nodes could be changed
 	 * by other threads. So we put pol->nodes in a local stack.
+	 *
+	 * Additionally, place the cur_node on the stack in case of a migration
 	 */
 	barrier();
 
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
-		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+		return cur_node;
+
+	il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights);
+	target = (unsigned int)n % il_weight;
 	nid = first_node(nodemask);
-	for (i = 0; i < target; i++)
-		nid = next_node(nid, nodemask);
+	while (target) {
+		weight = pol->il_weights[nid];
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
+
 	return nid;
 }
 
@@ -2319,32 +2346,83 @@  static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
-	int nodes;
-	unsigned long nr_pages_per_node;
-	int delta;
-	int i;
-	unsigned long nr_allocated;
+	struct task_struct *me = current;
 	unsigned long total_allocated = 0;
+	unsigned long nr_allocated;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	unsigned char weight;
+	unsigned long il_weight;
+	unsigned long req_pages = nr_pages;
+	int nnodes, node, prev_node;
+	int cur_node = numa_node_id();
+	int i;
 
-	nodes = nodes_weight(pol->nodes);
-	nr_pages_per_node = nr_pages / nodes;
-	delta = nr_pages - nodes * nr_pages_per_node;
-
-	for (i = 0; i < nodes; i++) {
-		if (delta) {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node + 1, NULL,
-					page_array);
-			delta--;
-		} else {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node, NULL, page_array);
+	prev_node = me->il_prev;
+	nnodes = nodes_weight(pol->nodes);
+	/* Continue allocating from most recent node */
+	if (pol->cur_weight) {
+		node = next_node_in(prev_node, pol->nodes);
+		node_pages = pol->cur_weight;
+		if (node_pages > nr_pages)
+			node_pages = nr_pages;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (req_pages <= pol->cur_weight) {
+			pol->cur_weight -= req_pages;
+			return total_allocated;
 		}
-
+		/* Otherwise we adjust req_pages down, and continue from there */
+		req_pages -= pol->cur_weight;
+		pol->cur_weight = 0;
+		prev_node = node;
+	}
+
+	il_weight = nodes_get_il_weights(cur_node, &pol->nodes,
+					 pol->il_weights);
+	rounds = req_pages / il_weight;
+	delta = req_pages % il_weight;
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, pol->nodes);
+		weight = pol->il_weights[node];
+		node_pages = weight * rounds;
+		if (delta > weight) {
+			node_pages += weight;
+			delta -= weight;
+		} else if (delta) {
+			node_pages += delta;
+			delta = 0;
+		}
+		/* The number of requested pages may not hit every node */
+		if (!node_pages)
+			break;
+		/* If an over-allocation would occur, floor it */
+		if (node_pages + total_allocated > nr_pages) {
+			node_pages = nr_pages - total_allocated;
+			delta = 0;
+		}
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
 		page_array += nr_allocated;
 		total_allocated += nr_allocated;
+		prev_node = node;
+	}
+
+	/*
+	 * Finally, we need to update me->il_prev and pol->cur_weight
+	 * If the last node allocated on has un-used weight, apply
+	 * the remainder as the cur_weight, otherwise proceed to next node
+	 */
+	if (node_pages) {
+		me->il_prev = prev_node;
+		node_pages %= weight;
+		pol->cur_weight = weight - node_pages;
+	} else {
+		me->il_prev = node;
+		pol->cur_weight = 0;
 	}
 
 	return total_allocated;