diff mbox series

[RFC,3/3] mm/mempolicy: implement a partial-interleave mempolicy

Message ID 20230914235457.482710-4-gregory.price@memverge.com
State New, archived
Headers show
Series mm/mempolicy: set/get_mempolicy2 | expand

Commit Message

Gregory Price Sept. 14, 2023, 11:54 p.m. UTC
The partial-interleave mempolicy implements interleave on an
allocation interval. The default node is the local node, for
which N pages will be allocated before an interleave pass occurs.

For example:
  nodes=0,1,2
  interval=3
  cpunode=0

Over 10 consecutive allocations, the following nodes will be selected:
[0,0,0,1,2,0,0,0,1,2]

In this example, there is a 60%/20%/20% distribution of memory.

Using this mechanism, it becomes possible to define an approximate
distribution percentage of memory across a set of nodes:

local_node% : interval/((nr_nodes-1)+interval-1)
other_node% : (1-local_node%)/(nr_nodes-1)

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/mempolicy.h      |   8 ++
 include/uapi/linux/mempolicy.h |   5 +
 mm/mempolicy.c                 | 161 +++++++++++++++++++++++++++++++--
 3 files changed, 166 insertions(+), 8 deletions(-)

Comments

Jonathan Cameron Oct. 2, 2023, 1:40 p.m. UTC | #1
On Thu, 14 Sep 2023 19:54:57 -0400
Gregory Price <gourry.memverge@gmail.com> wrote:

> The partial-interleave mempolicy implements interleave on an

I'm not sure 'partial' really conveys what is going on here.
Weighted, or uneven-interleave maybe?

> allocation interval. The default node is the local node, for
> which N pages will be allocated before an interleave pass occurs.
> 
> For example:
>   nodes=0,1,2
>   interval=3
>   cpunode=0
> 
> Over 10 consecutive allocations, the following nodes will be selected:
> [0,0,0,1,2,0,0,0,1,2]
> 
> In this example, there is a 60%/20%/20% distribution of memory.
> 
> Using this mechanism, it becomes possible to define an approximate
> distribution percentage of memory across a set of nodes:
> 
> local_node% : interval/((nr_nodes-1)+interval-1)
> other_node% : (1-local_node%)/(nr_nodes-1)

I'd like to see more discussion here of why you would do this...


A few trivial bits inline,

Jonathan

...

> +static unsigned long alloc_pages_bulk_array_partial_interleave(gfp_t gfp,
> +		struct mempolicy *pol, unsigned long nr_pages,
> +		struct page **page_array)
> +{
> +	nodemask_t nodemask = pol->nodes;
> +	unsigned long nr_pages_main;
> +	unsigned long nr_pages_other;
> +	unsigned long total_cycle;
> +	unsigned long delta;
> +	unsigned long interval;
> +	int allocated = 0;
> +	int start_nid;
> +	int nnodes;
> +	int prev, next;
> +	int i;
> +
> +	/* This stabilizes nodes on the stack incase pol->nodes changes */
> +	barrier();
> +
> +	nnodes = nodes_weight(nodemask);
> +	start_nid = numa_node_id();
> +
> +	if (!node_isset(start_nid, nodemask))
> +		start_nid = first_node(nodemask);
> +
> +	if (nnodes == 1) {
> +		allocated = __alloc_pages_bulk(gfp, start_nid,
> +					       NULL, nr_pages_main,
> +					       NULL, page_array);
> +		return allocated;
		return __alloc_pages_bulk(...)

> +	}
> +	/* We don't want to double-count the main node in calculations */
> +	nnodes--;
> +
> +	interval = pol->part_int.interval;
> +	total_cycle = (interval + nnodes);

excess brackets. Same in various other places.


> +	/* Number of pages on main node: (cycles*interval + up to interval) */
> +	nr_pages_main = ((nr_pages / total_cycle) * interval);
> +	nr_pages_main += (nr_pages % total_cycle % (interval + 1));


> +	/* Number of pages on others: (remaining/nodes) + 1 page if delta  */
> +	nr_pages_other = (nr_pages - nr_pages_main) / nnodes;
> +	nr_pages_other /= nnodes;
> +	/* Delta is number of pages beyond interval up to full cycle */
> +	delta = nr_pages - (nr_pages_main + (nr_pages_other * nnodes));
> +
> +	/* start by allocating for the main node, then interleave rest */
> +	prev = start_nid;
> +	allocated = __alloc_pages_bulk(gfp, start_nid, NULL, nr_pages_main,
> +				       NULL, page_array);
> +	for (i = 0; i < nnodes; i++) {
> +		int pages = nr_pages_other + (delta-- ? 1 : 0);
> +
> +		next = next_node_in(prev, nodemask);
> +		if (next < MAX_NUMNODES)
> +			prev = next;
> +		allocated += __alloc_pages_bulk(gfp, next, NULL, pages,
> +						NULL, page_array);
> +	}
> +
> +	return allocated;
> +}
> +
Gregory Price Oct. 2, 2023, 4:10 p.m. UTC | #2
On Mon, Oct 02, 2023 at 02:40:35PM +0100, Jonathan Cameron wrote:
> On Thu, 14 Sep 2023 19:54:57 -0400
> Gregory Price <gourry.memverge@gmail.com> wrote:
> 
> > The partial-interleave mempolicy implements interleave on an
> 
> I'm not sure 'partial' really conveys what is going on here.
> Weighted, or uneven-interleave maybe?
>
> > local_node% : interval/((nr_nodes-1)+interval-1)
> > other_node% : (1-local_node%)/(nr_nodes-1)
> 
> I'd like to see more discussion here of why you would do this...
> 

TL;DR: "Partial" in the sense that it's a simplified version of
weighted interleave.  I honestly struggled with the name, but i'm not
tied to it if there's something better.

I also considered "Preferred Interleave", where the local node is
preferred for some weight, and the remaining nodes are interleaved.
Maybe that's a more intuitive name.

For now i'll start calling it "preferred interleave" instead.



More generally:

This was a first pass at weighted interleave without adding the full
weights[MAX_NUMNODES] field to the mempolicy structure.

I've since added full weighted interleave and that'll be in v2 of the
RFC (hopefully pushing up today after addressing your notes).



I'll these notes for discussion in the RFC v2
---
I can see advantages of both full-weighted and preferred-interleave.

Something to consider: task migration and cpuset/memcg.

With "full-weighted" interleave, consider the scenario where the user
initially runs on Node 0/Socket 0, and sets the following weights

[0:10,1:3,2:5,3:1]
Where the nodes are as follows:
0 - socket 0 DRAM
1 - socket 1 DRAM
2 - socket 0 CXL
3 - socket 1 DRAM

If that task gets migrated to socket 1... that's not going to be a good
weighting plan.  This is the same reason a single set of weighted tiers
that abstract nodes is not a good idea - because Nodes 1 and 3 in this
scenario have "similar attributes" but only relative to their local
sockets (0-2 and 1-3).

Worse - if Nodes 2 and 3 *don't* have similar attributes, if we
implement an "auto-rebalance" mechanism, a lot of assumptions would have
to be made, and any time a migration is detected between nodes you would
have to do this auto-rebalance.

Even worse - I attempted to expose the weights per-task via procfs, and
realized the entire mempolicy subsystem is very unfriendly to outside
tasks twiddling bits (i.e. mempolicy is very 'current'-centric).

There are *tons* of race conditions that have to be handled, and it's
really rather nasty in my opinion.

consider this code:

2446 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
2447 {
... snip ...
2458         nodemask = pol->nodes;
2459
2460         /*
2461          * The barrier will stabilize the nodemask in a register or on
2462          * the stack so that it will stop changing under the code.
2463          *
2464          * Between first_node() and next_node(), pol->nodes could be changed
2465          * by other threads. So we put pol->nodes in a local stack.
2466          */
2467         barrier();

big oof, you wouldn't be able to depend on this for weights, so you need
an algorithm that can allow some slop as weights are being replaced.

So unless we rewrite mempolicy.c to be more robust in this sense, I
would argue a fully-weighted scenario is most useful if you are very
confident that your task is not going to be migrated.  Otherwise there
will be very high costs associated with recalculating weights.



With preferred-interleave, if a task migrates, the rebalance happens
automatically based on the nodemask:  The new local node becomes the
heavily weighted node, and the rest interleave evenly.

(If local node is for some reason not in the nodemask, use first-node,
but this could possibly be changed to use a manually defined node))

So basically if you expect your task to be migrate-able, something like
"preferred interleave" gets you a more aligned post-migration behavior to
what you originally wanted.  Similarly, if your interleave ratios are
simple, then this strategy is the simplest way to get to the desired
outcome.


Is it the *best* strategy? TBD. The behavior is more predictable, though.

I will have a weighted interleave patch added to my next RFC.  I need to
test it first.


Thanks
Gregory
diff mbox series

Patch

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..41a6de9ff556 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -48,6 +48,14 @@  struct mempolicy {
 	nodemask_t nodes;	/* interleave/bind/perfer */
 	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
+	union {
+		/* Partial Interleave: Allocate local count, then interleave */
+		struct {
+			int interval;
+			int count;
+		} part_int;
+	};
+
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 53650f69db2b..1af344344459 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -24,6 +24,7 @@  enum {
 	MPOL_LOCAL,
 	MPOL_PREFERRED_MANY,
 	MPOL_LEGACY,	/* set_mempolicy limited to above modes */
+	MPOL_PARTIAL_INTERLEAVE,
 	MPOL_MAX,	/* always last member of enum */
 };
 
@@ -55,6 +56,10 @@  struct mempolicy_args {
 		struct {
 			unsigned long next_node; /* get only */
 		} interleave;
+		struct {
+			unsigned long interval;  /* get and set */
+			unsigned long next_node; /* get only */
+		} part_int;
 	};
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1cf7709400f1..a2ee45ac2ab6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -399,6 +399,10 @@  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_nodemask,
 		.rebind = mpol_rebind_nodemask,
 	},
+	[MPOL_PARTIAL_INTERLEAVE] = {
+		.create = mpol_new_nodemask,
+		.rebind = mpol_rebind_nodemask,
+	},
 	[MPOL_PREFERRED] = {
 		.create = mpol_new_preferred,
 		.rebind = mpol_rebind_preferred,
@@ -875,7 +879,8 @@  static long swap_mempolicy(struct mempolicy *new,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && (new->mode == MPOL_INTERLEAVE ||
+		    new->mode == MPOL_PARTIAL_INTERLEAVE))
 		current->il_prev = MAX_NUMNODES-1;
 out:
 	task_unlock(current);
@@ -920,6 +925,7 @@  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	switch (p->mode) {
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
 		*nodes = p->nodes;
@@ -1614,6 +1620,23 @@  SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return kernel_set_mempolicy(mode, nmask, maxnode);
 }
 
+static long do_set_partial_interleave(struct mempolicy_args *args,
+				      struct mempolicy *new,
+				      nodemask_t *nodes)
+{
+	/* Preferred interleave cannot be done with no nodemask */
+	if (nodes_empty(*nodes))
+		return -EINVAL;
+
+	/* Preferred interleave interval cannot be <= 0 */
+	if (args->part_int.interval <= 0)
+		return -EINVAL;
+
+	new->part_int.interval = args->part_int.interval;
+	new->part_int.count = 0;
+	return 0;
+}
+
 static long do_set_mempolicy2(struct mempolicy_args *args)
 {
 	struct mempolicy *new = NULL;
@@ -1637,6 +1660,9 @@  static long do_set_mempolicy2(struct mempolicy_args *args)
 	}
 
 	switch (args->mode) {
+	case MPOL_PARTIAL_INTERLEAVE:
+		err = do_set_partial_interleave(args, new, &nodes);
+		break;
 	default:
 		BUG();
 	}
@@ -1791,6 +1817,11 @@  static long do_get_mempolicy2(struct mempolicy_args *kargs)
 		kargs->interleave.next_node = next_node_in(current->il_prev,
 							   pol->nodes);
 		break;
+	case MPOL_PARTIAL_INTERLEAVE:
+		kargs->part_int.next_node = next_node_in(current->il_prev,
+							 pol->nodes);
+		kargs->part_int.interval = pol->part_int.interval;
+		break;
 	default:
 		break;
 	}
@@ -2133,8 +2164,19 @@  static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	next = next_node_in(me->il_prev, policy->nodes);
-	if (next < MAX_NUMNODES)
+
+	if (policy->mode == MPOL_PARTIAL_INTERLEAVE) {
+		if (next == numa_node_id()) {
+			if (++policy->part_int.count >= policy->part_int.interval) {
+				policy->part_int.count = 0;
+				me->il_prev = next;
+			}
+		} else if (next < MAX_NUMNODES) {
+			me->il_prev = next;
+		}
+	} else if (next < MAX_NUMNODES)
 		me->il_prev = next;
+
 	return next;
 }
 
@@ -2159,6 +2201,7 @@  unsigned int mempolicy_slab_node(void)
 		return first_node(policy->nodes);
 
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 		return interleave_nodes(policy);
 
 	case MPOL_BIND:
@@ -2195,7 +2238,7 @@  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nodemask_t nodemask = pol->nodes;
 	unsigned int target, nnodes;
 	int i;
-	int nid;
+	int nid = MAX_NUMNODES;
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
@@ -2208,8 +2251,35 @@  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
-	nid = first_node(nodemask);
+
+	if (pol->mode == MPOL_PARTIAL_INTERLEAVE) {
+		int interval = pol->part_int.interval;
+		/*
+		 * Mode or interval can change so default to basic interleave
+		 * if the interval has become invalid.  Basic interleave is
+		 * equivalent to interval=1. Don't double-count the base node
+		 */
+		if (interval == 0)
+			interval = 1;
+		interval -= 1;
+
+		/* If target <= the interval, no need to call next_node */
+		target = ((unsigned int)n % (nnodes + interval));
+		target -= (target > interval) ? interval : target;
+		target %= MAX_NUMNODES;
+
+		/* If the local node ID is no longer set, do interleave */
+		nid = numa_node_id();
+		if (!node_isset(nid, nodemask))
+			nid = MAX_NUMNODES;
+	}
+
+	/* If partial interleave generated an invalid nid, do interleave */
+	if (nid == MAX_NUMNODES) {
+		target = (unsigned int)n % nnodes;
+		nid = first_node(nodemask);
+	}
+
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);
 	return nid;
@@ -2263,7 +2333,8 @@  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 	*nodemask = NULL;
 	mode = (*mpol)->mode;
 
-	if (unlikely(mode == MPOL_INTERLEAVE)) {
+	if (unlikely(mode == MPOL_INTERLEAVE) ||
+	    unlikely(mode == MPOL_PARTIAL_INTERLEAVE)) {
 		nid = interleave_nid(*mpol, vma, addr,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
@@ -2304,6 +2375,7 @@  bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 		*mask = mempolicy->nodes;
 		break;
 
@@ -2414,7 +2486,8 @@  struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	pol = get_vma_policy(vma, addr);
 
-	if (pol->mode == MPOL_INTERLEAVE) {
+	if (pol->mode == MPOL_INTERLEAVE ||
+	    pol->mode == MPOL_PARTIAL_INTERLEAVE) {
 		struct page *page;
 		unsigned nid;
 
@@ -2516,7 +2589,8 @@  struct page *alloc_pages(gfp_t gfp, unsigned order)
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
-	if (pol->mode == MPOL_INTERLEAVE)
+	if (pol->mode == MPOL_INTERLEAVE ||
+	    pol->mode == MPOL_PARTIAL_INTERLEAVE)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else if (pol->mode == MPOL_PREFERRED_MANY)
 		page = alloc_pages_preferred_many(gfp, order,
@@ -2576,6 +2650,68 @@  static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 	return total_allocated;
 }
 
+static unsigned long alloc_pages_bulk_array_partial_interleave(gfp_t gfp,
+		struct mempolicy *pol, unsigned long nr_pages,
+		struct page **page_array)
+{
+	nodemask_t nodemask = pol->nodes;
+	unsigned long nr_pages_main;
+	unsigned long nr_pages_other;
+	unsigned long total_cycle;
+	unsigned long delta;
+	unsigned long interval;
+	int allocated = 0;
+	int start_nid;
+	int nnodes;
+	int prev, next;
+	int i;
+
+	/* This stabilizes nodes on the stack incase pol->nodes changes */
+	barrier();
+
+	nnodes = nodes_weight(nodemask);
+	start_nid = numa_node_id();
+
+	if (!node_isset(start_nid, nodemask))
+		start_nid = first_node(nodemask);
+
+	if (nnodes == 1) {
+		allocated = __alloc_pages_bulk(gfp, start_nid,
+					       NULL, nr_pages_main,
+					       NULL, page_array);
+		return allocated;
+	}
+	/* We don't want to double-count the main node in calculations */
+	nnodes--;
+
+	interval = pol->part_int.interval;
+	total_cycle = (interval + nnodes);
+	/* Number of pages on main node: (cycles*interval + up to interval) */
+	nr_pages_main = ((nr_pages / total_cycle) * interval);
+	nr_pages_main += (nr_pages % total_cycle % (interval + 1));
+	/* Number of pages on others: (remaining/nodes) + 1 page if delta  */
+	nr_pages_other = (nr_pages - nr_pages_main) / nnodes;
+	nr_pages_other /= nnodes;
+	/* Delta is number of pages beyond interval up to full cycle */
+	delta = nr_pages - (nr_pages_main + (nr_pages_other * nnodes));
+
+	/* start by allocating for the main node, then interleave rest */
+	prev = start_nid;
+	allocated = __alloc_pages_bulk(gfp, start_nid, NULL, nr_pages_main,
+				       NULL, page_array);
+	for (i = 0; i < nnodes; i++) {
+		int pages = nr_pages_other + (delta-- ? 1 : 0);
+
+		next = next_node_in(prev, nodemask);
+		if (next < MAX_NUMNODES)
+			prev = next;
+		allocated += __alloc_pages_bulk(gfp, next, NULL, pages,
+						NULL, page_array);
+	}
+
+	return allocated;
+}
+
 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
@@ -2614,6 +2750,11 @@  unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		return alloc_pages_bulk_array_interleave(gfp, pol,
 							 nr_pages, page_array);
 
+	if (pol->mode == MPOL_PARTIAL_INTERLEAVE)
+		return alloc_pages_bulk_array_partial_interleave(gfp, pol,
+								 nr_pages,
+								 page_array);
+
 	if (pol->mode == MPOL_PREFERRED_MANY)
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
@@ -2686,6 +2827,7 @@  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	switch (a->mode) {
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
 		return !!nodes_equal(a->nodes, b->nodes);
@@ -2822,6 +2964,7 @@  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 		pgoff = vma->vm_pgoff;
 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
 		polnid = offset_il_node(pol, pgoff);
@@ -3209,6 +3352,7 @@  static const char * const policy_modes[] =
 	[MPOL_PREFERRED]  = "prefer",
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
+	[MPOL_PARTIAL_INTERLEAVE] = "partial interleave",
 	[MPOL_LOCAL]      = "local",
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
@@ -3379,6 +3523,7 @@  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_PARTIAL_INTERLEAVE:
 		nodes = pol->nodes;
 		break;
 	default: