diff mbox series

[1/5] sched/topology: Introduce for_each_numa_node() iterator

Message ID 20250206202109.384179-2-arighi@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series [1/5] sched/topology: Introduce for_each_numa_node() iterator | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Andrea Righi Feb. 6, 2025, 8:15 p.m. UTC
Introduce for_each_numa_node() and sched_numa_node() helpers to iterate
over node IDs in order of increasing NUMA distance from a given starting
node.

These iterator functions are similar to for_each_numa_hop_mask() and
sched_numa_hop_mask(), but instead of providing a cpumask at each
iteration, they provide a node ID.

Example usage:

  nodemask_t visited = NODE_MASK_NONE;
  int start = cpu_to_node(smp_processor_id());

  for_each_numa_node(node, start, visited, N_ONLINE)
  	pr_info("node (%d, %d) -> %d\n",
  		 start, node, node_distance(start, node));

On a system with equidistant nodes:

 $ numactl -H
 ...
 node distances:
 node     0    1    2    3
    0:   10   20   20   20
    1:   20   10   20   20
    2:   20   20   10   20
    3:   20   20   20   10

Output of the example above (on node 0):

[    7.367022] node (0, 0) -> 10
[    7.367151] node (0, 1) -> 20
[    7.367186] node (0, 2) -> 20
[    7.367247] node (0, 3) -> 20

On a system with non-equidistant nodes (simulated using virtme-ng):

 $ numactl -H
 ...
 node distances:
 node     0    1    2    3
    0:   10   51   31   41
    1:   51   10   21   61
    2:   31   21   10   11
    3:   41   61   11   10

Output of the example above (on node 0):

 [    8.953644] node (0, 0) -> 10
 [    8.953712] node (0, 2) -> 31
 [    8.953764] node (0, 3) -> 41
 [    8.953817] node (0, 1) -> 51

Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/topology.h | 31 ++++++++++++++++++++++++++++-
 kernel/sched/topology.c  | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

Comments

Yury Norov Feb. 7, 2025, 3:57 a.m. UTC | #1
On Thu, Feb 06, 2025 at 09:15:31PM +0100, Andrea Righi wrote:
> Introduce for_each_numa_node() and sched_numa_node() helpers to iterate
> over node IDs in order of increasing NUMA distance from a given starting
> node.
> 
> These iterator functions are similar to for_each_numa_hop_mask() and
> sched_numa_hop_mask(), but instead of providing a cpumask at each
> iteration, they provide a node ID.
> 
> Example usage:
> 
>   nodemask_t visited = NODE_MASK_NONE;
>   int start = cpu_to_node(smp_processor_id());
> 
>   for_each_numa_node(node, start, visited, N_ONLINE)
>   	pr_info("node (%d, %d) -> %d\n",
>   		 start, node, node_distance(start, node));
> 
> On a system with equidistant nodes:
> 
>  $ numactl -H
>  ...
>  node distances:
>  node     0    1    2    3
>     0:   10   20   20   20
>     1:   20   10   20   20
>     2:   20   20   10   20
>     3:   20   20   20   10
> 
> Output of the example above (on node 0):
> 
> [    7.367022] node (0, 0) -> 10
> [    7.367151] node (0, 1) -> 20
> [    7.367186] node (0, 2) -> 20
> [    7.367247] node (0, 3) -> 20
> 
> On a system with non-equidistant nodes (simulated using virtme-ng):
> 
>  $ numactl -H
>  ...
>  node distances:
>  node     0    1    2    3
>     0:   10   51   31   41
>     1:   51   10   21   61
>     2:   31   21   10   11
>     3:   41   61   11   10
> 
> Output of the example above (on node 0):
> 
>  [    8.953644] node (0, 0) -> 10
>  [    8.953712] node (0, 2) -> 31
>  [    8.953764] node (0, 3) -> 41
>  [    8.953817] node (0, 1) -> 51
> 
> Cc: Yury Norov <yury.norov@gmail.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>

Please keep me posted for the whole series.

> ---
>  include/linux/topology.h | 31 ++++++++++++++++++++++++++++-
>  kernel/sched/topology.c  | 42 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 52f5850730b3e..0c82b913a8814 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -248,12 +248,18 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
>  #ifdef CONFIG_NUMA
>  int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
>  extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
> -#else
> +extern int sched_numa_node(nodemask_t *visited, int start, unsigned int state);
> +#else /* !CONFIG_NUMA */
>  static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
>  {
>  	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
>  }
>  
> +static inline int sched_numa_node(nodemask_t *visited, int start, unsigned int state)
> +{
> +	return MAX_NUMNODES;
> +}
> +
>  static inline const struct cpumask *
>  sched_numa_hop_mask(unsigned int node, unsigned int hops)
>  {
> @@ -261,6 +267,29 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
>  }
>  #endif	/* CONFIG_NUMA */
>  
> +/**
> + * for_each_numa_node - iterate over NUMA nodes at increasing hop distances
> + *                      from a given starting node.
> + * @node: the iteration variable, representing the current NUMA node.
> + * @start: the NUMA node to start the iteration from.
> + * @visited: a nodemask_t to track the visited nodes.

nit: s/nodemask_t/nodemask

> + * @state: state of NUMA nodes to iterate.
> + *
> + * This macro iterates over NUMA nodes in increasing distance from
> + * @start_node and yields MAX_NUMNODES when all the nodes have been
> + * visited.
> + *
> + * The difference between for_each_node() and for_each_numa_node() is that
> + * the former allows to iterate over nodes in no particular order, whereas
> + * the latter iterates over nodes in increasing order of distance.

for_each_node iterates them in numerical order. 

> + *
> + * Requires rcu_lock to be held.
> + */

Please mention complexity here, which is O(N^2). 

> +#define for_each_numa_node(node, start, visited, state)				\
> +	for (node = start;							\
> +	     node != MAX_NUMNODES;						\
> +	     node = sched_numa_node(&(visited), start, state))

Please braces around parameters.

'node < MAX_NUMNODES' is better. It will cost you nothing but will give
some guarantee that if user passes broken start value, we don't call
sched_numa_node().

What about:
        start = -EINVAL;
        foe_each_numa_node(node, start, visited, N_ONLINE)
                do_something(node);

Whatever garbage user puts in 'start', at the very first iteration,
do_something() will be executed against it. Offline node, -EINVAL or
NUMA_NO_NODE are the examples.

> +
>  /**
>   * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
>   *                          from a given node.
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index da33ec9e94ab2..e1d0a33415fb5 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2183,6 +2183,48 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
>  }
>  EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
>  
> +/**
> + * sched_numa_node - Find the NUMA node at the closest distance from
> + *		     node @start.
> + *
> + * @visited: a pointer to a nodemask_t representing the visited nodes.
> + * @start: the node to start the search from.

Maybe just 'node' The 'start' is only meaningful in the caller. Here
we don't start, we just look for a node that is the most nearest to a
given one.

What if NOMA_NO_NODE is passed?

> + * @state: the node state to filter nodes by.
> + *
> + * This function iterates over all nodes in the given state and calculates
> + * the distance to the starting node. It returns the node that is the
> + * closest in terms of distance that has not already been considered (not
> + * set in @visited and not the starting node). If the node is found, it is
> + * marked as visited in the @visited node mask.
> + *
> + * Returns the node ID closest in terms of hop distance from the @start
> + * node, or MAX_NUMNODES if no node is found (or all nodes have been
> + * visited).
> + */
> +int sched_numa_node(nodemask_t *visited, int start, unsigned int state)

The name is somewhat confusing. Sched_numa_node what? If you're looking
for a closest node, call your function find_closest_node().

We already have numa_nearest_node(). The difference between this one
and what you're implementing here is that you add an additional mask.
So, I'd suggest something like

 int numa_nearest_node_andnot(int node, unsigned int state, nodemask_t *mask)

Also, there's about scheduler her, so I'd suggest to place it next to
numa_nearest_node() in mm/mempolicy.c.

> +{
> +	int dist, n, min_node, min_dist;
> +
> +	min_node = MAX_NUMNODES;
> +	min_dist = INT_MAX;
> +
> +	/* Find the nearest unvisted node */

If you name it properly, you don't need to explain your intentions in
the code. Also, at this level of abstraction, the 'visited' name may
be too specific. Let's refer to it as just a mask containing nodes
that user wants to skip for whatever reason. 


> +	for_each_node_state(n, state) {
> +		if (n == start || node_isset(n, *visited))
> +			continue;

Nah.

1. Skipping start node is wrong. The very first call should return
'start' exactly. Then it will be masked out, so the traversing will
move forward. 
2. This should be for_each_node_state_andnot(n, state, mask).

This way you'll be able to drop the above condition entirely.

> +		dist = node_distance(start, n);
> +		if (dist < min_dist) {
> +			min_dist = dist;
> +			min_node = n;
> +		}
> +	}
> +	if (min_node != MAX_NUMNODES)
> +		node_set(min_node, *visited);

Is it possible to set the 'visited' bit in caller code? This way we'll
make the function pure, like other bitmap search functions.

Would something like this work? 

int numa_nearest_node_andnot(int node, unsigned int state, const nodemask_t *mask);
#define for_each_numa_node(node, visited, state)			                      \
	for (int start = (node), n = numa_nearest_node_andnot(start, (state), &(visited));    \
	     n < MAX_NUMNODES;					                      \
             node_set(n, (visited)), n = numa_nearest_node_andnot(start, (state), &(visited)))

One other option to think about is that we can introduce numa_nearest_nodemask()
and use it like this

  nodemask_t nodes;

  nodes_complement(nodes, node_states[N_ONLINE];
  for_each_numa_node(node, unvisited)
        do_something();

Where:

int numa_nearest_nodemask(int node, const nodemask_t *mask);
#define for_each_numa_node(node, unvisited)			                      \
	for (int start = (node), n = numa_nearest_nodemask(start, &(unvisited));    \
	     n < MAX_NUMNODES;					                      \
             node_clear(n, (visited)), n = numa_nearest_nodemask(start, &(visited)))

Thanks,
Yury

> +
> +	return min_node;
> +}
> +EXPORT_SYMBOL_GPL(sched_numa_node);
> +
>  /**
>   * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
>   *                         @node
> -- 
> 2.48.1
Andrea Righi Feb. 7, 2025, 3:44 p.m. UTC | #2
Hi Yury,

On Thu, Feb 06, 2025 at 10:57:19PM -0500, Yury Norov wrote:
> On Thu, Feb 06, 2025 at 09:15:31PM +0100, Andrea Righi wrote:
...
> > @@ -261,6 +267,29 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
> >  }
> >  #endif	/* CONFIG_NUMA */
> >  
> > +/**
> > + * for_each_numa_node - iterate over NUMA nodes at increasing hop distances
> > + *                      from a given starting node.
> > + * @node: the iteration variable, representing the current NUMA node.
> > + * @start: the NUMA node to start the iteration from.
> > + * @visited: a nodemask_t to track the visited nodes.
> 
> nit: s/nodemask_t/nodemask

The type is actually nodemask_t, do you think it's better to mention
nodemask instead?

> 
> > + * @state: state of NUMA nodes to iterate.
> > + *
> > + * This macro iterates over NUMA nodes in increasing distance from
> > + * @start_node and yields MAX_NUMNODES when all the nodes have been
> > + * visited.
> > + *
> > + * The difference between for_each_node() and for_each_numa_node() is that
> > + * the former allows to iterate over nodes in no particular order, whereas
> > + * the latter iterates over nodes in increasing order of distance.
> 
> for_each_node iterates them in numerical order. 

Oh yes, much better. :)

> 
> > + *
> > + * Requires rcu_lock to be held.
> > + */
> 
> Please mention complexity here, which is O(N^2). 

Ok. Will also add a comment to describe why it's O(N^2).

> 
> > +#define for_each_numa_node(node, start, visited, state)				\
> > +	for (node = start;							\
> > +	     node != MAX_NUMNODES;						\
> > +	     node = sched_numa_node(&(visited), start, state))
> 
> Please braces around parameters.

Ok.

> 
> 'node < MAX_NUMNODES' is better. It will cost you nothing but will give
> some guarantee that if user passes broken start value, we don't call
> sched_numa_node().

Good point.

> 
> What about:
>         start = -EINVAL;
>         foe_each_numa_node(node, start, visited, N_ONLINE)
>                 do_something(node);
> 
> Whatever garbage user puts in 'start', at the very first iteration,
> do_something() will be executed against it. Offline node, -EINVAL or
> NUMA_NO_NODE are the examples.

So, my idea was actually to use start == NUMA_NO_NODE for all the cases
where the starting node doesn't matter, for example when calling
scx_bpf_pick_idle_cpu(), that doesn't have a prev_cpu context.

Should we implicitly fallback to for_each_node() when
start == NUMA_NO_NODE? And if it's complete garbage maybe just break and
never execute do_something(node)?

> 
> > +
> >  /**
> >   * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
> >   *                          from a given node.
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index da33ec9e94ab2..e1d0a33415fb5 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -2183,6 +2183,48 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> >  }
> >  EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
> >  
> > +/**
> > + * sched_numa_node - Find the NUMA node at the closest distance from
> > + *		     node @start.
> > + *
> > + * @visited: a pointer to a nodemask_t representing the visited nodes.
> > + * @start: the node to start the search from.
> 
> Maybe just 'node' The 'start' is only meaningful in the caller. Here
> we don't start, we just look for a node that is the most nearest to a
> given one.

Ok.

> 
> What if NOMA_NO_NODE is passed?

We could return the first non-visited node in numerical order. And still
fallback to for_each_node() from for_each_numa_node() when
start == NUMA_NO_NODE.

> 
> > + * @state: the node state to filter nodes by.
> > + *
> > + * This function iterates over all nodes in the given state and calculates
> > + * the distance to the starting node. It returns the node that is the
> > + * closest in terms of distance that has not already been considered (not
> > + * set in @visited and not the starting node). If the node is found, it is
> > + * marked as visited in the @visited node mask.
> > + *
> > + * Returns the node ID closest in terms of hop distance from the @start
> > + * node, or MAX_NUMNODES if no node is found (or all nodes have been
> > + * visited).
> > + */
> > +int sched_numa_node(nodemask_t *visited, int start, unsigned int state)
> 
> The name is somewhat confusing. Sched_numa_node what? If you're looking
> for a closest node, call your function find_closest_node().
> 
> We already have numa_nearest_node(). The difference between this one
> and what you're implementing here is that you add an additional mask.
> So, I'd suggest something like
> 
>  int numa_nearest_node_andnot(int node, unsigned int state, nodemask_t *mask)
> 
> Also, there's about scheduler her, so I'd suggest to place it next to
> numa_nearest_node() in mm/mempolicy.c.

Makes sense, and I agree that mm/mempolicy.c is a better place for this.

> 
> > +{
> > +	int dist, n, min_node, min_dist;
> > +
> > +	min_node = MAX_NUMNODES;
> > +	min_dist = INT_MAX;
> > +
> > +	/* Find the nearest unvisted node */
> 
> If you name it properly, you don't need to explain your intentions in
> the code. Also, at this level of abstraction, the 'visited' name may
> be too specific. Let's refer to it as just a mask containing nodes
> that user wants to skip for whatever reason. 

Ok.

> 
> 
> > +	for_each_node_state(n, state) {
> > +		if (n == start || node_isset(n, *visited))
> > +			continue;
> 
> Nah.
> 
> 1. Skipping start node is wrong. The very first call should return
> 'start' exactly. Then it will be masked out, so the traversing will
> move forward. 
> 2. This should be for_each_node_state_andnot(n, state, mask).
> 
> This way you'll be able to drop the above condition entirely.

Yeah, I agree, I'll revisit this, also considering the comments above.

> 
> > +		dist = node_distance(start, n);
> > +		if (dist < min_dist) {
> > +			min_dist = dist;
> > +			min_node = n;
> > +		}
> > +	}
> > +	if (min_node != MAX_NUMNODES)
> > +		node_set(min_node, *visited);
> 
> Is it possible to set the 'visited' bit in caller code? This way we'll
> make the function pure, like other bitmap search functions.
> 
> Would something like this work? 
> 
> int numa_nearest_node_andnot(int node, unsigned int state, const nodemask_t *mask);
> #define for_each_numa_node(node, visited, state)			                      \
> 	for (int start = (node), n = numa_nearest_node_andnot(start, (state), &(visited));    \
> 	     n < MAX_NUMNODES;					                      \
>              node_set(n, (visited)), n = numa_nearest_node_andnot(start, (state), &(visited)))
> 
> One other option to think about is that we can introduce numa_nearest_nodemask()
> and use it like this
> 
>   nodemask_t nodes;
> 
>   nodes_complement(nodes, node_states[N_ONLINE];
>   for_each_numa_node(node, unvisited)
>         do_something();
> 
> Where:
> 
> int numa_nearest_nodemask(int node, const nodemask_t *mask);
> #define for_each_numa_node(node, unvisited)			                      \
> 	for (int start = (node), n = numa_nearest_nodemask(start, &(unvisited));    \
> 	     n < MAX_NUMNODES;					                      \
>              node_clear(n, (visited)), n = numa_nearest_nodemask(start, &(visited)))

I like the numa_nearest_nodemask() idea, I'll do some experiemnts with it.

Thanks!
-Andrea
Yury Norov Feb. 7, 2025, 5:07 p.m. UTC | #3
On Fri, Feb 07, 2025 at 04:44:07PM +0100, Andrea Righi wrote:
> Hi Yury,
> 
> On Thu, Feb 06, 2025 at 10:57:19PM -0500, Yury Norov wrote:
> > On Thu, Feb 06, 2025 at 09:15:31PM +0100, Andrea Righi wrote:
> ...
> > > @@ -261,6 +267,29 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
> > >  }
> > >  #endif	/* CONFIG_NUMA */
> > >  
> > > +/**
> > > + * for_each_numa_node - iterate over NUMA nodes at increasing hop distances
> > > + *                      from a given starting node.
> > > + * @node: the iteration variable, representing the current NUMA node.
> > > + * @start: the NUMA node to start the iteration from.
> > > + * @visited: a nodemask_t to track the visited nodes.
> > 
> > nit: s/nodemask_t/nodemask
> 
> The type is actually nodemask_t, do you think it's better to mention
> nodemask instead?

We just don't put types in variables descriptions. Refer the comment
on top of memcpy():

 /**
  * memcpy - Copy one area of memory to another
  * @dest: Where to copy to
  * @src: Where to copy from
  * @count: The size of the area.
  *
  * You should not use this function to access IO space, use memcpy_toio()
  * or memcpy_fromio() instead.
  */
 void *memcpy(void *dest, const void *src, size_t count)

We don't say like
  * @count: The size_t of the area.

Right?
 
> > int numa_nearest_nodemask(int node, const nodemask_t *mask);
> > #define for_each_numa_node(node, unvisited)			                      \
> > 	for (int start = (node), n = numa_nearest_nodemask(start, &(unvisited));    \
> > 	     n < MAX_NUMNODES;					                      \
> >              node_clear(n, (visited)), n = numa_nearest_nodemask(start, &(visited)))
> 
> I like the numa_nearest_nodemask() idea, I'll do some experiemnts with it.

Yeah, this one looks like the most generic API I can figure out, and
still it fits your needs just as well.

Please in the comment refer that the 'unvisited' will be touched by
the macro.
diff mbox series

Patch

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3e..0c82b913a8814 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -248,12 +248,18 @@  static inline const struct cpumask *cpu_cpu_mask(int cpu)
 #ifdef CONFIG_NUMA
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
 extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
-#else
+extern int sched_numa_node(nodemask_t *visited, int start, unsigned int state);
+#else /* !CONFIG_NUMA */
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
 	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
+static inline int sched_numa_node(nodemask_t *visited, int start, unsigned int state)
+{
+	return MAX_NUMNODES;
+}
+
 static inline const struct cpumask *
 sched_numa_hop_mask(unsigned int node, unsigned int hops)
 {
@@ -261,6 +267,29 @@  sched_numa_hop_mask(unsigned int node, unsigned int hops)
 }
 #endif	/* CONFIG_NUMA */
 
+/**
+ * for_each_numa_node - iterate over NUMA nodes at increasing hop distances
+ *                      from a given starting node.
+ * @node: the iteration variable, representing the current NUMA node.
+ * @start: the NUMA node to start the iteration from.
+ * @visited: a nodemask_t to track the visited nodes.
+ * @state: state of NUMA nodes to iterate.
+ *
+ * This macro iterates over NUMA nodes in increasing distance from
+ * @start_node and yields MAX_NUMNODES when all the nodes have been
+ * visited.
+ *
+ * The difference between for_each_node() and for_each_numa_node() is that
+ * the former allows to iterate over nodes in no particular order, whereas
+ * the latter iterates over nodes in increasing order of distance.
+ *
+ * Requires rcu_lock to be held.
+ */
+#define for_each_numa_node(node, start, visited, state)				\
+	for (node = start;							\
+	     node != MAX_NUMNODES;						\
+	     node = sched_numa_node(&(visited), start, state))
+
 /**
  * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
  *                          from a given node.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index da33ec9e94ab2..e1d0a33415fb5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2183,6 +2183,48 @@  int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 }
 EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
 
+/**
+ * sched_numa_node - Find the NUMA node at the closest distance from
+ *		     node @start.
+ *
+ * @visited: a pointer to a nodemask_t representing the visited nodes.
+ * @start: the node to start the search from.
+ * @state: the node state to filter nodes by.
+ *
+ * This function iterates over all nodes in the given state and calculates
+ * the distance to the starting node. It returns the node that is the
+ * closest in terms of distance that has not already been considered (not
+ * set in @visited and not the starting node). If the node is found, it is
+ * marked as visited in the @visited node mask.
+ *
+ * Returns the node ID closest in terms of hop distance from the @start
+ * node, or MAX_NUMNODES if no node is found (or all nodes have been
+ * visited).
+ */
+int sched_numa_node(nodemask_t *visited, int start, unsigned int state)
+{
+	int dist, n, min_node, min_dist;
+
+	min_node = MAX_NUMNODES;
+	min_dist = INT_MAX;
+
+	/* Find the nearest unvisted node */
+	for_each_node_state(n, state) {
+		if (n == start || node_isset(n, *visited))
+			continue;
+		dist = node_distance(start, n);
+		if (dist < min_dist) {
+			min_dist = dist;
+			min_node = n;
+		}
+	}
+	if (min_node != MAX_NUMNODES)
+		node_set(min_node, *visited);
+
+	return min_node;
+}
+EXPORT_SYMBOL_GPL(sched_numa_node);
+
 /**
  * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
  *                         @node