diff mbox series

[3/4] sched: add sched_numa_find_nth_cpu()

Message ID 20221111040027.621646-4-yury.norov@gmail.com (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series cpumask: improve on cpumask_local_spread() locality | expand

Commit Message

Yury Norov Nov. 11, 2022, 4 a.m. UTC
The function finds Nth set CPU in a given cpumask starting from a given
node.

Leveraging the fact that each hop in sched_domains_numa_masks includes the
same or greater number of CPUs than the previous one, we can use binary
search on hops instead of linear walk, which makes the overall complexity
of O(log n) in terms of number of cpumask_weight() calls.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/topology.h |  8 ++++++++
 kernel/sched/topology.c  | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

Comments

Yury Norov Nov. 11, 2022, 4:11 a.m. UTC | #1
On Thu, Nov 10, 2022 at 08:00:26PM -0800, Yury Norov wrote:
> The function finds Nth set CPU in a given cpumask starting from a given
> node.
> 
> Leveraging the fact that each hop in sched_domains_numa_masks includes the
> same or greater number of CPUs than the previous one, we can use binary
> search on hops instead of linear walk, which makes the overall complexity
> of O(log n) in terms of number of cpumask_weight() calls.
> 
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  include/linux/topology.h |  8 ++++++++
>  kernel/sched/topology.c  | 42 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 4564faafd0e1..63048ac3207c 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -245,5 +245,13 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
>  	return cpumask_of_node(cpu_to_node(cpu));
>  }
>  
> +#ifdef CONFIG_NUMA
> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
> +#else
> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)

Ah, this should be static of course.

> +{
> +	return cpumask_nth(cpu, cpus);
> +}
> +#endif	/* CONFIG_NUMA */
>  
>  #endif /* _LINUX_TOPOLOGY_H */
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 8739c2a5a54e..c8f56287de46 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2067,6 +2067,48 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
>  	return found;
>  }
>  
> +/*
> + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
> + *                             closest to @cpu from @cpumask.
> + * cpumask: cpumask to find a cpu from
> + * cpu: Nth cpu to find
> + *
> + * returns: cpu, or >= nr_cpu_ids when nothing found.
> + */
> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> +{
> +	unsigned int first = 0, mid, last = sched_domains_numa_levels;
> +	struct cpumask ***masks;
> +	int w, ret = nr_cpu_ids;
> +
> +	rcu_read_lock();
> +	masks = rcu_dereference(sched_domains_numa_masks);
> +	if (!masks)
> +		goto out;
> +
> +	while (last >= first) {
> +		mid = (last + first) / 2;
> +
> +		if (cpumask_weight_and(cpus, masks[mid][node]) <= cpu) {
> +			first = mid + 1;
> +			continue;
> +		}
> +
> +		w = (mid == 0) ? 0 : cpumask_weight_and(cpus, masks[mid - 1][node]);
> +		if (w <= cpu)
> +			break;
> +
> +		last = mid - 1;
> +	}
> +
> +	ret = (mid == 0) ?
> +		cpumask_nth_and(cpu - w, cpus, masks[mid][node]) :
> +		cpumask_nth_and_andnot(cpu - w, cpus, masks[mid][node], masks[mid - 1][node]);
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
>  #endif /* CONFIG_NUMA */
>  
>  static int __sdt_alloc(const struct cpumask *cpu_map)
> -- 
> 2.34.1
Andy Shevchenko Nov. 11, 2022, 11:42 a.m. UTC | #2
On Thu, Nov 10, 2022 at 08:00:26PM -0800, Yury Norov wrote:
> The function finds Nth set CPU in a given cpumask starting from a given
> node.
> 
> Leveraging the fact that each hop in sched_domains_numa_masks includes the
> same or greater number of CPUs than the previous one, we can use binary
> search on hops instead of linear walk, which makes the overall complexity
> of O(log n) in terms of number of cpumask_weight() calls.

...

> +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> +{
> +	unsigned int first = 0, mid, last = sched_domains_numa_levels;
> +	struct cpumask ***masks;

*** ?
Hmm... Do we really need such deep indirection?

> +	int w, ret = nr_cpu_ids;
> +
> +	rcu_read_lock();
> +	masks = rcu_dereference(sched_domains_numa_masks);
> +	if (!masks)
> +		goto out;
> +
> +	while (last >= first) {
> +		mid = (last + first) / 2;
> +
> +		if (cpumask_weight_and(cpus, masks[mid][node]) <= cpu) {
> +			first = mid + 1;
> +			continue;
> +		}
> +
> +		w = (mid == 0) ? 0 : cpumask_weight_and(cpus, masks[mid - 1][node]);

See below.

> +		if (w <= cpu)
> +			break;
> +
> +		last = mid - 1;
> +	}

We have lib/bsearch.h. I haven't really looked deeply into the above, but my
gut feelings that that might be useful here. Can you check that?

> +	ret = (mid == 0) ?
> +		cpumask_nth_and(cpu - w, cpus, masks[mid][node]) :
> +		cpumask_nth_and_andnot(cpu - w, cpus, masks[mid][node], masks[mid - 1][node]);

You can also shorten this by inversing the conditional:

	ret = mid ? ...not 0... : ...for 0...;

> +out:

out_unlock: ?

> +	rcu_read_unlock();
> +	return ret;
> +}
Yury Norov Nov. 11, 2022, 5:07 p.m. UTC | #3
On Fri, Nov 11, 2022 at 01:42:29PM +0200, Andy Shevchenko wrote:
> On Thu, Nov 10, 2022 at 08:00:26PM -0800, Yury Norov wrote:
> > The function finds Nth set CPU in a given cpumask starting from a given
> > node.
> > 
> > Leveraging the fact that each hop in sched_domains_numa_masks includes the
> > same or greater number of CPUs than the previous one, we can use binary
> > search on hops instead of linear walk, which makes the overall complexity
> > of O(log n) in terms of number of cpumask_weight() calls.
> 
> ...
> 
> > +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
> > +{
> > +	unsigned int first = 0, mid, last = sched_domains_numa_levels;
> > +	struct cpumask ***masks;
> 
> *** ?
> Hmm... Do we really need such deep indirection?

It's 2d array of pointers, so - yes.
 
> > +	int w, ret = nr_cpu_ids;
> > +
> > +	rcu_read_lock();
> > +	masks = rcu_dereference(sched_domains_numa_masks);
> > +	if (!masks)
> > +		goto out;
> > +
> > +	while (last >= first) {
> > +		mid = (last + first) / 2;
> > +
> > +		if (cpumask_weight_and(cpus, masks[mid][node]) <= cpu) {
> > +			first = mid + 1;
> > +			continue;
> > +		}
> > +
> > +		w = (mid == 0) ? 0 : cpumask_weight_and(cpus, masks[mid - 1][node]);
> 
> See below.
> 
> > +		if (w <= cpu)
> > +			break;
> > +
> > +		last = mid - 1;
> > +	}
> 
> We have lib/bsearch.h. I haven't really looked deeply into the above, but my
> gut feelings that that might be useful here. Can you check that?

Yes we do. I tried it, and it didn't work because nodes arrays are
allocated dynamically, and distance between different pairs of hops
for a given node is not a constant, which is a requirement for
bsearch.

However, distance between hops pointers in 1st level array should be
constant, and we can try feeding bsearch with it. I'll experiment with
bsearch for more.

> > +	ret = (mid == 0) ?
> > +		cpumask_nth_and(cpu - w, cpus, masks[mid][node]) :
> > +		cpumask_nth_and_andnot(cpu - w, cpus, masks[mid][node], masks[mid - 1][node]);
> 
> You can also shorten this by inversing the conditional:
> 
> 	ret = mid ? ...not 0... : ...for 0...;

Yep, why not.

> > +out:
> 
> out_unlock: ?

Do you think it's better?

> > +	rcu_read_unlock();
> > +	return ret;
> > +}
> 
> -- 
> With Best Regards,
> Andy Shevchenko
>
Andy Shevchenko Nov. 11, 2022, 6:14 p.m. UTC | #4
On Fri, Nov 11, 2022 at 09:07:15AM -0800, Yury Norov wrote:
> On Fri, Nov 11, 2022 at 01:42:29PM +0200, Andy Shevchenko wrote:
> > On Thu, Nov 10, 2022 at 08:00:26PM -0800, Yury Norov wrote:

...

> > > +out:
> > 
> > out_unlock: ?
> 
> Do you think it's better?

Yes. It shows what will happen at goto.

So when one reads the "goto out;" it's something like "return ret;".
But "goto out_unlock;" immediately pictures "unlock; return ret;".

P.S. That's basically the way how we name labels.

> > > +	rcu_read_unlock();
> > > +	return ret;
Yury Norov Nov. 12, 2022, 6:14 p.m. UTC | #5
On Fri, Nov 11, 2022 at 09:07:17AM -0800, Yury Norov wrote:
> On Fri, Nov 11, 2022 at 01:42:29PM +0200, Andy Shevchenko wrote:
> > On Thu, Nov 10, 2022 at 08:00:26PM -0800, Yury Norov wrote:
> > > +	int w, ret = nr_cpu_ids;
> > > +
> > > +	rcu_read_lock();
> > > +	masks = rcu_dereference(sched_domains_numa_masks);
> > > +	if (!masks)
> > > +		goto out;
> > > +
> > > +	while (last >= first) {
> > > +		mid = (last + first) / 2;
> > > +
> > > +		if (cpumask_weight_and(cpus, masks[mid][node]) <= cpu) {
> > > +			first = mid + 1;
> > > +			continue;
> > > +		}
> > > +
> > > +		w = (mid == 0) ? 0 : cpumask_weight_and(cpus, masks[mid - 1][node]);
> > 
> > See below.
> > 
> > > +		if (w <= cpu)
> > > +			break;
> > > +
> > > +		last = mid - 1;
> > > +	}
> > 
> > We have lib/bsearch.h. I haven't really looked deeply into the above, but my
> > gut feelings that that might be useful here. Can you check that?
> 
> Yes we do. I tried it, and it didn't work because nodes arrays are
> allocated dynamically, and distance between different pairs of hops
> for a given node is not a constant, which is a requirement for
> bsearch.
> 
> However, distance between hops pointers in 1st level array should be
> constant, and we can try feeding bsearch with it. I'll experiment with
> bsearch for more.

OK, I tried bsearch on array of hops, and it works. But it requires
adding some black pointers magic. I'll send v2 based on bsearch soon.

Thanks,
Yury
diff mbox series

Patch

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4564faafd0e1..63048ac3207c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,13 @@  static inline const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 
+#ifdef CONFIG_NUMA
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+#else
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	return cpumask_nth(cpu, cpus);
+}
+#endif	/* CONFIG_NUMA */
 
 #endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..c8f56287de46 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2067,6 +2067,48 @@  int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 	return found;
 }
 
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ *                             closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or >= nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	unsigned int first = 0, mid, last = sched_domains_numa_levels;
+	struct cpumask ***masks;
+	int w, ret = nr_cpu_ids;
+
+	rcu_read_lock();
+	masks = rcu_dereference(sched_domains_numa_masks);
+	if (!masks)
+		goto out;
+
+	while (last >= first) {
+		mid = (last + first) / 2;
+
+		if (cpumask_weight_and(cpus, masks[mid][node]) <= cpu) {
+			first = mid + 1;
+			continue;
+		}
+
+		w = (mid == 0) ? 0 : cpumask_weight_and(cpus, masks[mid - 1][node]);
+		if (w <= cpu)
+			break;
+
+		last = mid - 1;
+	}
+
+	ret = (mid == 0) ?
+		cpumask_nth_and(cpu - w, cpus, masks[mid][node]) :
+		cpumask_nth_and_andnot(cpu - w, cpus, masks[mid][node], masks[mid - 1][node]);
+out:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)