diff mbox

arm64: topology: add MPIDR-based detection

Message ID 20140603173103.GA18004@red-moon (mailing list archive)
State New, archived
Headers show

Commit Message

Lorenzo Pieralisi June 3, 2014, 5:31 p.m. UTC
Mark,

On Sun, Jun 01, 2014 at 06:37:29PM +0100, Mark Brown wrote:
> From: Zi Shen Lim <zlim@broadcom.com>
> 
> Create cpu topology based on MPIDR. When hardware sets MPIDR to sane
> values, this method will always work. Therefore it should also work well
> as the fallback method. [1]
> 
> When we have multiple processing elements in the system, we create
> the cpu topology by mapping each affinity level (from lowest to highest)
> to threads (if they exist), cores, and clusters.
> 
> We combine data from all higher affinity levels into cluster_id
> so we don't lose any information from MPIDR. [2]

I refactored the patch (UP code path) and deliberately removed the
code that packs affinity levels into the cluster id. I do not
like packing the affinity levels and on second thoughts packing the
unused affinity levels into cluster_id is as correct as packing
the unused affinity levels into core_id (ie it is arbitrary), so I do
not think we should do it, that's the reason why we defined DT bindings
to add a proper topology semantics and we should use them when the MPIDR
values deviate from the "recommendations".

Patch attached, my ack included, should be ready to go, unless you
object to that.

Lorenzo

> [1] http://www.spinics.net/lists/arm-kernel/msg317445.html
> [2] https://lkml.org/lkml/2014/4/23/703
> 
> Signed-off-by: Zi Shen Lim <zlim@broadcom.com>
> Signed-off-by: Mark Brown <broonie@linaro.org>
> ---
>  arch/arm64/include/asm/cputype.h |  2 ++
>  arch/arm64/kernel/topology.c     | 51 +++++++++++++++++++++++++++++-----------
>  2 files changed, 39 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
> index 27f54a7cc81b..ed48a3a7836a 100644
> --- a/arch/arm64/include/asm/cputype.h
> +++ b/arch/arm64/include/asm/cputype.h
> @@ -18,6 +18,8 @@
>  
>  #define INVALID_HWID		ULONG_MAX
>  
> +#define MPIDR_UP_BITMASK	(0x1 << 30)
> +#define MPIDR_MT_BITMASK	(0x1 << 24)
>  #define MPIDR_HWID_BITMASK	0xff00ffffff
>  
>  #define MPIDR_LEVEL_BITS_SHIFT	3
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 43514f905916..9a0160068503 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -20,6 +20,7 @@
>  #include <linux/of.h>
>  #include <linux/sched.h>
>  
> +#include <asm/cputype.h>
>  #include <asm/topology.h>
>  
>  static int __init get_cpu_for_node(struct device_node *node)
> @@ -188,13 +189,9 @@ static int __init parse_dt_topology(void)
>  	 * Check that all cores are in the topology; the SMP code will
>  	 * only mark cores described in the DT as possible.
>  	 */
> -	for_each_possible_cpu(cpu) {
> -		if (cpu_topology[cpu].cluster_id == -1) {
> -			pr_err("CPU%d: No topology information specified\n",
> -			       cpu);
> +	for_each_possible_cpu(cpu)
> +		if (cpu_topology[cpu].cluster_id == -1)
>  			ret = -EINVAL;
> -		}
> -	}
>  
>  out_map:
>  	of_node_put(map);
> @@ -219,14 +216,6 @@ static void update_siblings_masks(unsigned int cpuid)
>  	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
>  	int cpu;
>  
> -	if (cpuid_topo->cluster_id == -1) {
> -		/*
> -		 * DT does not contain topology information for this cpu.
> -		 */
> -		pr_debug("CPU%u: No topology information configured\n", cpuid);
> -		return;
> -	}
> -
>  	/* update core and thread sibling masks */
>  	for_each_possible_cpu(cpu) {
>  		cpu_topo = &cpu_topology[cpu];
> @@ -249,6 +238,40 @@ static void update_siblings_masks(unsigned int cpuid)
>  
>  void store_cpu_topology(unsigned int cpuid)
>  {
> +	struct cpu_topology *cpuid_topo = &cpu_topology[cpuid];
> +	u64 mpidr;
> +
> +	if (cpuid_topo->cluster_id != -1)
> +		goto topology_populated;
> +
> +	mpidr = read_cpuid_mpidr();
> +
> +	/* Create cpu topology mapping based on MPIDR. */
> +	if (mpidr & MPIDR_UP_BITMASK) {
> +		/* Uniprocessor system */
> +		cpuid_topo->thread_id  = -1;
> +		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +		cpuid_topo->cluster_id = 0;
> +	} else if (mpidr & MPIDR_MT_BITMASK) {
> +		/* Multiprocessor system : Multi-threads per core */
> +		cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
> +					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
> +	} else {
> +		/* Multiprocessor system : Single-thread per core */
> +		cpuid_topo->thread_id  = -1;
> +		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
> +					 MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
> +					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
> +	}
> +
> +	pr_debug("CPU%u: cluster %d core %d thread %d mpidr %llx\n",
> +		 cpuid, cpuid_topo->cluster_id, cpuid_topo->core_id,
> +		 cpuid_topo->thread_id, mpidr);
> +
> +topology_populated:
>  	update_siblings_masks(cpuid);
>  }
>  
> -- 
> 2.0.0.rc4
> 
>

Comments

Mark Brown June 3, 2014, 9:04 p.m. UTC | #1
On Tue, Jun 03, 2014 at 06:31:03PM +0100, Lorenzo Pieralisi wrote:

> I refactored the patch (UP code path) and deliberately removed the
> code that packs affinity levels into the cluster id. I do not
> like packing the affinity levels and on second thoughts packing the
> unused affinity levels into cluster_id is as correct as packing
> the unused affinity levels into core_id (ie it is arbitrary), so I do

I'm having a really hard time seeing this as anything other than a step
back.  Your change causes us to discard the higher affinity levels which
seems like something we actively know to be misleading and means that we
will be handing the scheduler two different identically numbered cores
(all the affinity levels we do pay attention to will be identical) if we
ever encounter such a system which is actively broken.

> not think we should do it, that's the reason why we defined DT bindings
> to add a proper topology semantics and we should use them when the MPIDR
> values deviate from the "recommendations".

I'd agree with not going to great lengths unless someone defines
semantics for the higher affinity levels in the future however it does
seem like completely ignoring them when it's so easy to take some
account of them is insufficiently defensive - it's similar to things
like putting the of_node_put() calls in even though they don't do
anything at the minute.

> Patch attached, my ack included, should be ready to go, unless you
> object to that.

Well, I certainly don't object to getting something merged here.
Lorenzo Pieralisi June 4, 2014, 9:34 a.m. UTC | #2
On Tue, Jun 03, 2014 at 10:04:24PM +0100, Mark Brown wrote:
> On Tue, Jun 03, 2014 at 06:31:03PM +0100, Lorenzo Pieralisi wrote:
> 
> > I refactored the patch (UP code path) and deliberately removed the
> > code that packs affinity levels into the cluster id. I do not
> > like packing the affinity levels and on second thoughts packing the
> > unused affinity levels into cluster_id is as correct as packing
> > the unused affinity levels into core_id (ie it is arbitrary), so I do
> 
> I'm having a really hard time seeing this as anything other than a step
> back.  Your change causes us to discard the higher affinity levels which
> seems like something we actively know to be misleading and means that we
> will be handing the scheduler two different identically numbered cores
> (all the affinity levels we do pay attention to will be identical) if we
> ever encounter such a system which is actively broken.

If we ever encounter such systems we will deal with them when time
comes, DT can handle them and patching this code is trivial.

All I am saying is, let's wait and see, there is no compelling need to
use aff3 (and aff2 on non-SMT systems) on current systems for the
topology.

> > not think we should do it, that's the reason why we defined DT bindings
> > to add a proper topology semantics and we should use them when the MPIDR
> > values deviate from the "recommendations".
> 
> I'd agree with not going to great lengths unless someone defines
> semantics for the higher affinity levels in the future however it does
> seem like completely ignoring them when it's so easy to take some
> account of them is insufficiently defensive - it's similar to things
> like putting the of_node_put() calls in even though they don't do
> anything at the minute.

So why don't you remove of_node_put() all over the kernel then ?

As for the MPIDR and the affinity levels, see above.

> > Patch attached, my ack included, should be ready to go, unless you
> > object to that.
> 
> Well, I certainly don't object to getting something merged here.

Neither do I, let's get this code in.

Thanks,
Lorenzo
Mark Brown June 4, 2014, 11:57 a.m. UTC | #3
On Wed, Jun 04, 2014 at 10:34:53AM +0100, Lorenzo Pieralisi wrote:
> On Tue, Jun 03, 2014 at 10:04:24PM +0100, Mark Brown wrote:

> > I'm having a really hard time seeing this as anything other than a step
> > back.  Your change causes us to discard the higher affinity levels which
> > seems like something we actively know to be misleading and means that we
> > will be handing the scheduler two different identically numbered cores
> > (all the affinity levels we do pay attention to will be identical) if we
> > ever encounter such a system which is actively broken.

> If we ever encounter such systems we will deal with them when time
> comes, DT can handle them and patching this code is trivial.

> All I am saying is, let's wait and see, there is no compelling need to
> use aff3 (and aff2 on non-SMT systems) on current systems for the
> topology.

That's still a kernel patch or having to write DT to get things working
which for the sake of avoiding a couple of shift and or statements just
seems unhelpful.  If it was just going to give poor performance that'd
be one thing but it's actively broken.

> > > not think we should do it, that's the reason why we defined DT bindings
> > > to add a proper topology semantics and we should use them when the MPIDR
> > > values deviate from the "recommendations".

> > I'd agree with not going to great lengths unless someone defines
> > semantics for the higher affinity levels in the future however it does
> > seem like completely ignoring them when it's so easy to take some
> > account of them is insufficiently defensive - it's similar to things
> > like putting the of_node_put() calls in even though they don't do
> > anything at the minute.

> So why don't you remove of_node_put() all over the kernel then ?

Well, for code that might run on SPARC, PowerPC or (IIRC) x86 where
there are genuine OF implementations as opposed to just FDT and it does
in fact do something so in generic code it's useful.  For code that'll
only be run with FDT personally I'd not bother.
Lorenzo Pieralisi June 4, 2014, 1:01 p.m. UTC | #4
On Wed, Jun 04, 2014 at 12:57:51PM +0100, Mark Brown wrote:
> On Wed, Jun 04, 2014 at 10:34:53AM +0100, Lorenzo Pieralisi wrote:
> > On Tue, Jun 03, 2014 at 10:04:24PM +0100, Mark Brown wrote:
> 
> > > I'm having a really hard time seeing this as anything other than a step
> > > back.  Your change causes us to discard the higher affinity levels which
> > > seems like something we actively know to be misleading and means that we
> > > will be handing the scheduler two different identically numbered cores
> > > (all the affinity levels we do pay attention to will be identical) if we
> > > ever encounter such a system which is actively broken.
> 
> > If we ever encounter such systems we will deal with them when time
> > comes, DT can handle them and patching this code is trivial.
> 
> > All I am saying is, let's wait and see, there is no compelling need to
> > use aff3 (and aff2 on non-SMT systems) on current systems for the
> > topology.
> 
> That's still a kernel patch or having to write DT to get things working
> which for the sake of avoiding a couple of shift and or statements just
> seems unhelpful.  If it was just going to give poor performance that'd
> be one thing but it's actively broken.

What's broken ? Please provide me with an example and I will update the
patch.

Lorenzo
Mark Brown June 4, 2014, 1:54 p.m. UTC | #5
On Wed, Jun 04, 2014 at 02:01:14PM +0100, Lorenzo Pieralisi wrote:
> On Wed, Jun 04, 2014 at 12:57:51PM +0100, Mark Brown wrote:

> > > All I am saying is, let's wait and see, there is no compelling need to
> > > use aff3 (and aff2 on non-SMT systems) on current systems for the

> > That's still a kernel patch or having to write DT to get things working
> > which for the sake of avoiding a couple of shift and or statements just
> > seems unhelpful.  If it was just going to give poor performance that'd
> > be one thing but it's actively broken.

> What's broken ? Please provide me with an example and I will update the
> patch.

If some system we encounter in the future is for example a SMT system
which does use aff3 then until someone actively goes and fixes it by
writing the DT/ACPI or updating the kernel if there's two threads
0,0,0,0 and 0,0,0,1 we'll tell the scheduler they're both 0,0,0 which
we're not supposed to do.
Lorenzo Pieralisi June 4, 2014, 3:51 p.m. UTC | #6
On Wed, Jun 04, 2014 at 02:54:31PM +0100, Mark Brown wrote:
> On Wed, Jun 04, 2014 at 02:01:14PM +0100, Lorenzo Pieralisi wrote:
> > On Wed, Jun 04, 2014 at 12:57:51PM +0100, Mark Brown wrote:
> 
> > > > All I am saying is, let's wait and see, there is no compelling need to
> > > > use aff3 (and aff2 on non-SMT systems) on current systems for the
> 
> > > That's still a kernel patch or having to write DT to get things working
> > > which for the sake of avoiding a couple of shift and or statements just
> > > seems unhelpful.  If it was just going to give poor performance that'd
> > > be one thing but it's actively broken.
> 
> > What's broken ? Please provide me with an example and I will update the
> > patch.
> 
> If some system we encounter in the future is for example a SMT system
> which does use aff3 then until someone actively goes and fixes it by
> writing the DT/ACPI or updating the kernel if there's two threads
> 0,0,0,0 and 0,0,0,1 we'll tell the scheduler they're both 0,0,0 which
> we're not supposed to do.

Ok, so nothing to worry about for now (and as I mentioned that's a "bug"
in arm32 code too, less likely to be triggered, granted).

My question is: is it better to pack affinity levels and "guess" what aff3
(and aff2 on non-SMT) means or add an additional level of hierarchy in the
arm64 topology code (eg book_id - implemented only for s390 to the best
of my knowledge) ?

I personally prefer the latter approach but I think it boils down to
understanding what do we want to provide the scheduler with if we have
a hierarchy that extends beyond "cluster" level.

I will be glad to help you implement it when time comes (and this will also
fix the clusters of clusters DT issue we are facing - ie how to treat them).

Now, I do not think it is a major problem at the moment, merging the
patch I sent will give us more time to discuss how to define the
topology for clusters of clusters, because that's what we are talking
about.

Does it make sense ?

Thanks !
Lorenzo
Mark Brown June 4, 2014, 4:34 p.m. UTC | #7
On Wed, Jun 04, 2014 at 04:51:29PM +0100, Lorenzo Pieralisi wrote:

> My question is: is it better to pack affinity levels and "guess" what aff3
> (and aff2 on non-SMT) means or add an additional level of hierarchy in the
> arm64 topology code (eg book_id - implemented only for s390 to the best
> of my knowledge) ?

Shoving them in there would address the issue as well, yes (though we'd
still have to combine aff2 and aff3 for the non-SMT case).  I don't know
if having books enabled has some overhead we don't want though.

> I personally prefer the latter approach but I think it boils down to
> understanding what do we want to provide the scheduler with if we have
> a hierarchy that extends beyond "cluster" level.

> I will be glad to help you implement it when time comes (and this will also
> fix the clusters of clusters DT issue we are facing - ie how to treat them).

> Now, I do not think it is a major problem at the moment, merging the
> patch I sent will give us more time to discuss how to define the
> topology for clusters of clusters, because that's what we are talking
> about.

In so far as you're saying that we don't really need to worry about
exactly how we handle multi-level clusters properly at the minute I
agree with you - until we have some idea what they physically look like
and can consider how well that maps onto the scheduler and whatnot it
doesn't really matter and we can just ignore it.  Given that I'm not
concerned about just reporting everything as flat like we do with DT at
the minute and don't see a real need to theorise about it, it'll just be
a performance problem and not a correctness problem when it is
encountered.  That feels like a better position to leave things in as it
will be less stress for whoever is bringing up such a fancy new system,
they can stand a reasonable chance of getting things at least running
with minimal effort.

> Does it make sense ?

Like I say I do think that merging your current code is better than
nothing.
Lorenzo Pieralisi June 4, 2014, 5:10 p.m. UTC | #8
On Wed, Jun 04, 2014 at 05:34:00PM +0100, Mark Brown wrote:
> On Wed, Jun 04, 2014 at 04:51:29PM +0100, Lorenzo Pieralisi wrote:
> 
> > My question is: is it better to pack affinity levels and "guess" what aff3
> > (and aff2 on non-SMT) means or add an additional level of hierarchy in the
> > arm64 topology code (eg book_id - implemented only for s390 to the best
> > of my knowledge) ?
> 
> Shoving them in there would address the issue as well, yes (though we'd
> still have to combine aff2 and aff3 for the non-SMT case).  I don't know
> if having books enabled has some overhead we don't want though.
> 
> > I personally prefer the latter approach but I think it boils down to
> > understanding what do we want to provide the scheduler with if we have
> > a hierarchy that extends beyond "cluster" level.
> 
> > I will be glad to help you implement it when time comes (and this will also
> > fix the clusters of clusters DT issue we are facing - ie how to treat them).
> 
> > Now, I do not think it is a major problem at the moment, merging the
> > patch I sent will give us more time to discuss how to define the
> > topology for clusters of clusters, because that's what we are talking
> > about.
> 
> In so far as you're saying that we don't really need to worry about
> exactly how we handle multi-level clusters properly at the minute I
> agree with you - until we have some idea what they physically look like
> and can consider how well that maps onto the scheduler and whatnot it
> doesn't really matter and we can just ignore it.  Given that I'm not
> concerned about just reporting everything as flat like we do with DT at
> the minute and don't see a real need to theorise about it, it'll just be
> a performance problem and not a correctness problem when it is
> encountered.  That feels like a better position to leave things in as it
> will be less stress for whoever is bringing up such a fancy new system,
> they can stand a reasonable chance of getting things at least running
> with minimal effort.

Ok, I think we have an agreement, let's merge the patch I sent and
discuss the way forward to cater for systems with clusters of clusters
when we reasonably expect them to hit production, the scheduler expected
topology might well change by that time and now we are well positioned
to cope with future extensions (and actually packing affinity levels
might well be the final solution if the scheduler expects a "flat"
topology at the higher topology level).

> > Does it make sense ?
> 
> Like I say I do think that merging your current code is better than
> nothing.

Great, thanks for bearing with me.

Thanks !
Lorenzo
Ganapatrao Kulkarni Aug. 18, 2014, 7:39 a.m. UTC | #9
How we map non SMT (MT bit24=0) cores of dual/multi socket system with
the topology which is using only aff0 and aff1?
can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?

thanks
Ganapat


On Wed, Jun 4, 2014 at 10:40 PM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> On Wed, Jun 04, 2014 at 05:34:00PM +0100, Mark Brown wrote:
>> On Wed, Jun 04, 2014 at 04:51:29PM +0100, Lorenzo Pieralisi wrote:
>>
>> > My question is: is it better to pack affinity levels and "guess" what aff3
>> > (and aff2 on non-SMT) means or add an additional level of hierarchy in the
>> > arm64 topology code (eg book_id - implemented only for s390 to the best
>> > of my knowledge) ?
>>
>> Shoving them in there would address the issue as well, yes (though we'd
>> still have to combine aff2 and aff3 for the non-SMT case).  I don't know
>> if having books enabled has some overhead we don't want though.
>>
>> > I personally prefer the latter approach but I think it boils down to
>> > understanding what do we want to provide the scheduler with if we have
>> > a hierarchy that extends beyond "cluster" level.
>>
>> > I will be glad to help you implement it when time comes (and this will also
>> > fix the clusters of clusters DT issue we are facing - ie how to treat them).
>>
>> > Now, I do not think it is a major problem at the moment, merging the
>> > patch I sent will give us more time to discuss how to define the
>> > topology for clusters of clusters, because that's what we are talking
>> > about.
>>
>> In so far as you're saying that we don't really need to worry about
>> exactly how we handle multi-level clusters properly at the minute I
>> agree with you - until we have some idea what they physically look like
>> and can consider how well that maps onto the scheduler and whatnot it
>> doesn't really matter and we can just ignore it.  Given that I'm not
>> concerned about just reporting everything as flat like we do with DT at
>> the minute and don't see a real need to theorise about it, it'll just be
>> a performance problem and not a correctness problem when it is
>> encountered.  That feels like a better position to leave things in as it
>> will be less stress for whoever is bringing up such a fancy new system,
>> they can stand a reasonable chance of getting things at least running
>> with minimal effort.
>
> Ok, I think we have an agreement, let's merge the patch I sent and
> discuss the way forward to cater for systems with clusters of clusters
> when we reasonably expect them to hit production, the scheduler expected
> topology might well change by that time and now we are well positioned
> to cope with future extensions (and actually packing affinity levels
> might well be the final solution if the scheduler expects a "flat"
> topology at the higher topology level).
>
>> > Does it make sense ?
>>
>> Like I say I do think that merging your current code is better than
>> nothing.
>
> Great, thanks for bearing with me.
>
> Thanks !
> Lorenzo
>
>
> _______________________________________________
> linaro-kernel mailing list
> linaro-kernel@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/linaro-kernel
Lorenzo Pieralisi Aug. 18, 2014, 10:36 p.m. UTC | #10
On Mon, Aug 18, 2014 at 08:39:48AM +0100, Ganapatrao Kulkarni wrote:
> How we map non SMT (MT bit24=0) cores of dual/multi socket system with
> the topology which is using only aff0 and aff1?
> can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?

Can you provide us with a description of the topology and the MPIDR_EL1
list please ?

I think that DT squashes the levels above cluster so that's how it could
be implemented but first I would like to see what s the CPUs layout
of the system.

Thanks,
Lorenzo

> 
> thanks
> Ganapat
> 
> 
> On Wed, Jun 4, 2014 at 10:40 PM, Lorenzo Pieralisi
> <lorenzo.pieralisi@arm.com> wrote:
> > On Wed, Jun 04, 2014 at 05:34:00PM +0100, Mark Brown wrote:
> >> On Wed, Jun 04, 2014 at 04:51:29PM +0100, Lorenzo Pieralisi wrote:
> >>
> >> > My question is: is it better to pack affinity levels and "guess" what aff3
> >> > (and aff2 on non-SMT) means or add an additional level of hierarchy in the
> >> > arm64 topology code (eg book_id - implemented only for s390 to the best
> >> > of my knowledge) ?
> >>
> >> Shoving them in there would address the issue as well, yes (though we'd
> >> still have to combine aff2 and aff3 for the non-SMT case).  I don't know
> >> if having books enabled has some overhead we don't want though.
> >>
> >> > I personally prefer the latter approach but I think it boils down to
> >> > understanding what do we want to provide the scheduler with if we have
> >> > a hierarchy that extends beyond "cluster" level.
> >>
> >> > I will be glad to help you implement it when time comes (and this will also
> >> > fix the clusters of clusters DT issue we are facing - ie how to treat them).
> >>
> >> > Now, I do not think it is a major problem at the moment, merging the
> >> > patch I sent will give us more time to discuss how to define the
> >> > topology for clusters of clusters, because that's what we are talking
> >> > about.
> >>
> >> In so far as you're saying that we don't really need to worry about
> >> exactly how we handle multi-level clusters properly at the minute I
> >> agree with you - until we have some idea what they physically look like
> >> and can consider how well that maps onto the scheduler and whatnot it
> >> doesn't really matter and we can just ignore it.  Given that I'm not
> >> concerned about just reporting everything as flat like we do with DT at
> >> the minute and don't see a real need to theorise about it, it'll just be
> >> a performance problem and not a correctness problem when it is
> >> encountered.  That feels like a better position to leave things in as it
> >> will be less stress for whoever is bringing up such a fancy new system,
> >> they can stand a reasonable chance of getting things at least running
> >> with minimal effort.
> >
> > Ok, I think we have an agreement, let's merge the patch I sent and
> > discuss the way forward to cater for systems with clusters of clusters
> > when we reasonably expect them to hit production, the scheduler expected
> > topology might well change by that time and now we are well positioned
> > to cope with future extensions (and actually packing affinity levels
> > might well be the final solution if the scheduler expects a "flat"
> > topology at the higher topology level).
> >
> >> > Does it make sense ?
> >>
> >> Like I say I do think that merging your current code is better than
> >> nothing.
> >
> > Great, thanks for bearing with me.
> >
> > Thanks !
> > Lorenzo
> >
> >
> > _______________________________________________
> > linaro-kernel mailing list
> > linaro-kernel@lists.linaro.org
> > http://lists.linaro.org/mailman/listinfo/linaro-kernel
>
Ganapatrao Kulkarni Aug. 20, 2014, 4:08 a.m. UTC | #11
Hi Lorenzo,

On Tue, Aug 19, 2014 at 4:06 AM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> On Mon, Aug 18, 2014 at 08:39:48AM +0100, Ganapatrao Kulkarni wrote:
>> How we map non SMT (MT bit24=0) cores of dual/multi socket system with
>> the topology which is using only aff0 and aff1?
>> can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?
>
> Can you provide us with a description of the topology and the MPIDR_EL1
> list please ?
>
> I think that DT squashes the levels above cluster so that's how it could
> be implemented but first I would like to see what s the CPUs layout
> of the system.

Our Soc has 48 core(Cavium's  thunderX) design. 48 cores are
sub-grouped in to 3 clusters.
In turn each cluster is having 16 cores.
We do have support for connecting 2 SoCs and making single system of 96 cores.

the MPIDR mapping for this topology is,

Aff0 is mapped to 16 cores within a cluster. Valid range is 0 to 0xf
Aff1 is mapped to cluster number, valid values are 0,1 and 2.
Aff2 is mapped to Socket-id or SoC number. Valid values are 0 and 1.

note : our definition of cluster and number of cores in a cluster is
in-line with the GICv3 spec.

>
> Thanks,
> Lorenzo
>
>>
>> thanks
>> Ganapat
>>
>>
>> On Wed, Jun 4, 2014 at 10:40 PM, Lorenzo Pieralisi
>> <lorenzo.pieralisi@arm.com> wrote:
>> > On Wed, Jun 04, 2014 at 05:34:00PM +0100, Mark Brown wrote:
>> >> On Wed, Jun 04, 2014 at 04:51:29PM +0100, Lorenzo Pieralisi wrote:
>> >>
>> >> > My question is: is it better to pack affinity levels and "guess" what aff3
>> >> > (and aff2 on non-SMT) means or add an additional level of hierarchy in the
>> >> > arm64 topology code (eg book_id - implemented only for s390 to the best
>> >> > of my knowledge) ?
>> >>
>> >> Shoving them in there would address the issue as well, yes (though we'd
>> >> still have to combine aff2 and aff3 for the non-SMT case).  I don't know
>> >> if having books enabled has some overhead we don't want though.
>> >>
>> >> > I personally prefer the latter approach but I think it boils down to
>> >> > understanding what do we want to provide the scheduler with if we have
>> >> > a hierarchy that extends beyond "cluster" level.
>> >>
>> >> > I will be glad to help you implement it when time comes (and this will also
>> >> > fix the clusters of clusters DT issue we are facing - ie how to treat them).
>> >>
>> >> > Now, I do not think it is a major problem at the moment, merging the
>> >> > patch I sent will give us more time to discuss how to define the
>> >> > topology for clusters of clusters, because that's what we are talking
>> >> > about.
>> >>
>> >> In so far as you're saying that we don't really need to worry about
>> >> exactly how we handle multi-level clusters properly at the minute I
>> >> agree with you - until we have some idea what they physically look like
>> >> and can consider how well that maps onto the scheduler and whatnot it
>> >> doesn't really matter and we can just ignore it.  Given that I'm not
>> >> concerned about just reporting everything as flat like we do with DT at
>> >> the minute and don't see a real need to theorise about it, it'll just be
>> >> a performance problem and not a correctness problem when it is
>> >> encountered.  That feels like a better position to leave things in as it
>> >> will be less stress for whoever is bringing up such a fancy new system,
>> >> they can stand a reasonable chance of getting things at least running
>> >> with minimal effort.
>> >
>> > Ok, I think we have an agreement, let's merge the patch I sent and
>> > discuss the way forward to cater for systems with clusters of clusters
>> > when we reasonably expect them to hit production, the scheduler expected
>> > topology might well change by that time and now we are well positioned
>> > to cope with future extensions (and actually packing affinity levels
>> > might well be the final solution if the scheduler expects a "flat"
>> > topology at the higher topology level).
>> >
>> >> > Does it make sense ?
>> >>
>> >> Like I say I do think that merging your current code is better than
>> >> nothing.
>> >
>> > Great, thanks for bearing with me.
>> >
>> > Thanks !
>> > Lorenzo
>> >
>> >
>> > _______________________________________________
>> > linaro-kernel mailing list
>> > linaro-kernel@lists.linaro.org
>> > http://lists.linaro.org/mailman/listinfo/linaro-kernel
>>
>
Zi Shen Lim Aug. 21, 2014, 5:15 a.m. UTC | #12
Hi Ganapatrao,

On Tue, Aug 19, 2014 at 11:08 PM, Ganapatrao Kulkarni
<gpkulkarni@gmail.com> wrote:
> Hi Lorenzo,
>
> On Tue, Aug 19, 2014 at 4:06 AM, Lorenzo Pieralisi
> <lorenzo.pieralisi@arm.com> wrote:
>> On Mon, Aug 18, 2014 at 08:39:48AM +0100, Ganapatrao Kulkarni wrote:
>>> How we map non SMT (MT bit24=0) cores of dual/multi socket system with
>>> the topology which is using only aff0 and aff1?
>>> can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?
>>
>> Can you provide us with a description of the topology and the MPIDR_EL1
>> list please ?
>>
>> I think that DT squashes the levels above cluster so that's how it could
>> be implemented but first I would like to see what s the CPUs layout
>> of the system.
>
> Our Soc has 48 core(Cavium's  thunderX) design. 48 cores are
> sub-grouped in to 3 clusters.
> In turn each cluster is having 16 cores.
> We do have support for connecting 2 SoCs and making single system of 96 cores.
>
> the MPIDR mapping for this topology is,
>
> Aff0 is mapped to 16 cores within a cluster. Valid range is 0 to 0xf
> Aff1 is mapped to cluster number, valid values are 0,1 and 2.
> Aff2 is mapped to Socket-id or SoC number. Valid values are 0 and 1.

So, we went back and forth a few times and finally settled on what's
merged into 3.17, due to lack of visibility into hardware that'll be
shipped.

At one point, we had something approximating behavior of DT-based
topology, along the lines of:

        if (mpidr & MPIDR_MT_BITMASK) {
                /* Multiprocessor system : Multi-threads per core */
                cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
                cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
                cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
                                         MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
        } else {
                /* Multiprocessor system : Single-thread per core */
                cpuid_topo->thread_id  = -1;
                cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
                cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
                                         MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
                                         MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
        }

BTW, Mark Brown just posted "[PATCH] arm64: topology: Fix handling of
multi-level cluster MPIDR-based detection" which reintroduces the
above, albeit only for MT case.

I suspect you're looking for something above for the non-MT case?
As a workaround, have you considered defining your topology in DT?

Lorenzo,

Are you open to the above "workaround" given that Thunder is facing
the current deficiency for non-MT case?
Or is it time to introduce socket_id or something equivalent?
Lorenzo Pieralisi Aug. 23, 2014, 10:43 a.m. UTC | #13
On Thu, Aug 21, 2014 at 06:15:40AM +0100, Zi Shen Lim wrote:
> Hi Ganapatrao,
> 
> On Tue, Aug 19, 2014 at 11:08 PM, Ganapatrao Kulkarni
> <gpkulkarni@gmail.com> wrote:
> > Hi Lorenzo,
> >
> > On Tue, Aug 19, 2014 at 4:06 AM, Lorenzo Pieralisi
> > <lorenzo.pieralisi@arm.com> wrote:
> >> On Mon, Aug 18, 2014 at 08:39:48AM +0100, Ganapatrao Kulkarni wrote:
> >>> How we map non SMT (MT bit24=0) cores of dual/multi socket system with
> >>> the topology which is using only aff0 and aff1?
> >>> can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?
> >>
> >> Can you provide us with a description of the topology and the MPIDR_EL1
> >> list please ?
> >>
> >> I think that DT squashes the levels above cluster so that's how it could
> >> be implemented but first I would like to see what s the CPUs layout
> >> of the system.
> >
> > Our Soc has 48 core(Cavium's  thunderX) design. 48 cores are
> > sub-grouped in to 3 clusters.
> > In turn each cluster is having 16 cores.
> > We do have support for connecting 2 SoCs and making single system of 96 cores.
> >
> > the MPIDR mapping for this topology is,
> >
> > Aff0 is mapped to 16 cores within a cluster. Valid range is 0 to 0xf
> > Aff1 is mapped to cluster number, valid values are 0,1 and 2.
> > Aff2 is mapped to Socket-id or SoC number. Valid values are 0 and 1.
> 
> So, we went back and forth a few times and finally settled on what's
> merged into 3.17, due to lack of visibility into hardware that'll be
> shipped.
> 
> At one point, we had something approximating behavior of DT-based
> topology, along the lines of:
> 
>         if (mpidr & MPIDR_MT_BITMASK) {
>                 /* Multiprocessor system : Multi-threads per core */
>                 cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
>                 cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
>                 cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
>                                          MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
>         } else {
>                 /* Multiprocessor system : Single-thread per core */
>                 cpuid_topo->thread_id  = -1;
>                 cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
>                 cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
>                                          MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
>                                          MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
>         }
> 
> BTW, Mark Brown just posted "[PATCH] arm64: topology: Fix handling of
> multi-level cluster MPIDR-based detection" which reintroduces the
> above, albeit only for MT case.
> 
> I suspect you're looking for something above for the non-MT case?
> As a workaround, have you considered defining your topology in DT?
> 
> Lorenzo,
> 
> Are you open to the above "workaround" given that Thunder is facing
> the current deficiency for non-MT case?
> Or is it time to introduce socket_id or something equivalent?

Well, we have to do something for certain, I think M. Brown's patch
seems a reasonable solution to this problem, I will give it more thought
and come back to you beginning of September (I am currently on holiday).

Thank you,
Lorenzo
Ganapatrao Kulkarni Aug. 28, 2014, 6:49 a.m. UTC | #14
On Sat, Aug 23, 2014 at 4:13 PM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> On Thu, Aug 21, 2014 at 06:15:40AM +0100, Zi Shen Lim wrote:
>> Hi Ganapatrao,
>>
>> On Tue, Aug 19, 2014 at 11:08 PM, Ganapatrao Kulkarni
>> <gpkulkarni@gmail.com> wrote:
>> > Hi Lorenzo,
>> >
>> > On Tue, Aug 19, 2014 at 4:06 AM, Lorenzo Pieralisi
>> > <lorenzo.pieralisi@arm.com> wrote:
>> >> On Mon, Aug 18, 2014 at 08:39:48AM +0100, Ganapatrao Kulkarni wrote:
>> >>> How we map non SMT (MT bit24=0) cores of dual/multi socket system with
>> >>> the topology which is using only aff0 and aff1?
>> >>> can we use aff2  ( or concatenating aff2 and aff3)  to represent socket-id?
>> >>
>> >> Can you provide us with a description of the topology and the MPIDR_EL1
>> >> list please ?
>> >>
>> >> I think that DT squashes the levels above cluster so that's how it could
>> >> be implemented but first I would like to see what s the CPUs layout
>> >> of the system.
>> >
>> > Our Soc has 48 core(Cavium's  thunderX) design. 48 cores are
>> > sub-grouped in to 3 clusters.
>> > In turn each cluster is having 16 cores.
>> > We do have support for connecting 2 SoCs and making single system of 96 cores.
>> >
>> > the MPIDR mapping for this topology is,
>> >
>> > Aff0 is mapped to 16 cores within a cluster. Valid range is 0 to 0xf
>> > Aff1 is mapped to cluster number, valid values are 0,1 and 2.
>> > Aff2 is mapped to Socket-id or SoC number. Valid values are 0 and 1.
>>
>> So, we went back and forth a few times and finally settled on what's
>> merged into 3.17, due to lack of visibility into hardware that'll be
>> shipped.
>>
>> At one point, we had something approximating behavior of DT-based
>> topology, along the lines of:
>>
>>         if (mpidr & MPIDR_MT_BITMASK) {
>>                 /* Multiprocessor system : Multi-threads per core */
>>                 cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
>>                 cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
>>                 cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
>>                                          MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
>>         } else {
>>                 /* Multiprocessor system : Single-thread per core */
>>                 cpuid_topo->thread_id  = -1;
>>                 cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
>>                 cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
>>                                          MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
>>                                          MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
>>         }
>>
>> BTW, Mark Brown just posted "[PATCH] arm64: topology: Fix handling of
>> multi-level cluster MPIDR-based detection" which reintroduces the
>> above, albeit only for MT case.
>>
>> I suspect you're looking for something above for the non-MT case?
>> As a workaround, have you considered defining your topology in DT?
>>
>> Lorenzo,
>>
>> Are you open to the above "workaround" given that Thunder is facing
>> the current deficiency for non-MT case?
>> Or is it time to introduce socket_id or something equivalent?
>
> Well, we have to do something for certain, I think M. Brown's patch
> seems a reasonable solution to this problem, I will give it more thought
> and come back to you beginning of September (I am currently on holiday).
>
yes, please.
is it better to align/define cluster as group of  up-to 16 cores?(in
line to gicv3 target-list).
IMHO, it is better to introduce socket instead of calling cluster of cluster.
any plans to have DT bindings for NUMA, is any one working?

> Thank you,
> Lorenzo
>
thanks
Ganapat
diff mbox

Patch

From 1f77b5e2e0b7f3deb11618d37f4ec7d03a45b95d Mon Sep 17 00:00:00 2001
From: Zi Shen Lim <zlim@broadcom.com>
Date: Sun, 1 Jun 2014 18:37:29 +0100
Subject: [PATCH] arm64: topology: add MPIDR-based detection

Create cpu topology based on MPIDR. When hardware sets MPIDR to sane
values, this method will always work. Therefore it should also work well
as the fallback method. [1]

When we have multiple processing elements in the system, we create
the cpu topology by mapping each affinity level (from lowest to highest)
to threads (if they exist), cores, and clusters.

[1] http://www.spinics.net/lists/arm-kernel/msg317445.html

Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Zi Shen Lim <zlim@broadcom.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 arch/arm64/include/asm/cputype.h |  2 ++
 arch/arm64/kernel/topology.c     | 47 ++++++++++++++++++++++++++++------------
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index c404fb0..7639e8b 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -18,6 +18,8 @@ 
 
 #define INVALID_HWID		ULONG_MAX
 
+#define MPIDR_UP_BITMASK	(0x1 << 30)
+#define MPIDR_MT_BITMASK	(0x1 << 24)
 #define MPIDR_HWID_BITMASK	0xff00ffffff
 
 #define MPIDR_LEVEL_BITS_SHIFT	3
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 43514f9..b6ee26b 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -20,6 +20,7 @@ 
 #include <linux/of.h>
 #include <linux/sched.h>
 
+#include <asm/cputype.h>
 #include <asm/topology.h>
 
 static int __init get_cpu_for_node(struct device_node *node)
@@ -188,13 +189,9 @@  static int __init parse_dt_topology(void)
 	 * Check that all cores are in the topology; the SMP code will
 	 * only mark cores described in the DT as possible.
 	 */
-	for_each_possible_cpu(cpu) {
-		if (cpu_topology[cpu].cluster_id == -1) {
-			pr_err("CPU%d: No topology information specified\n",
-			       cpu);
+	for_each_possible_cpu(cpu)
+		if (cpu_topology[cpu].cluster_id == -1)
 			ret = -EINVAL;
-		}
-	}
 
 out_map:
 	of_node_put(map);
@@ -219,14 +216,6 @@  static void update_siblings_masks(unsigned int cpuid)
 	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
 	int cpu;
 
-	if (cpuid_topo->cluster_id == -1) {
-		/*
-		 * DT does not contain topology information for this cpu.
-		 */
-		pr_debug("CPU%u: No topology information configured\n", cpuid);
-		return;
-	}
-
 	/* update core and thread sibling masks */
 	for_each_possible_cpu(cpu) {
 		cpu_topo = &cpu_topology[cpu];
@@ -249,6 +238,36 @@  static void update_siblings_masks(unsigned int cpuid)
 
 void store_cpu_topology(unsigned int cpuid)
 {
+	struct cpu_topology *cpuid_topo = &cpu_topology[cpuid];
+	u64 mpidr;
+
+	if (cpuid_topo->cluster_id != -1)
+		goto topology_populated;
+
+	mpidr = read_cpuid_mpidr();
+
+	/* Uniprocessor systems can rely on default topology values */
+	if (mpidr & MPIDR_UP_BITMASK)
+		return;
+
+	/* Create cpu topology mapping based on MPIDR. */
+	if (mpidr & MPIDR_MT_BITMASK) {
+		/* Multiprocessor system : Multi-threads per core */
+		cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+	} else {
+		/* Multiprocessor system : Single-thread per core */
+		cpuid_topo->thread_id  = -1;
+		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+		cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+	}
+
+	pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
+		 cpuid, cpuid_topo->cluster_id, cpuid_topo->core_id,
+		 cpuid_topo->thread_id, mpidr);
+
+topology_populated:
 	update_siblings_masks(cpuid);
 }
 
-- 
1.8.4