diff mbox series

[RFCv1,4/4] perf: arm_spe: Dynamically switch PID tracing to contextidr

Message ID 20211021134530.206216-5-leo.yan@linaro.org (mailing list archive)
State New, archived
Headers show
Series arm64: Use static key for PID in CONTEXTIDR | expand

Commit Message

Leo Yan Oct. 21, 2021, 1:45 p.m. UTC
Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
driver invokes these functions to dynamically enable it during
profiling when the program runs in root PID name space, and disable PID
tracing when the perf event is stopped.

Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
tracing, so this patch uses the consistent condition for setting bit
EL1_CX for PMSCR.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
 drivers/perf/arm_spe_pmu.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

Comments

Kees Cook Oct. 21, 2021, 3:49 p.m. UTC | #1
On Thu, Oct 21, 2021 at 09:45:30PM +0800, Leo Yan wrote:
> Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> driver invokes these functions to dynamically enable it during
> profiling when the program runs in root PID name space, and disable PID
> tracing when the perf event is stopped.
> 
> Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> tracing, so this patch uses the consistent condition for setting bit
> EL1_CX for PMSCR.

My own preference here would be to not bother with the new
enable/disable helpers, but just open code it right here. (Save a patch
and is the only user.) But I defer to the taste of arm64 maintainers. :)

-Kees

> 
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
> ---
>  drivers/perf/arm_spe_pmu.c | 13 ++++++++++++-
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
> index d44bcc29d99c..935343cdcb39 100644
> --- a/drivers/perf/arm_spe_pmu.c
> +++ b/drivers/perf/arm_spe_pmu.c
> @@ -23,6 +23,7 @@
>  #include <linux/irq.h>
>  #include <linux/kernel.h>
>  #include <linux/list.h>
> +#include <linux/mmu_context.h>
>  #include <linux/module.h>
>  #include <linux/of_address.h>
>  #include <linux/of_device.h>
> @@ -272,7 +273,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event)
>  	if (!attr->exclude_kernel)
>  		reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
>  
> -	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
> +	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
>  		reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);
>  
>  	return reg;
> @@ -731,6 +732,13 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags)
>  	if (hwc->state)
>  		return;
>  
> +	/*
> +	 * Enable tracing PID to contextidr if profiling program runs in
> +	 * root PID namespace.
> +	 */
> +	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
> +		contextidr_enable();
> +
>  	reg = arm_spe_event_to_pmsfcr(event);
>  	write_sysreg_s(reg, SYS_PMSFCR_EL1);
>  
> @@ -792,6 +800,9 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
>  	}
>  
>  	hwc->state |= PERF_HES_STOPPED;
> +
> +	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
> +		contextidr_disable();
>  }
>  
>  static int arm_spe_pmu_add(struct perf_event *event, int flags)
> -- 
> 2.25.1
>
Leo Yan Oct. 22, 2021, 2:09 a.m. UTC | #2
Hi Kees,

On Thu, Oct 21, 2021 at 08:49:46AM -0700, Kees Cook wrote:
> On Thu, Oct 21, 2021 at 09:45:30PM +0800, Leo Yan wrote:
> > Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> > driver invokes these functions to dynamically enable it during
> > profiling when the program runs in root PID name space, and disable PID
> > tracing when the perf event is stopped.
> > 
> > Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> > tracing, so this patch uses the consistent condition for setting bit
> > EL1_CX for PMSCR.
> 
> My own preference here would be to not bother with the new
> enable/disable helpers, but just open code it right here. (Save a patch
> and is the only user.) But I defer to the taste of arm64 maintainers. :)

Yes, with your reminding I recognize that we can avoid the new helpers.
Just remind, tracing PID in contextidr is not only used by Arm SPE
driver, it will be used in Arm CoreSight driver as well.  I plan to use
a separate patch set to address Arm CoreSight (CoreSight driver misses
to checking root PID namespace so need firstly fix that issue).

Just give more info, so you and arm64 maintainers could judge we
should use helpers or directly access static key.

Thanks for your review!

Leo
James Clark Oct. 22, 2021, 3:36 p.m. UTC | #3
On 21/10/2021 14:45, Leo Yan wrote:
> Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> driver invokes these functions to dynamically enable it during
> profiling when the program runs in root PID name space, and disable PID
> tracing when the perf event is stopped.
> 
> Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> tracing, so this patch uses the consistent condition for setting bit
> EL1_CX for PMSCR.

Hi Leo,

I've been testing this change, but I'm seeing something strange. Not sure
if it's a problem on my side or not yet. With this command:

 sudo ./perf record -vvv -e arm_spe//u -- taskset --cpu-list 1 bash -c ls

I'm only seeing 0 values for context:

 sudo ./perf report -D | grep CONTEXT

.  00038dce:  65 00 00 00 00                                  CONTEXT 0x0 el2
.  00038e0e:  65 00 00 00 00                                  CONTEXT 0x0 el2

I added a printk to the function, and I see it print non zero values, although
there are some zero ones mixed in there too:

 diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 0c1669db19a1..8f0fb43a5fac 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -33,7 +33,8 @@ static inline void contextidr_thread_switch(struct task_struct *next)
        if (!static_branch_unlikely(&contextidr_in_use))
                return;
 
-       write_sysreg(task_pid_nr(next), contextidr_el1);
+       printk("Set %d\n", task_pid_nr(next));
+       write_sysreg(task_pid_nr(next), contextidr_el2);
        isb();
 }
 

Results in this:

[   53.257905] Set 77
[   53.257909] Set 0
[   53.258180] Set 77
[   53.258183] Set 0
[   53.258385] Set 309
[   53.258385] Set 172
[   53.258425] Set 77
[   53.258443] Set 990
[   53.258449] Set 77
[   53.258455] Set 990
[   53.258467] Set 310
[   53.258719] Set 7
[   53.258728] Set 77
[   53.258731] Set 0
[   53.258733] Set 0
[   53.258738] Set 7


Without your patchset I don't get 0 values in the SPE trace anymore:

.  0000050e:  65 b1 01 00 00                                  CONTEXT 0x1b1 el2
.  0000054e:  65 b1 01 00 00                                  CONTEXT 0x1b1 el2
.  0000058e:  65 ac 01 00 00                                  CONTEXT 0x1ac el2
.  000005ce:  65 ac 01 00 00                                  CONTEXT 0x1ac el2
James Clark Oct. 22, 2021, 3:40 p.m. UTC | #4
On 22/10/2021 16:36, James Clark wrote:
> 
> 
> On 21/10/2021 14:45, Leo Yan wrote:
>> Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
>> driver invokes these functions to dynamically enable it during
>> profiling when the program runs in root PID name space, and disable PID
>> tracing when the perf event is stopped.
>>
>> Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
>> tracing, so this patch uses the consistent condition for setting bit
>> EL1_CX for PMSCR.
> 
> Hi Leo,
> 
> I've been testing this change, but I'm seeing something strange. Not sure
> if it's a problem on my side or not yet. With this command:
> 
>  sudo ./perf record -vvv -e arm_spe//u -- taskset --cpu-list 1 bash -c ls
> 
> I'm only seeing 0 values for context:
> 
>  sudo ./perf report -D | grep CONTEXT
> 
> .  00038dce:  65 00 00 00 00                                  CONTEXT 0x0 el2
> .  00038e0e:  65 00 00 00 00                                  CONTEXT 0x0 el2
> 
> I added a printk to the function, and I see it print non zero values, although
> there are some zero ones mixed in there too:
> 
>  diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 0c1669db19a1..8f0fb43a5fac 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -33,7 +33,8 @@ static inline void contextidr_thread_switch(struct task_struct *next)
>         if (!static_branch_unlikely(&contextidr_in_use))
>                 return;
>  
> -       write_sysreg(task_pid_nr(next), contextidr_el1);
> +       printk("Set %d\n", task_pid_nr(next));
> +       write_sysreg(task_pid_nr(next), contextidr_el2);

Ignore this second line change, that doesn't even compile. It's just the printk that I added.
James Clark Oct. 22, 2021, 4:23 p.m. UTC | #5
On 22/10/2021 16:36, James Clark wrote:
> 
> 
> On 21/10/2021 14:45, Leo Yan wrote:
>> Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
>> driver invokes these functions to dynamically enable it during
>> profiling when the program runs in root PID name space, and disable PID
>> tracing when the perf event is stopped.
>>
>> Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
>> tracing, so this patch uses the consistent condition for setting bit
>> EL1_CX for PMSCR.
> 
> Hi Leo,
> 
> I've been testing this change, but I'm seeing something strange. Not sure
> if it's a problem on my side or not yet. With this command:
> 
>  sudo ./perf record -vvv -e arm_spe//u -- taskset --cpu-list 1 bash -c ls
> 
> I'm only seeing 0 values for context:
> 
>  sudo ./perf report -D | grep CONTEXT
> 
> .  00038dce:  65 00 00 00 00                                  CONTEXT 0x0 el2
> .  00038e0e:  65 00 00 00 00                                  CONTEXT 0x0 el2
> 
> I added a printk to the function, and I see it print non zero values, although
> there are some zero ones mixed in there too:
> 
>  diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 0c1669db19a1..8f0fb43a5fac 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -33,7 +33,8 @@ static inline void contextidr_thread_switch(struct task_struct *next)
>         if (!static_branch_unlikely(&contextidr_in_use))
>                 return;
>  
> -       write_sysreg(task_pid_nr(next), contextidr_el1);
> +       printk("Set %d\n", task_pid_nr(next));
> +       write_sysreg(task_pid_nr(next), contextidr_el2);
>         isb();
>  }
>  
> 
> Results in this:
> 
> [   53.257905] Set 77
> [   53.257909] Set 0
> [   53.258180] Set 77
> [   53.258183] Set 0
> [   53.258385] Set 309
> [   53.258385] Set 172
> [   53.258425] Set 77
> [   53.258443] Set 990
> [   53.258449] Set 77
> [   53.258455] Set 990
> [   53.258467] Set 310
> [   53.258719] Set 7
> [   53.258728] Set 77
> [   53.258731] Set 0
> [   53.258733] Set 0
> [   53.258738] Set 7
> 
> 
> Without your patchset I don't get 0 values in the SPE trace anymore:
> 
> .  0000050e:  65 b1 01 00 00                                  CONTEXT 0x1b1 el2
> .  0000054e:  65 b1 01 00 00                                  CONTEXT 0x1b1 el2
> .  0000058e:  65 ac 01 00 00                                  CONTEXT 0x1ac el2
> .  000005ce:  65 ac 01 00 00                                  CONTEXT 0x1ac el2
> 

Is it an issue with building with CONTEXTIDR disabled? Seems like this change results
in context packets set to 0 when it's disabled rather than having the packets disabled
like they used to be:

zcat /proc/config.gz | grep CONTEXTIDR
# CONFIG_PID_IN_CONTEXTIDR is not set

sudo ./perf report -D | grep CONTEXT
.  00045b4e:  65 00 00 00 00                                  CONTEXT 0x0 el2

When I build with CONFIG_PID_IN_CONTEXTIDR=y the contexts are non zero so it seems to
be working that way. But ./perf record -e arm_spe//u -a does have context IDs even
when CONFIG_PID_IN_CONTEXTIDR=n. So I'm still a bit confused.
Leo Yan Oct. 24, 2021, 10:25 a.m. UTC | #6
Hi James,

On Fri, Oct 22, 2021 at 05:23:23PM +0100, James Clark wrote:
> On 22/10/2021 16:36, James Clark wrote:
> > 
> > 
> > On 21/10/2021 14:45, Leo Yan wrote:
> >> Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> >> driver invokes these functions to dynamically enable it during
> >> profiling when the program runs in root PID name space, and disable PID
> >> tracing when the perf event is stopped.
> >>
> >> Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> >> tracing, so this patch uses the consistent condition for setting bit
> >> EL1_CX for PMSCR.
> > 
> > Hi Leo,
> > 
> > I've been testing this change, but I'm seeing something strange. Not sure
> > if it's a problem on my side or not yet. With this command:
> > 
> >  sudo ./perf record -vvv -e arm_spe//u -- taskset --cpu-list 1 bash -c ls
> > 
> > I'm only seeing 0 values for context:
> > 
> >  sudo ./perf report -D | grep CONTEXT
> > 
> > .  00038dce:  65 00 00 00 00                                  CONTEXT 0x0 el2
> > .  00038e0e:  65 00 00 00 00                                  CONTEXT 0x0 el2

Good catch!  I reproduced this issue at my side and looked into the
flow, the root cause is relevant with timing.

When perf launches the program 'taskset --cpu-list 1 bash -c ls', it
forks a new process and 'ls' program is scheduled in, then function
arm_spe_pmu_start() invokes contextidr_enable() to enable the PID
tracing in contextidr.  Since 'ls' program executes very short and it
simply runs to the end (so in the middle of 'ls' there have no any
context switching on the CPU), there have no any new PID is written
into contextidr and CPU's contextidr keeps zero.  This is the reason
we see the context packets contain zeros for PID.

To fix this issue, we should enable PID tracing when setup AUX ring
buffer, at this phase, the profiled program has not been started yet.
So when the profiled program is scheduled in at the first time, PID
traing is getting ready and we can see the expected context packet in
Arm SPE trace data.   So this patch should be updated as below, I will
apply it in next spin if no objection.

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index c21cf1385cc0..85aa2eab0c2e 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -876,6 +867,13 @@ static void *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages,
        buf->nr_pages   = nr_pages;
        buf->snapshot   = snapshot;

+       /*
+        * Enable tracing PID to contextidr if profiling program runs in
+        * root PID namespace.
+        */
+       if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
+               contextidr_enable();
+
        kfree(pglist);
        return buf;

@@ -890,6 +888,9 @@ static void arm_spe_pmu_free_aux(void *aux)
 {
        struct arm_spe_pmu_buf *buf = aux;

+       if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
+               contextidr_disable();
+
        vunmap(buf->base);
        kfree(buf);
 }

Thanks a lot for detailed testing!

Leo
Leo Yan Nov. 1, 2021, 3:28 p.m. UTC | #7
Hi Catalin, Will,

On Thu, Oct 21, 2021 at 08:49:46AM -0700, Kees Cook wrote:
> On Thu, Oct 21, 2021 at 09:45:30PM +0800, Leo Yan wrote:
> > Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> > driver invokes these functions to dynamically enable it during
> > profiling when the program runs in root PID name space, and disable PID
> > tracing when the perf event is stopped.
> > 
> > Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> > tracing, so this patch uses the consistent condition for setting bit
> > EL1_CX for PMSCR.
> 
> My own preference here would be to not bother with the new
> enable/disable helpers, but just open code it right here. (Save a patch
> and is the only user.) But I defer to the taste of arm64 maintainers. :)

Before I send out a new version for this patch set (for support
dynamic PID tracing on Arm64), I'd like to get your opinions for two
things:

- Firstly, as Kees suggested to directly use variable
  'contextidr_in_use' in drivers, which is exported as GPL symbol,
  it's not necessarily to add two helpers contextidr_{enable|disable}().
  What's your preference for this?

- Secondly, now this patch set only support dynamic PID tracing for
  Arm64; and there would be two customers to use dynamic PID tracing:
  Arm SPE and Coresight ETMv4.x.  So this patch set doesn't support
  dynamic PID tracing for Arm32 (under arch/arm).

  Do you accept this patch set for enabling PID tracing on Arm64 and we
  can defer to support Arm32 when really need PID tracing on Arm32?
  Or we should enable PID dynamic tracing for Arm64 and Arm32 in one
  go?


Thanks,
Leo
Catalin Marinas Dec. 3, 2021, 4:22 p.m. UTC | #8
On Mon, Nov 01, 2021 at 11:28:35PM +0800, Leo Yan wrote:
> On Thu, Oct 21, 2021 at 08:49:46AM -0700, Kees Cook wrote:
> > On Thu, Oct 21, 2021 at 09:45:30PM +0800, Leo Yan wrote:
> > > Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> > > driver invokes these functions to dynamically enable it during
> > > profiling when the program runs in root PID name space, and disable PID
> > > tracing when the perf event is stopped.
> > > 
> > > Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> > > tracing, so this patch uses the consistent condition for setting bit
> > > EL1_CX for PMSCR.
> > 
> > My own preference here would be to not bother with the new
> > enable/disable helpers, but just open code it right here. (Save a patch
> > and is the only user.) But I defer to the taste of arm64 maintainers. :)
> 
> Before I send out a new version for this patch set (for support
> dynamic PID tracing on Arm64), I'd like to get your opinions for two
> things:
> 
> - Firstly, as Kees suggested to directly use variable
>   'contextidr_in_use' in drivers, which is exported as GPL symbol,
>   it's not necessarily to add two helpers contextidr_{enable|disable}().
>   What's your preference for this?

My preference would be to keep the helpers.

> - Secondly, now this patch set only support dynamic PID tracing for
>   Arm64; and there would be two customers to use dynamic PID tracing:
>   Arm SPE and Coresight ETMv4.x.  So this patch set doesn't support
>   dynamic PID tracing for Arm32 (under arch/arm).
> 
>   Do you accept this patch set for enabling PID tracing on Arm64 and we
>   can defer to support Arm32 when really need PID tracing on Arm32?
>   Or we should enable PID dynamic tracing for Arm64 and Arm32 in one
>   go?

If it doesn't break arm32, it's fine by me.

What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
negligible, I'd not bother at all with any of the enabling/disabling.

Another question: can you run multiple instances of SPE for different
threads on different CPUs? What happens to the global contextidr_in_use
key when one of them stops?
Leo Yan Dec. 5, 2021, 1:51 p.m. UTC | #9
Hi Catalin,

On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> On Mon, Nov 01, 2021 at 11:28:35PM +0800, Leo Yan wrote:
> > On Thu, Oct 21, 2021 at 08:49:46AM -0700, Kees Cook wrote:
> > > On Thu, Oct 21, 2021 at 09:45:30PM +0800, Leo Yan wrote:
> > > > Now Arm64 provides API for enabling and disable PID tracing, Arm SPE
> > > > driver invokes these functions to dynamically enable it during
> > > > profiling when the program runs in root PID name space, and disable PID
> > > > tracing when the perf event is stopped.
> > > > 
> > > > Device drivers should not depend on CONFIG_PID_IN_CONTEXTIDR for PID
> > > > tracing, so this patch uses the consistent condition for setting bit
> > > > EL1_CX for PMSCR.
> > > 
> > > My own preference here would be to not bother with the new
> > > enable/disable helpers, but just open code it right here. (Save a patch
> > > and is the only user.) But I defer to the taste of arm64 maintainers. :)
> > 
> > Before I send out a new version for this patch set (for support
> > dynamic PID tracing on Arm64), I'd like to get your opinions for two
> > things:
> > 
> > - Firstly, as Kees suggested to directly use variable
> >   'contextidr_in_use' in drivers, which is exported as GPL symbol,
> >   it's not necessarily to add two helpers contextidr_{enable|disable}().
> >   What's your preference for this?
> 
> My preference would be to keep the helpers.

Okay, will keep the helpers.

> > - Secondly, now this patch set only support dynamic PID tracing for
> >   Arm64; and there would be two customers to use dynamic PID tracing:
> >   Arm SPE and Coresight ETMv4.x.  So this patch set doesn't support
> >   dynamic PID tracing for Arm32 (under arch/arm).
> > 
> >   Do you accept this patch set for enabling PID tracing on Arm64 and we
> >   can defer to support Arm32 when really need PID tracing on Arm32?
> >   Or we should enable PID dynamic tracing for Arm64 and Arm32 in one
> >   go?
> 
> If it doesn't break arm32, it's fine by me.

In next spin, I will introduce a patch for new Arm32 helpers.  Since
now we have no requirement for dynamic PID tracing, the Arm32 helpers
will be nop operations and can be used for kernel compilation.

> What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> negligible, I'd not bother at all with any of the enabling/disabling.

Yes, I compared performance for PID tracing with always enabling and
disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
static key for enabling/disabling PID tracing.  The result shows the
cost is negligible based on the benchmark 'perf bench sched'.

Please see the detailed data in below link (note the testing results
came from my Juno board):
https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/

> Another question: can you run multiple instances of SPE for different
> threads on different CPUs? What happens to the global contextidr_in_use
> key when one of them stops?

No, I only can launch one instance for Arm SPE event via perf tool; when
I tried to launch a second instance, perf tool reports failure:

The sys_perf_event_open() syscall returned with 16 (Device or resource busy) for event (arm_spe_0/load_filter=1,store_filter=1/u).

Alternatively, I'd like give several examples for contextidr_in_use key
values when run different perf modes.

Firstly, I added two kprobe points to monitor contextidr_in_use key
values with below commands, the first probe point is to monitor static
key's increment, and second probe point is to monitor key's
descrement:

  # perf probe --add 'arm_spe_pmu_setup_aux:45 contextidr_in_use=contextidr_in_use.key.enabled.counter:u32'
  # perf probe --add 'arm_spe_pmu_free_aux:7 contextidr_in_use=contextidr_in_use.key.enabled.counter:u32'

Case 1: run perf tool with 'per-thread' mode:

  # perf record -e arm_spe_0/load_filter=1,store_filter=1/u --per-thread -- uname

Trace log shows contextidr_in_use is increment to 1 when setup AUX
buffer and it descrement to 0 when free AUX buffer (before close SPE
event).

  perf-2393    [077] d..1.   427.161612: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=1
  perf-2393    [077] d..1.   427.477993: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=0

Case 2: perf tool runs with system-wide mode:

  # perf record -e arm_spe_0/load_filter=1,store_filter=1/u -a -- uname

The system has 128 cores, so every CPU has its own AUX buffer, thus
the static key will increase from 0 to 128; reversely, the static key
will decrease from 128 to 0 when perf tool exists:

  perf-2395    [077] d..1.   435.647270: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=1
  perf-2395    [077] d..1.   435.647912: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=2
  ...
  perf-2395    [077] d..1.   435.709717: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=128
  perf-2395    [127] d..1.   436.734142: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=127
  perf-2395    [127] d..1.   436.734438: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=126
  perf-2395    [127] d..1.   436.734682: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=125
  ...
  perf-2395    [127] d..1.   436.763856: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=0

Case 3: perf tool runs for CPU-wise mode:

  # perf record -e arm_spe_0/load_filter=1,store_filter=1/u -C 1,3,4  -- uname

The option '-C 1,3,4' specifies to only enable Arm SPE tracing on
three CPUs (CPU1/3/4), so the contextidr_in_use key will increment for
3 times when open perf event, and decrement the static key for 3 times
when close SPE event:

  perf-2404    [077] d..1.   450.564087: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=1
  perf-2404    [077] d..1.   450.564744: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=2
  perf-2404    [077] d..1.   450.565369: arm_spe_pmu_setup_aux_L45: (arm_spe_pmu_setup_aux+0x130/0x200) contextidr_in_use=3
  perf-2404    [004] d..1.   451.567532: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=2
  perf-2404    [004] d..1.   451.567823: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=1
  perf-2404    [004] d..1.   451.568120: arm_spe_pmu_free_aux_L7: (arm_spe_pmu_free_aux+0x44/0xa0) contextidr_in_use=0

Hope these three cases can demonstrate the usage for contextidr_in_use
static key.

Thanks,
Leo
Catalin Marinas Dec. 7, 2021, 11:48 a.m. UTC | #10
On Sun, Dec 05, 2021 at 09:51:03PM +0800, Leo Yan wrote:
> On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> > What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> > negligible, I'd not bother at all with any of the enabling/disabling.
> 
> Yes, I compared performance for PID tracing with always enabling and
> disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
> static key for enabling/disabling PID tracing.  The result shows the
> cost is negligible based on the benchmark 'perf bench sched'.
> 
> Please see the detailed data in below link (note the testing results
> came from my Juno board):
> https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/

The table wasn't entirely clear to me. So the dis/enb benchmarks are
without this patchset applied. There seems to be a minor drop but it's
probably noise. Anyway, do we need this patchset or we just make
CONFIG_PID_IN_CONTEXTIDR default to y?

> > Another question: can you run multiple instances of SPE for different
> > threads on different CPUs? What happens to the global contextidr_in_use
> > key when one of them stops?
> 
> No, I only can launch one instance for Arm SPE event via perf tool; when
> I tried to launch a second instance, perf tool reports failure:
> 
> The sys_perf_event_open() syscall returned with 16 (Device or resource
> busy) for event (arm_spe_0/load_filter=1,store_filter=1/u).
[...]
> Alternatively, I'd like give several examples for contextidr_in_use key
> values when run different perf modes.
[...]
> Hope these three cases can demonstrate the usage for contextidr_in_use
> static key.

OK, so we can have multiple uses of PID in CONTEXTIDR. Since
static_branch_inc() is refcounted, we get away with this but the
downside is that a CPU won't notice until its next thread switch.
Leo Yan Dec. 7, 2021, 12:31 p.m. UTC | #11
On Tue, Dec 07, 2021 at 11:48:00AM +0000, Catalin Marinas wrote:
> On Sun, Dec 05, 2021 at 09:51:03PM +0800, Leo Yan wrote:
> > On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> > > What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> > > negligible, I'd not bother at all with any of the enabling/disabling.
> > 
> > Yes, I compared performance for PID tracing with always enabling and
> > disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
> > static key for enabling/disabling PID tracing.  The result shows the
> > cost is negligible based on the benchmark 'perf bench sched'.
> > 
> > Please see the detailed data in below link (note the testing results
> > came from my Juno board):
> > https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/
> 
> The table wasn't entirely clear to me. So the dis/enb benchmarks are
> without this patchset applied.

Yes, dis/enb metrics don't apply this patchset.

> There seems to be a minor drop but it's
> probably noise. Anyway, do we need this patchset or we just make
> CONFIG_PID_IN_CONTEXTIDR default to y?

Good point.  I remembered before we had discussed for making
CONFIG_PID_IN_CONTEXTIDR to 'y', but this approach is not always valid,
especially when the profiling process runs in non-root PID namespace,
in this case, hardware tracing data (e.g. Arm SPE or CoreSight) cannot
trust the PID values from tracing since the PID conflicts between
different PID namespaces.

So this patchset is to add the fundamental mechanism for dynamically
enabling and disable PID tracing into CONTEXTIDR.  Based on it, we can
use helpers to dynamically enable PID tracing _only_ when process runs
in root PID namespace.

> > > Another question: can you run multiple instances of SPE for different
> > > threads on different CPUs? What happens to the global contextidr_in_use
> > > key when one of them stops?
> > 
> > No, I only can launch one instance for Arm SPE event via perf tool; when
> > I tried to launch a second instance, perf tool reports failure:
> > 
> > The sys_perf_event_open() syscall returned with 16 (Device or resource
> > busy) for event (arm_spe_0/load_filter=1,store_filter=1/u).
> [...]
> > Alternatively, I'd like give several examples for contextidr_in_use key
> > values when run different perf modes.
> [...]
> > Hope these three cases can demonstrate the usage for contextidr_in_use
> > static key.
> 
> OK, so we can have multiple uses of PID in CONTEXTIDR. Since
> static_branch_inc() is refcounted, we get away with this but the
> downside is that a CPU won't notice until its next thread switch.

Yes, it's true that after static key is decreased to 0, any CPUs in
the system will notice for the '0' value until the next context
switch.  But I think this will not introduce issue, please see the
flow:

  Step 1: Perf tool: enable event (Arm SPE or CoreSight event)
  Step 2: Perf tool: enable PID tracing when setup AUX buf
  Step 3: Profiling, run specific program or for all CPUs;
          it can have many context switch and all relevant PIDs
          are stored into CONTEXTIDR.
  Step 4: Perf tool: disable PID tracing when free AUX buf
  Step 5: Perf tool: disable event

The latency you mentioned is introduced between step 4 and step 5,
but it doesn't impact any PID tracing for step 3.

And when the CPU will switch context, it will detect the static key is
decreased to zero so it can skip PID tracing.  Thus it also will not
introduce any redundant PID tracing.

Do I miss any potential issue?

Thanks,
Leo
> 
> -- 
> Catalin
Catalin Marinas Dec. 8, 2021, 5:29 p.m. UTC | #12
On Tue, Dec 07, 2021 at 08:31:18PM +0800, Leo Yan wrote:
> On Tue, Dec 07, 2021 at 11:48:00AM +0000, Catalin Marinas wrote:
> > On Sun, Dec 05, 2021 at 09:51:03PM +0800, Leo Yan wrote:
> > > On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> > > > What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> > > > negligible, I'd not bother at all with any of the enabling/disabling.
> > > 
> > > Yes, I compared performance for PID tracing with always enabling and
> > > disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
> > > static key for enabling/disabling PID tracing.  The result shows the
> > > cost is negligible based on the benchmark 'perf bench sched'.
> > > 
> > > Please see the detailed data in below link (note the testing results
> > > came from my Juno board):
> > > https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/
> > 
> > The table wasn't entirely clear to me. So the dis/enb benchmarks are
> > without this patchset applied.
> 
> Yes, dis/enb metrics don't apply this patchset.
> 
> > There seems to be a minor drop but it's
> > probably noise. Anyway, do we need this patchset or we just make
> > CONFIG_PID_IN_CONTEXTIDR default to y?
> 
> Good point.  I remembered before we had discussed for making
> CONFIG_PID_IN_CONTEXTIDR to 'y', but this approach is not always valid,
> especially when the profiling process runs in non-root PID namespace,
> in this case, hardware tracing data (e.g. Arm SPE or CoreSight) cannot
> trust the PID values from tracing since the PID conflicts between
> different PID namespaces.
> 
> So this patchset is to add the fundamental mechanism for dynamically
> enabling and disable PID tracing into CONTEXTIDR.  Based on it, we can
> use helpers to dynamically enable PID tracing _only_ when process runs
> in root PID namespace.

I don't think your approach fully works. Let's say you are tracing two
processes, one in the root PID namespace, the other not. Since the
former enables PID in CONTEXTIDR, you automatically get some PID in
CONTEXTIDR for the latter whether you requested it explicitly or not.

I wonder whether it makes more sense to turn this on per thread. You set
some TIF flag and set the PID in contextidr_thread_switch() only if the
flag is set. You could also check there if the PID is in the root
namespace and avoid setting CONTEXTIDR (or write 0).
Leo Yan Dec. 10, 2021, 7:59 a.m. UTC | #13
On Wed, Dec 08, 2021 at 05:29:41PM +0000, Catalin Marinas wrote:
> On Tue, Dec 07, 2021 at 08:31:18PM +0800, Leo Yan wrote:
> > On Tue, Dec 07, 2021 at 11:48:00AM +0000, Catalin Marinas wrote:
> > > On Sun, Dec 05, 2021 at 09:51:03PM +0800, Leo Yan wrote:
> > > > On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> > > > > What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> > > > > negligible, I'd not bother at all with any of the enabling/disabling.
> > > > 
> > > > Yes, I compared performance for PID tracing with always enabling and
> > > > disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
> > > > static key for enabling/disabling PID tracing.  The result shows the
> > > > cost is negligible based on the benchmark 'perf bench sched'.
> > > > 
> > > > Please see the detailed data in below link (note the testing results
> > > > came from my Juno board):
> > > > https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/
> > > 
> > > The table wasn't entirely clear to me. So the dis/enb benchmarks are
> > > without this patchset applied.
> > 
> > Yes, dis/enb metrics don't apply this patchset.
> > 
> > > There seems to be a minor drop but it's
> > > probably noise. Anyway, do we need this patchset or we just make
> > > CONFIG_PID_IN_CONTEXTIDR default to y?
> > 
> > Good point.  I remembered before we had discussed for making
> > CONFIG_PID_IN_CONTEXTIDR to 'y', but this approach is not always valid,
> > especially when the profiling process runs in non-root PID namespace,
> > in this case, hardware tracing data (e.g. Arm SPE or CoreSight) cannot
> > trust the PID values from tracing since the PID conflicts between
> > different PID namespaces.
> > 
> > So this patchset is to add the fundamental mechanism for dynamically
> > enabling and disable PID tracing into CONTEXTIDR.  Based on it, we can
> > use helpers to dynamically enable PID tracing _only_ when process runs
> > in root PID namespace.
> 
> I don't think your approach fully works. Let's say you are tracing two
> processes, one in the root PID namespace, the other not. Since the
> former enables PID in CONTEXTIDR, you automatically get some PID in
> CONTEXTIDR for the latter whether you requested it explicitly or not.

The key point is kernel always sets a PID number from the root PID
namespace into the system register CONTEXTIDR.

Let's see a case:

  # unshare --fork --pid perf record -e cs_etm// -m 64K,64K -a -o perf_test.data -- ./multi_threads_test

In this case, with command "unshare --fork --pid", perf tool and the
profiled program multi_threads_test run in the created PID namespace.
We can see the dumped PID values:

   <idle>-0       [000] d..2.   331.751681: __switch_to: contextidr_thread_switch: task perf-exec pid 840 ns_pid 2
   <idle>-0       [002] d..2.   331.755930: __switch_to: contextidr_thread_switch: task multi_threads_t pid 842 ns_pid 4
   <idle>-0       [003] d..2.   331.755993: __switch_to: contextidr_thread_switch: task multi_threads_t pid 843 ns_pid 5
  sugov:0-129     [005] d..2.   332.323469: __switch_to: contextidr_thread_switch: task perf pid 841 ns_pid 3

We can see processes have two different PIDs values, say task
'perf-exec', 840 is its PID number from the root PID namespace, and 2
is its PID from the active namespace the task is belonging to.

But the register CONTEXTIDR is _only_ used to trace PIDs from the root
PID namespace and ignores PID values from any other PID namespace.

Come back to your question, if there have two tracing sessions, one
session runs in the root PID namespace, and another runs in the
non-root PID namespace.  Since we can gather the PIDs in the root namespace,
the session in the root PID namespace can work perfectly, and we can
ensure the PID infos matching well between kernel's PID tracing and user
space tool (note: perf needs gather process info with process' pid/tid for
post parsing).

For the later session in non-root PID name space, we doesn't capture
any PID tracing data.  This results in the profiled result doesn't
contain any PID info, although it's painful that the PID info is
incomplete but we don't deliver any wrong info for users studying
profiling result and PID is always '-1'.

I think the purpose for this patchset is to allow us to dynamically
enable PID tracing when the tracing session runs in root namespace.

Furthermore, please see the patch [1], you could see we also need an
extra checking in SPE driver for setting SPE's control register to
disable PID packets so can avoid PID polutions if the session runs in
non-root PID namespace:

  // Only enable PID packets when run in root namespace
  if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
 		reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);

[1] https://lore.kernel.org/lkml/20211021134530.206216-5-leo.yan@linaro.org/

> I wonder whether it makes more sense to turn this on per thread. You set
> some TIF flag and set the PID in contextidr_thread_switch() only if the
> flag is set.

Good point.  IIRC, before Suzuki brought up this suggestion as well.

I considered it and these two days I looked into the perf code, it's feasible
to apply the TIF flag for perf's 'per-thread' mode or process mode.  In these
modes, we can set the thread flags when profiled task/thread is forked.
But the difficult thing is how we can handle for CPU wide profiling.

In CPU wide mode, the perf tool opens the tracing events on several
or all CPUs, the running tasks running on the CPU are not forked
from perf session, in the below diagram, there have three tasks (T1/2/3),
we can see tasks T2/T3 even were created before perf tool is
lanched; in this case, it's hard for perf tool to set TIF flag for all
tasks in the system, and it's possible to see the tasks migrate from
one CPU to another CPU.

            Perf enables event                  Perf disables event
               on the CPU                          on the CPU
                  |                                   |
         ##T2##   V    ###T1###  ##T3##  ####T2#####  V
  CPU0  ------------------------------------------------->
                  |                                   |
            #T3#  V    #######T2######                V
  CPU1  ------------------------------------------------->

                                        ##T1##
  CPU2  ------------------------------------------------->
              ^
              ` The event is not enabled on CPU2 so CPU2 doesn't
                capture any trace data.

> You could also check there if the PID is in the root
> namespace and avoid setting CONTEXTIDR (or write 0).

This could introduce mess.  Writing 0 can lead the decoder to take it
as idle thread; if skip setting CONTEXTIDR, the tracer might use a
stale stale PID number (the previous one ID number).

Alternatively, if you accept to always set PID to CONTEXTIDR in
contextidr_thread_switch(), it would be fine for me and we can only
need to control PID packets in SPE and CoreSight drivers.

Please let me know your opinion, thanks!

Leo
Leo Yan Dec. 17, 2021, 7:58 a.m. UTC | #14
On Fri, Dec 10, 2021 at 03:59:18PM +0800, Leo Yan wrote:

[...]

> > You could also check there if the PID is in the root
> > namespace and avoid setting CONTEXTIDR (or write 0).
> 
> This could introduce mess.  Writing 0 can lead the decoder to take it
> as idle thread; if skip setting CONTEXTIDR, the tracer might use a
> stale stale PID number (the previous one ID number).
> 
> Alternatively, if you accept to always set PID to CONTEXTIDR in
> contextidr_thread_switch(), it would be fine for me and we can only
> need to control PID packets in SPE and CoreSight drivers.
> 
> Please let me know your opinion, thanks!

Gentle ping, Catalin.  If anything is blur and you want me to clarify,
please let me know.  Sorry if you are in holiday and if so we can
delay after holiday.

Thanks,
Leo
Catalin Marinas Jan. 17, 2022, 6:48 p.m. UTC | #15
Hi Leo,

(sorry for the delay, holidays, merging window)

On Fri, Dec 10, 2021 at 03:59:18PM +0800, Leo Yan wrote:
> On Wed, Dec 08, 2021 at 05:29:41PM +0000, Catalin Marinas wrote:
> > On Tue, Dec 07, 2021 at 08:31:18PM +0800, Leo Yan wrote:
> > > On Tue, Dec 07, 2021 at 11:48:00AM +0000, Catalin Marinas wrote:
> > > > On Sun, Dec 05, 2021 at 09:51:03PM +0800, Leo Yan wrote:
> > > > > On Fri, Dec 03, 2021 at 04:22:42PM +0000, Catalin Marinas wrote:
> > > > > > What's the cost of always enabling CONFIG_PID_IN_CONTEXTIDR? If it's
> > > > > > negligible, I'd not bother at all with any of the enabling/disabling.
> > > > > 
> > > > > Yes, I compared performance for PID tracing with always enabling and
> > > > > disabling CONFIG_PID_IN_CONTEXTIDR, and also compared with using
> > > > > static key for enabling/disabling PID tracing.  The result shows the
> > > > > cost is negligible based on the benchmark 'perf bench sched'.
> > > > > 
> > > > > Please see the detailed data in below link (note the testing results
> > > > > came from my Juno board):
> > > > > https://lore.kernel.org/lkml/20211021134530.206216-1-leo.yan@linaro.org/
> > > > 
> > > > The table wasn't entirely clear to me. So the dis/enb benchmarks are
> > > > without this patchset applied.
> > > 
> > > Yes, dis/enb metrics don't apply this patchset.
> > > 
> > > > There seems to be a minor drop but it's
> > > > probably noise. Anyway, do we need this patchset or we just make
> > > > CONFIG_PID_IN_CONTEXTIDR default to y?
> > > 
> > > Good point.  I remembered before we had discussed for making
> > > CONFIG_PID_IN_CONTEXTIDR to 'y', but this approach is not always valid,
> > > especially when the profiling process runs in non-root PID namespace,
> > > in this case, hardware tracing data (e.g. Arm SPE or CoreSight) cannot
> > > trust the PID values from tracing since the PID conflicts between
> > > different PID namespaces.
> > > 
> > > So this patchset is to add the fundamental mechanism for dynamically
> > > enabling and disable PID tracing into CONTEXTIDR.  Based on it, we can
> > > use helpers to dynamically enable PID tracing _only_ when process runs
> > > in root PID namespace.
> > 
> > I don't think your approach fully works. Let's say you are tracing two
> > processes, one in the root PID namespace, the other not. Since the
> > former enables PID in CONTEXTIDR, you automatically get some PID in
> > CONTEXTIDR for the latter whether you requested it explicitly or not.
> 
> The key point is kernel always sets a PID number from the root PID
> namespace into the system register CONTEXTIDR.

So earlier you mentioned that CONFIG_PID_IN_CONTEXTIDR=y is not always
right since a profiled task may run in a non-root PID namespace. But
this series introduces a single knob to turn on PID in CONTEXTIDR for
all CPUs, irrespective of whether they run a task in the root PID
namespace or not. The CPU running a task in the root ns may call
contextidr_enable() but the other CPUs may run tasks in non-root ns.

IIUC, with this patch, to avoid the root ns PID for a non-root ns task,
the SPE driver only sets the SYS_PMSCR_EL1_CX_SHIFT bit for the root ns
tasks. So, in this case, for a non-root ns task, does it matter that
CONTEXTIDR always contain the root ns PID? If it matters, see above why
a single knob is already doing this.

> Let's see a case:
> 
>   # unshare --fork --pid perf record -e cs_etm// -m 64K,64K -a -o perf_test.data -- ./multi_threads_test
> 
> In this case, with command "unshare --fork --pid", perf tool and the
> profiled program multi_threads_test run in the created PID namespace.
> We can see the dumped PID values:
> 
>    <idle>-0       [000] d..2.   331.751681: __switch_to: contextidr_thread_switch: task perf-exec pid 840 ns_pid 2
>    <idle>-0       [002] d..2.   331.755930: __switch_to: contextidr_thread_switch: task multi_threads_t pid 842 ns_pid 4
>    <idle>-0       [003] d..2.   331.755993: __switch_to: contextidr_thread_switch: task multi_threads_t pid 843 ns_pid 5
>   sugov:0-129     [005] d..2.   332.323469: __switch_to: contextidr_thread_switch: task perf pid 841 ns_pid 3
> 
> We can see processes have two different PIDs values, say task
> 'perf-exec', 840 is its PID number from the root PID namespace, and 2
> is its PID from the active namespace the task is belonging to.
> 
> But the register CONTEXTIDR is _only_ used to trace PIDs from the root
> PID namespace and ignores PID values from any other PID namespace.

Would it help if we used task_pid_nr_ns() instead for setting
CONTEXTIDR?

> Come back to your question, if there have two tracing sessions, one
> session runs in the root PID namespace, and another runs in the
> non-root PID namespace.  Since we can gather the PIDs in the root namespace,
> the session in the root PID namespace can work perfectly, and we can
> ensure the PID infos matching well between kernel's PID tracing and user
> space tool (note: perf needs gather process info with process' pid/tid for
> post parsing).
> 
> For the later session in non-root PID name space, we doesn't capture
> any PID tracing data.  This results in the profiled result doesn't
> contain any PID info, although it's painful that the PID info is
> incomplete but we don't deliver any wrong info for users studying
> profiling result and PID is always '-1'.

So what's actually disabling the capture of PID tracing data? Is it the
PMSCR_EL1.CX bit being 0?

> I think the purpose for this patchset is to allow us to dynamically
> enable PID tracing when the tracing session runs in root namespace.

But it doesn't do this with a single global knob for all CPUs. You only
need the SPE patch together with the CONTEXTIDR config set to y.

> Alternatively, if you accept to always set PID to CONTEXTIDR in
> contextidr_thread_switch(), it would be fine for me and we can only
> need to control PID packets in SPE and CoreSight drivers.

Well, we have the config option and infrastructure already, it's up to
Linux distros to turn it on. It doesn't necessarily need to be in
defconfig.

Now, if we removed the CONTEXTIDR setting altogether, is there a way for
perf tools to coordinate the traces with the scheduling events? This
would be a simpler approach from the kernel perspective (i.e. no work).
Alternatively, could we move the CONTEXTIDR setting to a perf driver.
I'm not too familiar with perf but I guess we could add some scheduling
event hooks.
Leo Yan Feb. 1, 2022, 1:02 p.m. UTC | #16
Hi Catalin,

On Mon, Jan 17, 2022 at 06:48:10PM +0000, Catalin Marinas wrote:
> Hi Leo,
> 
> (sorry for the delay, holidays, merging window)

I am sorry too for the delay.  I read your email but have no sufficient
time to reply it in the past two weeks.

[...]

> > > I don't think your approach fully works. Let's say you are tracing two
> > > processes, one in the root PID namespace, the other not. Since the
> > > former enables PID in CONTEXTIDR, you automatically get some PID in
> > > CONTEXTIDR for the latter whether you requested it explicitly or not.
> > 
> > The key point is kernel always sets a PID number from the root PID
> > namespace into the system register CONTEXTIDR.
> 
> So earlier you mentioned that CONFIG_PID_IN_CONTEXTIDR=y is not always
> right since a profiled task may run in a non-root PID namespace. But
> this series introduces a single knob to turn on PID in CONTEXTIDR for
> all CPUs, irrespective of whether they run a task in the root PID
> namespace or not. The CPU running a task in the root ns may call
> contextidr_enable() but the other CPUs may run tasks in non-root ns.

Sorry I introduced confusion.  Let me to clarify:

CONTEXTIDR should be always used to trace root ns PIDs, this can give us
a consistent semantics.  Especially, when we record trace data with
system wide mode (e.g. use perf command with option "-a" for tracing all
CPUs), CONTEXTIDR always contains root ns PIDs for all tasks.

On the flip side when perf and profiled program run in non-root PID
namespace, it must not rely on CONTEXTIDR anymore for PID tracing.  In
some implementations (E.g. Arm SPE, x86 Intel-PT), perf tool can use
the software trace events to retrieve PID numbers, and heavily based on
the timestamp correlation between software trace events and hardware
trace events.

> IIUC, with this patch, to avoid the root ns PID for a non-root ns task,
> the SPE driver only sets the SYS_PMSCR_EL1_CX_SHIFT bit for the root ns
> tasks. So, in this case, for a non-root ns task, does it matter that
> CONTEXTIDR always contain the root ns PID? If it matters, see above why
> a single knob is already doing this.

You are right, even CONTEXTIDR always contains the root ns PID, we
still can control the bit SYS_PMSCR_EL1_CX_SHIFT in SPE driver:
when the profiled program runs in non-root ns, Arm SPE driver clears
SYS_PMSCR_EL1_CX_SHIFT bit; and if the profiled program runs in root
ns, ARM SPE driver sets SYS_PMSCR_EL1_CX_SHIFT bit alternatively.

  Diagram 1: Perf/profiled program run in root namespace
  +------------+    +---------+    +-----------------+
  | CONTEXTIDR | -> | Arm SPE | -> | Perf Trace Data |
  +------------+    +---------+    +-----------------+
                         ^                 ^
                         |                 ` Contains context packet
                         ` Set SYS_PMSCR_EL1_CX_SHIFT bit

  Diagram 2: Perf/profiled program run in non-root namespace
  +------------+    +---------+    +-----------------+
  | CONTEXTIDR | -> | Arm SPE | -> | Perf Trace Data |
  +------------+    +---------+    +-----------------+
                         ^                 ^
                         |                 ` Doesn't contain any context packet
                         ` Clear SYS_PMSCR_EL1_CX_SHIFT bit

As shown in diagram 2, when Arm SPE driver detects if the session runs
in non-root PID namespace, it clears SYS_PMSCR_EL1_CX_SHIFT bit.  As a
result, the SPE tracing data in perf file doesn't contain any context
packets, thus perf tool will rollback to use software events to retrieve
PID numbers and assign them to samples.

> > Let's see a case:
> > 
> >   # unshare --fork --pid perf record -e cs_etm// -m 64K,64K -a -o perf_test.data -- ./multi_threads_test
> > 
> > In this case, with command "unshare --fork --pid", perf tool and the
> > profiled program multi_threads_test run in the created PID namespace.
> > We can see the dumped PID values:
> > 
> >    <idle>-0       [000] d..2.   331.751681: __switch_to: contextidr_thread_switch: task perf-exec pid 840 ns_pid 2
> >    <idle>-0       [002] d..2.   331.755930: __switch_to: contextidr_thread_switch: task multi_threads_t pid 842 ns_pid 4
> >    <idle>-0       [003] d..2.   331.755993: __switch_to: contextidr_thread_switch: task multi_threads_t pid 843 ns_pid 5
> >   sugov:0-129     [005] d..2.   332.323469: __switch_to: contextidr_thread_switch: task perf pid 841 ns_pid 3
> > 
> > We can see processes have two different PIDs values, say task
> > 'perf-exec', 840 is its PID number from the root PID namespace, and 2
> > is its PID from the active namespace the task is belonging to.
> > 
> > But the register CONTEXTIDR is _only_ used to trace PIDs from the root
> > PID namespace and ignores PID values from any other PID namespace.
> 
> Would it help if we used task_pid_nr_ns() instead for setting
> CONTEXTIDR?

No, I don't think using task_pid_nr_ns() is a good idea for setting
CONTEXTIDR; this will introduce mess for the system wide mode in the
perf docoding phase.

E.g. for below perf command:

  # perf record -e arm_spe_0// -a -- sleep 1

In this case, perf tool gathers all processes info from the root
PID namespace via proc FS in the recording phase.

If we store task_pid_nr_ns() into CONTEXTIDR, it will lead to confusion
for perf tool, since the PID numbers come from different namespaces,
during the decoding phase perf tool will wrongly match PIDs (from
different PID namespace) with the recorded PIDs from root namespace.

> > Come back to your question, if there have two tracing sessions, one
> > session runs in the root PID namespace, and another runs in the
> > non-root PID namespace.  Since we can gather the PIDs in the root namespace,
> > the session in the root PID namespace can work perfectly, and we can
> > ensure the PID infos matching well between kernel's PID tracing and user
> > space tool (note: perf needs gather process info with process' pid/tid for
> > post parsing).
> > 
> > For the later session in non-root PID name space, we doesn't capture
> > any PID tracing data.  This results in the profiled result doesn't
> > contain any PID info, although it's painful that the PID info is
> > incomplete but we don't deliver any wrong info for users studying
> > profiling result and PID is always '-1'.
> 
> So what's actually disabling the capture of PID tracing data? Is it the
> PMSCR_EL1.CX bit being 0?

Yes.  I agree it does make sense to clear PMSCR_EL1.CX bit for session
in non-root ns.

Sorry before I didn't make clear for this.

> > I think the purpose for this patchset is to allow us to dynamically
> > enable PID tracing when the tracing session runs in root namespace.
> 
> But it doesn't do this with a single global knob for all CPUs. You only
> need the SPE patch together with the CONTEXTIDR config set to y.

Yeah.

> > Alternatively, if you accept to always set PID to CONTEXTIDR in
> > contextidr_thread_switch(), it would be fine for me and we can only
> > need to control PID packets in SPE and CoreSight drivers.
> 
> Well, we have the config option and infrastructure already, it's up to
> Linux distros to turn it on. It doesn't necessarily need to be in
> defconfig.
> 
> Now, if we removed the CONTEXTIDR setting altogether, is there a way for
> perf tools to coordinate the traces with the scheduling events? This
> would be a simpler approach from the kernel perspective (i.e. no work).

Indeed, we can correlate between software scheduling events and
hardware tracing events, which can be sorted with timestamps.  Arm SPE
has supported this mode, please refer to the patch [1] for understanding
the implementation in perf tool.

But using the software scheduling events might introduce inaccuracy
for the statistics, the reason is there have gap between scheduling
events and task context switching, so this is why we still want to
support context packets (by using CONTEXTIDR) for root PID namespace.
this can give us more accurate results.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/tools/perf/util/arm-spe.c?id=9dc9855f18ba25d2bc536ea5ba6682855e385d66

> Alternatively, could we move the CONTEXTIDR setting to a perf driver.
> I'm not too familiar with perf but I guess we could add some scheduling
> event hooks.

It would introduce trouble if we move the CONTEXTIDR setting to a perf
driver, I can think out two reasons:

- The first reason is we might have not only one perf driver to use
  CONTEXTIDR, e.g. Arm SPE and Arm CoreSight both need to use
  CONTEXTIDR for PID tracing.  This means we either need to set
  CONTEXTIDR in both drivers so we will introduce redundant codes.
- Furthermore, for the perf system wide mode, perf only enables
  hardware event when start the session and disables hardware event
  when stop the session.  In this mode, perf driver is not notified
  by any scheduling event during the session.

Seems to me, so far we need to come back to apply below change:

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index e1a0c44bc686..a678562253dc 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -127,6 +127,7 @@ config XGENE_PMU
 config ARM_SPE_PMU
        tristate "Enable support for the ARMv8.2 Statistical Profiling Extension"
        depends on ARM64
+       select PID_IN_CONTEXTIDR
        help
          Enable perf support for the ARMv8.2 Statistical Profiling
          Extension, which provides periodic sampling of operations in

Thanks,
Leo
diff mbox series

Patch

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index d44bcc29d99c..935343cdcb39 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -23,6 +23,7 @@ 
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/mmu_context.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
@@ -272,7 +273,7 @@  static u64 arm_spe_event_to_pmscr(struct perf_event *event)
 	if (!attr->exclude_kernel)
 		reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
 
-	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
+	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
 		reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);
 
 	return reg;
@@ -731,6 +732,13 @@  static void arm_spe_pmu_start(struct perf_event *event, int flags)
 	if (hwc->state)
 		return;
 
+	/*
+	 * Enable tracing PID to contextidr if profiling program runs in
+	 * root PID namespace.
+	 */
+	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
+		contextidr_enable();
+
 	reg = arm_spe_event_to_pmsfcr(event);
 	write_sysreg_s(reg, SYS_PMSFCR_EL1);
 
@@ -792,6 +800,9 @@  static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 	}
 
 	hwc->state |= PERF_HES_STOPPED;
+
+	if (perfmon_capable() && (task_active_pid_ns(current) == &init_pid_ns))
+		contextidr_disable();
 }
 
 static int arm_spe_pmu_add(struct perf_event *event, int flags)