diff mbox series

[3/6] tracing: Add tp_pstore cmdline to have tracepoints go to pstore

Message ID f65e9fa6e047b9b14d4f64362f9fe1f89b306050.1536430404.git.saiprakash.ranjan@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series Tracing register accesses with pstore and dynamic debug | expand

Commit Message

Sai Prakash Ranjan Sept. 8, 2018, 8:27 p.m. UTC
Add the kernel command line tp_pstore option that will have
tracepoints go to persistent ram buffer as well as to the
trace buffer for further debugging. This is similar to tp_printk
cmdline option of ftrace.

Pstore support for event tracing is already added and we enable
logging to pstore only if cmdline is specified.

Passing "tp_pstore" will activate logging to pstore. To turn it
off, the sysctl /proc/sys/kernel/tracepoint_pstore can have '0'
echoed into it. Note, this only works if the cmdline option is
used. Echoing 1 into the sysctl file without the cmdline option
will have no affect.

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
---
 .../admin-guide/kernel-parameters.txt         | 21 ++++++++
 include/linux/ftrace.h                        |  6 ++-
 kernel/sysctl.c                               |  7 +++
 kernel/trace/Kconfig                          | 22 +++++++-
 kernel/trace/trace.c                          | 51 +++++++++++++++++++
 kernel/trace/trace.h                          |  7 +++
 6 files changed, 112 insertions(+), 2 deletions(-)

Comments

Joel Fernandes Sept. 25, 2018, 9:25 p.m. UTC | #1
On Sat, Sep 8, 2018 at 1:28 PM Sai Prakash Ranjan
<saiprakash.ranjan@codeaurora.org> wrote:
>
> Add the kernel command line tp_pstore option that will have
> tracepoints go to persistent ram buffer as well as to the
> trace buffer for further debugging. This is similar to tp_printk
> cmdline option of ftrace.
>
> Pstore support for event tracing is already added and we enable
> logging to pstore only if cmdline is specified.
>
> Passing "tp_pstore" will activate logging to pstore. To turn it
> off, the sysctl /proc/sys/kernel/tracepoint_pstore can have '0'
> echoed into it. Note, this only works if the cmdline option is
> used. Echoing 1 into the sysctl file without the cmdline option
> will have no affect.
>
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
> ---
>  .../admin-guide/kernel-parameters.txt         | 21 ++++++++
>  include/linux/ftrace.h                        |  6 ++-
>  kernel/sysctl.c                               |  7 +++
>  kernel/trace/Kconfig                          | 22 +++++++-
>  kernel/trace/trace.c                          | 51 +++++++++++++++++++
>  kernel/trace/trace.h                          |  7 +++
>  6 files changed, 112 insertions(+), 2 deletions(-)
>
[...]
>  config GCOV_PROFILE_FTRACE
>         bool "Enable GCOV profiling on ftrace subsystem"
>         depends on GCOV_KERNEL
> @@ -789,4 +810,3 @@ config GCOV_PROFILE_FTRACE
>  endif # FTRACE
>
>  endif # TRACING_SUPPORT
> -
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index bf6f1d70484d..018cbbefb769 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -73,6 +73,11 @@ struct trace_iterator *tracepoint_print_iter;
>  int tracepoint_printk;
>  static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
>
> +/* Pipe tracepoints to pstore */
> +struct trace_iterator *tracepoint_pstore_iter;
> +int tracepoint_pstore;
> +static DEFINE_STATIC_KEY_FALSE(tracepoint_pstore_key);
> +
>  /* For tracers that don't implement custom flags */
>  static struct tracer_opt dummy_tracer_opt[] = {
>         { }
> @@ -238,6 +243,14 @@ static int __init set_tracepoint_printk(char *str)
>  }
>  __setup("tp_printk", set_tracepoint_printk);
>
> +static int __init set_tracepoint_pstore(char *str)
> +{
> +       if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
> +               tracepoint_pstore = 1;
> +       return 1;
> +}
> +__setup("tp_pstore", set_tracepoint_pstore);
> +
>  unsigned long long ns2usecs(u64 nsec)
>  {
>         nsec += 500;
> @@ -2376,11 +2389,45 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
>         return ret;
>  }
>
> +static DEFINE_MUTEX(tracepoint_pstore_mutex);
> +
> +int tracepoint_pstore_sysctl(struct ctl_table *table, int write,
> +                            void __user *buffer, size_t *lenp,
> +                            loff_t *ppos)
> +{
> +       int save_tracepoint_pstore;
> +       int ret;
> +
> +       mutex_lock(&tracepoint_pstore_mutex);
> +       save_tracepoint_pstore = tracepoint_pstore;
> +
> +       ret = proc_dointvec(table, write, buffer, lenp, ppos);
> +
> +       if (!tracepoint_pstore_iter)
> +               tracepoint_pstore = 0;
> +
> +       if (save_tracepoint_pstore == tracepoint_pstore)
> +               goto out;
> +
> +       if (tracepoint_pstore)
> +               static_key_enable(&tracepoint_pstore_key.key);
> +       else
> +               static_key_disable(&tracepoint_pstore_key.key);
> +
> + out:
> +       mutex_unlock(&tracepoint_pstore_mutex);
> +
> +       return ret;
> +}
> +
>  void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
>  {
>         if (static_key_false(&tracepoint_printk_key.key))
>                 output_printk(fbuffer);
>
> +       if (static_key_false(&tracepoint_pstore_key.key))
> +               pstore_event_call(fbuffer);

Can you not find a way to pass the size of the even record here, to
pstore? Then you can directly allocate and store the binary record in
pstore itself instead of rendering and storing the text in pstore
which will be more space (and I think time) efficient. I also think if
you do this, then you will not need to use the spinlock in the pstore
(which AIUI is preventing the warning you're seeing in the
event_call->event.funcs->trace() call).

- Joel
Sai Prakash Ranjan Sept. 26, 2018, 9:46 a.m. UTC | #2
On 9/26/2018 2:55 AM, Joel Fernandes wrote:
> On Sat, Sep 8, 2018 at 1:28 PM Sai Prakash Ranjan
> <saiprakash.ranjan@codeaurora.org> wrote:
>>
>> Add the kernel command line tp_pstore option that will have
>> tracepoints go to persistent ram buffer as well as to the
>> trace buffer for further debugging. This is similar to tp_printk
>> cmdline option of ftrace.
>>
>> Pstore support for event tracing is already added and we enable
>> logging to pstore only if cmdline is specified.
>>
>> Passing "tp_pstore" will activate logging to pstore. To turn it
>> off, the sysctl /proc/sys/kernel/tracepoint_pstore can have '0'
>> echoed into it. Note, this only works if the cmdline option is
>> used. Echoing 1 into the sysctl file without the cmdline option
>> will have no affect.
>>
>> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
>> ---
>>   .../admin-guide/kernel-parameters.txt         | 21 ++++++++
>>   include/linux/ftrace.h                        |  6 ++-
>>   kernel/sysctl.c                               |  7 +++
>>   kernel/trace/Kconfig                          | 22 +++++++-
>>   kernel/trace/trace.c                          | 51 +++++++++++++++++++
>>   kernel/trace/trace.h                          |  7 +++
>>   6 files changed, 112 insertions(+), 2 deletions(-)
>>
> [...]
>>   config GCOV_PROFILE_FTRACE
>>          bool "Enable GCOV profiling on ftrace subsystem"
>>          depends on GCOV_KERNEL
>> @@ -789,4 +810,3 @@ config GCOV_PROFILE_FTRACE
>>   endif # FTRACE
>>
>>   endif # TRACING_SUPPORT
>> -
>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>> index bf6f1d70484d..018cbbefb769 100644
>> --- a/kernel/trace/trace.c
>> +++ b/kernel/trace/trace.c
>> @@ -73,6 +73,11 @@ struct trace_iterator *tracepoint_print_iter;
>>   int tracepoint_printk;
>>   static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
>>
>> +/* Pipe tracepoints to pstore */
>> +struct trace_iterator *tracepoint_pstore_iter;
>> +int tracepoint_pstore;
>> +static DEFINE_STATIC_KEY_FALSE(tracepoint_pstore_key);
>> +
>>   /* For tracers that don't implement custom flags */
>>   static struct tracer_opt dummy_tracer_opt[] = {
>>          { }
>> @@ -238,6 +243,14 @@ static int __init set_tracepoint_printk(char *str)
>>   }
>>   __setup("tp_printk", set_tracepoint_printk);
>>
>> +static int __init set_tracepoint_pstore(char *str)
>> +{
>> +       if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
>> +               tracepoint_pstore = 1;
>> +       return 1;
>> +}
>> +__setup("tp_pstore", set_tracepoint_pstore);
>> +
>>   unsigned long long ns2usecs(u64 nsec)
>>   {
>>          nsec += 500;
>> @@ -2376,11 +2389,45 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
>>          return ret;
>>   }
>>
>> +static DEFINE_MUTEX(tracepoint_pstore_mutex);
>> +
>> +int tracepoint_pstore_sysctl(struct ctl_table *table, int write,
>> +                            void __user *buffer, size_t *lenp,
>> +                            loff_t *ppos)
>> +{
>> +       int save_tracepoint_pstore;
>> +       int ret;
>> +
>> +       mutex_lock(&tracepoint_pstore_mutex);
>> +       save_tracepoint_pstore = tracepoint_pstore;
>> +
>> +       ret = proc_dointvec(table, write, buffer, lenp, ppos);
>> +
>> +       if (!tracepoint_pstore_iter)
>> +               tracepoint_pstore = 0;
>> +
>> +       if (save_tracepoint_pstore == tracepoint_pstore)
>> +               goto out;
>> +
>> +       if (tracepoint_pstore)
>> +               static_key_enable(&tracepoint_pstore_key.key);
>> +       else
>> +               static_key_disable(&tracepoint_pstore_key.key);
>> +
>> + out:
>> +       mutex_unlock(&tracepoint_pstore_mutex);
>> +
>> +       return ret;
>> +}
>> +
>>   void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
>>   {
>>          if (static_key_false(&tracepoint_printk_key.key))
>>                  output_printk(fbuffer);
>>
>> +       if (static_key_false(&tracepoint_pstore_key.key))
>> +               pstore_event_call(fbuffer);
> 
> Can you not find a way to pass the size of the even record here, to
> pstore? Then you can directly allocate and store the binary record in
> pstore itself instead of rendering and storing the text in pstore
> which will be more space (and I think time) efficient. I also think if
> you do this, then you will not need to use the spinlock in the pstore
> (which AIUI is preventing the warning you're seeing in the
> event_call->event.funcs->trace() call).
> 

Right, I can check this out.

Thanks,
Sai
Sai Prakash Ranjan Oct. 8, 2018, 2:16 p.m. UTC | #3
On 9/26/2018 3:16 PM, Sai Prakash Ranjan wrote:
> On 9/26/2018 2:55 AM, Joel Fernandes wrote:
>> On Sat, Sep 8, 2018 at 1:28 PM Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>
>>> Add the kernel command line tp_pstore option that will have
>>> tracepoints go to persistent ram buffer as well as to the
>>> trace buffer for further debugging. This is similar to tp_printk
>>> cmdline option of ftrace.
>>>
>>> Pstore support for event tracing is already added and we enable
>>> logging to pstore only if cmdline is specified.
>>>
>>> Passing "tp_pstore" will activate logging to pstore. To turn it
>>> off, the sysctl /proc/sys/kernel/tracepoint_pstore can have '0'
>>> echoed into it. Note, this only works if the cmdline option is
>>> used. Echoing 1 into the sysctl file without the cmdline option
>>> will have no affect.
>>>
>>> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
>>> ---
>>>   .../admin-guide/kernel-parameters.txt         | 21 ++++++++
>>>   include/linux/ftrace.h                        |  6 ++-
>>>   kernel/sysctl.c                               |  7 +++
>>>   kernel/trace/Kconfig                          | 22 +++++++-
>>>   kernel/trace/trace.c                          | 51 +++++++++++++++++++
>>>   kernel/trace/trace.h                          |  7 +++
>>>   6 files changed, 112 insertions(+), 2 deletions(-)
>>>
>> [...]
>>>   config GCOV_PROFILE_FTRACE
>>>          bool "Enable GCOV profiling on ftrace subsystem"
>>>          depends on GCOV_KERNEL
>>> @@ -789,4 +810,3 @@ config GCOV_PROFILE_FTRACE
>>>   endif # FTRACE
>>>
>>>   endif # TRACING_SUPPORT
>>> -
>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>>> index bf6f1d70484d..018cbbefb769 100644
>>> --- a/kernel/trace/trace.c
>>> +++ b/kernel/trace/trace.c
>>> @@ -73,6 +73,11 @@ struct trace_iterator *tracepoint_print_iter;
>>>   int tracepoint_printk;
>>>   static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
>>>
>>> +/* Pipe tracepoints to pstore */
>>> +struct trace_iterator *tracepoint_pstore_iter;
>>> +int tracepoint_pstore;
>>> +static DEFINE_STATIC_KEY_FALSE(tracepoint_pstore_key);
>>> +
>>>   /* For tracers that don't implement custom flags */
>>>   static struct tracer_opt dummy_tracer_opt[] = {
>>>          { }
>>> @@ -238,6 +243,14 @@ static int __init set_tracepoint_printk(char *str)
>>>   }
>>>   __setup("tp_printk", set_tracepoint_printk);
>>>
>>> +static int __init set_tracepoint_pstore(char *str)
>>> +{
>>> +       if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
>>> +               tracepoint_pstore = 1;
>>> +       return 1;
>>> +}
>>> +__setup("tp_pstore", set_tracepoint_pstore);
>>> +
>>>   unsigned long long ns2usecs(u64 nsec)
>>>   {
>>>          nsec += 500;
>>> @@ -2376,11 +2389,45 @@ int tracepoint_printk_sysctl(struct ctl_table 
>>> *table, int write,
>>>          return ret;
>>>   }
>>>
>>> +static DEFINE_MUTEX(tracepoint_pstore_mutex);
>>> +
>>> +int tracepoint_pstore_sysctl(struct ctl_table *table, int write,
>>> +                            void __user *buffer, size_t *lenp,
>>> +                            loff_t *ppos)
>>> +{
>>> +       int save_tracepoint_pstore;
>>> +       int ret;
>>> +
>>> +       mutex_lock(&tracepoint_pstore_mutex);
>>> +       save_tracepoint_pstore = tracepoint_pstore;
>>> +
>>> +       ret = proc_dointvec(table, write, buffer, lenp, ppos);
>>> +
>>> +       if (!tracepoint_pstore_iter)
>>> +               tracepoint_pstore = 0;
>>> +
>>> +       if (save_tracepoint_pstore == tracepoint_pstore)
>>> +               goto out;
>>> +
>>> +       if (tracepoint_pstore)
>>> +               static_key_enable(&tracepoint_pstore_key.key);
>>> +       else
>>> +               static_key_disable(&tracepoint_pstore_key.key);
>>> +
>>> + out:
>>> +       mutex_unlock(&tracepoint_pstore_mutex);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>>   void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
>>>   {
>>>          if (static_key_false(&tracepoint_printk_key.key))
>>>                  output_printk(fbuffer);
>>>
>>> +       if (static_key_false(&tracepoint_pstore_key.key))
>>> +               pstore_event_call(fbuffer);
>>
>> Can you not find a way to pass the size of the even record here, to
>> pstore? Then you can directly allocate and store the binary record in
>> pstore itself instead of rendering and storing the text in pstore
>> which will be more space (and I think time) efficient. I also think if
>> you do this, then you will not need to use the spinlock in the pstore
>> (which AIUI is preventing the warning you're seeing in the
>> event_call->event.funcs->trace() call).
>>
> 
> Right, I can check this out.
> 

Hi Joel,

Sorry for the long delay in updating this thread.
But I just observed one weird behaviour in ftrace-ramoops when I was 
trying to use binary record instead of rendering text for event-ramoops.

Even though we set the ftrace-size in device tree for ramoops, the 
actual ftrace-ramoops record seems to have more records than specified size.
Is this expected or some bug?

Below is the ftrace-ramoops size passed in dtsi for db410c and we can 
see that the ftrace record is more.

# cat /sys/module/ramoops/parameters/ftrace_size
131072
#
# cat /sys/fs/pstore/ftrace-ramoops-0 | wc -c
560888
#
# cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:0 | wc -c
137758
#
# cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:1 | wc -c
140560
#
# cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:2 | wc -c
141174
#
# cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:3 | wc -c
141396
#

I don't see this in console or dmesg ramoops and also with the 
event-ramoops which I have posted since we don't use binary record, only 
ftrace-ramoops uses binary record to store trace data.

Also regarding the warning on "event_call->event.funcs->trace()" call,
I see it everytime without spinlock. Also we see output_printk using 
spinlock when making this call. I could not find a way to pass event 
buffer size and allocate in pstore. Steven can give some hints with this 
I guess.

Steven Rostedt gave some reviews about using raw_spinlocks for this call 
earlier in this thread. So is it right to not use spinlocks for trace 
events?

One difference I see in ftrace-ramoops and event-ramoops is that 
ftrace-ramoops  is not started on boot and event-ramoops logging can be 
enabled from boot (late initcall however).

Do let me know if you have any way to avoid this warning which is a race 
I think without spinlock.

Thanks,
Sai
Steven Rostedt Oct. 8, 2018, 2:36 p.m. UTC | #4
On Mon, 8 Oct 2018 19:46:15 +0530
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:

> Hi Joel,
> 
> Sorry for the long delay in updating this thread.
> But I just observed one weird behaviour in ftrace-ramoops when I was 
> trying to use binary record instead of rendering text for event-ramoops.
> 
> Even though we set the ftrace-size in device tree for ramoops, the 
> actual ftrace-ramoops record seems to have more records than specified size.
> Is this expected or some bug?

If ftrace-ramoops is using logic similar to the ftrace ring buffer,
then yes, it will be able to store more events than specified. The
ftrace ring buffer is broken up into pages, and everything is rounded
up to the nearest page (note, the data may be smaller than a page,
because each page also contains a header).

> 
> Below is the ftrace-ramoops size passed in dtsi for db410c and we can 
> see that the ftrace record is more.
> 
> # cat /sys/module/ramoops/parameters/ftrace_size
> 131072

I'm assuming this too is like the ftrace ring buffer, where the size is
per cpu (and why you do a search per cpu below).

> #
> # cat /sys/fs/pstore/ftrace-ramoops-0 | wc -c
> 560888
> #
> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:0 | wc -c
> 137758
> #
> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:1 | wc -c
> 140560
> #
> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:2 | wc -c
> 141174
> #
> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:3 | wc -c
> 141396
> #
> 

If you are using binary record, isn't this what you want? The
ftrace_size is the size of how much binary data is stored. When you
translate the binary into human text, the text should be bigger.

> I don't see this in console or dmesg ramoops and also with the 
> event-ramoops which I have posted since we don't use binary record, only 
> ftrace-ramoops uses binary record to store trace data.
> 
> Also regarding the warning on "event_call->event.funcs->trace()" call,
> I see it everytime without spinlock. Also we see output_printk using 
> spinlock when making this call. I could not find a way to pass event 
> buffer size and allocate in pstore. Steven can give some hints with this 
> I guess.
> 
> Steven Rostedt gave some reviews about using raw_spinlocks for this call 
> earlier in this thread. So is it right to not use spinlocks for trace 
> events?

Correct.

> 
> One difference I see in ftrace-ramoops and event-ramoops is that 
> ftrace-ramoops  is not started on boot and event-ramoops logging can be 
> enabled from boot (late initcall however).
> 
> Do let me know if you have any way to avoid this warning which is a race 
> I think without spinlock.

I guess I need to look at that code again too.

-- Steve
Joel Fernandes Oct. 8, 2018, 10:40 p.m. UTC | #5
On Mon, Oct 08, 2018 at 10:36:59AM -0400, Steven Rostedt wrote:
> On Mon, 8 Oct 2018 19:46:15 +0530
> Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:
> 
> > Hi Joel,
> > 
> > Sorry for the long delay in updating this thread.
> > But I just observed one weird behaviour in ftrace-ramoops when I was 
> > trying to use binary record instead of rendering text for event-ramoops.
> > 
> > Even though we set the ftrace-size in device tree for ramoops, the 
> > actual ftrace-ramoops record seems to have more records than specified size.
> > Is this expected or some bug?
> 
> If ftrace-ramoops is using logic similar to the ftrace ring buffer,
> then yes, it will be able to store more events than specified. The
> ftrace ring buffer is broken up into pages, and everything is rounded
> up to the nearest page (note, the data may be smaller than a page,
> because each page also contains a header).

In the pstore code, the pages are contiguous. The platform drivers for that
platform configure 'ftrace-size' which is in bytes. That is further divided
by the number of CPUs. The records from all the buffers are then merged at read time.

Each function trace record is of type:
struct pstore_ftrace_record {
        unsigned long ip;
        unsigned long parent_ip;
        u64 ts;
};

which should be 24 bytes.

But there is an ECC block (with ECC data and ECC header) added to each CPU in
this case of the backing store of the pstore being RAM (pstore backing stores
can be other media types too).

> > Below is the ftrace-ramoops size passed in dtsi for db410c and we can 
> > see that the ftrace record is more.
> > 
> > # cat /sys/module/ramoops/parameters/ftrace_size
> > 131072
> 
> I'm assuming this too is like the ftrace ring buffer, where the size is
> per cpu (and why you do a search per cpu below).
> 
> > #
> > # cat /sys/fs/pstore/ftrace-ramoops-0 | wc -c
> > 560888
> > #
> > # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:0 | wc -c
> > 137758
> > #
> > # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:1 | wc -c
> > 140560
> > #
> > # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:2 | wc -c
> > 141174
> > #
> > # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:3 | wc -c
> > 141396
> > #

That could be because you're counting text characters when you're counting.
You need to count the binary size.

> If you are using binary record, isn't this what you want? The
> ftrace_size is the size of how much binary data is stored. When you
> translate the binary into human text, the text should be bigger.
> 
> > I don't see this in console or dmesg ramoops and also with the 
> > event-ramoops which I have posted since we don't use binary record, only 
> > ftrace-ramoops uses binary record to store trace data.
> > 
> > Also regarding the warning on "event_call->event.funcs->trace()" call,
> > I see it everytime without spinlock. Also we see output_printk using 
> > spinlock when making this call. I could not find a way to pass event 
> > buffer size and allocate in pstore. Steven can give some hints with this 
> > I guess.

The spinlock warning you're talking about is this one correct?

[    1.389382] WARNING: CPU: 2 PID: 2 at kernel/trace/trace_output.c:289 trace_raw_output_prep+0x18/0xa0
[    1.389416] Modules linked in:
which you reported here:
https://lkml.org/lkml/2018/9/22/319

This warning happens I think because you're trying to format the events while
the trace events are being generated. You said you got this warning when you
didn't use the spinlock. I believe the spinlocking prevents such races, but
if you didn't need to format the events into text into text in the first
place, then you wouldn't need such locking at all. 

I believe ftrace doesn't have such issues because such locking is taken care
off when the trace events are formatted from the trace buffer and displayed,
but I could be wrong about that.. I'll let Steven provide more inputs about
how this warning can occur.

As a suggestion, please always provide references to the warnings you're
referring to, such as previous LKML posts or atleast the warning message so
folks know which warning you're talking about.

thanks,

 - Joel
Sai Prakash Ranjan Oct. 9, 2018, 6:22 p.m. UTC | #6
On 10/9/2018 4:10 AM, Joel Fernandes wrote:
> On Mon, Oct 08, 2018 at 10:36:59AM -0400, Steven Rostedt wrote:
>> On Mon, 8 Oct 2018 19:46:15 +0530
>> Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:
>>
>>> Hi Joel,
>>>
>>> Sorry for the long delay in updating this thread.
>>> But I just observed one weird behaviour in ftrace-ramoops when I was
>>> trying to use binary record instead of rendering text for event-ramoops.
>>>
>>> Even though we set the ftrace-size in device tree for ramoops, the
>>> actual ftrace-ramoops record seems to have more records than specified size.
>>> Is this expected or some bug?
>>
>> If ftrace-ramoops is using logic similar to the ftrace ring buffer,
>> then yes, it will be able to store more events than specified. The
>> ftrace ring buffer is broken up into pages, and everything is rounded
>> up to the nearest page (note, the data may be smaller than a page,
>> because each page also contains a header).
> 
> In the pstore code, the pages are contiguous. The platform drivers for that
> platform configure 'ftrace-size' which is in bytes. That is further divided
> by the number of CPUs. The records from all the buffers are then merged at read time.
> 
> Each function trace record is of type:
> struct pstore_ftrace_record {
>          unsigned long ip;
>          unsigned long parent_ip;
>          u64 ts;
> };
> 
> which should be 24 bytes.
> 
> But there is an ECC block (with ECC data and ECC header) added to each CPU in
> this case of the backing store of the pstore being RAM (pstore backing stores
> can be other media types too).
> 

Thanks for this info.

>>> Below is the ftrace-ramoops size passed in dtsi for db410c and we can
>>> see that the ftrace record is more.
>>>
>>> # cat /sys/module/ramoops/parameters/ftrace_size
>>> 131072
>>
>> I'm assuming this too is like the ftrace ring buffer, where the size is
>> per cpu (and why you do a search per cpu below).
>>
>>> #
>>> # cat /sys/fs/pstore/ftrace-ramoops-0 | wc -c
>>> 560888
>>> #
>>> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:0 | wc -c
>>> 137758
>>> #
>>> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:1 | wc -c
>>> 140560
>>> #
>>> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:2 | wc -c
>>> 141174
>>> #
>>> # cat /sys/fs/pstore/ftrace-ramoops-0 | grep CPU:3 | wc -c
>>> 141396
>>> #
> 
> That could be because you're counting text characters when you're counting.
> You need to count the binary size.
> 

Right thanks.

>> If you are using binary record, isn't this what you want? The
>> ftrace_size is the size of how much binary data is stored. When you
>> translate the binary into human text, the text should be bigger.
>>
>>> I don't see this in console or dmesg ramoops and also with the
>>> event-ramoops which I have posted since we don't use binary record, only
>>> ftrace-ramoops uses binary record to store trace data.
>>>
>>> Also regarding the warning on "event_call->event.funcs->trace()" call,
>>> I see it everytime without spinlock. Also we see output_printk using
>>> spinlock when making this call. I could not find a way to pass event
>>> buffer size and allocate in pstore. Steven can give some hints with this
>>> I guess.
> 
> The spinlock warning you're talking about is this one correct?
> 
> [    1.389382] WARNING: CPU: 2 PID: 2 at kernel/trace/trace_output.c:289 trace_raw_output_prep+0x18/0xa0
> [    1.389416] Modules linked in:
> which you reported here:
> https://lkml.org/lkml/2018/9/22/319
> 
> This warning happens I think because you're trying to format the events while
> the trace events are being generated. You said you got this warning when you
> didn't use the spinlock. I believe the spinlocking prevents such races, but
> if you didn't need to format the events into text into text in the first
> place, then you wouldn't need such locking at all.
> 
> I believe ftrace doesn't have such issues because such locking is taken care
> off when the trace events are formatted from the trace buffer and displayed,
> but I could be wrong about that.. I'll let Steven provide more inputs about
> how this warning can occur.
Yes Steven would have more insight on this warning.

> 
> As a suggestion, please always provide references to the warnings you're
> referring to, such as previous LKML posts or atleast the warning message so
> folks know which warning you're talking about.
> 

Yes sure.

Thanks,
Sai
Steven Rostedt Oct. 10, 2018, 7:37 p.m. UTC | #7
On Tue, 9 Oct 2018 23:52:14 +0530
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:

> >>> Also regarding the warning on "event_call->event.funcs->trace()" call,
> >>> I see it everytime without spinlock. Also we see output_printk using
> >>> spinlock when making this call. I could not find a way to pass event
> >>> buffer size and allocate in pstore. Steven can give some hints with this
> >>> I guess.  
> > 
> > The spinlock warning you're talking about is this one correct?
> > 
> > [    1.389382] WARNING: CPU: 2 PID: 2 at kernel/trace/trace_output.c:289 trace_raw_output_prep+0x18/0xa0
> > [    1.389416] Modules linked in:
> > which you reported here:
> > https://lkml.org/lkml/2018/9/22/319
> > 
> > This warning happens I think because you're trying to format the events while
> > the trace events are being generated. You said you got this warning when you
> > didn't use the spinlock. I believe the spinlocking prevents such races, but
> > if you didn't need to format the events into text into text in the first
> > place, then you wouldn't need such locking at all.
> > 
> > I believe ftrace doesn't have such issues because such locking is taken care
> > off when the trace events are formatted from the trace buffer and displayed,
> > but I could be wrong about that.. I'll let Steven provide more inputs about
> > how this warning can occur.  
> Yes Steven would have more insight on this warning.

Correct. Formatting should not be done at the location of recording
(it's way too much overhead). Even trace_printk() doesn't format string
at the execution path, but does so at the presentation path (reading
the buffer, via reads of the trace file, or in trace-cmd post
processing of the trace.dat file).

trace_printk() uses vbin_printf() which doesn't do printf()
conversions, but instead only saves the format string and the raw
values into the buffer. Then in uses bstr_printf() that takes the
format string and raw values to produce the "printf()" output you know
and love.

-- Steve
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9871e649ffef..622cf64d4e5b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4519,6 +4519,27 @@ 
 			frequency tracepoints such as irq or sched, can cause
 			the system to live lock.
 
+	tp_pstore[FTRACE]
+			Have the tracepoints sent to persistent ram buffer for
+			debugging. This is useful for debugging early boot up
+			and other kernel issues where the system hangs or
+			reboots due to some unclocked access or some buggy
+			driver. Instead of spamming the console with unwanted
+			logs, we can send the logs to pstore buffer for further
+			debugging.
+
+			Last occurred events in the pstore log will be helpful
+			in identifying the reset cause.
+
+			For example, to trace sched event, add to the command
+			line:
+				trace_event=sched tp_pstore
+
+			To turn off having tracepoints sent to pstore,
+				echo 0 > /proc/sys/kernel/tracepoint_pstore
+			Note, echoing 1 into this file without the
+			tracepoint_pstore kernel cmdline option has no effect.
+
 	traceoff_on_warning
 			[FTRACE] enable this option to disable tracing when a
 			warning is hit. This turns off "tracing_on". Tracing can
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a397907e8d72..7c3074e7ec6b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -900,6 +900,7 @@  enum ftrace_dump_mode;
 
 extern enum ftrace_dump_mode ftrace_dump_on_oops;
 extern int tracepoint_printk;
+extern int tracepoint_pstore;
 
 extern void disable_trace_on_warning(void);
 extern int __disable_trace_on_warning;
@@ -907,9 +908,12 @@  extern int __disable_trace_on_warning;
 int tracepoint_printk_sysctl(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp,
 			     loff_t *ppos);
+int tracepoint_pstore_sysctl(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos);
 
 #else /* CONFIG_TRACING */
-static inline void  disable_trace_on_warning(void) { }
+static inline void disable_trace_on_warning(void) { }
 #endif /* CONFIG_TRACING */
 
 #ifdef CONFIG_FTRACE_SYSCALLS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cc02050fd0c4..3cc1223b8955 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -653,6 +653,13 @@  static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= tracepoint_printk_sysctl,
 	},
+	{
+		.procname	= "tracepoint_pstore",
+		.data		= &tracepoint_pstore,
+		.maxlen		= sizeof(tracepoint_pstore),
+		.mode		= 0644,
+		.proc_handler	= tracepoint_pstore_sysctl,
+	},
 #endif
 #ifdef CONFIG_KEXEC_CORE
 	{
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5e3de28c7677..d0eed268ee85 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -774,6 +774,27 @@  config TRACING_EVENTS_GPIO
 	help
 	  Enable tracing events for gpio subsystem
 
+config TRACING_EVENTS_IO
+	bool "Trace IO read/write events"
+	help
+	  Enable tracing events for IO read/write operations.
+	  This is useful for debugging random hangs or resets
+	  caused due to unclocked access or some buggy driver.
+
+	  Output of this trace event can be overwhelming and hence
+	  dynamic filtering option based on dynamic debug library
+	  is provided to narrow down the issues.
+
+	  Eg: Trace all register read/write of mmc subsystem with
+	  below command line:
+
+	    dyndbg="file drivers/mmc/* +p" trace_event=io
+
+	  See Documentation/admin-guide/dynamic-debug-howto.rst for
+	  more info on using dynamic debug.
+
+	  If unsure, say N.
+
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
@@ -789,4 +810,3 @@  config GCOV_PROFILE_FTRACE
 endif # FTRACE
 
 endif # TRACING_SUPPORT
-
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bf6f1d70484d..018cbbefb769 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,6 +73,11 @@  struct trace_iterator *tracepoint_print_iter;
 int tracepoint_printk;
 static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
 
+/* Pipe tracepoints to pstore */
+struct trace_iterator *tracepoint_pstore_iter;
+int tracepoint_pstore;
+static DEFINE_STATIC_KEY_FALSE(tracepoint_pstore_key);
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -238,6 +243,14 @@  static int __init set_tracepoint_printk(char *str)
 }
 __setup("tp_printk", set_tracepoint_printk);
 
+static int __init set_tracepoint_pstore(char *str)
+{
+	if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+		tracepoint_pstore = 1;
+	return 1;
+}
+__setup("tp_pstore", set_tracepoint_pstore);
+
 unsigned long long ns2usecs(u64 nsec)
 {
 	nsec += 500;
@@ -2376,11 +2389,45 @@  int tracepoint_printk_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+static DEFINE_MUTEX(tracepoint_pstore_mutex);
+
+int tracepoint_pstore_sysctl(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos)
+{
+	int save_tracepoint_pstore;
+	int ret;
+
+	mutex_lock(&tracepoint_pstore_mutex);
+	save_tracepoint_pstore = tracepoint_pstore;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (!tracepoint_pstore_iter)
+		tracepoint_pstore = 0;
+
+	if (save_tracepoint_pstore == tracepoint_pstore)
+		goto out;
+
+	if (tracepoint_pstore)
+		static_key_enable(&tracepoint_pstore_key.key);
+	else
+		static_key_disable(&tracepoint_pstore_key.key);
+
+ out:
+	mutex_unlock(&tracepoint_pstore_mutex);
+
+	return ret;
+}
+
 void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 {
 	if (static_key_false(&tracepoint_printk_key.key))
 		output_printk(fbuffer);
 
+	if (static_key_false(&tracepoint_pstore_key.key))
+		pstore_event_call(fbuffer);
+
 	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
 				    fbuffer->flags, fbuffer->pc);
@@ -8596,6 +8643,10 @@  void __init early_trace_init(void)
 		else
 			static_key_enable(&tracepoint_printk_key.key);
 	}
+
+	if (tracepoint_pstore)
+		static_key_enable(&tracepoint_pstore_key.key);
+
 	tracer_alloc_buffers();
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3b8c0e24ab30..edc28d0c27eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1843,5 +1843,12 @@  static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 extern struct trace_iterator *tracepoint_print_iter;
+extern struct trace_iterator *tracepoint_pstore_iter;
+
+#ifdef CONFIG_PSTORE_FTRACE
+extern void pstore_event_call(struct trace_event_buffer *fbuffer);
+#else
+static inline void pstore_event_call(struct trace_event_buffer *fbuffer) { }
+#endif
 
 #endif /* _LINUX_KERNEL_TRACE_H */