diff mbox series

[2/6] pstore: Add event tracing support

Message ID b9b60ff2864a17fb05481aa33f90114ab051d6ce.1536430404.git.saiprakash.ranjan@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series Tracing register accesses with pstore and dynamic debug | expand

Commit Message

Sai Prakash Ranjan Sept. 8, 2018, 8:27 p.m. UTC
Currently pstore has function trace support which can be
used to get the function call chain with limited data.
Event tracing has extra data which is useful to debug wide
variety of issues and is heavily used across the kernel.

Adding this support to pstore can be very helpful to debug
different subsystems since almost all of them have trace
events already available. And also it is useful to debug
unknown resets or crashes since we can get lot more info
from event tracing by viewing the last occurred events.

High frequency tracepoints such as sched and irq has also
been tested. This implementation is similar to "tp_printk"
command line feature of ftrace by Steven.

For example, sample pstore output of tracing sched events
after reboot:

  # mount -t pstore pstore /sys/fs/pstore/
  # tail /sys/fs/pstore/event-ramoops-0
  sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=rcu_preempt next_pid=10 next_prio=120
  sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=R+ ==> next_comm=swapper/1 next_pid=0 next_prio=120
  sched_waking: comm=rcu_sched pid=11 prio=120 target_cpu=002
  sched_wakeup: comm=rcu_sched pid=11 prio=120 target_cpu=002
  sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=rcu_sched next_pid=11 next_prio=120
  sched_switch: prev_comm=rcu_sched prev_pid=11 prev_prio=120 prev_state=R+ ==> next_comm=swapper/2 next_pid=0 next_prio=120
  sched_waking: comm=reboot pid=1867 prio=120 target_cpu=000
  sched_wakeup: comm=reboot pid=1867 prio=120 target_cpu=000
  sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=reboot next_pid=1867 next_prio=120

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
---
 fs/pstore/Kconfig          |  2 +-
 fs/pstore/ftrace.c         | 55 ++++++++++++++++++++++++++++++++++++++
 fs/pstore/inode.c          |  4 +++
 fs/pstore/ram.c            | 44 +++++++++++++++++++++++++++---
 include/linux/pstore.h     |  2 ++
 include/linux/pstore_ram.h |  1 +
 6 files changed, 104 insertions(+), 4 deletions(-)

Comments

Sai Prakash Ranjan Sept. 11, 2018, 10:46 a.m. UTC | #1
On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
> +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
> +{
> +	struct trace_iterator *iter;
> +	struct trace_seq *s;
> +	struct trace_event_call *event_call;
> +	struct pstore_record record;
> +	struct trace_event *event;
> +	struct seq_buf *seq;
> +	unsigned long flags;
> +
> +	if (!psinfo)
> +		return;
> +
> +	if (unlikely(oops_in_progress))
> +		return;
> +
> +	pstore_record_init(&record, psinfo);
> +	record.type = PSTORE_TYPE_EVENT;
> +
> +	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> +	if (!iter)
> +		return;
> +
> +	event_call = fbuffer->trace_file->event_call;
> +	if (!event_call || !event_call->event.funcs ||
> +	    !event_call->event.funcs->trace)
> +		goto fail_event;
> +
> +	event = &fbuffer->trace_file->event_call->event;
> +
> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
> +
> +	trace_seq_init(&iter->seq);
> +	iter->ent = fbuffer->entry;
> +	event_call->event.funcs->trace(iter, 0, event);
> +	trace_seq_putc(&iter->seq, 0);
> +
> +	if (seq->size > psinfo->bufsize)
> +		seq->size = psinfo->bufsize;
> +
> +	s = &iter->seq;
> +	seq = &s->seq;
> +
> +	record.buf = (char *)(seq->buffer);
> +	record.size = seq->len;
> +	psinfo->write(&record);
> +
> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> +
> +fail_event:
> +	kfree(iter);
> +}
> +
> 

When tracing sched events on sdm845 mtp, I hit below bug repeatedly.
Seems like pstore_event_call can be called in atomic context.
I will respin the below fix in next version of the patch.
Reviews on other parts would be appreciated, thanks.

diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index d47dc93ac098..a497cf782ee8 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -73,6 +73,7 @@ void notrace pstore_event_call(struct 
trace_event_buffer *fbuffer)
         struct trace_event *event;
         struct seq_buf *seq;
         unsigned long flags;
+       gfp_t gfpflags;

         if (!psinfo)
                 return;
@@ -83,7 +84,9 @@ void notrace pstore_event_call(struct 
trace_event_buffer *fbuffer)
         pstore_record_init(&record, psinfo);
         record.type = PSTORE_TYPE_EVENT;

-       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+       gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : 
GFP_KERNEL;
+
+       iter = kmalloc(sizeof(*iter), gfpflags);
         if (!iter)
                 return;

Backtrace:

[    0.433204] clocksource: Switched to clocksource arch_sys_counter
[    0.440035] pstore: using zlib compression
[    0.444265] BUG: sleeping function called from invalid context at 
/mnt/host/source/src/third_party/kernel/v4.14/mm/page_alloc.c:4134
[    0.444268] in_atomic(): 1, irqs_disabled(): 128, pid: 1, name: swapper/0
[    0.444277] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.14.65 #19
[    0.444286] Call trace:
[    0.444303] [<ffffff800808a91c>] dump_backtrace+0x0/0x254
[    0.444309] [<ffffff800808ab90>] show_stack+0x20/0x28
[    0.444320] [<ffffff8008913794>] dump_stack+0x90/0xb0
[    0.444329] [<ffffff80080d9080>] ___might_sleep+0x110/0x128
[    0.444332] [<ffffff80080d9110>] __might_sleep+0x78/0x88
[    0.444343] [<ffffff80081b83c4>] __alloc_pages_nodemask+0xe4/0xef4
[    0.444351] [<ffffff80081da1ac>] kmalloc_order+0x34/0x5c
[    0.444355] [<ffffff80081da210>] kmalloc_order_trace+0x3c/0x10c
[    0.444366] [<ffffff800832b5fc>] pstore_event_call+0x64/0x148
[    0.444374] [<ffffff800817b754>] trace_event_buffer_commit+0x230/0x25c
[    0.444379] [<ffffff80080d774c>] 
trace_event_raw_event_sched_stat_runtime+0x78/0xc8
[    0.444385] [<ffffff80080e4d5c>] update_curr+0x184/0x208
[    0.444388] [<ffffff80080e5fc0>] task_tick_fair+0x58/0x478
[    0.444394] [<ffffff80080dd834>] scheduler_tick+0x78/0xb4
[    0.444404] [<ffffff800811d714>] update_process_times+0x48/0x5c
[    0.444411] [<ffffff800812e198>] tick_sched_handle.isra.9+0x58/0x6c
[    0.444415] [<ffffff800812e1f4>] tick_sched_timer+0x48/0x84
[    0.444420] [<ffffff800811e218>] __hrtimer_run_queues+0x1f0/0x31c
[    0.444423] [<ffffff800811eb78>] hrtimer_interrupt+0xa8/0x1c8
[    0.444435] [<ffffff8008720bc8>] arch_timer_handler_phys+0x3c/0x48
[    0.444443] [<ffffff800810a750>] handle_percpu_devid_irq+0x124/0x27c
[    0.444448] [<ffffff8008104e48>] generic_handle_irq+0x30/0x44
[    0.444452] [<ffffff80081054dc>] __handle_domain_irq+0x90/0xbc
[    0.444457] [<ffffff8008081250>] gic_handle_irq+0x144/0x1ac
[    0.444460] Exception stack(0xffffff800805b5f0 to 0xffffff800805b730)
[    0.444465] b5e0:                                   0000000000000040 
ffffffc0fb2d8000
[    0.444468] b600: 00000040f7046000 ffffffffffffffff 0000000000000000 
0000000000000000
[    0.444472] b620: ffffffc0fb288f00 0720072007200720 0720072007200720 
0720072007200720
[    0.444475] b640: 0720072007200720 0720072007200720 0720072007200720 
0720072007200720
[    0.444479] b660: 0720072007200720 0000000000000010 0000000000000003 
0000000000004600
[    0.444482] b680: 000000000000000c ffffff8009152000 ffffff8009192000 
000000000000002e
[    0.444485] b6a0: ffffff8009192458 0000000000000000 0000000000000000 
ffffff8009031410
[    0.444490] b6c0: ffffff8008ec7018 ffffff8009031410 ffffff8009031000 
ffffff800805b730
[    0.444495] b6e0: ffffff80081030d4 ffffff800805b730 ffffff80081030dc 
0000000060c00049
[    0.444498] b700: 0000000000000000 ffffff80090916a0 ffffffffffffffff 
ffffff8008ec7018
[    0.444502] b720: ffffff800805b730 ffffff80081030dc
[    0.444506] [<ffffff8008082a30>] el1_irq+0xb0/0x124
[    0.444510] [<ffffff80081030dc>] console_unlock+0x27c/0x4e0
[    0.444512] [<ffffff80081035b0>] vprintk_emit+0x270/0x290
[    0.444515] [<ffffff80081037c0>] vprintk_default+0x7c/0x84
[    0.444518] [<ffffff8008104db8>] vprintk_func+0x154/0x180
[    0.444522] [<ffffff8008104448>] printk+0x80/0xa0
[    0.444525] [<ffffff800832a944>] pstore_register+0x148/0x1f8
[    0.444529] [<ffffff800832cb5c>] ramoops_probe+0x56c/0x6e8
[    0.444537] [<ffffff8008551d64>] platform_drv_probe+0x58/0xa4
[    0.444544] [<ffffff800854f9c4>] driver_probe_device+0x204/0x438
[    0.444547] [<ffffff800854fe44>] __device_attach_driver+0xf8/0x110
[    0.444552] [<ffffff800854d6dc>] bus_for_each_drv+0x98/0xc8
[    0.444556] [<ffffff800854f6ec>] __device_attach+0xac/0x150
[    0.444559] [<ffffff800854fe80>] device_initial_probe+0x24/0x30
[    0.444563] [<ffffff800854e96c>] bus_probe_device+0x38/0x98
[    0.444567] [<ffffff800854c478>] device_add+0x318/0x560
[    0.444575] [<ffffff8008737440>] of_device_add+0x44/0x50
[    0.444579] [<ffffff80087379f8>] 
of_platform_device_create_pdata+0x8c/0xc4
[    0.444582] [<ffffff8008737a64>] of_platform_device_create+0x34/0x40
[    0.444593] [<ffffff8008e3e9e8>] 
of_platform_default_populate_init+0x58/0xa8
[    0.444596] [<ffffff8008083558>] do_one_initcall+0xa0/0x138
[    0.444605] [<ffffff8008e00f10>] kernel_init_freeable+0x198/0x24c
[    0.444610] [<ffffff8008925a7c>] kernel_init+0x18/0x108
[    0.444616] [<ffffff800808477c>] ret_from_fork+0x10/0x18
[    0.856135] console [pstore-1] enabled
Sai Prakash Ranjan Sept. 16, 2018, 7:07 a.m. UTC | #2
On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
> Currently pstore has function trace support which can be
> used to get the function call chain with limited data.
> Event tracing has extra data which is useful to debug wide
> variety of issues and is heavily used across the kernel.
> 
> Adding this support to pstore can be very helpful to debug
> different subsystems since almost all of them have trace
> events already available. And also it is useful to debug
> unknown resets or crashes since we can get lot more info
> from event tracing by viewing the last occurred events.
> 
> High frequency tracepoints such as sched and irq has also
> been tested. This implementation is similar to "tp_printk"
> command line feature of ftrace by Steven.
> 
> For example, sample pstore output of tracing sched events
> after reboot:
> 
>    # mount -t pstore pstore /sys/fs/pstore/
>    # tail /sys/fs/pstore/event-ramoops-0
>    sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=rcu_preempt next_pid=10 next_prio=120
>    sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=R+ ==> next_comm=swapper/1 next_pid=0 next_prio=120
>    sched_waking: comm=rcu_sched pid=11 prio=120 target_cpu=002
>    sched_wakeup: comm=rcu_sched pid=11 prio=120 target_cpu=002
>    sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=rcu_sched next_pid=11 next_prio=120
>    sched_switch: prev_comm=rcu_sched prev_pid=11 prev_prio=120 prev_state=R+ ==> next_comm=swapper/2 next_pid=0 next_prio=120
>    sched_waking: comm=reboot pid=1867 prio=120 target_cpu=000
>    sched_wakeup: comm=reboot pid=1867 prio=120 target_cpu=000
>    sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=S ==> next_comm=reboot next_pid=1867 next_prio=120
> 
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
> ---
>   fs/pstore/Kconfig          |  2 +-
>   fs/pstore/ftrace.c         | 55 ++++++++++++++++++++++++++++++++++++++
>   fs/pstore/inode.c          |  4 +++
>   fs/pstore/ram.c            | 44 +++++++++++++++++++++++++++---
>   include/linux/pstore.h     |  2 ++
>   include/linux/pstore_ram.h |  1 +
>   6 files changed, 104 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
> index 503086f7f7c1..6fe087b13a51 100644
> --- a/fs/pstore/Kconfig
> +++ b/fs/pstore/Kconfig
> @@ -126,7 +126,7 @@ config PSTORE_PMSG
>   
>   config PSTORE_FTRACE
>   	bool "Persistent function tracer"
> -	depends on PSTORE
> +	depends on PSTORE && PSTORE!=m
>   	depends on FUNCTION_TRACER
>   	depends on DEBUG_FS
>   	help
> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
> index 06aab07b6bb7..d47dc93ac098 100644
> --- a/fs/pstore/ftrace.c
> +++ b/fs/pstore/ftrace.c
> @@ -24,6 +24,8 @@
>   #include <linux/debugfs.h>
>   #include <linux/err.h>
>   #include <linux/cache.h>
> +#include <linux/slab.h>
> +#include <linux/trace_events.h>
>   #include <asm/barrier.h>
>   #include "internal.h"
>   
> @@ -62,6 +64,59 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
>   	.func	= pstore_ftrace_call,
>   };
>   
> +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
> +{
> +	struct trace_iterator *iter;
> +	struct trace_seq *s;
> +	struct trace_event_call *event_call;
> +	struct pstore_record record;
> +	struct trace_event *event;
> +	struct seq_buf *seq;
> +	unsigned long flags;
> +
> +	if (!psinfo)
> +		return;
> +
> +	if (unlikely(oops_in_progress))
> +		return;
> +
> +	pstore_record_init(&record, psinfo);
> +	record.type = PSTORE_TYPE_EVENT;
> +
> +	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> +	if (!iter)
> +		return;
> +
> +	event_call = fbuffer->trace_file->event_call;
> +	if (!event_call || !event_call->event.funcs ||
> +	    !event_call->event.funcs->trace)
> +		goto fail_event;
> +
> +	event = &fbuffer->trace_file->event_call->event;
> +
> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
> +
> +	trace_seq_init(&iter->seq);
> +	iter->ent = fbuffer->entry;
> +	event_call->event.funcs->trace(iter, 0, event);
> +	trace_seq_putc(&iter->seq, 0);
> +
> +	if (seq->size > psinfo->bufsize)
> +		seq->size = psinfo->bufsize;
> +
> +	s = &iter->seq;
> +	seq = &s->seq;
> +
> +	record.buf = (char *)(seq->buffer);
> +	record.size = seq->len;
> +	psinfo->write(&record);
> +
> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> +
> +fail_event:
> +	kfree(iter);
> +}
> +
>   static DEFINE_MUTEX(pstore_ftrace_lock);
>   static bool pstore_ftrace_enabled;
>   
> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
> index 5fcb845b9fec..f099152abbbd 100644
> --- a/fs/pstore/inode.c
> +++ b/fs/pstore/inode.c
> @@ -345,6 +345,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
>   		scnprintf(name, sizeof(name), "console-%s-%llu",
>   			  record->psi->name, record->id);
>   		break;
> +	case PSTORE_TYPE_EVENT:
> +		scnprintf(name, sizeof(name), "event-%s-%llu",
> +			  record->psi->name, record->id);
> +		break;
>   	case PSTORE_TYPE_FTRACE:
>   		scnprintf(name, sizeof(name), "ftrace-%s-%llu",
>   			  record->psi->name, record->id);
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index bbd1e357c23d..f60d41c0309e 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -48,6 +48,10 @@ static ulong ramoops_console_size = MIN_MEM_SIZE;
>   module_param_named(console_size, ramoops_console_size, ulong, 0400);
>   MODULE_PARM_DESC(console_size, "size of kernel console log");
>   
> +static ulong ramoops_event_size = MIN_MEM_SIZE;
> +module_param_named(event_size, ramoops_event_size, ulong, 0400);
> +MODULE_PARM_DESC(event_size, "size of event log");
> +
>   static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
>   module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
>   MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
> @@ -86,6 +90,7 @@ MODULE_PARM_DESC(ramoops_ecc,
>   struct ramoops_context {
>   	struct persistent_ram_zone **dprzs;	/* Oops dump zones */
>   	struct persistent_ram_zone *cprz;	/* Console zone */
> +	struct persistent_ram_zone *eprz;       /* Event zone */
>   	struct persistent_ram_zone **fprzs;	/* Ftrace zones */
>   	struct persistent_ram_zone *mprz;	/* PMSG zone */
>   	phys_addr_t phys_addr;
> @@ -93,6 +98,7 @@ struct ramoops_context {
>   	unsigned int memtype;
>   	size_t record_size;
>   	size_t console_size;
> +	size_t event_size;
>   	size_t ftrace_size;
>   	size_t pmsg_size;
>   	int dump_oops;
> @@ -103,6 +109,7 @@ struct ramoops_context {
>   	/* _read_cnt need clear on ramoops_pstore_open */
>   	unsigned int dump_read_cnt;
>   	unsigned int console_read_cnt;
> +	unsigned int event_read_cnt;
>   	unsigned int max_ftrace_cnt;
>   	unsigned int ftrace_read_cnt;
>   	unsigned int pmsg_read_cnt;
> @@ -118,6 +125,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
>   
>   	cxt->dump_read_cnt = 0;
>   	cxt->console_read_cnt = 0;
> +	cxt->event_read_cnt = 0;
>   	cxt->ftrace_read_cnt = 0;
>   	cxt->pmsg_read_cnt = 0;
>   	return 0;
> @@ -277,6 +285,11 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
>   					   1, &record->id, &record->type,
>   					   PSTORE_TYPE_CONSOLE, 0);
>   
> +	if (!prz_ok(prz))
> +		prz = ramoops_get_next_prz(&cxt->eprz, &cxt->event_read_cnt,
> +					   1, &record->id, &record->type,
> +					   PSTORE_TYPE_EVENT, 0);
> +
>   	if (!prz_ok(prz))
>   		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
>   					   1, &record->id, &record->type,
> @@ -385,6 +398,11 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
>   			return -ENOMEM;
>   		persistent_ram_write(cxt->cprz, record->buf, record->size);
>   		return 0;
> +	} else if (record->type == PSTORE_TYPE_EVENT) {
> +		if (!cxt->eprz)
> +			return -ENOMEM;
> +		persistent_ram_write(cxt->eprz, record->buf, record->size);
> +		return 0;
>   	} else if (record->type == PSTORE_TYPE_FTRACE) {
>   		int zonenum;
>   
> @@ -475,6 +493,9 @@ static int ramoops_pstore_erase(struct pstore_record *record)
>   	case PSTORE_TYPE_CONSOLE:
>   		prz = cxt->cprz;
>   		break;
> +	case PSTORE_TYPE_EVENT:
> +		prz = cxt->eprz;
> +		break;
>   	case PSTORE_TYPE_FTRACE:
>   		if (record->id >= cxt->max_ftrace_cnt)
>   			return -EINVAL;
> @@ -699,6 +720,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
>   
>   	parse_size("record-size", pdata->record_size);
>   	parse_size("console-size", pdata->console_size);
> +	parse_size("event-size", pdata->event_size);
>   	parse_size("ftrace-size", pdata->ftrace_size);
>   	parse_size("pmsg-size", pdata->pmsg_size);
>   	parse_size("ecc-size", pdata->ecc_info.ecc_size);
> @@ -747,7 +769,8 @@ static int ramoops_probe(struct platform_device *pdev)
>   	}
>   
>   	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
> -			!pdata->ftrace_size && !pdata->pmsg_size)) {
> +			!pdata->event_size && !pdata->ftrace_size &&
> +			!pdata->pmsg_size)) {
>   		pr_err("The memory size and the record/console size must be "
>   			"non-zero\n");
>   		goto fail_out;
> @@ -757,6 +780,8 @@ static int ramoops_probe(struct platform_device *pdev)
>   		pdata->record_size = rounddown_pow_of_two(pdata->record_size);
>   	if (pdata->console_size && !is_power_of_2(pdata->console_size))
>   		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
> +	if (pdata->event_size && !is_power_of_2(pdata->event_size))
> +		pdata->event_size = rounddown_pow_of_two(pdata->event_size);
>   	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
>   		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
>   	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
> @@ -767,6 +792,7 @@ static int ramoops_probe(struct platform_device *pdev)
>   	cxt->memtype = pdata->mem_type;
>   	cxt->record_size = pdata->record_size;
>   	cxt->console_size = pdata->console_size;
> +	cxt->event_size = pdata->event_size;
>   	cxt->ftrace_size = pdata->ftrace_size;
>   	cxt->pmsg_size = pdata->pmsg_size;
>   	cxt->dump_oops = pdata->dump_oops;
> @@ -775,8 +801,8 @@ static int ramoops_probe(struct platform_device *pdev)
>   
>   	paddr = cxt->phys_addr;
>   
> -	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
> -			- cxt->pmsg_size;
> +	dump_mem_sz = cxt->size - cxt->console_size - cxt->event_size
> +			- cxt->ftrace_size - cxt->pmsg_size;
>   	err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
>   				dump_mem_sz, cxt->record_size,
>   				&cxt->max_dump_cnt, 0, 0);
> @@ -788,6 +814,11 @@ static int ramoops_probe(struct platform_device *pdev)
>   	if (err)
>   		goto fail_init_cprz;
>   
> +	err = ramoops_init_prz("event", dev, cxt, &cxt->eprz, &paddr,
> +			       cxt->event_size, 0);
> +	if (err)
> +		goto fail_init_eprz;
> +
>   	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
>   				? nr_cpu_ids
>   				: 1;
> @@ -825,6 +856,8 @@ static int ramoops_probe(struct platform_device *pdev)
>   	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
>   	if (cxt->console_size)
>   		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
> +	if (cxt->event_size)
> +		cxt->pstore.flags |= PSTORE_FLAGS_EVENT;
>   	if (cxt->ftrace_size)
>   		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
>   	if (cxt->pmsg_size)
> @@ -845,6 +878,7 @@ static int ramoops_probe(struct platform_device *pdev)
>   	record_size = pdata->record_size;
>   	dump_oops = pdata->dump_oops;
>   	ramoops_console_size = pdata->console_size;
> +	ramoops_event_size = pdata->event_size;
>   	ramoops_pmsg_size = pdata->pmsg_size;
>   	ramoops_ftrace_size = pdata->ftrace_size;
>   
> @@ -858,6 +892,8 @@ static int ramoops_probe(struct platform_device *pdev)
>   	kfree(cxt->pstore.buf);
>   fail_clear:
>   	cxt->pstore.bufsize = 0;
> +	persistent_ram_free(cxt->eprz);
> +fail_init_eprz:
>   	persistent_ram_free(cxt->mprz);
>   fail_init_mprz:
>   fail_init_fprz:
> @@ -877,6 +913,7 @@ static int ramoops_remove(struct platform_device *pdev)
>   	kfree(cxt->pstore.buf);
>   	cxt->pstore.bufsize = 0;
>   
> +	persistent_ram_free(cxt->eprz);
>   	persistent_ram_free(cxt->mprz);
>   	persistent_ram_free(cxt->cprz);
>   	ramoops_free_przs(cxt);
> @@ -916,6 +953,7 @@ static void ramoops_register_dummy(void)
>   	dummy_data->mem_type = mem_type;
>   	dummy_data->record_size = record_size;
>   	dummy_data->console_size = ramoops_console_size;
> +	dummy_data->event_size = ramoops_event_size;
>   	dummy_data->ftrace_size = ramoops_ftrace_size;
>   	dummy_data->pmsg_size = ramoops_pmsg_size;
>   	dummy_data->dump_oops = dump_oops;
> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
> index a15bc4d48752..c31bb9567b4a 100644
> --- a/include/linux/pstore.h
> +++ b/include/linux/pstore.h
> @@ -44,6 +44,7 @@ enum pstore_type_id {
>   	PSTORE_TYPE_PPC_COMMON	= 6,
>   	PSTORE_TYPE_PMSG	= 7,
>   	PSTORE_TYPE_PPC_OPAL	= 8,
> +	PSTORE_TYPE_EVENT	= 9,
>   	PSTORE_TYPE_UNKNOWN	= 255
>   };
>   
> @@ -193,6 +194,7 @@ struct pstore_info {
>   #define PSTORE_FLAGS_CONSOLE	(1 << 1)
>   #define PSTORE_FLAGS_FTRACE	(1 << 2)
>   #define PSTORE_FLAGS_PMSG	(1 << 3)
> +#define PSTORE_FLAGS_EVENT	(1 << 4)
>   
>   extern int pstore_register(struct pstore_info *);
>   extern void pstore_unregister(struct pstore_info *);
> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
> index e6d226464838..f01055c6cc40 100644
> --- a/include/linux/pstore_ram.h
> +++ b/include/linux/pstore_ram.h
> @@ -95,6 +95,7 @@ struct ramoops_platform_data {
>   	unsigned int	mem_type;
>   	unsigned long	record_size;
>   	unsigned long	console_size;
> +	unsigned long	event_size;
>   	unsigned long	ftrace_size;
>   	unsigned long	pmsg_size;
>   	int		dump_oops;
> 

Hi,

Anyone here?
Kees Cook Sept. 17, 2018, 2:54 p.m. UTC | #3
On Sun, Sep 16, 2018 at 6:55 AM, Joel Fernandes <joel@joelfernandes.org> wrote:
> On Sun, Sep 16, 2018, 12:08 AM Sai Prakash Ranjan
> <saiprakash.ranjan@codeaurora.org> wrote:
>>
>> On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
>> > Currently pstore has function trace support which can be
>> > used to get the function call chain with limited data.
>> > Event tracing has extra data which is useful to debug wide
>> > variety of issues and is heavily used across the kernel.
>> >
>> > Adding this support to pstore can be very helpful to debug
>> > different subsystems since almost all of them have trace
>> > events already available. And also it is useful to debug
>> > unknown resets or crashes since we can get lot more info
>> > from event tracing by viewing the last occurred events.
>>
>> Anyone here?
>
> Sorry for the top post. I've been wanting to do this as well for some time.
> It's quite useful. I am out of office this week and away from work machine.
> I will take a look at your patches next week once I'm back at work. Thanks.

If Steven agrees this shouldn't live in ftrace directly and Joel
reviews these patches, I think it should be fine. I'm travelling, but
I can review it hopefully later this week.

-Kees
Sai Prakash Ranjan Sept. 17, 2018, 5:13 p.m. UTC | #4
On 9/16/2018 7:25 PM, Joel Fernandes wrote:
> Sorry for the top post. I've been wanting to do this as well for some 
> time. It's quite useful. I am out of office this week and away from work 
> machine. I will take a look at your patches next week once I'm back at 
> work. Thanks.
> 
> Best,
> J
> 

Cool, thanks Joel.
Sai Prakash Ranjan Sept. 17, 2018, 5:17 p.m. UTC | #5
On 9/17/2018 8:24 PM, Kees Cook wrote:
> On Sun, Sep 16, 2018 at 6:55 AM, Joel Fernandes <joel@joelfernandes.org> wrote:
>> On Sun, Sep 16, 2018, 12:08 AM Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>
>>> On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
>>>> Currently pstore has function trace support which can be
>>>> used to get the function call chain with limited data.
>>>> Event tracing has extra data which is useful to debug wide
>>>> variety of issues and is heavily used across the kernel.
>>>>
>>>> Adding this support to pstore can be very helpful to debug
>>>> different subsystems since almost all of them have trace
>>>> events already available. And also it is useful to debug
>>>> unknown resets or crashes since we can get lot more info
>>>> from event tracing by viewing the last occurred events.
>>>
>>> Anyone here?
>>
>> Sorry for the top post. I've been wanting to do this as well for some time.
>> It's quite useful. I am out of office this week and away from work machine.
>> I will take a look at your patches next week once I'm back at work. Thanks.
> 
> If Steven agrees this shouldn't live in ftrace directly and Joel
> reviews these patches, I think it should be fine. I'm travelling, but
> I can review it hopefully later this week.
> 

Thank you Kees.

- Sai
Stephen Boyd Sept. 17, 2018, 5:38 p.m. UTC | #6
Quoting Sai Prakash Ranjan (2018-09-11 03:46:01)
> On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
> > +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
> > +{
> > +     struct trace_iterator *iter;
> > +     struct trace_seq *s;
> > +     struct trace_event_call *event_call;
> > +     struct pstore_record record;
> > +     struct trace_event *event;
> > +     struct seq_buf *seq;
> > +     unsigned long flags;
> > +
> > +     if (!psinfo)
> > +             return;
> > +
> > +     if (unlikely(oops_in_progress))
> > +             return;
> > +
> > +     pstore_record_init(&record, psinfo);
> > +     record.type = PSTORE_TYPE_EVENT;
> > +
> > +     iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> > +     if (!iter)
> > +             return;
> > +
> > +     event_call = fbuffer->trace_file->event_call;
> > +     if (!event_call || !event_call->event.funcs ||
> > +         !event_call->event.funcs->trace)
> > +             goto fail_event;
> > +
> > +     event = &fbuffer->trace_file->event_call->event;
> > +
> > +     spin_lock_irqsave(&psinfo->buf_lock, flags);
> > +
> > +     trace_seq_init(&iter->seq);
> > +     iter->ent = fbuffer->entry;
> > +     event_call->event.funcs->trace(iter, 0, event);
> > +     trace_seq_putc(&iter->seq, 0);
> > +
> > +     if (seq->size > psinfo->bufsize)
> > +             seq->size = psinfo->bufsize;
> > +
> > +     s = &iter->seq;
> > +     seq = &s->seq;
> > +
> > +     record.buf = (char *)(seq->buffer);
> > +     record.size = seq->len;
> > +     psinfo->write(&record);
> > +
> > +     spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> > +
> > +fail_event:
> > +     kfree(iter);
> > +}
> > +
> > 
> 
> When tracing sched events on sdm845 mtp, I hit below bug repeatedly.
> Seems like pstore_event_call can be called in atomic context.
> I will respin the below fix in next version of the patch.
> Reviews on other parts would be appreciated, thanks.
> 
> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
> index d47dc93ac098..a497cf782ee8 100644
> --- a/fs/pstore/ftrace.c
> +++ b/fs/pstore/ftrace.c
> @@ -73,6 +73,7 @@ void notrace pstore_event_call(struct 
> trace_event_buffer *fbuffer)
>          struct trace_event *event;
>          struct seq_buf *seq;
>          unsigned long flags;
> +       gfp_t gfpflags;
> 
>          if (!psinfo)
>                  return;
> @@ -83,7 +84,9 @@ void notrace pstore_event_call(struct 
> trace_event_buffer *fbuffer)
>          pstore_record_init(&record, psinfo);
>          record.type = PSTORE_TYPE_EVENT;
> 
> -       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> +       gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : 
> GFP_KERNEL;
> +

Do you need to allocate at all? Can you throw the iter on the stack?

Using in_atomic() and irqs_disabled() to figure out if an atomic or a
non-atomic allocation should be used is not a good solution.
Sai Prakash Ranjan Sept. 17, 2018, 7:43 p.m. UTC | #7
On 9/17/2018 11:08 PM, Stephen Boyd wrote:
> Quoting Sai Prakash Ranjan (2018-09-11 03:46:01)
>> On 9/9/2018 1:57 AM, Sai Prakash Ranjan wrote:
>>> +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
>>> +{
>>> +     struct trace_iterator *iter;
>>> +     struct trace_seq *s;
>>> +     struct trace_event_call *event_call;
>>> +     struct pstore_record record;
>>> +     struct trace_event *event;
>>> +     struct seq_buf *seq;
>>> +     unsigned long flags;
>>> +
>>> +     if (!psinfo)
>>> +             return;
>>> +
>>> +     if (unlikely(oops_in_progress))
>>> +             return;
>>> +
>>> +     pstore_record_init(&record, psinfo);
>>> +     record.type = PSTORE_TYPE_EVENT;
>>> +
>>> +     iter = kmalloc(sizeof(*iter), GFP_KERNEL);
>>> +     if (!iter)
>>> +             return;
>>> +
>>> +     event_call = fbuffer->trace_file->event_call;
>>> +     if (!event_call || !event_call->event.funcs ||
>>> +         !event_call->event.funcs->trace)
>>> +             goto fail_event;
>>> +
>>> +     event = &fbuffer->trace_file->event_call->event;
>>> +
>>> +     spin_lock_irqsave(&psinfo->buf_lock, flags);
>>> +
>>> +     trace_seq_init(&iter->seq);
>>> +     iter->ent = fbuffer->entry;
>>> +     event_call->event.funcs->trace(iter, 0, event);
>>> +     trace_seq_putc(&iter->seq, 0);
>>> +
>>> +     if (seq->size > psinfo->bufsize)
>>> +             seq->size = psinfo->bufsize;
>>> +
>>> +     s = &iter->seq;
>>> +     seq = &s->seq;
>>> +
>>> +     record.buf = (char *)(seq->buffer);
>>> +     record.size = seq->len;
>>> +     psinfo->write(&record);
>>> +
>>> +     spin_unlock_irqrestore(&psinfo->buf_lock, flags);
>>> +
>>> +fail_event:
>>> +     kfree(iter);
>>> +}
>>> +
>>>
>>
>> When tracing sched events on sdm845 mtp, I hit below bug repeatedly.
>> Seems like pstore_event_call can be called in atomic context.
>> I will respin the below fix in next version of the patch.
>> Reviews on other parts would be appreciated, thanks.
>>
>> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
>> index d47dc93ac098..a497cf782ee8 100644
>> --- a/fs/pstore/ftrace.c
>> +++ b/fs/pstore/ftrace.c
>> @@ -73,6 +73,7 @@ void notrace pstore_event_call(struct
>> trace_event_buffer *fbuffer)
>>           struct trace_event *event;
>>           struct seq_buf *seq;
>>           unsigned long flags;
>> +       gfp_t gfpflags;
>>
>>           if (!psinfo)
>>                   return;
>> @@ -83,7 +84,9 @@ void notrace pstore_event_call(struct
>> trace_event_buffer *fbuffer)
>>           pstore_record_init(&record, psinfo);
>>           record.type = PSTORE_TYPE_EVENT;
>>
>> -       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
>> +       gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC :
>> GFP_KERNEL;
>> +
>

Hi Stephen

Thanks for the comments.

> Do you need to allocate at all? Can you throw the iter on the stack?
> 

Yes since we need to copy the contents to pstore buffer.

> Using in_atomic() and irqs_disabled() to figure out if an atomic or a
> non-atomic allocation should be used is not a good solution.
> 

I took reference from a similar use by graph_trace_open() which can be 
called in atomic context via ftrace_dump(). I am open to correct this if 
there is some other way.

Sai
Steven Rostedt Sept. 17, 2018, 11:04 p.m. UTC | #8
On Sun, 16 Sep 2018 12:37:52 +0530
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:

> Hi,
> 
> Anyone here?

You also just caught me from coming back from a trip. I'm looking at
your patches now.

-- Steve
Steven Rostedt Sept. 17, 2018, 11:34 p.m. UTC | #9
On Sun,  9 Sep 2018 01:57:03 +0530
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:


> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
> ---
>  fs/pstore/Kconfig          |  2 +-
>  fs/pstore/ftrace.c         | 55 ++++++++++++++++++++++++++++++++++++++
>  fs/pstore/inode.c          |  4 +++
>  fs/pstore/ram.c            | 44 +++++++++++++++++++++++++++---
>  include/linux/pstore.h     |  2 ++
>  include/linux/pstore_ram.h |  1 +
>  6 files changed, 104 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
> index 503086f7f7c1..6fe087b13a51 100644
> --- a/fs/pstore/Kconfig
> +++ b/fs/pstore/Kconfig
> @@ -126,7 +126,7 @@ config PSTORE_PMSG
>  
>  config PSTORE_FTRACE
>  	bool "Persistent function tracer"
> -	depends on PSTORE
> +	depends on PSTORE && PSTORE!=m
>  	depends on FUNCTION_TRACER
>  	depends on DEBUG_FS
>  	help
> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
> index 06aab07b6bb7..d47dc93ac098 100644
> --- a/fs/pstore/ftrace.c
> +++ b/fs/pstore/ftrace.c
> @@ -24,6 +24,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/err.h>
>  #include <linux/cache.h>
> +#include <linux/slab.h>
> +#include <linux/trace_events.h>
>  #include <asm/barrier.h>
>  #include "internal.h"
>  
> @@ -62,6 +64,59 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
>  	.func	= pstore_ftrace_call,
>  };
>  
> +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
> +{
> +	struct trace_iterator *iter;
> +	struct trace_seq *s;
> +	struct trace_event_call *event_call;
> +	struct pstore_record record;
> +	struct trace_event *event;
> +	struct seq_buf *seq;
> +	unsigned long flags;
> +
> +	if (!psinfo)
> +		return;
> +
> +	if (unlikely(oops_in_progress))
> +		return;
> +
> +	pstore_record_init(&record, psinfo);
> +	record.type = PSTORE_TYPE_EVENT;
> +
> +	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> +	if (!iter)
> +		return;

It looks like pstore_event_call() gets called from a trace event. You
can't call kmalloc() from one. One thing is that kmalloc has
tracepoints itself. You trace those you just entered an infinite loop.


> +
> +	event_call = fbuffer->trace_file->event_call;
> +	if (!event_call || !event_call->event.funcs ||
> +	    !event_call->event.funcs->trace)
> +		goto fail_event;
> +
> +	event = &fbuffer->trace_file->event_call->event;
> +
> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
> +
> +	trace_seq_init(&iter->seq);
> +	iter->ent = fbuffer->entry;

I guess what you are doing is needing to translate the raw data into
ascii output, and need the trace_iterator to do so.

You are already under a psinfo->buf_lock. Add a dummy iterator to that
and use it instead.

	trace_seq_init(&psinfo->iter->seq);

> +	event_call->event.funcs->trace(iter, 0, event);

				      (psinfo->iter, 0 , event);

etc.

> +	trace_seq_putc(&iter->seq, 0);
> +
> +	if (seq->size > psinfo->bufsize)
> +		seq->size = psinfo->bufsize;
> +
> +	s = &iter->seq;
> +	seq = &s->seq;
> +
> +	record.buf = (char *)(seq->buffer);
> +	record.size = seq->len;
> +	psinfo->write(&record);
> +
> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);

You may also need to convert these spin_locks into raw_spin_locks as
when PREEMPT_RT enters the kernel you don't want them to turn into
mutexes. 

But that can be another patch.

-- Steve

> +
> +fail_event:
> +	kfree(iter);
> +}
> +
>  static DEFINE_MUTEX(pstore_ftrace_lock);
>  static bool pstore_ftrace_enabled;
>  
> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
> index 5fcb845b9fec..f099152abbbd 100644
> --- a/fs/pstore/inode.c
> +++ b/fs/pstore/inode.c
> @@ -345,6 +345,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
>  		scnprintf(name, sizeof(name), "console-%s-%llu",
>  			  record->psi->name, record->id);
>  		break;
> +	case PSTORE_TYPE_EVENT:
> +		scnprintf(name, sizeof(name), "event-%s-%llu",
> +			  record->psi->name, record->id);
> +		break;
>  	case PSTORE_TYPE_FTRACE:
>  		scnprintf(name, sizeof(name), "ftrace-%s-%llu",
>  			  record->psi->name, record->id);
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index bbd1e357c23d..f60d41c0309e 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -48,6 +48,10 @@ static ulong ramoops_console_size = MIN_MEM_SIZE;
>  module_param_named(console_size, ramoops_console_size, ulong, 0400);
>  MODULE_PARM_DESC(console_size, "size of kernel console log");
>  
> +static ulong ramoops_event_size = MIN_MEM_SIZE;
> +module_param_named(event_size, ramoops_event_size, ulong, 0400);
> +MODULE_PARM_DESC(event_size, "size of event log");
> +
>  static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
>  module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
>  MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
> @@ -86,6 +90,7 @@ MODULE_PARM_DESC(ramoops_ecc,
>  struct ramoops_context {
>  	struct persistent_ram_zone **dprzs;	/* Oops dump zones */
>  	struct persistent_ram_zone *cprz;	/* Console zone */
> +	struct persistent_ram_zone *eprz;       /* Event zone */
>  	struct persistent_ram_zone **fprzs;	/* Ftrace zones */
>  	struct persistent_ram_zone *mprz;	/* PMSG zone */
>  	phys_addr_t phys_addr;
> @@ -93,6 +98,7 @@ struct ramoops_context {
>  	unsigned int memtype;
>  	size_t record_size;
>  	size_t console_size;
> +	size_t event_size;
>  	size_t ftrace_size;
>  	size_t pmsg_size;
>  	int dump_oops;
> @@ -103,6 +109,7 @@ struct ramoops_context {
>  	/* _read_cnt need clear on ramoops_pstore_open */
>  	unsigned int dump_read_cnt;
>  	unsigned int console_read_cnt;
> +	unsigned int event_read_cnt;
>  	unsigned int max_ftrace_cnt;
>  	unsigned int ftrace_read_cnt;
>  	unsigned int pmsg_read_cnt;
> @@ -118,6 +125,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
>  
>  	cxt->dump_read_cnt = 0;
>  	cxt->console_read_cnt = 0;
> +	cxt->event_read_cnt = 0;
>  	cxt->ftrace_read_cnt = 0;
>  	cxt->pmsg_read_cnt = 0;
>  	return 0;
> @@ -277,6 +285,11 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
>  					   1, &record->id, &record->type,
>  					   PSTORE_TYPE_CONSOLE, 0);
>  
> +	if (!prz_ok(prz))
> +		prz = ramoops_get_next_prz(&cxt->eprz, &cxt->event_read_cnt,
> +					   1, &record->id, &record->type,
> +					   PSTORE_TYPE_EVENT, 0);
> +
>  	if (!prz_ok(prz))
>  		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
>  					   1, &record->id, &record->type,
> @@ -385,6 +398,11 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
>  			return -ENOMEM;
>  		persistent_ram_write(cxt->cprz, record->buf, record->size);
>  		return 0;
> +	} else if (record->type == PSTORE_TYPE_EVENT) {
> +		if (!cxt->eprz)
> +			return -ENOMEM;
> +		persistent_ram_write(cxt->eprz, record->buf, record->size);
> +		return 0;
>  	} else if (record->type == PSTORE_TYPE_FTRACE) {
>  		int zonenum;
>  
> @@ -475,6 +493,9 @@ static int ramoops_pstore_erase(struct pstore_record *record)
>  	case PSTORE_TYPE_CONSOLE:
>  		prz = cxt->cprz;
>  		break;
> +	case PSTORE_TYPE_EVENT:
> +		prz = cxt->eprz;
> +		break;
>  	case PSTORE_TYPE_FTRACE:
>  		if (record->id >= cxt->max_ftrace_cnt)
>  			return -EINVAL;
> @@ -699,6 +720,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
>  
>  	parse_size("record-size", pdata->record_size);
>  	parse_size("console-size", pdata->console_size);
> +	parse_size("event-size", pdata->event_size);
>  	parse_size("ftrace-size", pdata->ftrace_size);
>  	parse_size("pmsg-size", pdata->pmsg_size);
>  	parse_size("ecc-size", pdata->ecc_info.ecc_size);
> @@ -747,7 +769,8 @@ static int ramoops_probe(struct platform_device *pdev)
>  	}
>  
>  	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
> -			!pdata->ftrace_size && !pdata->pmsg_size)) {
> +			!pdata->event_size && !pdata->ftrace_size &&
> +			!pdata->pmsg_size)) {
>  		pr_err("The memory size and the record/console size must be "
>  			"non-zero\n");
>  		goto fail_out;
> @@ -757,6 +780,8 @@ static int ramoops_probe(struct platform_device *pdev)
>  		pdata->record_size = rounddown_pow_of_two(pdata->record_size);
>  	if (pdata->console_size && !is_power_of_2(pdata->console_size))
>  		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
> +	if (pdata->event_size && !is_power_of_2(pdata->event_size))
> +		pdata->event_size = rounddown_pow_of_two(pdata->event_size);
>  	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
>  		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
>  	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
> @@ -767,6 +792,7 @@ static int ramoops_probe(struct platform_device *pdev)
>  	cxt->memtype = pdata->mem_type;
>  	cxt->record_size = pdata->record_size;
>  	cxt->console_size = pdata->console_size;
> +	cxt->event_size = pdata->event_size;
>  	cxt->ftrace_size = pdata->ftrace_size;
>  	cxt->pmsg_size = pdata->pmsg_size;
>  	cxt->dump_oops = pdata->dump_oops;
> @@ -775,8 +801,8 @@ static int ramoops_probe(struct platform_device *pdev)
>  
>  	paddr = cxt->phys_addr;
>  
> -	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
> -			- cxt->pmsg_size;
> +	dump_mem_sz = cxt->size - cxt->console_size - cxt->event_size
> +			- cxt->ftrace_size - cxt->pmsg_size;
>  	err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
>  				dump_mem_sz, cxt->record_size,
>  				&cxt->max_dump_cnt, 0, 0);
> @@ -788,6 +814,11 @@ static int ramoops_probe(struct platform_device *pdev)
>  	if (err)
>  		goto fail_init_cprz;
>  
> +	err = ramoops_init_prz("event", dev, cxt, &cxt->eprz, &paddr,
> +			       cxt->event_size, 0);
> +	if (err)
> +		goto fail_init_eprz;
> +
>  	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
>  				? nr_cpu_ids
>  				: 1;
> @@ -825,6 +856,8 @@ static int ramoops_probe(struct platform_device *pdev)
>  	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
>  	if (cxt->console_size)
>  		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
> +	if (cxt->event_size)
> +		cxt->pstore.flags |= PSTORE_FLAGS_EVENT;
>  	if (cxt->ftrace_size)
>  		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
>  	if (cxt->pmsg_size)
> @@ -845,6 +878,7 @@ static int ramoops_probe(struct platform_device *pdev)
>  	record_size = pdata->record_size;
>  	dump_oops = pdata->dump_oops;
>  	ramoops_console_size = pdata->console_size;
> +	ramoops_event_size = pdata->event_size;
>  	ramoops_pmsg_size = pdata->pmsg_size;
>  	ramoops_ftrace_size = pdata->ftrace_size;
>  
> @@ -858,6 +892,8 @@ static int ramoops_probe(struct platform_device *pdev)
>  	kfree(cxt->pstore.buf);
>  fail_clear:
>  	cxt->pstore.bufsize = 0;
> +	persistent_ram_free(cxt->eprz);
> +fail_init_eprz:
>  	persistent_ram_free(cxt->mprz);
>  fail_init_mprz:
>  fail_init_fprz:
> @@ -877,6 +913,7 @@ static int ramoops_remove(struct platform_device *pdev)
>  	kfree(cxt->pstore.buf);
>  	cxt->pstore.bufsize = 0;
>  
> +	persistent_ram_free(cxt->eprz);
>  	persistent_ram_free(cxt->mprz);
>  	persistent_ram_free(cxt->cprz);
>  	ramoops_free_przs(cxt);
> @@ -916,6 +953,7 @@ static void ramoops_register_dummy(void)
>  	dummy_data->mem_type = mem_type;
>  	dummy_data->record_size = record_size;
>  	dummy_data->console_size = ramoops_console_size;
> +	dummy_data->event_size = ramoops_event_size;
>  	dummy_data->ftrace_size = ramoops_ftrace_size;
>  	dummy_data->pmsg_size = ramoops_pmsg_size;
>  	dummy_data->dump_oops = dump_oops;
> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
> index a15bc4d48752..c31bb9567b4a 100644
> --- a/include/linux/pstore.h
> +++ b/include/linux/pstore.h
> @@ -44,6 +44,7 @@ enum pstore_type_id {
>  	PSTORE_TYPE_PPC_COMMON	= 6,
>  	PSTORE_TYPE_PMSG	= 7,
>  	PSTORE_TYPE_PPC_OPAL	= 8,
> +	PSTORE_TYPE_EVENT	= 9,
>  	PSTORE_TYPE_UNKNOWN	= 255
>  };
>  
> @@ -193,6 +194,7 @@ struct pstore_info {
>  #define PSTORE_FLAGS_CONSOLE	(1 << 1)
>  #define PSTORE_FLAGS_FTRACE	(1 << 2)
>  #define PSTORE_FLAGS_PMSG	(1 << 3)
> +#define PSTORE_FLAGS_EVENT	(1 << 4)
>  
>  extern int pstore_register(struct pstore_info *);
>  extern void pstore_unregister(struct pstore_info *);
> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
> index e6d226464838..f01055c6cc40 100644
> --- a/include/linux/pstore_ram.h
> +++ b/include/linux/pstore_ram.h
> @@ -95,6 +95,7 @@ struct ramoops_platform_data {
>  	unsigned int	mem_type;
>  	unsigned long	record_size;
>  	unsigned long	console_size;
> +	unsigned long	event_size;
>  	unsigned long	ftrace_size;
>  	unsigned long	pmsg_size;
>  	int		dump_oops;
Sai Prakash Ranjan Sept. 18, 2018, 6:24 a.m. UTC | #10
On 9/18/2018 4:34 AM, Steven Rostedt wrote:
> On Sun, 16 Sep 2018 12:37:52 +0530
> Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:
> 
>> Hi,
>>
>> Anyone here?
> 
> You also just caught me from coming back from a trip. I'm looking at
> your patches now.
> 
> -- Steve
> 

Thanks Steve, I just thought you guys might have missed the patch.
Sai Prakash Ranjan Sept. 18, 2018, 5:52 p.m. UTC | #11
On 9/18/2018 5:04 AM, Steven Rostedt wrote:
> 
> It looks like pstore_event_call() gets called from a trace event. You
> can't call kmalloc() from one. One thing is that kmalloc has
> tracepoints itself. You trace those you just entered an infinite loop.
> 
> 

Ok will remove it in v2. But any alternative way to do this?

>> +
>> +	event_call = fbuffer->trace_file->event_call;
>> +	if (!event_call || !event_call->event.funcs ||
>> +	    !event_call->event.funcs->trace)
>> +		goto fail_event;
>> +
>> +	event = &fbuffer->trace_file->event_call->event;
>> +
>> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
>> +
>> +	trace_seq_init(&iter->seq);
>> +	iter->ent = fbuffer->entry;
> 
> I guess what you are doing is needing to translate the raw data into
> ascii output, and need the trace_iterator to do so.
> 
> You are already under a psinfo->buf_lock. Add a dummy iterator to that
> and use it instead.
> 
> 	trace_seq_init(&psinfo->iter->seq);
> 
>> +	event_call->event.funcs->trace(iter, 0, event);
> 
> 				      (psinfo->iter, 0 , event);
> 
> etc.
> 

Sure, will update in v2.

>> +	trace_seq_putc(&iter->seq, 0);
>> +
>> +	if (seq->size > psinfo->bufsize)
>> +		seq->size = psinfo->bufsize;
>> +
>> +	s = &iter->seq;
>> +	seq = &s->seq;
>> +
>> +	record.buf = (char *)(seq->buffer);
>> +	record.size = seq->len;
>> +	psinfo->write(&record);
>> +
>> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> 
> You may also need to convert these spin_locks into raw_spin_locks as
> when PREEMPT_RT enters the kernel you don't want them to turn into
> mutexes.
> 
> But that can be another patch.
> 

I will change this in v2, but can't we have it in same patch?

Thanks,
Sai
Steven Rostedt Sept. 18, 2018, 8:44 p.m. UTC | #12
On Tue, 18 Sep 2018 23:22:48 +0530
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:

> On 9/18/2018 5:04 AM, Steven Rostedt wrote:
> > 
> > It looks like pstore_event_call() gets called from a trace event. You
> > can't call kmalloc() from one. One thing is that kmalloc has
> > tracepoints itself. You trace those you just entered an infinite loop.
> > 
> >   
> 
> Ok will remove it in v2. But any alternative way to do this?

I think I describe it below.

> 
> >> +
> >> +	event_call = fbuffer->trace_file->event_call;
> >> +	if (!event_call || !event_call->event.funcs ||
> >> +	    !event_call->event.funcs->trace)
> >> +		goto fail_event;
> >> +
> >> +	event = &fbuffer->trace_file->event_call->event;
> >> +
> >> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
> >> +
> >> +	trace_seq_init(&iter->seq);
> >> +	iter->ent = fbuffer->entry;  
> > 
> > I guess what you are doing is needing to translate the raw data into
> > ascii output, and need the trace_iterator to do so.
> > 
> > You are already under a psinfo->buf_lock. Add a dummy iterator to that
> > and use it instead.
> > 
> > 	trace_seq_init(&psinfo->iter->seq);
> >   
> >> +	event_call->event.funcs->trace(iter, 0, event);  
> > 
> > 				      (psinfo->iter, 0 , event);
> > 
> > etc.
> >   
> 
> Sure, will update in v2.
> 
> >> +	trace_seq_putc(&iter->seq, 0);
> >> +
> >> +	if (seq->size > psinfo->bufsize)
> >> +		seq->size = psinfo->bufsize;
> >> +
> >> +	s = &iter->seq;
> >> +	seq = &s->seq;
> >> +
> >> +	record.buf = (char *)(seq->buffer);
> >> +	record.size = seq->len;
> >> +	psinfo->write(&record);
> >> +
> >> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);  
> > 
> > You may also need to convert these spin_locks into raw_spin_locks as
> > when PREEMPT_RT enters the kernel you don't want them to turn into
> > mutexes.
> > 
> > But that can be another patch.
> >   
> 
> I will change this in v2, but can't we have it in same patch?

I suggested a separate patch because buf_lock is used elsewhere.
Changing it to "raw_spin_lock" will affect more than just what this
patch series does. Thus, I recommend making it a separate patch to keep
this patch series from being more intrusive than it needs to be.

Thanks!

-- Steve
Sai Prakash Ranjan Sept. 18, 2018, 9:13 p.m. UTC | #13
On 9/19/2018 2:14 AM, Steven Rostedt wrote:
> On Tue, 18 Sep 2018 23:22:48 +0530
> Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:
> 
>> On 9/18/2018 5:04 AM, Steven Rostedt wrote:
>>>
>>> It looks like pstore_event_call() gets called from a trace event. You
>>> can't call kmalloc() from one. One thing is that kmalloc has
>>> tracepoints itself. You trace those you just entered an infinite loop.
>>>
>>>    
>>
>> Ok will remove it in v2. But any alternative way to do this?
> 
> I think I describe it below.
> 

Ok got it, will change and post the 2nd version soon.

>>
>>>> +
>>>> +	event_call = fbuffer->trace_file->event_call;
>>>> +	if (!event_call || !event_call->event.funcs ||
>>>> +	    !event_call->event.funcs->trace)
>>>> +		goto fail_event;
>>>> +
>>>> +	event = &fbuffer->trace_file->event_call->event;
>>>> +
>>>> +	spin_lock_irqsave(&psinfo->buf_lock, flags);
>>>> +
>>>> +	trace_seq_init(&iter->seq);
>>>> +	iter->ent = fbuffer->entry;
>>>
>>> I guess what you are doing is needing to translate the raw data into
>>> ascii output, and need the trace_iterator to do so.
>>>
>>> You are already under a psinfo->buf_lock. Add a dummy iterator to that
>>> and use it instead.
>>>
>>> 	trace_seq_init(&psinfo->iter->seq);
>>>    
>>>> +	event_call->event.funcs->trace(iter, 0, event);
>>>
>>> 				      (psinfo->iter, 0 , event);
>>>
>>> etc.
>>>    
>>
>> Sure, will update in v2.
>>
>>>> +	trace_seq_putc(&iter->seq, 0);
>>>> +
>>>> +	if (seq->size > psinfo->bufsize)
>>>> +		seq->size = psinfo->bufsize;
>>>> +
>>>> +	s = &iter->seq;
>>>> +	seq = &s->seq;
>>>> +
>>>> +	record.buf = (char *)(seq->buffer);
>>>> +	record.size = seq->len;
>>>> +	psinfo->write(&record);
>>>> +
>>>> +	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
>>>
>>> You may also need to convert these spin_locks into raw_spin_locks as
>>> when PREEMPT_RT enters the kernel you don't want them to turn into
>>> mutexes.
>>>
>>> But that can be another patch.
>>>    
>>
>> I will change this in v2, but can't we have it in same patch?
> 
> I suggested a separate patch because buf_lock is used elsewhere.
> Changing it to "raw_spin_lock" will affect more than just what this
> patch series does. Thus, I recommend making it a separate patch to keep
> this patch series from being more intrusive than it needs to be.
> 

Sure, thanks a lot.
Sai Prakash Ranjan Sept. 22, 2018, 6:48 a.m. UTC | #14
On 9/19/2018 2:43 AM, Sai Prakash Ranjan wrote:
> On 9/19/2018 2:14 AM, Steven Rostedt wrote:
>> On Tue, 18 Sep 2018 23:22:48 +0530
>> Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> wrote:
>>
>>> On 9/18/2018 5:04 AM, Steven Rostedt wrote:
>>>>
>>>> It looks like pstore_event_call() gets called from a trace event. You
>>>> can't call kmalloc() from one. One thing is that kmalloc has
>>>> tracepoints itself. You trace those you just entered an infinite loop.
>>>>
>>>
>>> Ok will remove it in v2. But any alternative way to do this?
>>
>> I think I describe it below.
>>
> 
> Ok got it, will change and post the 2nd version soon.
> 

Hi Steven,

Instead of dummy iterator, can't we have something like below, there 
won't be any infinite loop if we trace kmalloc in this case. This is 
same as tp_printk.

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 018cbbefb769..271b0573f44a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8644,8 +8644,14 @@ void __init early_trace_init(void)
                         static_key_enable(&tracepoint_printk_key.key);
         }

-       if (tracepoint_pstore)
-               static_key_enable(&tracepoint_pstore_key.key);
+       if (tracepoint_pstore) {
+               tracepoint_pstore_iter =
+                       kmalloc(sizeof(*tracepoint_pstore_iter), 
GFP_KERNEL);
+               if (WARN_ON(!tracepoint_pstore_iter))
+                       tracepoint_pstore = 0;
+               else
+                       static_key_enable(&tracepoint_pstore_key.key);
+       }

         tracer_alloc_buffers();
  }
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index f5263b6fb96f..0534546aef6d 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -73,7 +73,6 @@ void notrace pstore_event_call(struct 
trace_event_buffer *fbuffer)
         struct trace_event *event;
         struct seq_buf *seq;
         unsigned long flags;
-       gfp_t gfpflags;

         if (!psinfo)

         if (!psinfo)
                 return;
@@ -81,20 +80,17 @@ void notrace pstore_event_call(struct 
trace_event_buffer *fbuffer)
         if (unlikely(oops_in_progress))
                 return;

-       pstore_record_init(&record, psinfo);
-       record.type = PSTORE_TYPE_EVENT;
-
-       /* Can be called in atomic context */
-       gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : 
GFP_KERNEL;
-
-       iter = kmalloc(sizeof(*iter), gfpflags);
+       iter = tracepoint_pstore_iter;
         if (!iter)
                 return;

+       pstore_record_init(&record, psinfo);
+       record.type = PSTORE_TYPE_EVENT;
+
         event_call = fbuffer->trace_file->event_call;
         if (!event_call || !event_call->event.funcs ||
             !event_call->event.funcs->trace)
-               goto fail_event;
+               return;

         event = &fbuffer->trace_file->event_call->event;

@@ -116,9 +112,6 @@ void notrace pstore_event_call(struct 
trace_event_buffer *fbuffer)
         psinfo->write(&record);

         spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-
-fail_event:
-       kfree(iter);
  }

Thanks,
Sai
Joel Fernandes Sept. 22, 2018, 9:05 a.m. UTC | #15
On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
<saiprakash.ranjan@codeaurora.org> wrote:
>
> Currently pstore has function trace support which can be
> used to get the function call chain with limited data.
> Event tracing has extra data which is useful to debug wide
> variety of issues and is heavily used across the kernel.
>
[...]
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
> ---
>  fs/pstore/Kconfig          |  2 +-
>  fs/pstore/ftrace.c         | 55 ++++++++++++++++++++++++++++++++++++++
>  fs/pstore/inode.c          |  4 +++
>  fs/pstore/ram.c            | 44 +++++++++++++++++++++++++++---
>  include/linux/pstore.h     |  2 ++
>  include/linux/pstore_ram.h |  1 +
>  6 files changed, 104 insertions(+), 4 deletions(-)
>
> diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
> index 503086f7f7c1..6fe087b13a51 100644
> --- a/fs/pstore/Kconfig
> +++ b/fs/pstore/Kconfig
> @@ -126,7 +126,7 @@ config PSTORE_PMSG
>
>  config PSTORE_FTRACE
>         bool "Persistent function tracer"
> -       depends on PSTORE
> +       depends on PSTORE && PSTORE!=m
>         depends on FUNCTION_TRACER
>         depends on DEBUG_FS
>         help
> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
> index 06aab07b6bb7..d47dc93ac098 100644
> --- a/fs/pstore/ftrace.c
> +++ b/fs/pstore/ftrace.c
> @@ -24,6 +24,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/err.h>
>  #include <linux/cache.h>
> +#include <linux/slab.h>
> +#include <linux/trace_events.h>
>  #include <asm/barrier.h>
>  #include "internal.h"
>
> @@ -62,6 +64,59 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
>         .func   = pstore_ftrace_call,
>  };
>
> +void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
> +{
> +       struct trace_iterator *iter;
> +       struct trace_seq *s;
> +       struct trace_event_call *event_call;
> +       struct pstore_record record;
> +       struct trace_event *event;
> +       struct seq_buf *seq;
> +       unsigned long flags;
> +
> +       if (!psinfo)
> +               return;
> +
> +       if (unlikely(oops_in_progress))
> +               return;
> +
> +       pstore_record_init(&record, psinfo);
> +       record.type = PSTORE_TYPE_EVENT;
> +
> +       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
> +       if (!iter)
> +               return;
> +
> +       event_call = fbuffer->trace_file->event_call;
> +       if (!event_call || !event_call->event.funcs ||
> +           !event_call->event.funcs->trace)
> +               goto fail_event;
> +
> +       event = &fbuffer->trace_file->event_call->event;
> +
> +       spin_lock_irqsave(&psinfo->buf_lock, flags);

Could you just split the pstore space into a per-cpu event buffer like
we are doing for ftrace-on-pstore? Then you don't need to lock. I fear
the lock contention will be apparent. The pstore code already has
plumbing to split the store buffer per CPU.

Also I think this spinlock can be moved further down.

> +
> +       trace_seq_init(&iter->seq);
> +       iter->ent = fbuffer->entry;
> +       event_call->event.funcs->trace(iter, 0, event);
> +       trace_seq_putc(&iter->seq, 0);

Would it be possible to store the binary trace record in the pstore
buffer instead of outputting text? I suspect that will both be faster
and less space.

> +
> +       if (seq->size > psinfo->bufsize)
> +               seq->size = psinfo->bufsize;
> +
> +       s = &iter->seq;
> +       seq = &s->seq;
> +
> +       record.buf = (char *)(seq->buffer);
> +       record.size = seq->len;
> +       psinfo->write(&record);

thanks,

 - Joel

> +
> +       spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> +
> +fail_event:
> +       kfree(iter);
> +}
> +
>  static DEFINE_MUTEX(pstore_ftrace_lock);
>  static bool pstore_ftrace_enabled;
>
> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
> index 5fcb845b9fec..f099152abbbd 100644
> --- a/fs/pstore/inode.c
> +++ b/fs/pstore/inode.c
> @@ -345,6 +345,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
>                 scnprintf(name, sizeof(name), "console-%s-%llu",
>                           record->psi->name, record->id);
>                 break;
> +       case PSTORE_TYPE_EVENT:
> +               scnprintf(name, sizeof(name), "event-%s-%llu",
> +                         record->psi->name, record->id);
> +               break;
>         case PSTORE_TYPE_FTRACE:
>                 scnprintf(name, sizeof(name), "ftrace-%s-%llu",
>                           record->psi->name, record->id);
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index bbd1e357c23d..f60d41c0309e 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -48,6 +48,10 @@ static ulong ramoops_console_size = MIN_MEM_SIZE;
>  module_param_named(console_size, ramoops_console_size, ulong, 0400);
>  MODULE_PARM_DESC(console_size, "size of kernel console log");
>
> +static ulong ramoops_event_size = MIN_MEM_SIZE;
> +module_param_named(event_size, ramoops_event_size, ulong, 0400);
> +MODULE_PARM_DESC(event_size, "size of event log");
> +
>  static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
>  module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
>  MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
> @@ -86,6 +90,7 @@ MODULE_PARM_DESC(ramoops_ecc,
>  struct ramoops_context {
>         struct persistent_ram_zone **dprzs;     /* Oops dump zones */
>         struct persistent_ram_zone *cprz;       /* Console zone */
> +       struct persistent_ram_zone *eprz;       /* Event zone */
>         struct persistent_ram_zone **fprzs;     /* Ftrace zones */
>         struct persistent_ram_zone *mprz;       /* PMSG zone */
>         phys_addr_t phys_addr;
> @@ -93,6 +98,7 @@ struct ramoops_context {
>         unsigned int memtype;
>         size_t record_size;
>         size_t console_size;
> +       size_t event_size;
>         size_t ftrace_size;
>         size_t pmsg_size;
>         int dump_oops;
> @@ -103,6 +109,7 @@ struct ramoops_context {
>         /* _read_cnt need clear on ramoops_pstore_open */
>         unsigned int dump_read_cnt;
>         unsigned int console_read_cnt;
> +       unsigned int event_read_cnt;
>         unsigned int max_ftrace_cnt;
>         unsigned int ftrace_read_cnt;
>         unsigned int pmsg_read_cnt;
> @@ -118,6 +125,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
>
>         cxt->dump_read_cnt = 0;
>         cxt->console_read_cnt = 0;
> +       cxt->event_read_cnt = 0;
>         cxt->ftrace_read_cnt = 0;
>         cxt->pmsg_read_cnt = 0;
>         return 0;
> @@ -277,6 +285,11 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
>                                            1, &record->id, &record->type,
>                                            PSTORE_TYPE_CONSOLE, 0);
>
> +       if (!prz_ok(prz))
> +               prz = ramoops_get_next_prz(&cxt->eprz, &cxt->event_read_cnt,
> +                                          1, &record->id, &record->type,
> +                                          PSTORE_TYPE_EVENT, 0);
> +
>         if (!prz_ok(prz))
>                 prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
>                                            1, &record->id, &record->type,
> @@ -385,6 +398,11 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
>                         return -ENOMEM;
>                 persistent_ram_write(cxt->cprz, record->buf, record->size);
>                 return 0;
> +       } else if (record->type == PSTORE_TYPE_EVENT) {
> +               if (!cxt->eprz)
> +                       return -ENOMEM;
> +               persistent_ram_write(cxt->eprz, record->buf, record->size);
> +               return 0;
>         } else if (record->type == PSTORE_TYPE_FTRACE) {
>                 int zonenum;
>
> @@ -475,6 +493,9 @@ static int ramoops_pstore_erase(struct pstore_record *record)
>         case PSTORE_TYPE_CONSOLE:
>                 prz = cxt->cprz;
>                 break;
> +       case PSTORE_TYPE_EVENT:
> +               prz = cxt->eprz;
> +               break;
>         case PSTORE_TYPE_FTRACE:
>                 if (record->id >= cxt->max_ftrace_cnt)
>                         return -EINVAL;
> @@ -699,6 +720,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
>
>         parse_size("record-size", pdata->record_size);
>         parse_size("console-size", pdata->console_size);
> +       parse_size("event-size", pdata->event_size);
>         parse_size("ftrace-size", pdata->ftrace_size);
>         parse_size("pmsg-size", pdata->pmsg_size);
>         parse_size("ecc-size", pdata->ecc_info.ecc_size);
> @@ -747,7 +769,8 @@ static int ramoops_probe(struct platform_device *pdev)
>         }
>
>         if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
> -                       !pdata->ftrace_size && !pdata->pmsg_size)) {
> +                       !pdata->event_size && !pdata->ftrace_size &&
> +                       !pdata->pmsg_size)) {
>                 pr_err("The memory size and the record/console size must be "
>                         "non-zero\n");
>                 goto fail_out;
> @@ -757,6 +780,8 @@ static int ramoops_probe(struct platform_device *pdev)
>                 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
>         if (pdata->console_size && !is_power_of_2(pdata->console_size))
>                 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
> +       if (pdata->event_size && !is_power_of_2(pdata->event_size))
> +               pdata->event_size = rounddown_pow_of_two(pdata->event_size);
>         if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
>                 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
>         if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
> @@ -767,6 +792,7 @@ static int ramoops_probe(struct platform_device *pdev)
>         cxt->memtype = pdata->mem_type;
>         cxt->record_size = pdata->record_size;
>         cxt->console_size = pdata->console_size;
> +       cxt->event_size = pdata->event_size;
>         cxt->ftrace_size = pdata->ftrace_size;
>         cxt->pmsg_size = pdata->pmsg_size;
>         cxt->dump_oops = pdata->dump_oops;
> @@ -775,8 +801,8 @@ static int ramoops_probe(struct platform_device *pdev)
>
>         paddr = cxt->phys_addr;
>
> -       dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
> -                       - cxt->pmsg_size;
> +       dump_mem_sz = cxt->size - cxt->console_size - cxt->event_size
> +                       - cxt->ftrace_size - cxt->pmsg_size;
>         err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
>                                 dump_mem_sz, cxt->record_size,
>                                 &cxt->max_dump_cnt, 0, 0);
> @@ -788,6 +814,11 @@ static int ramoops_probe(struct platform_device *pdev)
>         if (err)
>                 goto fail_init_cprz;
>
> +       err = ramoops_init_prz("event", dev, cxt, &cxt->eprz, &paddr,
> +                              cxt->event_size, 0);
> +       if (err)
> +               goto fail_init_eprz;
> +
>         cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
>                                 ? nr_cpu_ids
>                                 : 1;
> @@ -825,6 +856,8 @@ static int ramoops_probe(struct platform_device *pdev)
>         cxt->pstore.flags = PSTORE_FLAGS_DMESG;
>         if (cxt->console_size)
>                 cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
> +       if (cxt->event_size)
> +               cxt->pstore.flags |= PSTORE_FLAGS_EVENT;
>         if (cxt->ftrace_size)
>                 cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
>         if (cxt->pmsg_size)
> @@ -845,6 +878,7 @@ static int ramoops_probe(struct platform_device *pdev)
>         record_size = pdata->record_size;
>         dump_oops = pdata->dump_oops;
>         ramoops_console_size = pdata->console_size;
> +       ramoops_event_size = pdata->event_size;
>         ramoops_pmsg_size = pdata->pmsg_size;
>         ramoops_ftrace_size = pdata->ftrace_size;
>
> @@ -858,6 +892,8 @@ static int ramoops_probe(struct platform_device *pdev)
>         kfree(cxt->pstore.buf);
>  fail_clear:
>         cxt->pstore.bufsize = 0;
> +       persistent_ram_free(cxt->eprz);
> +fail_init_eprz:
>         persistent_ram_free(cxt->mprz);
>  fail_init_mprz:
>  fail_init_fprz:
> @@ -877,6 +913,7 @@ static int ramoops_remove(struct platform_device *pdev)
>         kfree(cxt->pstore.buf);
>         cxt->pstore.bufsize = 0;
>
> +       persistent_ram_free(cxt->eprz);
>         persistent_ram_free(cxt->mprz);
>         persistent_ram_free(cxt->cprz);
>         ramoops_free_przs(cxt);
> @@ -916,6 +953,7 @@ static void ramoops_register_dummy(void)
>         dummy_data->mem_type = mem_type;
>         dummy_data->record_size = record_size;
>         dummy_data->console_size = ramoops_console_size;
> +       dummy_data->event_size = ramoops_event_size;
>         dummy_data->ftrace_size = ramoops_ftrace_size;
>         dummy_data->pmsg_size = ramoops_pmsg_size;
>         dummy_data->dump_oops = dump_oops;
> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
> index a15bc4d48752..c31bb9567b4a 100644
> --- a/include/linux/pstore.h
> +++ b/include/linux/pstore.h
> @@ -44,6 +44,7 @@ enum pstore_type_id {
>         PSTORE_TYPE_PPC_COMMON  = 6,
>         PSTORE_TYPE_PMSG        = 7,
>         PSTORE_TYPE_PPC_OPAL    = 8,
> +       PSTORE_TYPE_EVENT       = 9,
>         PSTORE_TYPE_UNKNOWN     = 255
>  };
>
> @@ -193,6 +194,7 @@ struct pstore_info {
>  #define PSTORE_FLAGS_CONSOLE   (1 << 1)
>  #define PSTORE_FLAGS_FTRACE    (1 << 2)
>  #define PSTORE_FLAGS_PMSG      (1 << 3)
> +#define PSTORE_FLAGS_EVENT     (1 << 4)
>
>  extern int pstore_register(struct pstore_info *);
>  extern void pstore_unregister(struct pstore_info *);
> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
> index e6d226464838..f01055c6cc40 100644
> --- a/include/linux/pstore_ram.h
> +++ b/include/linux/pstore_ram.h
> @@ -95,6 +95,7 @@ struct ramoops_platform_data {
>         unsigned int    mem_type;
>         unsigned long   record_size;
>         unsigned long   console_size;
> +       unsigned long   event_size;
>         unsigned long   ftrace_size;
>         unsigned long   pmsg_size;
>         int             dump_oops;
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
> of Code Aurora Forum, hosted by The Linux Foundation
>
Sai Prakash Ranjan Sept. 22, 2018, 4:37 p.m. UTC | #16
On 9/22/2018 2:35 PM, Joel Fernandes wrote:
> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
> <saiprakash.ranjan@codeaurora.org> wrote:
>>
> Could you just split the pstore space into a per-cpu event buffer like
> we are doing for ftrace-on-pstore? Then you don't need to lock. I fear
> the lock contention will be apparent. The pstore code already has
> plumbing to split the store buffer per CPU.
> 

Hi Joel,

Thanks a lot for reviewing.

I just looked at per-cpu buffer for ftrace and itseems the pstore percpu 
records will need to be merged into one record if we add this support 
for events. Merging of ftrace logs is based on timestamp, but for events 
we do not have timestamp field (for this specific reason I have added 
timestamp field for IO event tracing so that atleast we can know the cpu 
number in pstore output). For example, the sched event pstore output 
below has no timestamp field, so how do we merge per-cpu logs?

# tail /sys/fs/pstore/event-ramoops-0
sched_waking: comm=rcu_sched pid=11 prio=120 target_cpu=002
sched_wakeup: comm=rcu_sched pid=11 prio=120 target_cpu=002

Also Pstore ftrace log format is fixed i.e.,(CPU:%d ts:%llu %08lx  %08lx 
  %pf <- %pF\n"), but different events will have different formats and 
we will not be able to add timestamp field like how pstore ftrace does 
using  pstore_ftrace_write_timestamp() and pstore_ftrace_read_timestamp().

Sorry if I am confusing you, I can explain better I guess.

> Also I think this spinlock can be moved further down.
> 

OK. Something like this would suffice?

{{{
spin_lock_irqsave(&psinfo->buf_lock, flags);

record.buf = (char *)(seq->buffer);
record.size = seq->len;
psinfo->write(&record);

spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}}}

>> +
>> +       trace_seq_init(&iter->seq);
>> +       iter->ent = fbuffer->entry;
>> +       event_call->event.funcs->trace(iter, 0, event);
>> +       trace_seq_putc(&iter->seq, 0);
> 
> Would it be possible to store the binary trace record in the pstore
> buffer instead of outputting text? I suspect that will both be faster
> and less space.
> 

I will try this and come back.

Thanks,
Sai
Sai Prakash Ranjan Sept. 22, 2018, 5:32 p.m. UTC | #17
On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>
>> Could you just split the pstore space into a per-cpu event buffer like
>> we are doing for ftrace-on-pstore? Then you don't need to lock. I fear
>> the lock contention will be apparent. The pstore code already has
>> plumbing to split the store buffer per CPU.
>>
> 
> Hi Joel,
> 
> Thanks a lot for reviewing.
> 
> I just looked at per-cpu buffer for ftrace and itseems the pstore percpu 
> records will need to be merged into one record if we add this support 
> for events. Merging of ftrace logs is based on timestamp, but for events 
> we do not have timestamp field (for this specific reason I have added 
> timestamp field for IO event tracing so that atleast we can know the cpu 
> number in pstore output). For example, the sched event pstore output 
> below has no timestamp field, so how do we merge per-cpu logs?
> 
> # tail /sys/fs/pstore/event-ramoops-0
> sched_waking: comm=rcu_sched pid=11 prio=120 target_cpu=002
> sched_wakeup: comm=rcu_sched pid=11 prio=120 target_cpu=002
> 
> Also Pstore ftrace log format is fixed i.e.,(CPU:%d ts:%llu %08lx  %08lx 
>   %pf <- %pF\n"), but different events will have different formats and 
> we will not be able to add timestamp field like how pstore ftrace does 
> using  pstore_ftrace_write_timestamp() and pstore_ftrace_read_timestamp().
> 
> Sorry if I am confusing you, I can explain better I guess.
> 

To add to this on db410c board with the current patch I just measured 
average throughput of dd over 60 seconds with sched event tracing 
enabled (trace_event=sched tp_pstore) : 190 MB/s

# dd if=/dev/zero of=/dev/null status=progress
11402907648 bytes (11 GB, 11 GiB) copied, 60 s, 190 MB/s^C
22430312+0 records in
22430312+0 records out
11484319744 bytes (11 GB, 11 GiB) copied, 60.4277 s, 190 MB/s

It seems fine or maybe I am testing it wrongly? Anyways let me know if I 
can test any additional things in some different ways.

Thanks,
Sai
Sai Prakash Ranjan Sept. 22, 2018, 5:45 p.m. UTC | #18
On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
> 
> 
>> Also I think this spinlock can be moved further down.
>>
> 
> OK. Something like this would suffice?
> 
> {{{
> spin_lock_irqsave(&psinfo->buf_lock, flags);
> 
> record.buf = (char *)(seq->buffer);
> record.size = seq->len;
> psinfo->write(&record);
> 
> spin_unlock_irqrestore(&psinfo->buf_lock, flags);
> }}}
> 
> 

If I move the spinlock further down like above, I get this backtrace:

[    1.367708] console [pstore-1] enabled
[    1.368874] pstore: Registered ramoops as persistent store backend
[    1.372637] ramoops: attached 0x100000@0xbff00000, ecc: 0/0
[    1.389382] WARNING: CPU: 2 PID: 2 at kernel/trace/trace_output.c:289 
trace_raw_output_prep+0x18/0xa0
[    1.389416] Modules linked in:
[    1.397572] CPU: 2 PID: 2 Comm: kthreadd Not tainted 
4.19.0-rc4-00032-g71e30ed20433-dirty #9
[    1.409122] pstate: 80000005 (Nzcv daif -PAN -UAO)
[    1.415804] pc : trace_raw_output_prep+0x18/0xa0
[    1.420404] lr : trace_raw_output_sched_process_fork+0x1c/0x70
[    1.425176] sp : ffff00000803bc00
[    1.430815] x29: ffff00000803bc00 x28: ffff80003ca68d80
[    1.434204] x27: ffff00000810d000 x26: 0000000000000001
[    1.439586] x25: 0000000000000000 x24: ffff80000eb05090
[    1.444881] x23: ffff80003ca68d80 x22: ffff00000803bd58
[    1.450176] x21: ffff000009e63000 x20: ffff80003cbf8988
[    1.455470] x19: ffff80000eb04000 x18: ffffffffffffffff
[    1.460766] x17: 000000000006df2c x16: 0000000000000000
[    1.466061] x15: ffff000009b496c8 x14: 0000000000000040
[    1.471356] x13: ffffffffffffffff x12: 0000000000000000
[    1.476651] x11: ffff80003c49a6a8 x10: 00000000000005d8
[    1.481947] x9 : ffff80003c89f400 x8 : ffff00000803bc60
[    1.487242] x7 : 0000000000000034 x6 : 0000000005d97d4a
[    1.492536] x5 : 00ffffffffffffff x4 : 0000000000000050
[    1.497831] x3 : 0000000000000056 x2 : ffff000009b657a0
[    1.503127] x1 : ffff000009b657c0 x0 : ffff80000eb04000
[    1.508422] Call trace:
[    1.513716]  trace_raw_output_prep+0x18/0xa0
[    1.515890]  pstore_event_call+0xbc/0x130
[    1.520402]  trace_event_buffer_commit+0xf8/0x248
[    1.524308]  trace_event_raw_event_sched_process_fork+0xc0/0xf8
[    1.528999]  _do_fork+0x234/0x418
[    1.534724]  kernel_thread+0x48/0x58
[    1.538196]  kthreadd+0x178/0x1f0
[    1.541842]  ret_from_fork+0x10/0x1c
[    1.545052] ---[ end trace 97d58905d13e7aea ]---
[    1.560654] s1: supplied by regulator-dummy
[    1.560714] s1: Bringing 0uV into 375000-375000uV
Sai Prakash Ranjan Sept. 23, 2018, 3:33 p.m. UTC | #19
On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>
>>> +
>>> +       trace_seq_init(&iter->seq);
>>> +       iter->ent = fbuffer->entry;
>>> +       event_call->event.funcs->trace(iter, 0, event);
>>> +       trace_seq_putc(&iter->seq, 0);
>>
>> Would it be possible to store the binary trace record in the pstore
>> buffer instead of outputting text? I suspect that will both be faster
>> and less space.
>>
> 
> I will try this and come back.
> 

Hi Joel,

I removed trace_seq_putc and there is some improvement seen: 203 MB/s

# dd if=/dev/zero of=/dev/null status=progress
12207371264 bytes (12 GB, 11 GiB) copied, 60 s, 203 MB/s^C
24171926+0 records in
24171926+0 records out
12376026112 bytes (12 GB, 12 GiB) copied, 60.8282 s, 203 MB/s

This seems good when compared to 190 MB/s seen previously.
If this is Ok, then I will spin v2 with changes suggested.

Thanks,
Sai
Joel Fernandes Sept. 25, 2018, 8:37 p.m. UTC | #20
On Sun, Sep 23, 2018 at 8:33 AM, Sai Prakash Ranjan
<saiprakash.ranjan@codeaurora.org> wrote:
> On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
>>
>> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>>>
>>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>>
>>>>
>>>> +
>>>> +       trace_seq_init(&iter->seq);
>>>> +       iter->ent = fbuffer->entry;
>>>> +       event_call->event.funcs->trace(iter, 0, event);
>>>> +       trace_seq_putc(&iter->seq, 0);
>>>
>>>
>>> Would it be possible to store the binary trace record in the pstore
>>> buffer instead of outputting text? I suspect that will both be faster
>>> and less space.
>>>
>>
>> I will try this and come back.
>>
>
> Hi Joel,
>
> I removed trace_seq_putc and there is some improvement seen: 203 MB/s
>
> # dd if=/dev/zero of=/dev/null status=progress
> 12207371264 bytes (12 GB, 11 GiB) copied, 60 s, 203 MB/s^C
> 24171926+0 records in
> 24171926+0 records out
> 12376026112 bytes (12 GB, 12 GiB) copied, 60.8282 s, 203 MB/s
>
> This seems good when compared to 190 MB/s seen previously.
> If this is Ok, then I will spin v2 with changes suggested.

Sorry for slow reply, yes that sounds good and a worthwhile perf improvement.

thanks,

 - Joel
Joel Fernandes Sept. 25, 2018, 8:39 p.m. UTC | #21
On Tue, Sep 25, 2018 at 1:37 PM, Joel Fernandes <joelaf@google.com> wrote:
> On Sun, Sep 23, 2018 at 8:33 AM, Sai Prakash Ranjan
> <saiprakash.ranjan@codeaurora.org> wrote:
>> On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
>>>
>>> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>>>>
>>>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>>>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>>>
>>>>>
>>>>> +
>>>>> +       trace_seq_init(&iter->seq);
>>>>> +       iter->ent = fbuffer->entry;
>>>>> +       event_call->event.funcs->trace(iter, 0, event);
>>>>> +       trace_seq_putc(&iter->seq, 0);
>>>>
>>>>
>>>> Would it be possible to store the binary trace record in the pstore
>>>> buffer instead of outputting text? I suspect that will both be faster
>>>> and less space.
>>>>
>>>
>>> I will try this and come back.
>>>
>>
>> Hi Joel,
>>
>> I removed trace_seq_putc and there is some improvement seen: 203 MB/s
>>
>> # dd if=/dev/zero of=/dev/null status=progress
>> 12207371264 bytes (12 GB, 11 GiB) copied, 60 s, 203 MB/s^C
>> 24171926+0 records in
>> 24171926+0 records out
>> 12376026112 bytes (12 GB, 12 GiB) copied, 60.8282 s, 203 MB/s
>>
>> This seems good when compared to 190 MB/s seen previously.
>> If this is Ok, then I will spin v2 with changes suggested.
>
> Sorry for slow reply, yes that sounds good and a worthwhile perf improvement.
>

Well so I think you should still not use spinlock to synchronize and
split the buffer. You could expand pstore_record to have a ts field or
introduce a new API like ->write_percpu instead of write, or
something. But I strongly feel you should lock. For ftrace function
trace, the perf reduction with locking was dramatic.

- Joel
Joel Fernandes Sept. 25, 2018, 8:40 p.m. UTC | #22
On Tue, Sep 25, 2018 at 1:39 PM, Joel Fernandes <joelaf@google.com> wrote:
> On Tue, Sep 25, 2018 at 1:37 PM, Joel Fernandes <joelaf@google.com> wrote:
>> On Sun, Sep 23, 2018 at 8:33 AM, Sai Prakash Ranjan
>> <saiprakash.ranjan@codeaurora.org> wrote:
>>> On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
>>>>
>>>> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>>>>>
>>>>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>>>>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>>>>
>>>>>>
>>>>>> +
>>>>>> +       trace_seq_init(&iter->seq);
>>>>>> +       iter->ent = fbuffer->entry;
>>>>>> +       event_call->event.funcs->trace(iter, 0, event);
>>>>>> +       trace_seq_putc(&iter->seq, 0);
>>>>>
>>>>>
>>>>> Would it be possible to store the binary trace record in the pstore
>>>>> buffer instead of outputting text? I suspect that will both be faster
>>>>> and less space.
>>>>>
>>>>
>>>> I will try this and come back.
>>>>
>>>
>>> Hi Joel,
>>>
>>> I removed trace_seq_putc and there is some improvement seen: 203 MB/s
>>>
>>> # dd if=/dev/zero of=/dev/null status=progress
>>> 12207371264 bytes (12 GB, 11 GiB) copied, 60 s, 203 MB/s^C
>>> 24171926+0 records in
>>> 24171926+0 records out
>>> 12376026112 bytes (12 GB, 12 GiB) copied, 60.8282 s, 203 MB/s
>>>
>>> This seems good when compared to 190 MB/s seen previously.
>>> If this is Ok, then I will spin v2 with changes suggested.
>>
>> Sorry for slow reply, yes that sounds good and a worthwhile perf improvement.
>>
>
> Well so I think you should still not use spinlock to synchronize and
> split the buffer. You could expand pstore_record to have a ts field or
> introduce a new API like ->write_percpu instead of write, or
> something. But I strongly feel you should lock. For ftrace function

Aargh, I meant you should *not* lock :-)
Sai Prakash Ranjan Sept. 26, 2018, 9:52 a.m. UTC | #23
On 9/26/2018 2:10 AM, Joel Fernandes wrote:
> On Tue, Sep 25, 2018 at 1:39 PM, Joel Fernandes <joelaf@google.com> wrote:
>> On Tue, Sep 25, 2018 at 1:37 PM, Joel Fernandes <joelaf@google.com> wrote:
>>> On Sun, Sep 23, 2018 at 8:33 AM, Sai Prakash Ranjan
>>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>> On 9/22/2018 10:07 PM, Sai Prakash Ranjan wrote:
>>>>>
>>>>> On 9/22/2018 2:35 PM, Joel Fernandes wrote:
>>>>>>
>>>>>> On Sat, Sep 8, 2018 at 4:28 PM Sai Prakash Ranjan
>>>>>> <saiprakash.ranjan@codeaurora.org> wrote:
>>>>>>>
>>>>>>>
>>>>>>> +
>>>>>>> +       trace_seq_init(&iter->seq);
>>>>>>> +       iter->ent = fbuffer->entry;
>>>>>>> +       event_call->event.funcs->trace(iter, 0, event);
>>>>>>> +       trace_seq_putc(&iter->seq, 0);
>>>>>>
>>>>>>
>>>>>> Would it be possible to store the binary trace record in the pstore
>>>>>> buffer instead of outputting text? I suspect that will both be faster
>>>>>> and less space.
>>>>>>
>>>>>
>>>>> I will try this and come back.
>>>>>
>>>>
>>>> Hi Joel,
>>>>
>>>> I removed trace_seq_putc and there is some improvement seen: 203 MB/s
>>>>
>>>> # dd if=/dev/zero of=/dev/null status=progress
>>>> 12207371264 bytes (12 GB, 11 GiB) copied, 60 s, 203 MB/s^C
>>>> 24171926+0 records in
>>>> 24171926+0 records out
>>>> 12376026112 bytes (12 GB, 12 GiB) copied, 60.8282 s, 203 MB/s
>>>>
>>>> This seems good when compared to 190 MB/s seen previously.
>>>> If this is Ok, then I will spin v2 with changes suggested.
>>>
>>> Sorry for slow reply, yes that sounds good and a worthwhile perf improvement.
>>>
>>
>> Well so I think you should still not use spinlock to synchronize and
>> split the buffer. You could expand pstore_record to have a ts field or
>> introduce a new API like ->write_percpu instead of write, or
>> something. But I strongly feel you should lock. For ftrace function
> 
> Aargh, I meant you should *not* lock :-)
> 

OK I can try this and will measure some perf difference.

BTW I guess you missed my previous comment about not able to combine 
logs based on timestamp? Anyways I think if I add some extra ts field, 
then should be able to do it.

Thanks,
Sai
diff mbox series

Patch

diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 503086f7f7c1..6fe087b13a51 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -126,7 +126,7 @@  config PSTORE_PMSG
 
 config PSTORE_FTRACE
 	bool "Persistent function tracer"
-	depends on PSTORE
+	depends on PSTORE && PSTORE!=m
 	depends on FUNCTION_TRACER
 	depends on DEBUG_FS
 	help
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 06aab07b6bb7..d47dc93ac098 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -24,6 +24,8 @@ 
 #include <linux/debugfs.h>
 #include <linux/err.h>
 #include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/trace_events.h>
 #include <asm/barrier.h>
 #include "internal.h"
 
@@ -62,6 +64,59 @@  static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
 	.func	= pstore_ftrace_call,
 };
 
+void notrace pstore_event_call(struct trace_event_buffer *fbuffer)
+{
+	struct trace_iterator *iter;
+	struct trace_seq *s;
+	struct trace_event_call *event_call;
+	struct pstore_record record;
+	struct trace_event *event;
+	struct seq_buf *seq;
+	unsigned long flags;
+
+	if (!psinfo)
+		return;
+
+	if (unlikely(oops_in_progress))
+		return;
+
+	pstore_record_init(&record, psinfo);
+	record.type = PSTORE_TYPE_EVENT;
+
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return;
+
+	event_call = fbuffer->trace_file->event_call;
+	if (!event_call || !event_call->event.funcs ||
+	    !event_call->event.funcs->trace)
+		goto fail_event;
+
+	event = &fbuffer->trace_file->event_call->event;
+
+	spin_lock_irqsave(&psinfo->buf_lock, flags);
+
+	trace_seq_init(&iter->seq);
+	iter->ent = fbuffer->entry;
+	event_call->event.funcs->trace(iter, 0, event);
+	trace_seq_putc(&iter->seq, 0);
+
+	if (seq->size > psinfo->bufsize)
+		seq->size = psinfo->bufsize;
+
+	s = &iter->seq;
+	seq = &s->seq;
+
+	record.buf = (char *)(seq->buffer);
+	record.size = seq->len;
+	psinfo->write(&record);
+
+	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+fail_event:
+	kfree(iter);
+}
+
 static DEFINE_MUTEX(pstore_ftrace_lock);
 static bool pstore_ftrace_enabled;
 
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5fcb845b9fec..f099152abbbd 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -345,6 +345,10 @@  int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 		scnprintf(name, sizeof(name), "console-%s-%llu",
 			  record->psi->name, record->id);
 		break;
+	case PSTORE_TYPE_EVENT:
+		scnprintf(name, sizeof(name), "event-%s-%llu",
+			  record->psi->name, record->id);
+		break;
 	case PSTORE_TYPE_FTRACE:
 		scnprintf(name, sizeof(name), "ftrace-%s-%llu",
 			  record->psi->name, record->id);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index bbd1e357c23d..f60d41c0309e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -48,6 +48,10 @@  static ulong ramoops_console_size = MIN_MEM_SIZE;
 module_param_named(console_size, ramoops_console_size, ulong, 0400);
 MODULE_PARM_DESC(console_size, "size of kernel console log");
 
+static ulong ramoops_event_size = MIN_MEM_SIZE;
+module_param_named(event_size, ramoops_event_size, ulong, 0400);
+MODULE_PARM_DESC(event_size, "size of event log");
+
 static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
@@ -86,6 +90,7 @@  MODULE_PARM_DESC(ramoops_ecc,
 struct ramoops_context {
 	struct persistent_ram_zone **dprzs;	/* Oops dump zones */
 	struct persistent_ram_zone *cprz;	/* Console zone */
+	struct persistent_ram_zone *eprz;       /* Event zone */
 	struct persistent_ram_zone **fprzs;	/* Ftrace zones */
 	struct persistent_ram_zone *mprz;	/* PMSG zone */
 	phys_addr_t phys_addr;
@@ -93,6 +98,7 @@  struct ramoops_context {
 	unsigned int memtype;
 	size_t record_size;
 	size_t console_size;
+	size_t event_size;
 	size_t ftrace_size;
 	size_t pmsg_size;
 	int dump_oops;
@@ -103,6 +109,7 @@  struct ramoops_context {
 	/* _read_cnt need clear on ramoops_pstore_open */
 	unsigned int dump_read_cnt;
 	unsigned int console_read_cnt;
+	unsigned int event_read_cnt;
 	unsigned int max_ftrace_cnt;
 	unsigned int ftrace_read_cnt;
 	unsigned int pmsg_read_cnt;
@@ -118,6 +125,7 @@  static int ramoops_pstore_open(struct pstore_info *psi)
 
 	cxt->dump_read_cnt = 0;
 	cxt->console_read_cnt = 0;
+	cxt->event_read_cnt = 0;
 	cxt->ftrace_read_cnt = 0;
 	cxt->pmsg_read_cnt = 0;
 	return 0;
@@ -277,6 +285,11 @@  static ssize_t ramoops_pstore_read(struct pstore_record *record)
 					   1, &record->id, &record->type,
 					   PSTORE_TYPE_CONSOLE, 0);
 
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->eprz, &cxt->event_read_cnt,
+					   1, &record->id, &record->type,
+					   PSTORE_TYPE_EVENT, 0);
+
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
 					   1, &record->id, &record->type,
@@ -385,6 +398,11 @@  static int notrace ramoops_pstore_write(struct pstore_record *record)
 			return -ENOMEM;
 		persistent_ram_write(cxt->cprz, record->buf, record->size);
 		return 0;
+	} else if (record->type == PSTORE_TYPE_EVENT) {
+		if (!cxt->eprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->eprz, record->buf, record->size);
+		return 0;
 	} else if (record->type == PSTORE_TYPE_FTRACE) {
 		int zonenum;
 
@@ -475,6 +493,9 @@  static int ramoops_pstore_erase(struct pstore_record *record)
 	case PSTORE_TYPE_CONSOLE:
 		prz = cxt->cprz;
 		break;
+	case PSTORE_TYPE_EVENT:
+		prz = cxt->eprz;
+		break;
 	case PSTORE_TYPE_FTRACE:
 		if (record->id >= cxt->max_ftrace_cnt)
 			return -EINVAL;
@@ -699,6 +720,7 @@  static int ramoops_parse_dt(struct platform_device *pdev,
 
 	parse_size("record-size", pdata->record_size);
 	parse_size("console-size", pdata->console_size);
+	parse_size("event-size", pdata->event_size);
 	parse_size("ftrace-size", pdata->ftrace_size);
 	parse_size("pmsg-size", pdata->pmsg_size);
 	parse_size("ecc-size", pdata->ecc_info.ecc_size);
@@ -747,7 +769,8 @@  static int ramoops_probe(struct platform_device *pdev)
 	}
 
 	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-			!pdata->ftrace_size && !pdata->pmsg_size)) {
+			!pdata->event_size && !pdata->ftrace_size &&
+			!pdata->pmsg_size)) {
 		pr_err("The memory size and the record/console size must be "
 			"non-zero\n");
 		goto fail_out;
@@ -757,6 +780,8 @@  static int ramoops_probe(struct platform_device *pdev)
 		pdata->record_size = rounddown_pow_of_two(pdata->record_size);
 	if (pdata->console_size && !is_power_of_2(pdata->console_size))
 		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
+	if (pdata->event_size && !is_power_of_2(pdata->event_size))
+		pdata->event_size = rounddown_pow_of_two(pdata->event_size);
 	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
 		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
 	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
@@ -767,6 +792,7 @@  static int ramoops_probe(struct platform_device *pdev)
 	cxt->memtype = pdata->mem_type;
 	cxt->record_size = pdata->record_size;
 	cxt->console_size = pdata->console_size;
+	cxt->event_size = pdata->event_size;
 	cxt->ftrace_size = pdata->ftrace_size;
 	cxt->pmsg_size = pdata->pmsg_size;
 	cxt->dump_oops = pdata->dump_oops;
@@ -775,8 +801,8 @@  static int ramoops_probe(struct platform_device *pdev)
 
 	paddr = cxt->phys_addr;
 
-	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
-			- cxt->pmsg_size;
+	dump_mem_sz = cxt->size - cxt->console_size - cxt->event_size
+			- cxt->ftrace_size - cxt->pmsg_size;
 	err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
 				dump_mem_sz, cxt->record_size,
 				&cxt->max_dump_cnt, 0, 0);
@@ -788,6 +814,11 @@  static int ramoops_probe(struct platform_device *pdev)
 	if (err)
 		goto fail_init_cprz;
 
+	err = ramoops_init_prz("event", dev, cxt, &cxt->eprz, &paddr,
+			       cxt->event_size, 0);
+	if (err)
+		goto fail_init_eprz;
+
 	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
 				? nr_cpu_ids
 				: 1;
@@ -825,6 +856,8 @@  static int ramoops_probe(struct platform_device *pdev)
 	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
 	if (cxt->console_size)
 		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
+	if (cxt->event_size)
+		cxt->pstore.flags |= PSTORE_FLAGS_EVENT;
 	if (cxt->ftrace_size)
 		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
 	if (cxt->pmsg_size)
@@ -845,6 +878,7 @@  static int ramoops_probe(struct platform_device *pdev)
 	record_size = pdata->record_size;
 	dump_oops = pdata->dump_oops;
 	ramoops_console_size = pdata->console_size;
+	ramoops_event_size = pdata->event_size;
 	ramoops_pmsg_size = pdata->pmsg_size;
 	ramoops_ftrace_size = pdata->ftrace_size;
 
@@ -858,6 +892,8 @@  static int ramoops_probe(struct platform_device *pdev)
 	kfree(cxt->pstore.buf);
 fail_clear:
 	cxt->pstore.bufsize = 0;
+	persistent_ram_free(cxt->eprz);
+fail_init_eprz:
 	persistent_ram_free(cxt->mprz);
 fail_init_mprz:
 fail_init_fprz:
@@ -877,6 +913,7 @@  static int ramoops_remove(struct platform_device *pdev)
 	kfree(cxt->pstore.buf);
 	cxt->pstore.bufsize = 0;
 
+	persistent_ram_free(cxt->eprz);
 	persistent_ram_free(cxt->mprz);
 	persistent_ram_free(cxt->cprz);
 	ramoops_free_przs(cxt);
@@ -916,6 +953,7 @@  static void ramoops_register_dummy(void)
 	dummy_data->mem_type = mem_type;
 	dummy_data->record_size = record_size;
 	dummy_data->console_size = ramoops_console_size;
+	dummy_data->event_size = ramoops_event_size;
 	dummy_data->ftrace_size = ramoops_ftrace_size;
 	dummy_data->pmsg_size = ramoops_pmsg_size;
 	dummy_data->dump_oops = dump_oops;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index a15bc4d48752..c31bb9567b4a 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -44,6 +44,7 @@  enum pstore_type_id {
 	PSTORE_TYPE_PPC_COMMON	= 6,
 	PSTORE_TYPE_PMSG	= 7,
 	PSTORE_TYPE_PPC_OPAL	= 8,
+	PSTORE_TYPE_EVENT	= 9,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
@@ -193,6 +194,7 @@  struct pstore_info {
 #define PSTORE_FLAGS_CONSOLE	(1 << 1)
 #define PSTORE_FLAGS_FTRACE	(1 << 2)
 #define PSTORE_FLAGS_PMSG	(1 << 3)
+#define PSTORE_FLAGS_EVENT	(1 << 4)
 
 extern int pstore_register(struct pstore_info *);
 extern void pstore_unregister(struct pstore_info *);
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index e6d226464838..f01055c6cc40 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -95,6 +95,7 @@  struct ramoops_platform_data {
 	unsigned int	mem_type;
 	unsigned long	record_size;
 	unsigned long	console_size;
+	unsigned long	event_size;
 	unsigned long	ftrace_size;
 	unsigned long	pmsg_size;
 	int		dump_oops;