diff mbox series

[v2] mm: emit tracepoint when RSS changes by threshold

Message ID 20190903200905.198642-1-joel@joelfernandes.org (mailing list archive)
State New, archived
Headers show
Series [v2] mm: emit tracepoint when RSS changes by threshold | expand

Commit Message

Joel Fernandes Sept. 3, 2019, 8:09 p.m. UTC
Useful to track how RSS is changing per TGID to detect spikes in RSS and
memory hogs. Several Android teams have been using this patch in various
kernel trees for half a year now. Many reported to me it is really
useful so I'm posting it upstream.

Initial patch developed by Tim Murray. Changes I made from original patch:
o Prevent any additional space consumed by mm_struct.
o Keep overhead low by checking if tracing is enabled.
o Add some noise reduction and lower overhead by emitting only on
  threshold changes.

Co-developed-by: Tim Murray <timmurray@google.com>
Signed-off-by: Tim Murray <timmurray@google.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>

---

v1->v2: Added more commit message.

Cc: carmenjackson@google.com
Cc: mayankgupta@google.com
Cc: dancol@google.com
Cc: rostedt@goodmis.org
Cc: minchan@kernel.org
Cc: akpm@linux-foundation.org
Cc: kernel-team@android.com

 include/linux/mm.h          | 14 +++++++++++---
 include/trace/events/kmem.h | 21 +++++++++++++++++++++
 mm/memory.c                 | 20 ++++++++++++++++++++
 3 files changed, 52 insertions(+), 3 deletions(-)

Comments

Suren Baghdasaryan Sept. 4, 2019, 4:44 a.m. UTC | #1
On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
<joel@joelfernandes.org> wrote:
>
> Useful to track how RSS is changing per TGID to detect spikes in RSS and
> memory hogs. Several Android teams have been using this patch in various
> kernel trees for half a year now. Many reported to me it is really
> useful so I'm posting it upstream.
>
> Initial patch developed by Tim Murray. Changes I made from original patch:
> o Prevent any additional space consumed by mm_struct.
> o Keep overhead low by checking if tracing is enabled.
> o Add some noise reduction and lower overhead by emitting only on
>   threshold changes.
>
> Co-developed-by: Tim Murray <timmurray@google.com>
> Signed-off-by: Tim Murray <timmurray@google.com>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>
> ---
>
> v1->v2: Added more commit message.
>
> Cc: carmenjackson@google.com
> Cc: mayankgupta@google.com
> Cc: dancol@google.com
> Cc: rostedt@goodmis.org
> Cc: minchan@kernel.org
> Cc: akpm@linux-foundation.org
> Cc: kernel-team@android.com
>
>  include/linux/mm.h          | 14 +++++++++++---
>  include/trace/events/kmem.h | 21 +++++++++++++++++++++
>  mm/memory.c                 | 20 ++++++++++++++++++++
>  3 files changed, 52 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0334ca97c584..823aaf759bdb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
>         return (unsigned long)val;
>  }
>
> +void mm_trace_rss_stat(int member, long count, long value);
> +
>  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
>  {
> -       atomic_long_add(value, &mm->rss_stat.count[member]);
> +       long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
> +
> +       mm_trace_rss_stat(member, count, value);
>  }
>
>  static inline void inc_mm_counter(struct mm_struct *mm, int member)
>  {
> -       atomic_long_inc(&mm->rss_stat.count[member]);
> +       long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
> +
> +       mm_trace_rss_stat(member, count, 1);
>  }
>
>  static inline void dec_mm_counter(struct mm_struct *mm, int member)
>  {
> -       atomic_long_dec(&mm->rss_stat.count[member]);
> +       long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
> +
> +       mm_trace_rss_stat(member, count, -1);
>  }
>
>  /* Optimized variant when page is already known not to be PageAnon */
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index eb57e3037deb..8b88e04fafbf 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
>                 __entry->change_ownership)
>  );
>
> +TRACE_EVENT(rss_stat,
> +
> +       TP_PROTO(int member,
> +               long count),
> +
> +       TP_ARGS(member, count),
> +
> +       TP_STRUCT__entry(
> +               __field(int, member)
> +               __field(long, size)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->member = member;
> +               __entry->size = (count << PAGE_SHIFT);
> +       ),
> +
> +       TP_printk("member=%d size=%ldB",
> +               __entry->member,
> +               __entry->size)
> +       );
>  #endif /* _TRACE_KMEM_H */
>
>  /* This part must be outside protection */
> diff --git a/mm/memory.c b/mm/memory.c
> index e2bb51b6242e..9d81322c24a3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -72,6 +72,8 @@
>  #include <linux/oom.h>
>  #include <linux/numa.h>
>
> +#include <trace/events/kmem.h>
> +
>  #include <asm/io.h>
>  #include <asm/mmu_context.h>
>  #include <asm/pgalloc.h>
> @@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
>  }
>  core_initcall(init_zero_pfn);
>
> +/*
> + * This threshold is the boundary in the value space, that the counter has to
> + * advance before we trace it. Should be a power of 2. It is to reduce unwanted
> + * trace overhead. The counter is in units of number of pages.
> + */
> +#define TRACE_MM_COUNTER_THRESHOLD 128

IIUC the counter has to change by 128 pages (512kB assuming 4kB pages)
before the change gets traced. Would it make sense to make this step
size configurable? For a system with limited memory size change of
512kB might be considerable while on systems with plenty of memory
that might be negligible. Not even mentioning possible difference in
page sizes. Maybe something like
/sys/kernel/debug/tracing/rss_step_order with
TRACE_MM_COUNTER_THRESHOLD=(1<<rss_step_order)?

> +
> +void mm_trace_rss_stat(int member, long count, long value)
> +{
> +       long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
> +
> +       if (!trace_rss_stat_enabled())
> +               return;
> +
> +       /* Threshold roll-over, trace it */
> +       if ((count & thresh_mask) != ((count - value) & thresh_mask))
> +               trace_rss_stat(member, count);
> +}
>
>  #if defined(SPLIT_RSS_COUNTING)
>
> --
> 2.23.0.187.g17f5b7556c-goog
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Daniel Colascione Sept. 4, 2019, 4:51 a.m. UTC | #2
On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> <joel@joelfernandes.org> wrote:
> >
> > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > memory hogs. Several Android teams have been using this patch in various
> > kernel trees for half a year now. Many reported to me it is really
> > useful so I'm posting it upstream.

It's also worth being able to turn off the per-task memory counter
caching, otherwise you'll have two levels of batching before the
counter gets updated, IIUC.
Joel Fernandes Sept. 4, 2019, 5:02 a.m. UTC | #3
On Tue, Sep 03, 2019 at 09:44:51PM -0700, Suren Baghdasaryan wrote:
> On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> <joel@joelfernandes.org> wrote:
> >
> > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > memory hogs. Several Android teams have been using this patch in various
> > kernel trees for half a year now. Many reported to me it is really
> > useful so I'm posting it upstream.
> >
> > Initial patch developed by Tim Murray. Changes I made from original patch:
> > o Prevent any additional space consumed by mm_struct.
> > o Keep overhead low by checking if tracing is enabled.
> > o Add some noise reduction and lower overhead by emitting only on
> >   threshold changes.
> >
> > Co-developed-by: Tim Murray <timmurray@google.com>
> > Signed-off-by: Tim Murray <timmurray@google.com>
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> >
> > ---
> >
> > v1->v2: Added more commit message.
> >
> > Cc: carmenjackson@google.com
> > Cc: mayankgupta@google.com
> > Cc: dancol@google.com
> > Cc: rostedt@goodmis.org
> > Cc: minchan@kernel.org
> > Cc: akpm@linux-foundation.org
> > Cc: kernel-team@android.com
> >
> >  include/linux/mm.h          | 14 +++++++++++---
> >  include/trace/events/kmem.h | 21 +++++++++++++++++++++
> >  mm/memory.c                 | 20 ++++++++++++++++++++
> >  3 files changed, 52 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 0334ca97c584..823aaf759bdb 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> >         return (unsigned long)val;
> >  }
> >
> > +void mm_trace_rss_stat(int member, long count, long value);
> > +
> >  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
> >  {
> > -       atomic_long_add(value, &mm->rss_stat.count[member]);
> > +       long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
> > +
> > +       mm_trace_rss_stat(member, count, value);
> >  }
> >
> >  static inline void inc_mm_counter(struct mm_struct *mm, int member)
> >  {
> > -       atomic_long_inc(&mm->rss_stat.count[member]);
> > +       long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
> > +
> > +       mm_trace_rss_stat(member, count, 1);
> >  }
> >
> >  static inline void dec_mm_counter(struct mm_struct *mm, int member)
> >  {
> > -       atomic_long_dec(&mm->rss_stat.count[member]);
> > +       long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
> > +
> > +       mm_trace_rss_stat(member, count, -1);
> >  }
> >
> >  /* Optimized variant when page is already known not to be PageAnon */
> > diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> > index eb57e3037deb..8b88e04fafbf 100644
> > --- a/include/trace/events/kmem.h
> > +++ b/include/trace/events/kmem.h
> > @@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
> >                 __entry->change_ownership)
> >  );
> >
> > +TRACE_EVENT(rss_stat,
> > +
> > +       TP_PROTO(int member,
> > +               long count),
> > +
> > +       TP_ARGS(member, count),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(int, member)
> > +               __field(long, size)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->member = member;
> > +               __entry->size = (count << PAGE_SHIFT);
> > +       ),
> > +
> > +       TP_printk("member=%d size=%ldB",
> > +               __entry->member,
> > +               __entry->size)
> > +       );
> >  #endif /* _TRACE_KMEM_H */
> >
> >  /* This part must be outside protection */
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e2bb51b6242e..9d81322c24a3 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -72,6 +72,8 @@
> >  #include <linux/oom.h>
> >  #include <linux/numa.h>
> >
> > +#include <trace/events/kmem.h>
> > +
> >  #include <asm/io.h>
> >  #include <asm/mmu_context.h>
> >  #include <asm/pgalloc.h>
> > @@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
> >  }
> >  core_initcall(init_zero_pfn);
> >
> > +/*
> > + * This threshold is the boundary in the value space, that the counter has to
> > + * advance before we trace it. Should be a power of 2. It is to reduce unwanted
> > + * trace overhead. The counter is in units of number of pages.
> > + */
> > +#define TRACE_MM_COUNTER_THRESHOLD 128
> 
> IIUC the counter has to change by 128 pages (512kB assuming 4kB pages)
> before the change gets traced. Would it make sense to make this step
> size configurable? For a system with limited memory size change of
> 512kB might be considerable while on systems with plenty of memory
> that might be negligible. Not even mentioning possible difference in
> page sizes. Maybe something like
> /sys/kernel/debug/tracing/rss_step_order with
> TRACE_MM_COUNTER_THRESHOLD=(1<<rss_step_order)?

I would not want to complicate this more to be honest. It is already a bit
complex, and I am not sure about the win in making it as configurable as you
seem to want. The "threshold" thing is just a slight improvement, it is not
aiming to be optimal. If in your tracing, this granularity is an issue, we
can visit it then.

thanks,

 - Joel



> > +void mm_trace_rss_stat(int member, long count, long value)
> > +{
> > +       long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
> > +
> > +       if (!trace_rss_stat_enabled())
> > +               return;
> > +
> > +       /* Threshold roll-over, trace it */
> > +       if ((count & thresh_mask) != ((count - value) & thresh_mask))
> > +               trace_rss_stat(member, count);
> > +}
> >
> >  #if defined(SPLIT_RSS_COUNTING)
> >
> > --
> > 2.23.0.187.g17f5b7556c-goog
> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Joel Fernandes Sept. 4, 2019, 5:15 a.m. UTC | #4
On Tue, Sep 03, 2019 at 09:51:20PM -0700, Daniel Colascione wrote:
> On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > <joel@joelfernandes.org> wrote:
> > >
> > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > memory hogs. Several Android teams have been using this patch in various
> > > kernel trees for half a year now. Many reported to me it is really
> > > useful so I'm posting it upstream.
> 
> It's also worth being able to turn off the per-task memory counter
> caching, otherwise you'll have two levels of batching before the
> counter gets updated, IIUC.

I prefer to keep split RSS accounting turned on if it is available. I think
discussing split RSS accounting is a bit out of scope of this patch as well.
Any improvements on that front can be a follow-up.

Curious, has split RSS accounting shown you any issue with this patch?

thanks,

 - Joel
Suren Baghdasaryan Sept. 4, 2019, 5:38 a.m. UTC | #5
On Tue, Sep 3, 2019 at 10:02 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Tue, Sep 03, 2019 at 09:44:51PM -0700, Suren Baghdasaryan wrote:
> > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > <joel@joelfernandes.org> wrote:
> > >
> > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > memory hogs. Several Android teams have been using this patch in various
> > > kernel trees for half a year now. Many reported to me it is really
> > > useful so I'm posting it upstream.
> > >
> > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > o Prevent any additional space consumed by mm_struct.
> > > o Keep overhead low by checking if tracing is enabled.
> > > o Add some noise reduction and lower overhead by emitting only on
> > >   threshold changes.
> > >
> > > Co-developed-by: Tim Murray <timmurray@google.com>
> > > Signed-off-by: Tim Murray <timmurray@google.com>
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > >
> > > ---
> > >
> > > v1->v2: Added more commit message.
> > >
> > > Cc: carmenjackson@google.com
> > > Cc: mayankgupta@google.com
> > > Cc: dancol@google.com
> > > Cc: rostedt@goodmis.org
> > > Cc: minchan@kernel.org
> > > Cc: akpm@linux-foundation.org
> > > Cc: kernel-team@android.com
> > >
> > >  include/linux/mm.h          | 14 +++++++++++---
> > >  include/trace/events/kmem.h | 21 +++++++++++++++++++++
> > >  mm/memory.c                 | 20 ++++++++++++++++++++
> > >  3 files changed, 52 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 0334ca97c584..823aaf759bdb 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> > >         return (unsigned long)val;
> > >  }
> > >
> > > +void mm_trace_rss_stat(int member, long count, long value);
> > > +
> > >  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
> > >  {
> > > -       atomic_long_add(value, &mm->rss_stat.count[member]);
> > > +       long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
> > > +
> > > +       mm_trace_rss_stat(member, count, value);
> > >  }
> > >
> > >  static inline void inc_mm_counter(struct mm_struct *mm, int member)
> > >  {
> > > -       atomic_long_inc(&mm->rss_stat.count[member]);
> > > +       long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
> > > +
> > > +       mm_trace_rss_stat(member, count, 1);
> > >  }
> > >
> > >  static inline void dec_mm_counter(struct mm_struct *mm, int member)
> > >  {
> > > -       atomic_long_dec(&mm->rss_stat.count[member]);
> > > +       long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
> > > +
> > > +       mm_trace_rss_stat(member, count, -1);
> > >  }
> > >
> > >  /* Optimized variant when page is already known not to be PageAnon */
> > > diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> > > index eb57e3037deb..8b88e04fafbf 100644
> > > --- a/include/trace/events/kmem.h
> > > +++ b/include/trace/events/kmem.h
> > > @@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
> > >                 __entry->change_ownership)
> > >  );
> > >
> > > +TRACE_EVENT(rss_stat,
> > > +
> > > +       TP_PROTO(int member,
> > > +               long count),
> > > +
> > > +       TP_ARGS(member, count),
> > > +
> > > +       TP_STRUCT__entry(
> > > +               __field(int, member)
> > > +               __field(long, size)
> > > +       ),
> > > +
> > > +       TP_fast_assign(
> > > +               __entry->member = member;
> > > +               __entry->size = (count << PAGE_SHIFT);
> > > +       ),
> > > +
> > > +       TP_printk("member=%d size=%ldB",
> > > +               __entry->member,
> > > +               __entry->size)
> > > +       );
> > >  #endif /* _TRACE_KMEM_H */
> > >
> > >  /* This part must be outside protection */
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index e2bb51b6242e..9d81322c24a3 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -72,6 +72,8 @@
> > >  #include <linux/oom.h>
> > >  #include <linux/numa.h>
> > >
> > > +#include <trace/events/kmem.h>
> > > +
> > >  #include <asm/io.h>
> > >  #include <asm/mmu_context.h>
> > >  #include <asm/pgalloc.h>
> > > @@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
> > >  }
> > >  core_initcall(init_zero_pfn);
> > >
> > > +/*
> > > + * This threshold is the boundary in the value space, that the counter has to
> > > + * advance before we trace it. Should be a power of 2. It is to reduce unwanted
> > > + * trace overhead. The counter is in units of number of pages.
> > > + */
> > > +#define TRACE_MM_COUNTER_THRESHOLD 128
> >
> > IIUC the counter has to change by 128 pages (512kB assuming 4kB pages)
> > before the change gets traced. Would it make sense to make this step
> > size configurable? For a system with limited memory size change of
> > 512kB might be considerable while on systems with plenty of memory
> > that might be negligible. Not even mentioning possible difference in
> > page sizes. Maybe something like
> > /sys/kernel/debug/tracing/rss_step_order with
> > TRACE_MM_COUNTER_THRESHOLD=(1<<rss_step_order)?
>
> I would not want to complicate this more to be honest. It is already a bit
> complex, and I am not sure about the win in making it as configurable as you
> seem to want. The "threshold" thing is just a slight improvement, it is not
> aiming to be optimal. If in your tracing, this granularity is an issue, we
> can visit it then.

I guess that can be done as a separate patch later on if necessary.

>
> thanks,
>
>  - Joel
>
>
>
> > > +void mm_trace_rss_stat(int member, long count, long value)
> > > +{
> > > +       long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
> > > +
> > > +       if (!trace_rss_stat_enabled())
> > > +               return;
> > > +
> > > +       /* Threshold roll-over, trace it */
> > > +       if ((count & thresh_mask) != ((count - value) & thresh_mask))
> > > +               trace_rss_stat(member, count);
> > > +}
> > >
> > >  #if defined(SPLIT_RSS_COUNTING)
> > >
> > > --
> > > 2.23.0.187.g17f5b7556c-goog
> > >
> > > --
> > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > >
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Daniel Colascione Sept. 4, 2019, 5:42 a.m. UTC | #6
On Tue, Sep 3, 2019 at 10:15 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Tue, Sep 03, 2019 at 09:51:20PM -0700, Daniel Colascione wrote:
> > On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > > <joel@joelfernandes.org> wrote:
> > > >
> > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > memory hogs. Several Android teams have been using this patch in various
> > > > kernel trees for half a year now. Many reported to me it is really
> > > > useful so I'm posting it upstream.
> >
> > It's also worth being able to turn off the per-task memory counter
> > caching, otherwise you'll have two levels of batching before the
> > counter gets updated, IIUC.
>
> I prefer to keep split RSS accounting turned on if it is available.

Why? AFAIK, nobody's produced numbers showing that split accounting
has a real benefit.

> I think
> discussing split RSS accounting is a bit out of scope of this patch as well.

It's in-scope, because with split RSS accounting, allocated memory can
stay accumulated in task structs for an indefinite time without being
flushed to the mm. As a result, if you take the stream of virtual
memory management system calls that  program makes on one hand, and VM
counter values on the other, the two don't add up. For various kinds
of robustness (trace self-checking, say) it's important that various
sources of data add up.

If we're adding a configuration knob that controls how often VM
counters get reflected in system trace points, we should also have a
knob to control delayed VM counter operations. The whole point is for
users to be able to specify how precisely they want VM counter changes
reported to analysis tools.

> Any improvements on that front can be a follow-up.
>
> Curious, has split RSS accounting shown you any issue with this patch?

Split accounting has been a source of confusion for a while now: it
causes that numbers-don't-add-up problem even when sampling from
procfs instead of reading memory tracepoint data.
Michal Hocko Sept. 4, 2019, 8:45 a.m. UTC | #7
On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> Useful to track how RSS is changing per TGID to detect spikes in RSS and
> memory hogs. Several Android teams have been using this patch in various
> kernel trees for half a year now. Many reported to me it is really
> useful so I'm posting it upstream.
> 
> Initial patch developed by Tim Murray. Changes I made from original patch:
> o Prevent any additional space consumed by mm_struct.
> o Keep overhead low by checking if tracing is enabled.
> o Add some noise reduction and lower overhead by emitting only on
>   threshold changes.

Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
the Linus tree (nor in linux-next). Besides that why do we need batching
in the first place. Does this have a measurable overhead? How does it
differ from any other tracepoints that we have in other hotpaths (e.g.
page allocator doesn't do any checks).

Other than that this looks reasonable to me.

> Co-developed-by: Tim Murray <timmurray@google.com>
> Signed-off-by: Tim Murray <timmurray@google.com>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> ---
> 
> v1->v2: Added more commit message.
> 
> Cc: carmenjackson@google.com
> Cc: mayankgupta@google.com
> Cc: dancol@google.com
> Cc: rostedt@goodmis.org
> Cc: minchan@kernel.org
> Cc: akpm@linux-foundation.org
> Cc: kernel-team@android.com
> 
>  include/linux/mm.h          | 14 +++++++++++---
>  include/trace/events/kmem.h | 21 +++++++++++++++++++++
>  mm/memory.c                 | 20 ++++++++++++++++++++
>  3 files changed, 52 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0334ca97c584..823aaf759bdb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
>  	return (unsigned long)val;
>  }
>  
> +void mm_trace_rss_stat(int member, long count, long value);
> +
>  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
>  {
> -	atomic_long_add(value, &mm->rss_stat.count[member]);
> +	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
> +
> +	mm_trace_rss_stat(member, count, value);
>  }
>  
>  static inline void inc_mm_counter(struct mm_struct *mm, int member)
>  {
> -	atomic_long_inc(&mm->rss_stat.count[member]);
> +	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
> +
> +	mm_trace_rss_stat(member, count, 1);
>  }
>  
>  static inline void dec_mm_counter(struct mm_struct *mm, int member)
>  {
> -	atomic_long_dec(&mm->rss_stat.count[member]);
> +	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
> +
> +	mm_trace_rss_stat(member, count, -1);
>  }
>  
>  /* Optimized variant when page is already known not to be PageAnon */
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index eb57e3037deb..8b88e04fafbf 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
>  		__entry->change_ownership)
>  );
>  
> +TRACE_EVENT(rss_stat,
> +
> +	TP_PROTO(int member,
> +		long count),
> +
> +	TP_ARGS(member, count),
> +
> +	TP_STRUCT__entry(
> +		__field(int, member)
> +		__field(long, size)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->member = member;
> +		__entry->size = (count << PAGE_SHIFT);
> +	),
> +
> +	TP_printk("member=%d size=%ldB",
> +		__entry->member,
> +		__entry->size)
> +	);
>  #endif /* _TRACE_KMEM_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/memory.c b/mm/memory.c
> index e2bb51b6242e..9d81322c24a3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -72,6 +72,8 @@
>  #include <linux/oom.h>
>  #include <linux/numa.h>
>  
> +#include <trace/events/kmem.h>
> +
>  #include <asm/io.h>
>  #include <asm/mmu_context.h>
>  #include <asm/pgalloc.h>
> @@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
>  }
>  core_initcall(init_zero_pfn);
>  
> +/*
> + * This threshold is the boundary in the value space, that the counter has to
> + * advance before we trace it. Should be a power of 2. It is to reduce unwanted
> + * trace overhead. The counter is in units of number of pages.
> + */
> +#define TRACE_MM_COUNTER_THRESHOLD 128
> +
> +void mm_trace_rss_stat(int member, long count, long value)
> +{
> +	long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
> +
> +	if (!trace_rss_stat_enabled())
> +		return;
> +
> +	/* Threshold roll-over, trace it */
> +	if ((count & thresh_mask) != ((count - value) & thresh_mask))
> +		trace_rss_stat(member, count);
> +}
>  
>  #if defined(SPLIT_RSS_COUNTING)
>  
> -- 
> 2.23.0.187.g17f5b7556c-goog
Joel Fernandes Sept. 4, 2019, 2:59 p.m. UTC | #8
On Tue, Sep 03, 2019 at 10:42:53PM -0700, Daniel Colascione wrote:
> On Tue, Sep 3, 2019 at 10:15 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > On Tue, Sep 03, 2019 at 09:51:20PM -0700, Daniel Colascione wrote:
> > > On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > > > <joel@joelfernandes.org> wrote:
> > > > >
> > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > useful so I'm posting it upstream.
> > >
> > > It's also worth being able to turn off the per-task memory counter
> > > caching, otherwise you'll have two levels of batching before the
> > > counter gets updated, IIUC.
> >
> > I prefer to keep split RSS accounting turned on if it is available.
> 
> Why? AFAIK, nobody's produced numbers showing that split accounting
> has a real benefit.

I am not too sure. Have you checked the original patches that added this
stuff though? It seems to me the main win would be on big systems that have
to pay for atomic updates.

> > I think
> > discussing split RSS accounting is a bit out of scope of this patch as well.
> 
> It's in-scope, because with split RSS accounting, allocated memory can
> stay accumulated in task structs for an indefinite time without being
> flushed to the mm. As a result, if you take the stream of virtual
> memory management system calls that  program makes on one hand, and VM
> counter values on the other, the two don't add up. For various kinds
> of robustness (trace self-checking, say) it's important that various
> sources of data add up.
> 
> If we're adding a configuration knob that controls how often VM
> counters get reflected in system trace points, we should also have a
> knob to control delayed VM counter operations. The whole point is for
> users to be able to specify how precisely they want VM counter changes
> reported to analysis tools.

We're not adding more configuration knobs.

> > Any improvements on that front can be a follow-up.
> >
> > Curious, has split RSS accounting shown you any issue with this patch?
> 
> Split accounting has been a source of confusion for a while now: it
> causes that numbers-don't-add-up problem even when sampling from
> procfs instead of reading memory tracepoint data.

I think you can just disable split RSS accounting if it does not work well
for your configuration. It sounds like the problems you share are common all
with existing ways of getting RSS accounting working, and not this particular
one, hence I mentioned it is a bit of scope.

Also AFAIU, every TASK_RSS_EVENTS_THRESH the page fault code does sync the
counters. So it does not indefinitely lurk. The tracepoint's main intended
use is to detect spikes which provides ample opportunity to sync the cache.

You could reduce TASK_RSS_EVENTS_THRESH in your kernel, or even just disable
split RSS accounting if that suits you better. That would solve all the
issues you raised, not just any potential ones that you raised here for this
tracepoint.

thanks,

 - Joel
Joel Fernandes Sept. 4, 2019, 3:32 p.m. UTC | #9
On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > memory hogs. Several Android teams have been using this patch in various
> > kernel trees for half a year now. Many reported to me it is really
> > useful so I'm posting it upstream.
> > 
> > Initial patch developed by Tim Murray. Changes I made from original patch:
> > o Prevent any additional space consumed by mm_struct.
> > o Keep overhead low by checking if tracing is enabled.
> > o Add some noise reduction and lower overhead by emitting only on
> >   threshold changes.
> 
> Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> the Linus tree (nor in linux-next).

No, this is generated automatically by the tracepoint infrastructure when a
tracepoint is added.

> Besides that why do we need batching in the first place. Does this have a
> measurable overhead? How does it differ from any other tracepoints that we
> have in other hotpaths (e.g.  page allocator doesn't do any checks).

We do need batching not only for overhead reduction, but also for reducing
tracing noise. Flooding the traces makes it less useful for long traces and
post-processing of traces. IOW, the overhead reduction is a bonus.

I have not looked at the page allocator paths, we don't currently use that
for the purposes of this rss_stat tracepoint.

> Other than that this looks reasonable to me.

Thanks!

 - Joel


> 
> > Co-developed-by: Tim Murray <timmurray@google.com>
> > Signed-off-by: Tim Murray <timmurray@google.com>
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > ---
> > 
> > v1->v2: Added more commit message.
> > 
> > Cc: carmenjackson@google.com
> > Cc: mayankgupta@google.com
> > Cc: dancol@google.com
> > Cc: rostedt@goodmis.org
> > Cc: minchan@kernel.org
> > Cc: akpm@linux-foundation.org
> > Cc: kernel-team@android.com
> > 
> >  include/linux/mm.h          | 14 +++++++++++---
> >  include/trace/events/kmem.h | 21 +++++++++++++++++++++
> >  mm/memory.c                 | 20 ++++++++++++++++++++
> >  3 files changed, 52 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 0334ca97c584..823aaf759bdb 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> >  	return (unsigned long)val;
> >  }
> >  
> > +void mm_trace_rss_stat(int member, long count, long value);
> > +
> >  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
> >  {
> > -	atomic_long_add(value, &mm->rss_stat.count[member]);
> > +	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
> > +
> > +	mm_trace_rss_stat(member, count, value);
> >  }
> >  
> >  static inline void inc_mm_counter(struct mm_struct *mm, int member)
> >  {
> > -	atomic_long_inc(&mm->rss_stat.count[member]);
> > +	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
> > +
> > +	mm_trace_rss_stat(member, count, 1);
> >  }
> >  
> >  static inline void dec_mm_counter(struct mm_struct *mm, int member)
> >  {
> > -	atomic_long_dec(&mm->rss_stat.count[member]);
> > +	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
> > +
> > +	mm_trace_rss_stat(member, count, -1);
> >  }
> >  
> >  /* Optimized variant when page is already known not to be PageAnon */
> > diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> > index eb57e3037deb..8b88e04fafbf 100644
> > --- a/include/trace/events/kmem.h
> > +++ b/include/trace/events/kmem.h
> > @@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
> >  		__entry->change_ownership)
> >  );
> >  
> > +TRACE_EVENT(rss_stat,
> > +
> > +	TP_PROTO(int member,
> > +		long count),
> > +
> > +	TP_ARGS(member, count),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(int, member)
> > +		__field(long, size)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->member = member;
> > +		__entry->size = (count << PAGE_SHIFT);
> > +	),
> > +
> > +	TP_printk("member=%d size=%ldB",
> > +		__entry->member,
> > +		__entry->size)
> > +	);
> >  #endif /* _TRACE_KMEM_H */
> >  
> >  /* This part must be outside protection */
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e2bb51b6242e..9d81322c24a3 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -72,6 +72,8 @@
> >  #include <linux/oom.h>
> >  #include <linux/numa.h>
> >  
> > +#include <trace/events/kmem.h>
> > +
> >  #include <asm/io.h>
> >  #include <asm/mmu_context.h>
> >  #include <asm/pgalloc.h>
> > @@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
> >  }
> >  core_initcall(init_zero_pfn);
> >  
> > +/*
> > + * This threshold is the boundary in the value space, that the counter has to
> > + * advance before we trace it. Should be a power of 2. It is to reduce unwanted
> > + * trace overhead. The counter is in units of number of pages.
> > + */
> > +#define TRACE_MM_COUNTER_THRESHOLD 128
> > +
> > +void mm_trace_rss_stat(int member, long count, long value)
> > +{
> > +	long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
> > +
> > +	if (!trace_rss_stat_enabled())
> > +		return;
> > +
> > +	/* Threshold roll-over, trace it */
> > +	if ((count & thresh_mask) != ((count - value) & thresh_mask))
> > +		trace_rss_stat(member, count);
> > +}
> >  
> >  #if defined(SPLIT_RSS_COUNTING)
> >  
> > -- 
> > 2.23.0.187.g17f5b7556c-goog
> 
> -- 
> Michal Hocko
> SUSE Labs
Michal Hocko Sept. 4, 2019, 3:37 p.m. UTC | #10
On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > memory hogs. Several Android teams have been using this patch in various
> > > kernel trees for half a year now. Many reported to me it is really
> > > useful so I'm posting it upstream.
> > > 
> > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > o Prevent any additional space consumed by mm_struct.
> > > o Keep overhead low by checking if tracing is enabled.
> > > o Add some noise reduction and lower overhead by emitting only on
> > >   threshold changes.
> > 
> > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > the Linus tree (nor in linux-next).
> 
> No, this is generated automatically by the tracepoint infrastructure when a
> tracepoint is added.

OK, I was not aware of that.

> > Besides that why do we need batching in the first place. Does this have a
> > measurable overhead? How does it differ from any other tracepoints that we
> > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> 
> We do need batching not only for overhead reduction,

What is the overhead?

> but also for reducing
> tracing noise. Flooding the traces makes it less useful for long traces and
> post-processing of traces. IOW, the overhead reduction is a bonus.

This is not really anything special for this tracepoint though.
Basically any tracepoint in a hot path is in the same situation and I do
not see a point why each of them should really invent its own way to
throttle. Maybe there is some way to do that in the tracing subsystem
directly.
Joel Fernandes Sept. 4, 2019, 4:28 p.m. UTC | #11
On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > memory hogs. Several Android teams have been using this patch in various
> > > > kernel trees for half a year now. Many reported to me it is really
> > > > useful so I'm posting it upstream.
> > > >
> > > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > > o Prevent any additional space consumed by mm_struct.
> > > > o Keep overhead low by checking if tracing is enabled.
> > > > o Add some noise reduction and lower overhead by emitting only on
> > > >   threshold changes.
> > >
> > > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > > the Linus tree (nor in linux-next).
> >
> > No, this is generated automatically by the tracepoint infrastructure when a
> > tracepoint is added.
>
> OK, I was not aware of that.
>
> > > Besides that why do we need batching in the first place. Does this have a
> > > measurable overhead? How does it differ from any other tracepoints that we
> > > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> >
> > We do need batching not only for overhead reduction,
>
> What is the overhead?

The overhead is occasionally higher without the threshold (that is if we
trace every counter change). I would classify performance benefit to be
almost the same and within the noise.

For memset of 1GB data:

With threshold:
Total time for 1GB data: 684172499 nanoseconds.
Total time for 1GB data: 692379986 nanoseconds.
Total time for 1GB data: 760023463 nanoseconds.
Total time for 1GB data: 669291457 nanoseconds.
Total time for 1GB data: 729722783 nanoseconds.

Without threshold
Total time for 1GB data: 722505810 nanoseconds.
Total time for 1GB data: 648724292 nanoseconds.
Total time for 1GB data: 643102853 nanoseconds.
Total time for 1GB data: 641815282 nanoseconds.
Total time for 1GB data: 828561187 nanoseconds.  <-- outlier but it did happen.

> > but also for reducing
> > tracing noise. Flooding the traces makes it less useful for long traces and
> > post-processing of traces. IOW, the overhead reduction is a bonus.
>
> This is not really anything special for this tracepoint though.
> Basically any tracepoint in a hot path is in the same situation and I do
> not see a point why each of them should really invent its own way to
> throttle. Maybe there is some way to do that in the tracing subsystem
> directly.

I am not sure if there is a way to do this easily. Add to that, the fact that
you still have to call into trace events. Why call into it at all, if you can
filter in advance and have a sane filtering default?

The bigger improvement with the threshold is the number of trace records are
almost halved by using a threshold. The number of records went from 4.6K to
2.6K.

I don't see any drawbacks with using a threshold. There is no overhead either
way. For system without split RSS accounting, the reduction in number of
trace records would be even higher significantly reducing the consumption of
the ftrace buffer and the noise that people have to deal with.

Hope you agree now?

thanks,

 - Joel
Daniel Colascione Sept. 4, 2019, 5:15 p.m. UTC | #12
On Wed, Sep 4, 2019 at 7:59 AM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Tue, Sep 03, 2019 at 10:42:53PM -0700, Daniel Colascione wrote:
> > On Tue, Sep 3, 2019 at 10:15 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> > >
> > > On Tue, Sep 03, 2019 at 09:51:20PM -0700, Daniel Colascione wrote:
> > > > On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > > > > <joel@joelfernandes.org> wrote:
> > > > > >
> > > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > > useful so I'm posting it upstream.
> > > >
> > > > It's also worth being able to turn off the per-task memory counter
> > > > caching, otherwise you'll have two levels of batching before the
> > > > counter gets updated, IIUC.
> > >
> > > I prefer to keep split RSS accounting turned on if it is available.
> >
> > Why? AFAIK, nobody's produced numbers showing that split accounting
> > has a real benefit.
>
> I am not too sure. Have you checked the original patches that added this
> stuff though? It seems to me the main win would be on big systems that have
> to pay for atomic updates.

I looked into this issue the last time I mentioned split mm
accounting. See [1]. It's my sense that the original change was
inadequately justified; Michal Hocko seems to agree. I've tried
disabling split rss accounting locally on a variety of systems ---
Android, laptop, desktop --- and failed to notice any difference. It's
possible that some difference appears at a scale beyond that to which
I have access, but if the benefit of split rss accounting is limited
to these cases, split rss accounting shouldn't be on by default, since
it comes at a cost in consistency.

[1] https://lore.kernel.org/linux-mm/20180227100234.GF15357@dhcp22.suse.cz/

> > > I think
> > > discussing split RSS accounting is a bit out of scope of this patch as well.
> >
> > It's in-scope, because with split RSS accounting, allocated memory can
> > stay accumulated in task structs for an indefinite time without being
> > flushed to the mm. As a result, if you take the stream of virtual
> > memory management system calls that  program makes on one hand, and VM
> > counter values on the other, the two don't add up. For various kinds
> > of robustness (trace self-checking, say) it's important that various
> > sources of data add up.
> >
> > If we're adding a configuration knob that controls how often VM
> > counters get reflected in system trace points, we should also have a
> > knob to control delayed VM counter operations. The whole point is for
> > users to be able to specify how precisely they want VM counter changes
> > reported to analysis tools.
>
> We're not adding more configuration knobs.

This position doesn't seem to be the thread consensus yet.

> > > Any improvements on that front can be a follow-up.
> > >
> > > Curious, has split RSS accounting shown you any issue with this patch?
> >
> > Split accounting has been a source of confusion for a while now: it
> > causes that numbers-don't-add-up problem even when sampling from
> > procfs instead of reading memory tracepoint data.
>
> I think you can just disable split RSS accounting if it does not work well
> for your configuration.

There's no build-time configuration for split RSS accounting. It's not
reasonable to expect people to carry patches just to get their memory
usage numbers to add up.

> Also AFAIU, every TASK_RSS_EVENTS_THRESH the page fault code does sync the
> counters. So it does not indefinitely lurk.

If a thread incurs TASK_RSS_EVENTS_THRESH - 1 page faults and then
sleeps for a week, all memory counters observable from userspace will
be wrong for a week. Multiply this potential error by the number of
threads on a typical system and you have to conclude that split RSS
accounting produces a lot of potential uncertainty. What are we
getting in exchange for this uncertainty?

> The tracepoint's main intended
> use is to detect spikes which provides ample opportunity to sync the cache.

The intended use is measuring memory levels of various processes over
time, not just detecting "spikes". In order to make sense of the
resulting data series, we need to be able to place error bars on it.
The presence of split RSS accounting makes those error bars much
larger than they have to be.

> You could reduce TASK_RSS_EVENTS_THRESH in your kernel, or even just disable
> split RSS accounting if that suits you better. That would solve all the
> issues you raised, not just any potential ones that you raised here for this
> tracepoint.

I think we should just delete the split RSS accounting code unless
someone can demonstrate that it's a measurable win on a typical
system. The first priority of any system should be correctness.
Consistency is a kind of correctness. Departures from correctness
coming only from quantitatively-justifiable need.
Daniel Colascione Sept. 4, 2019, 5:17 p.m. UTC | #13
On Wed, Sep 4, 2019 at 8:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > but also for reducing
> > tracing noise. Flooding the traces makes it less useful for long traces and
> > post-processing of traces. IOW, the overhead reduction is a bonus.
>
> This is not really anything special for this tracepoint though.
> Basically any tracepoint in a hot path is in the same situation and I do
> not see a point why each of them should really invent its own way to
> throttle. Maybe there is some way to do that in the tracing subsystem
> directly.

I agree. I'd rather not special-case RSS in this way, especially not
with a hardcoded aggregation and thresholding configuration. It should
be possible to handle high-frequency trace data point aggregation in a
general way. Why shouldn't we be able to get a time series for, say,
dirty pages? Or various slab counts? IMHO, any counter on the system
ought to be observable in a uniform way.
Sandeep Patil Sept. 4, 2019, 11:59 p.m. UTC | #14
On Wed, Sep 04, 2019 at 10:15:10AM -0700, 'Daniel Colascione' via kernel-team wrote:
> On Wed, Sep 4, 2019 at 7:59 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > On Tue, Sep 03, 2019 at 10:42:53PM -0700, Daniel Colascione wrote:
> > > On Tue, Sep 3, 2019 at 10:15 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > >
> > > > On Tue, Sep 03, 2019 at 09:51:20PM -0700, Daniel Colascione wrote:
> > > > > On Tue, Sep 3, 2019 at 9:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > On Tue, Sep 3, 2019 at 1:09 PM Joel Fernandes (Google)
> > > > > > <joel@joelfernandes.org> wrote:
> > > > > > >
> > > > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > > > useful so I'm posting it upstream.
> > > > >
> > > > > It's also worth being able to turn off the per-task memory counter
> > > > > caching, otherwise you'll have two levels of batching before the
> > > > > counter gets updated, IIUC.
> > > >
> > > > I prefer to keep split RSS accounting turned on if it is available.
> > >
> > > Why? AFAIK, nobody's produced numbers showing that split accounting
> > > has a real benefit.
> >
> > I am not too sure. Have you checked the original patches that added this
> > stuff though? It seems to me the main win would be on big systems that have
> > to pay for atomic updates.
> 
> I looked into this issue the last time I mentioned split mm
> accounting. See [1]. It's my sense that the original change was
> inadequately justified; Michal Hocko seems to agree. I've tried
> disabling split rss accounting locally on a variety of systems ---
> Android, laptop, desktop --- and failed to notice any difference. It's
> possible that some difference appears at a scale beyond that to which
> I have access, but if the benefit of split rss accounting is limited
> to these cases, split rss accounting shouldn't be on by default, since
> it comes at a cost in consistency.
> 
> [1] https://lore.kernel.org/linux-mm/20180227100234.GF15357@dhcp22.suse.cz/
> 
> > > > I think
> > > > discussing split RSS accounting is a bit out of scope of this patch as well.
> > >
> > > It's in-scope, because with split RSS accounting, allocated memory can
> > > stay accumulated in task structs for an indefinite time without being
> > > flushed to the mm. As a result, if you take the stream of virtual
> > > memory management system calls that  program makes on one hand, and VM
> > > counter values on the other, the two don't add up. For various kinds
> > > of robustness (trace self-checking, say) it's important that various
> > > sources of data add up.
> > >
> > > If we're adding a configuration knob that controls how often VM
> > > counters get reflected in system trace points, we should also have a
> > > knob to control delayed VM counter operations. The whole point is for
> > > users to be able to specify how precisely they want VM counter changes
> > > reported to analysis tools.
> >
> > We're not adding more configuration knobs.
> 
> This position doesn't seem to be the thread consensus yet.
> 
> > > > Any improvements on that front can be a follow-up.
> > > >
> > > > Curious, has split RSS accounting shown you any issue with this patch?
> > >
> > > Split accounting has been a source of confusion for a while now: it
> > > causes that numbers-don't-add-up problem even when sampling from
> > > procfs instead of reading memory tracepoint data.
> >
> > I think you can just disable split RSS accounting if it does not work well
> > for your configuration.
> 
> There's no build-time configuration for split RSS accounting. It's not
> reasonable to expect people to carry patches just to get their memory
> usage numbers to add up.

sure, may be send a patch to make one in that case or for deleting the split
RSS accounting code like you said below.

> 
> > Also AFAIU, every TASK_RSS_EVENTS_THRESH the page fault code does sync the
> > counters. So it does not indefinitely lurk.
> 
> If a thread incurs TASK_RSS_EVENTS_THRESH - 1 page faults and then
> sleeps for a week, all memory counters observable from userspace will
> be wrong for a week. Multiply this potential error by the number of
> threads on a typical system and you have to conclude that split RSS
> accounting produces a lot of potential uncertainty. What are we
> getting in exchange for this uncertainty?
> 
> > The tracepoint's main intended
> > use is to detect spikes which provides ample opportunity to sync the cache.
> 
> The intended use is measuring memory levels of various processes over
> time, not just detecting "spikes". In order to make sense of the
> resulting data series, we need to be able to place error bars on it.
> The presence of split RSS accounting makes those error bars much
> larger than they have to be.
> 
> > You could reduce TASK_RSS_EVENTS_THRESH in your kernel, or even just disable
> > split RSS accounting if that suits you better. That would solve all the
> > issues you raised, not just any potential ones that you raised here for this
> > tracepoint.
> 
> I think we should just delete the split RSS accounting code unless
> someone can demonstrate that it's a measurable win on a typical
> system. The first priority of any system should be correctness.
> Consistency is a kind of correctness. Departures from correctness
> coming only from quantitatively-justifiable need.

I think you make some good points for correctness, but I still don't see
how all that relates to _this_ change. We currently do want an ability to get
these rss spikes in the traces (as the patch description shows).

You seem to be arguing about the correctness of split RSS accounting. I would
suggest to send a patch to delete the split RSS accounting code and take
these *very valid* arguments there? I am struggling to see the point of
derailing this _specific_ change for that.

- ssp

> 
> -- 
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Michal Hocko Sept. 5, 2019, 10:54 a.m. UTC | #15
On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > > > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > useful so I'm posting it upstream.
> > > > >
> > > > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > > > o Prevent any additional space consumed by mm_struct.
> > > > > o Keep overhead low by checking if tracing is enabled.
> > > > > o Add some noise reduction and lower overhead by emitting only on
> > > > >   threshold changes.
> > > >
> > > > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > > > the Linus tree (nor in linux-next).
> > >
> > > No, this is generated automatically by the tracepoint infrastructure when a
> > > tracepoint is added.
> >
> > OK, I was not aware of that.
> >
> > > > Besides that why do we need batching in the first place. Does this have a
> > > > measurable overhead? How does it differ from any other tracepoints that we
> > > > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> > >
> > > We do need batching not only for overhead reduction,
> >
> > What is the overhead?
> 
> The overhead is occasionally higher without the threshold (that is if we
> trace every counter change). I would classify performance benefit to be
> almost the same and within the noise.

OK, so the additional code is not really justified.
Joel Fernandes Sept. 5, 2019, 2:14 p.m. UTC | #16
On Thu, Sep 05, 2019 at 12:54:24PM +0200, Michal Hocko wrote:
> On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > > > > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > > useful so I'm posting it upstream.
> > > > > >
> > > > > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > > > > o Prevent any additional space consumed by mm_struct.
> > > > > > o Keep overhead low by checking if tracing is enabled.
> > > > > > o Add some noise reduction and lower overhead by emitting only on
> > > > > >   threshold changes.
> > > > >
> > > > > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > > > > the Linus tree (nor in linux-next).
> > > >
> > > > No, this is generated automatically by the tracepoint infrastructure when a
> > > > tracepoint is added.
> > >
> > > OK, I was not aware of that.
> > >
> > > > > Besides that why do we need batching in the first place. Does this have a
> > > > > measurable overhead? How does it differ from any other tracepoints that we
> > > > > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> > > >
> > > > We do need batching not only for overhead reduction,
> > >
> > > What is the overhead?
> > 
> > The overhead is occasionally higher without the threshold (that is if we
> > trace every counter change). I would classify performance benefit to be
> > almost the same and within the noise.
> 
> OK, so the additional code is not really justified.

It is really justified. Did you read the whole of the last email?

thanks,

 - Joel
Michal Hocko Sept. 5, 2019, 2:20 p.m. UTC | #17
On Thu 05-09-19 10:14:52, Joel Fernandes wrote:
> On Thu, Sep 05, 2019 at 12:54:24PM +0200, Michal Hocko wrote:
> > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > > > > > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > > > useful so I'm posting it upstream.
> > > > > > >
> > > > > > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > > > > > o Prevent any additional space consumed by mm_struct.
> > > > > > > o Keep overhead low by checking if tracing is enabled.
> > > > > > > o Add some noise reduction and lower overhead by emitting only on
> > > > > > >   threshold changes.
> > > > > >
> > > > > > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > > > > > the Linus tree (nor in linux-next).
> > > > >
> > > > > No, this is generated automatically by the tracepoint infrastructure when a
> > > > > tracepoint is added.
> > > >
> > > > OK, I was not aware of that.
> > > >
> > > > > > Besides that why do we need batching in the first place. Does this have a
> > > > > > measurable overhead? How does it differ from any other tracepoints that we
> > > > > > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> > > > >
> > > > > We do need batching not only for overhead reduction,
> > > >
> > > > What is the overhead?
> > > 
> > > The overhead is occasionally higher without the threshold (that is if we
> > > trace every counter change). I would classify performance benefit to be
> > > almost the same and within the noise.
> > 
> > OK, so the additional code is not really justified.
> 
> It is really justified. Did you read the whole of the last email?

Of course I have. The information that numbers are in noise with some
outliers (without any details about the underlying reason) is simply
showing that you are optimizing something probably not worth it.

I would recommend adding a simple tracepoint. That should be pretty non
controversial. And if you want to add an optimization on top then
provide data to justify it.
Joel Fernandes Sept. 5, 2019, 2:23 p.m. UTC | #18
On Thu, Sep 05, 2019 at 04:20:10PM +0200, Michal Hocko wrote:
> On Thu 05-09-19 10:14:52, Joel Fernandes wrote:
> > On Thu, Sep 05, 2019 at 12:54:24PM +0200, Michal Hocko wrote:
> > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > > On Wed, Sep 04, 2019 at 10:45:08AM +0200, Michal Hocko wrote:
> > > > > > > On Tue 03-09-19 16:09:05, Joel Fernandes (Google) wrote:
> > > > > > > > Useful to track how RSS is changing per TGID to detect spikes in RSS and
> > > > > > > > memory hogs. Several Android teams have been using this patch in various
> > > > > > > > kernel trees for half a year now. Many reported to me it is really
> > > > > > > > useful so I'm posting it upstream.
> > > > > > > >
> > > > > > > > Initial patch developed by Tim Murray. Changes I made from original patch:
> > > > > > > > o Prevent any additional space consumed by mm_struct.
> > > > > > > > o Keep overhead low by checking if tracing is enabled.
> > > > > > > > o Add some noise reduction and lower overhead by emitting only on
> > > > > > > >   threshold changes.
> > > > > > >
> > > > > > > Does this have any pre-requisite? I do not see trace_rss_stat_enabled in
> > > > > > > the Linus tree (nor in linux-next).
> > > > > >
> > > > > > No, this is generated automatically by the tracepoint infrastructure when a
> > > > > > tracepoint is added.
> > > > >
> > > > > OK, I was not aware of that.
> > > > >
> > > > > > > Besides that why do we need batching in the first place. Does this have a
> > > > > > > measurable overhead? How does it differ from any other tracepoints that we
> > > > > > > have in other hotpaths (e.g.  page allocator doesn't do any checks).
> > > > > >
> > > > > > We do need batching not only for overhead reduction,
> > > > >
> > > > > What is the overhead?
> > > > 
> > > > The overhead is occasionally higher without the threshold (that is if we
> > > > trace every counter change). I would classify performance benefit to be
> > > > almost the same and within the noise.
> > > 
> > > OK, so the additional code is not really justified.
> > 
> > It is really justified. Did you read the whole of the last email?
> 
> Of course I have. The information that numbers are in noise with some
> outliers (without any details about the underlying reason) is simply
> showing that you are optimizing something probably not worth it.
> 
> I would recommend adding a simple tracepoint. That should be pretty non
> controversial. And if you want to add an optimization on top then
> provide data to justify it.

Did you read the point about trace sizes? We don't want traces flooded and
you are not really making any good points about why we should not reduce
flooding of traces. I don't want to simplify it and lose the benefit. It is
already simple enough and non-controversial.

thanks,

 - Joel
Michal Hocko Sept. 5, 2019, 2:43 p.m. UTC | #19
[Add Steven]

On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
[...]
> > > but also for reducing
> > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > post-processing of traces. IOW, the overhead reduction is a bonus.
> >
> > This is not really anything special for this tracepoint though.
> > Basically any tracepoint in a hot path is in the same situation and I do
> > not see a point why each of them should really invent its own way to
> > throttle. Maybe there is some way to do that in the tracing subsystem
> > directly.
> 
> I am not sure if there is a way to do this easily. Add to that, the fact that
> you still have to call into trace events. Why call into it at all, if you can
> filter in advance and have a sane filtering default?
> 
> The bigger improvement with the threshold is the number of trace records are
> almost halved by using a threshold. The number of records went from 4.6K to
> 2.6K.

Steven, would it be feasible to add a generic tracepoint throttling?
Suren Baghdasaryan Sept. 5, 2019, 4:03 p.m. UTC | #20
On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> [Add Steven]
>
> On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> [...]
> > > > but also for reducing
> > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > post-processing of traces. IOW, the overhead reduction is a bonus.
> > >
> > > This is not really anything special for this tracepoint though.
> > > Basically any tracepoint in a hot path is in the same situation and I do
> > > not see a point why each of them should really invent its own way to
> > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > directly.
> >
> > I am not sure if there is a way to do this easily. Add to that, the fact that
> > you still have to call into trace events. Why call into it at all, if you can
> > filter in advance and have a sane filtering default?
> >
> > The bigger improvement with the threshold is the number of trace records are
> > almost halved by using a threshold. The number of records went from 4.6K to
> > 2.6K.
>
> Steven, would it be feasible to add a generic tracepoint throttling?

I might misunderstand this but is the issue here actually throttling
of the sheer number of trace records or tracing large enough changes
to RSS that user might care about? Small changes happen all the time
but we are likely not interested in those. Surely we could postprocess
the traces to extract changes large enough to be interesting but why
capture uninteresting information in the first place? IOW the
throttling here should be based not on the time between traces but on
the amount of change of the traced signal. Maybe a generic facility
like that would be a good idea?

> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Steven Rostedt Sept. 5, 2019, 5:35 p.m. UTC | #21
[ Added Tom ]

On Thu, 5 Sep 2019 09:03:01 -0700
Suren Baghdasaryan <surenb@google.com> wrote:

> On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > [Add Steven]
> >
> > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:  
> > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:  
> > > >
> > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:  
> > [...]  
> > > > > but also for reducing
> > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > post-processing of traces. IOW, the overhead reduction is a bonus.  
> > > >
> > > > This is not really anything special for this tracepoint though.
> > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > not see a point why each of them should really invent its own way to
> > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > directly.  
> > >
> > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > you still have to call into trace events. Why call into it at all, if you can
> > > filter in advance and have a sane filtering default?
> > >
> > > The bigger improvement with the threshold is the number of trace records are
> > > almost halved by using a threshold. The number of records went from 4.6K to
> > > 2.6K.  
> >
> > Steven, would it be feasible to add a generic tracepoint throttling?  
> 
> I might misunderstand this but is the issue here actually throttling
> of the sheer number of trace records or tracing large enough changes
> to RSS that user might care about? Small changes happen all the time
> but we are likely not interested in those. Surely we could postprocess
> the traces to extract changes large enough to be interesting but why
> capture uninteresting information in the first place? IOW the
> throttling here should be based not on the time between traces but on
> the amount of change of the traced signal. Maybe a generic facility
> like that would be a good idea?

You mean like add a trigger (or filter) that only traces if a field has
changed since the last time the trace was hit? Hmm, I think we could
possibly do that. Perhaps even now with histogram triggers?

-- Steve
Suren Baghdasaryan Sept. 5, 2019, 5:39 p.m. UTC | #22
On Thu, Sep 5, 2019 at 10:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
>
>
> [ Added Tom ]
>
> On Thu, 5 Sep 2019 09:03:01 -0700
> Suren Baghdasaryan <surenb@google.com> wrote:
>
> > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > [Add Steven]
> > >
> > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > [...]
> > > > > > but also for reducing
> > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.
> > > > >
> > > > > This is not really anything special for this tracepoint though.
> > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > not see a point why each of them should really invent its own way to
> > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > directly.
> > > >
> > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > you still have to call into trace events. Why call into it at all, if you can
> > > > filter in advance and have a sane filtering default?
> > > >
> > > > The bigger improvement with the threshold is the number of trace records are
> > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > 2.6K.
> > >
> > > Steven, would it be feasible to add a generic tracepoint throttling?
> >
> > I might misunderstand this but is the issue here actually throttling
> > of the sheer number of trace records or tracing large enough changes
> > to RSS that user might care about? Small changes happen all the time
> > but we are likely not interested in those. Surely we could postprocess
> > the traces to extract changes large enough to be interesting but why
> > capture uninteresting information in the first place? IOW the
> > throttling here should be based not on the time between traces but on
> > the amount of change of the traced signal. Maybe a generic facility
> > like that would be a good idea?
>
> You mean like add a trigger (or filter) that only traces if a field has
> changed since the last time the trace was hit?

Almost... I mean emit a trace if a field has changed by more than X
amount since the last time the trace was hit.

> Hmm, I think we could
> possibly do that. Perhaps even now with histogram triggers?
>
> -- Steve
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Tim Murray Sept. 5, 2019, 5:43 p.m. UTC | #23
On Thu, Sep 5, 2019 at 9:03 AM Suren Baghdasaryan <surenb@google.com> wrote:
> I might misunderstand this but is the issue here actually throttling
> of the sheer number of trace records or tracing large enough changes
> to RSS that user might care about? Small changes happen all the time
> but we are likely not interested in those. Surely we could postprocess
> the traces to extract changes large enough to be interesting but why
> capture uninteresting information in the first place? IOW the
> throttling here should be based not on the time between traces but on
> the amount of change of the traced signal. Maybe a generic facility
> like that would be a good idea?

You want two properties from the tracepoint:

- Small fluctuations in the value don't flood the trace buffer. If you
get a new trace event from a process every time kswapd reclaims a
single page from that process, you're going to need an enormous trace
buffer that will have significant side effects on overall system
performance.
- Any spike in memory consumption gets a trace event, regardless of
the duration of that spike. This tracepoint has been incredibly useful
in both understanding the causes of kswapd wakeups and
lowmemorykiller/lmkd kills and evaluating the impact of memory
management changes because it guarantees that any spike appears in the
trace output.

As a result, the RSS tracepoint in particular needs to be throttled
based on the delta of the value, not time. The very first prototype of
the patch emitted a trace event per RSS counter change, and IIRC the
RSS trace events consumed significantly more room in the buffer than
sched_switch (and Android has a lot of sched_switch events). It's not
practical to trace changes in RSS without throttling. If there's a
generic throttling approach that would work here, I'm all for it; like
Dan mentioned, there are many more counters that we would like to
trace in a similar way.
Joel Fernandes Sept. 5, 2019, 5:47 p.m. UTC | #24
On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> 
> 
> [ Added Tom ]
> 
> On Thu, 5 Sep 2019 09:03:01 -0700
> Suren Baghdasaryan <surenb@google.com> wrote:
> 
> > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > [Add Steven]
> > >
> > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:  
> > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:  
> > > > >
> > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:  
> > > [...]  
> > > > > > but also for reducing
> > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.  
> > > > >
> > > > > This is not really anything special for this tracepoint though.
> > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > not see a point why each of them should really invent its own way to
> > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > directly.  
> > > >
> > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > you still have to call into trace events. Why call into it at all, if you can
> > > > filter in advance and have a sane filtering default?
> > > >
> > > > The bigger improvement with the threshold is the number of trace records are
> > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > 2.6K.  
> > >
> > > Steven, would it be feasible to add a generic tracepoint throttling?  
> > 
> > I might misunderstand this but is the issue here actually throttling
> > of the sheer number of trace records or tracing large enough changes
> > to RSS that user might care about? Small changes happen all the time
> > but we are likely not interested in those. Surely we could postprocess
> > the traces to extract changes large enough to be interesting but why
> > capture uninteresting information in the first place? IOW the
> > throttling here should be based not on the time between traces but on
> > the amount of change of the traced signal. Maybe a generic facility
> > like that would be a good idea?
> 
> You mean like add a trigger (or filter) that only traces if a field has
> changed since the last time the trace was hit? Hmm, I think we could
> possibly do that. Perhaps even now with histogram triggers?


Hey Steve,

Something like an analog to digitial coversion function where you lose the
granularity of the signal depending on how much trace data:
https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee38d1a85d37fa23f86a14d3a9776ff67b0ec0f3b.gif

so like, if you had a counter incrementing with values after the increments
as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to emit a trace,
then you would get 1,8,12,30.

So I guess what is need is a way to reduce the quantiy of trace data this
way. For this usecase, the user mostly cares about spikes in the counter
changing that accurate values of the different points.

thanks,

 - Joel
Daniel Colascione Sept. 5, 2019, 5:50 p.m. UTC | #25
On Thu, Sep 5, 2019 at 10:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 5 Sep 2019 09:03:01 -0700
> Suren Baghdasaryan <surenb@google.com> wrote:
>
> > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > [Add Steven]
> > >
> > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > [...]
> > > > > > but also for reducing
> > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.
> > > > >
> > > > > This is not really anything special for this tracepoint though.
> > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > not see a point why each of them should really invent its own way to
> > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > directly.
> > > >
> > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > you still have to call into trace events. Why call into it at all, if you can
> > > > filter in advance and have a sane filtering default?
> > > >
> > > > The bigger improvement with the threshold is the number of trace records are
> > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > 2.6K.
> > >
> > > Steven, would it be feasible to add a generic tracepoint throttling?
> >
> > I might misunderstand this but is the issue here actually throttling
> > of the sheer number of trace records or tracing large enough changes
> > to RSS that user might care about? Small changes happen all the time
> > but we are likely not interested in those. Surely we could postprocess
> > the traces to extract changes large enough to be interesting but why
> > capture uninteresting information in the first place? IOW the
> > throttling here should be based not on the time between traces but on
> > the amount of change of the traced signal. Maybe a generic facility
> > like that would be a good idea?
>
> You mean like add a trigger (or filter) that only traces if a field has
> changed since the last time the trace was hit? Hmm, I think we could
> possibly do that. Perhaps even now with histogram triggers?

I was thinking along the same lines. The histogram subsystem seems
like a very good fit here. Histogram triggers already let users talk
about specific fields of trace events, aggregate them in configurable
ways, and (importantly, IMHO) create synthetic new trace events that
the kernel emits under configurable conditions.
Joel Fernandes Sept. 5, 2019, 5:51 p.m. UTC | #26
On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > 
> > 
> > [ Added Tom ]
> > 
> > On Thu, 5 Sep 2019 09:03:01 -0700
> > Suren Baghdasaryan <surenb@google.com> wrote:
> > 
> > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > [Add Steven]
> > > >
> > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:  
> > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:  
> > > > > >
> > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:  
> > > > [...]  
> > > > > > > but also for reducing
> > > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.  
> > > > > >
> > > > > > This is not really anything special for this tracepoint though.
> > > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > > not see a point why each of them should really invent its own way to
> > > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > > directly.  
> > > > >
> > > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > > you still have to call into trace events. Why call into it at all, if you can
> > > > > filter in advance and have a sane filtering default?
> > > > >
> > > > > The bigger improvement with the threshold is the number of trace records are
> > > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > > 2.6K.  
> > > >
> > > > Steven, would it be feasible to add a generic tracepoint throttling?  
> > > 
> > > I might misunderstand this but is the issue here actually throttling
> > > of the sheer number of trace records or tracing large enough changes
> > > to RSS that user might care about? Small changes happen all the time
> > > but we are likely not interested in those. Surely we could postprocess
> > > the traces to extract changes large enough to be interesting but why
> > > capture uninteresting information in the first place? IOW the
> > > throttling here should be based not on the time between traces but on
> > > the amount of change of the traced signal. Maybe a generic facility
> > > like that would be a good idea?
> > 
> > You mean like add a trigger (or filter) that only traces if a field has
> > changed since the last time the trace was hit? Hmm, I think we could
> > possibly do that. Perhaps even now with histogram triggers?
> 
> 
> Hey Steve,
> 
> Something like an analog to digitial coversion function where you lose the
> granularity of the signal depending on how much trace data:
> https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee38d1a85d37fa23f86a14d3a9776ff67b0ec0f3b.gif

s/how much trace data/what the resolution is/

> so like, if you had a counter incrementing with values after the increments
> as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to emit a trace,
> then you would get 1,8,12,30.
> 
> So I guess what is need is a way to reduce the quantiy of trace data this
> way. For this usecase, the user mostly cares about spikes in the counter
> changing that accurate values of the different points.

s/that accurate/than accurate/

I think Tim, Suren, Dan and Michal are all saying the same thing as well.

thanks,

 - Joel
Tom Zanussi Sept. 5, 2019, 7:56 p.m. UTC | #27
Hi,

On Thu, 2019-09-05 at 13:51 -0400, Joel Fernandes wrote:
> On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> > On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > > 
> > > 
> > > [ Added Tom ]
> > > 
> > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > Suren Baghdasaryan <surenb@google.com> wrote:
> > > 
> > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org>
> > > > wrote:
> > > > > 
> > > > > [Add Steven]
> > > > > 
> > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:  
> > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel
> > > > > > .org> wrote:  
> > > > > > > 
> > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:  
> > > > > 
> > > > > [...]  
> > > > > > > > but also for reducing
> > > > > > > > tracing noise. Flooding the traces makes it less useful
> > > > > > > > for long traces and
> > > > > > > > post-processing of traces. IOW, the overhead reduction
> > > > > > > > is a bonus.  
> > > > > > > 
> > > > > > > This is not really anything special for this tracepoint
> > > > > > > though.
> > > > > > > Basically any tracepoint in a hot path is in the same
> > > > > > > situation and I do
> > > > > > > not see a point why each of them should really invent its
> > > > > > > own way to
> > > > > > > throttle. Maybe there is some way to do that in the
> > > > > > > tracing subsystem
> > > > > > > directly.  
> > > > > > 
> > > > > > I am not sure if there is a way to do this easily. Add to
> > > > > > that, the fact that
> > > > > > you still have to call into trace events. Why call into it
> > > > > > at all, if you can
> > > > > > filter in advance and have a sane filtering default?
> > > > > > 
> > > > > > The bigger improvement with the threshold is the number of
> > > > > > trace records are
> > > > > > almost halved by using a threshold. The number of records
> > > > > > went from 4.6K to
> > > > > > 2.6K.  
> > > > > 
> > > > > Steven, would it be feasible to add a generic tracepoint
> > > > > throttling?  
> > > > 
> > > > I might misunderstand this but is the issue here actually
> > > > throttling
> > > > of the sheer number of trace records or tracing large enough
> > > > changes
> > > > to RSS that user might care about? Small changes happen all the
> > > > time
> > > > but we are likely not interested in those. Surely we could
> > > > postprocess
> > > > the traces to extract changes large enough to be interesting
> > > > but why
> > > > capture uninteresting information in the first place? IOW the
> > > > throttling here should be based not on the time between traces
> > > > but on
> > > > the amount of change of the traced signal. Maybe a generic
> > > > facility
> > > > like that would be a good idea?
> > > 
> > > You mean like add a trigger (or filter) that only traces if a
> > > field has
> > > changed since the last time the trace was hit? Hmm, I think we
> > > could
> > > possibly do that. Perhaps even now with histogram triggers?
> > 
> > 
> > Hey Steve,
> > 
> > Something like an analog to digitial coversion function where you
> > lose the
> > granularity of the signal depending on how much trace data:
> > https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee38d1a
> > 85d37fa23f86a14d3a9776ff67b0ec0f3b.gif
> 
> s/how much trace data/what the resolution is/
> 
> > so like, if you had a counter incrementing with values after the
> > increments
> > as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to emit a
> > trace,
> > then you would get 1,8,12,30.
> > 
> > So I guess what is need is a way to reduce the quantiy of trace
> > data this
> > way. For this usecase, the user mostly cares about spikes in the
> > counter
> > changing that accurate values of the different points.
> 
> s/that accurate/than accurate/
> 
> I think Tim, Suren, Dan and Michal are all saying the same thing as
> well.
> 

There's not a way to do this using existing triggers (histogram
triggers have an onchange() that fires on any change, but that doesn't
help here), and I wouldn't expect there to be - these sound like very
specific cases that would never have support in the simple trigger
'language'.

On the other hand, I have been working on something that should give
you the ability to do something like this, by writing a module that
hooks into arbitrary trace events, accessing their fields, building up
any needed state across events, and then generating synthetic events as
needed:

https://git.kernel.org/pub/scm/linux/kernel/git/zanussi/linux-trace.git/log/?h=ftrace/in-kernel-events-v0

The documentation is in the top commit, and I'll include it below, but
the basic idea is that you can set up a 'live handler' for any event,
and when that event is hit, you look at the data and save it however
you need, or modify some state machine, etc, and the event is then
discarded since it's in soft enable mode and doesn't end up in the
trace stream unless you enable it.  At any point you can decide to
generate a synthetic event of your own design which will end up in the
trace output.

I don't know much about the RSS threshold case, but it seems you could
just set up a live handler for the RSS tracepoint and monitor it in the
module.  Then whenever it hits your granularity threshold or so, just
generate a new 'RSS threshold' synthetic event at that point, which
would then be the only thing in the trace buffer.  Well, I'm probably
oversimplifying the problem, but hopefully close enough.

If this code looks like it might be useful in general, let me know and
I'll clean it up - at this point it's only prototype-quality and needs
plenty of work.

Thanks,

Tom

[PATCH] trace: Documentation for in-kernel trace event API

Add Documentation for creating and generating synthetic events from
modules, along with instructions for creating custom event handlers
for existing events.

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
---
 Documentation/trace/events.rst | 200 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index f7e1fcc0953c..b7d69b894ec0 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -525,3 +525,203 @@ The following commands are supported:
   event counts (hitcount).
 
   See Documentation/trace/histogram.rst for details and examples.
+
+6.3 In-kernel trace event API
+-----------------------------
+
+In most cases, the command-line interface to trace events is more than
+sufficient.  Sometimes, however, applications might find the need for
+more complex relationships than can be expressed through a simple
+series of linked command-line expressions, or putting together sets of
+commands may be simply too cumbersome.  An example might be an
+application that needs to 'listen' to the trace stream in order to
+maintain an in-kernel state machine detecting, for instance, when an
+illegal kernel state occurs in the scheduler.
+
+The trace event subsystem provides an in-kernel API allowing modules
+or other kernel code to access the trace stream in real-time, and
+simultaneously generate user-defined 'synthetic' events at will, which
+can be used to either augment the existing trace stream and/or signal
+that a particular important state has occured.
+
+The API provided for these purposes is describe below and allows the
+following:
+
+  - creating and defining custom handlers for existing trace events
+  - dynamically creating synthetic event definitions
+  - generating synthetic events from custom handlers
+
+6.3.1 Creating and defining custom handlers for existing trace events
+---------------------------------------------------------------------
+
+To create a live event handler for an existing event in a kernel
+module, the create_live_handler() function should be used.  The name
+of the event and its subsystem should be specified, along with an
+event_trigger_ops object.  The event_trigger_ops object specifies the
+name of the callback function that will be called on every hit of the
+named event.  For example, to have the 'sched_switch_event_trigger()'
+callback called on every sched_switch event, the following code could
+be used:
+
+  static struct event_trigger_ops event_live_trigger_ops = {
+      .func			= sched_switch_event_trigger,
+      .print			= event_live_trigger_print,
+      .init			= event_live_trigger_init,
+      .free			= event_live_trigger_free,
+  };
+
+  data = create_live_handler("sched", "sched_switch", &event_live_trigger_ops);
+
+In order to access the event's trace data on an event hit, field
+accessors can be used.  When the callback function is invoked, an
+array of live_field 'accessors' is passed in as the private_data of
+the event_trigger_data passed to the handler.  A given event field's
+data can be accessed via the live_field struct pointer corresponding
+to that field, specifically by calling the live_field->fn() member,
+which will return the field data as a u64 that can be cast into the
+actual field type (is_string_field() can be used to determine and deal
+with string field types).
+
+For example, the following code will print out each field value for
+various types:
+
+  for (i = 0; i < MAX_ACCESSORS; i++) {
+      field = live_accessors->accessors[i];
+      if (!field)
+          break;
+
+      val = field->fn(field->field, rbe, rec);
+
+      if (is_string_field(field->field))
+          printk("\tval=%s (%s)\n", (char *)(long)val, field->field->name);
+      else
+          printk("\tval=%llu (%s)\n", val, field->field->name);
+  }
+
+  val = prev_comm_field->fn(prev_comm_field->field, rbe, rec);
+  printk("prev_comm: %s\n", (char *)val);
+  val = prev_pid_field->fn(prev_pid_field->field, rbe, rec);
+  printk("prev_pid: %d\n", (pid_t)val);
+
+  print_timestamp(live_accessors->timestamp_accessor, rec, rbe);
+
+In order for this to work, a set of field accessors needs to be
+created before the event is registered.  This is best done in the
+event_trigger_ops.init() callback, which is called from the
+create_live_handler() code.
+
+create_live_field_accessors() automatically creates and add a live
+field accessor for each field in a trace event.
+
+Each field will have an accessor created for it and added to the
+live_accessors array for the event.  This includes the common event
+fields, but doesn't include the common_timestamp (which can be added
+manually using add_live_field_accessor() if needed).
+
+If you don't need or want accessors for every field, you can add them
+individually using add_live_field_accessor() for each one.
+
+A given field's accessor, for use in the handler, can be retrieved
+using find_live_field_accessor() with the field's name.
+
+For example, this call would create a set of live field accessors, one
+for each field of the sched_switch event:
+
+  live_accessors = create_live_field_accessors("sched", "sched_switch");
+
+To access a couple specific sched_switch fields (the same fields we
+used above in the handler):
+
+  prev_pid_field = find_live_field_accessor(live_accessors, "prev_pid");
+  prev_comm_field = find_live_field_accessor(live_accessors, "prev_comm");
+
+To add a common_timestamp accessor, use this call:
+
+  ret = add_live_field_accessor(live_accessors, "common_timestamp");
+
+Note that the common_timestamp accessor is special and unlike all
+other accessors, can be accessed using the dedicated
+live_accessors->timestamp_accessor member.
+
+6.3.2 Dyamically creating synthetic event definitions
+-----------------------------------------------------
+
+To create a new synthetic event from a kernel module, an empty
+synthetic event should first be created using
+create_empty_synthetic_event().  The name of the event should be
+supplied to this function.  For example, to create a new "livetest"
+synthetic event:
+
+  struct synth_event *se = create_empty_synth_event("livetest");
+
+Once a synthetic event object has been created, it can then be
+populated with fields.  Fields are added one by one using
+add_synth_field(), supplying the new synthetic event object, a field
+type, and a field name.  For example, to add a new int field named
+"intfield", the following call should be made:
+
+  ret = add_synth_field(se, "int", "intfield");
+
+See synth_field_size() for available types. If field_name contains [n]
+the field is considered to be an array.
+
+Once all the fields have been added, the event should be finalized and
+registered by calling the finalize_synth_event() function:
+
+  ret = finalize_synth_event(se);
+
+At this point, the event object is ready to be used for generating new
+events.
+
+6.3.3 Generating synthetic events from custom handlers
+------------------------------------------------------
+
+To generate a synthetic event, the generate_synth_event() function is
+used.  It's passed the trace_event_file representing the synthetic
+event (which can be retrieved using find_event_file() using the
+synthetic event name and "synthetic" as the system name, along with
+top_trace_array() as the trace array), along with an array of u64, one
+for each synthetic event field.
+
+The 'vals' array is just an array of u64, the number of which must
+match the number of field in the synthetic event, and which must be in
+the same order as the synthetic event fields.
+
+All vals should be cast to u64, and string vals are just pointers to
+strings, cast to u64.  Strings will be copied into space reserved in
+the event for the string, using these pointers.
+
+So for a synthetic event that was created using the following:
+
+  se = create_empty_synth_event("schedtest");
+  add_synth_field(se, "pid_t", "next_pid_field");
+  add_synth_field(se, "char[16]", "next_comm_field");
+  add_synth_field(se, "u64", "ts_ns");
+  add_synth_field(se, "u64", "ts_ms");
+  add_synth_field(se, "unsigned int", "cpu");
+  add_synth_field(se, "char[64]", "my_string_field");
+  add_synth_field(se, "int", "my_int_field");
+  finalize_synth_event(se);
+
+  schedtest_event_file = find_event_file(top_trace_array(),
+                                         "synthetic", "schedtest");
+
+Code to generate a synthetic event might then look like this:
+
+  u64 vals[7];
+
+  ts_ns = get_timestamp(live_accessors->timestamp_accessor, rec, rbe, true);
+  ts_ms = get_timestamp(live_accessors->timestamp_accessor, rec, rbe, false);
+  cpu = smp_processor_id();
+  next_comm_val = next_comm_field->fn(next_comm_field->field, rbe, rec);
+  next_pid_val = next_pid_field->fn(next_pid_field->field, rbe, rec);
+
+  vals[0] = next_pid_val;
+  vals[1] = next_comm_val;
+  vals[2] = ts_ns;
+  vals[3] = ts_ms;
+  vals[4] = cpu;
+  vals[5] = (u64)"thneed";
+  vals[6] = 398;
+
+  generate_synth_event(schedtest_event_file, vals, sizeof(vals));
Daniel Colascione Sept. 5, 2019, 8:24 p.m. UTC | #28
On Thu, Sep 5, 2019 at 12:56 PM Tom Zanussi <zanussi@kernel.org> wrote:
> On Thu, 2019-09-05 at 13:51 -0400, Joel Fernandes wrote:
> > On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> > > On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > > >
> > > >
> > > > [ Added Tom ]
> > > >
> > > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > > Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org>
> > > > > wrote:
> > > > > >
> > > > > > [Add Steven]
> > > > > >
> > > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel
> > > > > > > .org> wrote:
> > > > > > > >
> > > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > >
> > > > > > [...]
> > > > > > > > > but also for reducing
> > > > > > > > > tracing noise. Flooding the traces makes it less useful
> > > > > > > > > for long traces and
> > > > > > > > > post-processing of traces. IOW, the overhead reduction
> > > > > > > > > is a bonus.
> > > > > > > >
> > > > > > > > This is not really anything special for this tracepoint
> > > > > > > > though.
> > > > > > > > Basically any tracepoint in a hot path is in the same
> > > > > > > > situation and I do
> > > > > > > > not see a point why each of them should really invent its
> > > > > > > > own way to
> > > > > > > > throttle. Maybe there is some way to do that in the
> > > > > > > > tracing subsystem
> > > > > > > > directly.
> > > > > > >
> > > > > > > I am not sure if there is a way to do this easily. Add to
> > > > > > > that, the fact that
> > > > > > > you still have to call into trace events. Why call into it
> > > > > > > at all, if you can
> > > > > > > filter in advance and have a sane filtering default?
> > > > > > >
> > > > > > > The bigger improvement with the threshold is the number of
> > > > > > > trace records are
> > > > > > > almost halved by using a threshold. The number of records
> > > > > > > went from 4.6K to
> > > > > > > 2.6K.
> > > > > >
> > > > > > Steven, would it be feasible to add a generic tracepoint
> > > > > > throttling?
> > > > >
> > > > > I might misunderstand this but is the issue here actually
> > > > > throttling
> > > > > of the sheer number of trace records or tracing large enough
> > > > > changes
> > > > > to RSS that user might care about? Small changes happen all the
> > > > > time
> > > > > but we are likely not interested in those. Surely we could
> > > > > postprocess
> > > > > the traces to extract changes large enough to be interesting
> > > > > but why
> > > > > capture uninteresting information in the first place? IOW the
> > > > > throttling here should be based not on the time between traces
> > > > > but on
> > > > > the amount of change of the traced signal. Maybe a generic
> > > > > facility
> > > > > like that would be a good idea?
> > > >
> > > > You mean like add a trigger (or filter) that only traces if a
> > > > field has
> > > > changed since the last time the trace was hit? Hmm, I think we
> > > > could
> > > > possibly do that. Perhaps even now with histogram triggers?
> > >
> > >
> > > Hey Steve,
> > >
> > > Something like an analog to digitial coversion function where you
> > > lose the
> > > granularity of the signal depending on how much trace data:
> > > https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee38d1a
> > > 85d37fa23f86a14d3a9776ff67b0ec0f3b.gif
> >
> > s/how much trace data/what the resolution is/
> >
> > > so like, if you had a counter incrementing with values after the
> > > increments
> > > as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to emit a
> > > trace,
> > > then you would get 1,8,12,30.
> > >
> > > So I guess what is need is a way to reduce the quantiy of trace
> > > data this
> > > way. For this usecase, the user mostly cares about spikes in the
> > > counter
> > > changing that accurate values of the different points.
> >
> > s/that accurate/than accurate/
> >
> > I think Tim, Suren, Dan and Michal are all saying the same thing as
> > well.
> >
>
> There's not a way to do this using existing triggers (histogram
> triggers have an onchange() that fires on any change, but that doesn't
> help here), and I wouldn't expect there to be - these sound like very
> specific cases that would never have support in the simple trigger
> 'language'.

I don't see the filtering under discussion as some "very specific"
esoteric need. You need this general kind of mechanism any time you
want to monitor at low frequency a thing that changes at high
frequency. The general pattern isn't specific to RSS or even memory in
general. One might imagine, say, wanting to trace large changes in TCP
window sizes. Any time something in the kernel has a "level" and that
level changes at high frequency and we want to learn about big swings
in that level, the mechanism we're talking about becomes useful. I
don't think it should be out of bounds for the histogram mechanism,
which is *almost* there right now. We already have the ability to
accumulate values derived from ftrace events into tables keyed on
various fields in these events and things like onmax().

> On the other hand, I have been working on something that should give
> you the ability to do something like this, by writing a module that
> hooks into arbitrary trace events, accessing their fields, building up
> any needed state across events, and then generating synthetic events as
> needed:

You might as well say we shouldn't have tracepoints at all and that
people should just write modules that kprobe what they need. :-) You
can reject *any* kernel interface by suggesting that people write a
module to do that thing. (You could also probably do something with
eBPF.) But there's a lot of value to having an easy-to-use
general-purpose mechanism that doesn't make people break out the
kernel headers and a C compiler.
Tom Zanussi Sept. 5, 2019, 8:32 p.m. UTC | #29
Hi,

On Thu, 2019-09-05 at 13:24 -0700, Daniel Colascione wrote:
> On Thu, Sep 5, 2019 at 12:56 PM Tom Zanussi <zanussi@kernel.org>
> wrote:
> > On Thu, 2019-09-05 at 13:51 -0400, Joel Fernandes wrote:
> > > On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> > > > On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > > > > 
> > > > > 
> > > > > [ Added Tom ]
> > > > > 
> > > > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > > > Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > 
> > > > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.
> > > > > > org>
> > > > > > wrote:
> > > > > > > 
> > > > > > > [Add Steven]
> > > > > > > 
> > > > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@ke
> > > > > > > > rnel
> > > > > > > > .org> wrote:
> > > > > > > > > 
> > > > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > > > 
> > > > > > > [...]
> > > > > > > > > > but also for reducing
> > > > > > > > > > tracing noise. Flooding the traces makes it less
> > > > > > > > > > useful
> > > > > > > > > > for long traces and
> > > > > > > > > > post-processing of traces. IOW, the overhead
> > > > > > > > > > reduction
> > > > > > > > > > is a bonus.
> > > > > > > > > 
> > > > > > > > > This is not really anything special for this
> > > > > > > > > tracepoint
> > > > > > > > > though.
> > > > > > > > > Basically any tracepoint in a hot path is in the same
> > > > > > > > > situation and I do
> > > > > > > > > not see a point why each of them should really invent
> > > > > > > > > its
> > > > > > > > > own way to
> > > > > > > > > throttle. Maybe there is some way to do that in the
> > > > > > > > > tracing subsystem
> > > > > > > > > directly.
> > > > > > > > 
> > > > > > > > I am not sure if there is a way to do this easily. Add
> > > > > > > > to
> > > > > > > > that, the fact that
> > > > > > > > you still have to call into trace events. Why call into
> > > > > > > > it
> > > > > > > > at all, if you can
> > > > > > > > filter in advance and have a sane filtering default?
> > > > > > > > 
> > > > > > > > The bigger improvement with the threshold is the number
> > > > > > > > of
> > > > > > > > trace records are
> > > > > > > > almost halved by using a threshold. The number of
> > > > > > > > records
> > > > > > > > went from 4.6K to
> > > > > > > > 2.6K.
> > > > > > > 
> > > > > > > Steven, would it be feasible to add a generic tracepoint
> > > > > > > throttling?
> > > > > > 
> > > > > > I might misunderstand this but is the issue here actually
> > > > > > throttling
> > > > > > of the sheer number of trace records or tracing large
> > > > > > enough
> > > > > > changes
> > > > > > to RSS that user might care about? Small changes happen all
> > > > > > the
> > > > > > time
> > > > > > but we are likely not interested in those. Surely we could
> > > > > > postprocess
> > > > > > the traces to extract changes large enough to be
> > > > > > interesting
> > > > > > but why
> > > > > > capture uninteresting information in the first place? IOW
> > > > > > the
> > > > > > throttling here should be based not on the time between
> > > > > > traces
> > > > > > but on
> > > > > > the amount of change of the traced signal. Maybe a generic
> > > > > > facility
> > > > > > like that would be a good idea?
> > > > > 
> > > > > You mean like add a trigger (or filter) that only traces if a
> > > > > field has
> > > > > changed since the last time the trace was hit? Hmm, I think
> > > > > we
> > > > > could
> > > > > possibly do that. Perhaps even now with histogram triggers?
> > > > 
> > > > 
> > > > Hey Steve,
> > > > 
> > > > Something like an analog to digitial coversion function where
> > > > you
> > > > lose the
> > > > granularity of the signal depending on how much trace data:
> > > > https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee3
> > > > 8d1a
> > > > 85d37fa23f86a14d3a9776ff67b0ec0f3b.gif
> > > 
> > > s/how much trace data/what the resolution is/
> > > 
> > > > so like, if you had a counter incrementing with values after
> > > > the
> > > > increments
> > > > as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to
> > > > emit a
> > > > trace,
> > > > then you would get 1,8,12,30.
> > > > 
> > > > So I guess what is need is a way to reduce the quantiy of trace
> > > > data this
> > > > way. For this usecase, the user mostly cares about spikes in
> > > > the
> > > > counter
> > > > changing that accurate values of the different points.
> > > 
> > > s/that accurate/than accurate/
> > > 
> > > I think Tim, Suren, Dan and Michal are all saying the same thing
> > > as
> > > well.
> > > 
> > 
> > There's not a way to do this using existing triggers (histogram
> > triggers have an onchange() that fires on any change, but that
> > doesn't
> > help here), and I wouldn't expect there to be - these sound like
> > very
> > specific cases that would never have support in the simple trigger
> > 'language'.
> 
> I don't see the filtering under discussion as some "very specific"
> esoteric need. You need this general kind of mechanism any time you
> want to monitor at low frequency a thing that changes at high
> frequency. The general pattern isn't specific to RSS or even memory
> in
> general. One might imagine, say, wanting to trace large changes in
> TCP
> window sizes. Any time something in the kernel has a "level" and that
> level changes at high frequency and we want to learn about big swings
> in that level, the mechanism we're talking about becomes useful. I
> don't think it should be out of bounds for the histogram mechanism,
> which is *almost* there right now. We already have the ability to
> accumulate values derived from ftrace events into tables keyed on
> various fields in these events and things like onmax().
> 
> > On the other hand, I have been working on something that should
> > give
> > you the ability to do something like this, by writing a module that
> > hooks into arbitrary trace events, accessing their fields, building
> > up
> > any needed state across events, and then generating synthetic
> > events as
> > needed:
> 
> You might as well say we shouldn't have tracepoints at all and that
> people should just write modules that kprobe what they need. :-) You
> can reject *any* kernel interface by suggesting that people write a
> module to do that thing. (You could also probably do something with
> eBPF.) But there's a lot of value to having an easy-to-use
> general-purpose mechanism that doesn't make people break out the
> kernel headers and a C compiler.

Oh, I didn't mean to reject any interface - I guess I should go read
the whole thread then, and find the interface you're talking about.

Tom
Tom Zanussi Sept. 5, 2019, 9:14 p.m. UTC | #30
On Thu, 2019-09-05 at 13:24 -0700, Daniel Colascione wrote:
> On Thu, Sep 5, 2019 at 12:56 PM Tom Zanussi <zanussi@kernel.org>
> wrote:
> > On Thu, 2019-09-05 at 13:51 -0400, Joel Fernandes wrote:
> > > On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> > > > On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > > > > 
> > > > > 
> > > > > [ Added Tom ]
> > > > > 
> > > > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > > > Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > 
> > > > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.
> > > > > > org>
> > > > > > wrote:
> > > > > > > 
> > > > > > > [Add Steven]
> > > > > > > 
> > > > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@ke
> > > > > > > > rnel
> > > > > > > > .org> wrote:
> > > > > > > > > 
> > > > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > > > 
> > > > > > > [...]
> > > > > > > > > > but also for reducing
> > > > > > > > > > tracing noise. Flooding the traces makes it less
> > > > > > > > > > useful
> > > > > > > > > > for long traces and
> > > > > > > > > > post-processing of traces. IOW, the overhead
> > > > > > > > > > reduction
> > > > > > > > > > is a bonus.
> > > > > > > > > 
> > > > > > > > > This is not really anything special for this
> > > > > > > > > tracepoint
> > > > > > > > > though.
> > > > > > > > > Basically any tracepoint in a hot path is in the same
> > > > > > > > > situation and I do
> > > > > > > > > not see a point why each of them should really invent
> > > > > > > > > its
> > > > > > > > > own way to
> > > > > > > > > throttle. Maybe there is some way to do that in the
> > > > > > > > > tracing subsystem
> > > > > > > > > directly.
> > > > > > > > 
> > > > > > > > I am not sure if there is a way to do this easily. Add
> > > > > > > > to
> > > > > > > > that, the fact that
> > > > > > > > you still have to call into trace events. Why call into
> > > > > > > > it
> > > > > > > > at all, if you can
> > > > > > > > filter in advance and have a sane filtering default?
> > > > > > > > 
> > > > > > > > The bigger improvement with the threshold is the number
> > > > > > > > of
> > > > > > > > trace records are
> > > > > > > > almost halved by using a threshold. The number of
> > > > > > > > records
> > > > > > > > went from 4.6K to
> > > > > > > > 2.6K.
> > > > > > > 
> > > > > > > Steven, would it be feasible to add a generic tracepoint
> > > > > > > throttling?
> > > > > > 
> > > > > > I might misunderstand this but is the issue here actually
> > > > > > throttling
> > > > > > of the sheer number of trace records or tracing large
> > > > > > enough
> > > > > > changes
> > > > > > to RSS that user might care about? Small changes happen all
> > > > > > the
> > > > > > time
> > > > > > but we are likely not interested in those. Surely we could
> > > > > > postprocess
> > > > > > the traces to extract changes large enough to be
> > > > > > interesting
> > > > > > but why
> > > > > > capture uninteresting information in the first place? IOW
> > > > > > the
> > > > > > throttling here should be based not on the time between
> > > > > > traces
> > > > > > but on
> > > > > > the amount of change of the traced signal. Maybe a generic
> > > > > > facility
> > > > > > like that would be a good idea?
> > > > > 
> > > > > You mean like add a trigger (or filter) that only traces if a
> > > > > field has
> > > > > changed since the last time the trace was hit? Hmm, I think
> > > > > we
> > > > > could
> > > > > possibly do that. Perhaps even now with histogram triggers?
> > > > 
> > > > 
> > > > Hey Steve,
> > > > 
> > > > Something like an analog to digitial coversion function where
> > > > you
> > > > lose the
> > > > granularity of the signal depending on how much trace data:
> > > > https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee3
> > > > 8d1a
> > > > 85d37fa23f86a14d3a9776ff67b0ec0f3b.gif
> > > 
> > > s/how much trace data/what the resolution is/
> > > 
> > > > so like, if you had a counter incrementing with values after
> > > > the
> > > > increments
> > > > as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to
> > > > emit a
> > > > trace,
> > > > then you would get 1,8,12,30.
> > > > 
> > > > So I guess what is need is a way to reduce the quantiy of trace
> > > > data this
> > > > way. For this usecase, the user mostly cares about spikes in
> > > > the
> > > > counter
> > > > changing that accurate values of the different points.
> > > 
> > > s/that accurate/than accurate/
> > > 
> > > I think Tim, Suren, Dan and Michal are all saying the same thing
> > > as
> > > well.
> > > 
> > 
> > There's not a way to do this using existing triggers (histogram
> > triggers have an onchange() that fires on any change, but that
> > doesn't
> > help here), and I wouldn't expect there to be - these sound like
> > very
> > specific cases that would never have support in the simple trigger
> > 'language'.
> 
> I don't see the filtering under discussion as some "very specific"
> esoteric need. You need this general kind of mechanism any time you
> want to monitor at low frequency a thing that changes at high
> frequency. The general pattern isn't specific to RSS or even memory
> in
> general. One might imagine, say, wanting to trace large changes in
> TCP
> window sizes. Any time something in the kernel has a "level" and that
> level changes at high frequency and we want to learn about big swings
> in that level, the mechanism we're talking about becomes useful. I
> don't think it should be out of bounds for the histogram mechanism,
> which is *almost* there right now. We already have the ability to
> accumulate values derived from ftrace events into tables keyed on
> various fields in these events and things like onmax().
> 

OK, so with the histograms we already have onchange(), which triggers
on any change.

Would it be sufficient to just add a 'threshold' param to that i.e.
onchange(x) means trigger whenever the difference between the new value
and the previous value is >= x?

Tom
Daniel Colascione Sept. 5, 2019, 10:12 p.m. UTC | #31
On Thu, Sep 5, 2019 at 2:14 PM Tom Zanussi <zanussi@kernel.org> wrote:
>
> On Thu, 2019-09-05 at 13:24 -0700, Daniel Colascione wrote:
> > On Thu, Sep 5, 2019 at 12:56 PM Tom Zanussi <zanussi@kernel.org>
> > wrote:
> > > On Thu, 2019-09-05 at 13:51 -0400, Joel Fernandes wrote:
> > > > On Thu, Sep 05, 2019 at 01:47:05PM -0400, Joel Fernandes wrote:
> > > > > On Thu, Sep 05, 2019 at 01:35:07PM -0400, Steven Rostedt wrote:
> > > > > >
> > > > > >
> > > > > > [ Added Tom ]
> > > > > >
> > > > > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > > > > Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.
> > > > > > > org>
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > [Add Steven]
> > > > > > > >
> > > > > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@ke
> > > > > > > > > rnel
> > > > > > > > > .org> wrote:
> > > > > > > > > >
> > > > > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > > > >
> > > > > > > > [...]
> > > > > > > > > > > but also for reducing
> > > > > > > > > > > tracing noise. Flooding the traces makes it less
> > > > > > > > > > > useful
> > > > > > > > > > > for long traces and
> > > > > > > > > > > post-processing of traces. IOW, the overhead
> > > > > > > > > > > reduction
> > > > > > > > > > > is a bonus.
> > > > > > > > > >
> > > > > > > > > > This is not really anything special for this
> > > > > > > > > > tracepoint
> > > > > > > > > > though.
> > > > > > > > > > Basically any tracepoint in a hot path is in the same
> > > > > > > > > > situation and I do
> > > > > > > > > > not see a point why each of them should really invent
> > > > > > > > > > its
> > > > > > > > > > own way to
> > > > > > > > > > throttle. Maybe there is some way to do that in the
> > > > > > > > > > tracing subsystem
> > > > > > > > > > directly.
> > > > > > > > >
> > > > > > > > > I am not sure if there is a way to do this easily. Add
> > > > > > > > > to
> > > > > > > > > that, the fact that
> > > > > > > > > you still have to call into trace events. Why call into
> > > > > > > > > it
> > > > > > > > > at all, if you can
> > > > > > > > > filter in advance and have a sane filtering default?
> > > > > > > > >
> > > > > > > > > The bigger improvement with the threshold is the number
> > > > > > > > > of
> > > > > > > > > trace records are
> > > > > > > > > almost halved by using a threshold. The number of
> > > > > > > > > records
> > > > > > > > > went from 4.6K to
> > > > > > > > > 2.6K.
> > > > > > > >
> > > > > > > > Steven, would it be feasible to add a generic tracepoint
> > > > > > > > throttling?
> > > > > > >
> > > > > > > I might misunderstand this but is the issue here actually
> > > > > > > throttling
> > > > > > > of the sheer number of trace records or tracing large
> > > > > > > enough
> > > > > > > changes
> > > > > > > to RSS that user might care about? Small changes happen all
> > > > > > > the
> > > > > > > time
> > > > > > > but we are likely not interested in those. Surely we could
> > > > > > > postprocess
> > > > > > > the traces to extract changes large enough to be
> > > > > > > interesting
> > > > > > > but why
> > > > > > > capture uninteresting information in the first place? IOW
> > > > > > > the
> > > > > > > throttling here should be based not on the time between
> > > > > > > traces
> > > > > > > but on
> > > > > > > the amount of change of the traced signal. Maybe a generic
> > > > > > > facility
> > > > > > > like that would be a good idea?
> > > > > >
> > > > > > You mean like add a trigger (or filter) that only traces if a
> > > > > > field has
> > > > > > changed since the last time the trace was hit? Hmm, I think
> > > > > > we
> > > > > > could
> > > > > > possibly do that. Perhaps even now with histogram triggers?
> > > > >
> > > > >
> > > > > Hey Steve,
> > > > >
> > > > > Something like an analog to digitial coversion function where
> > > > > you
> > > > > lose the
> > > > > granularity of the signal depending on how much trace data:
> > > > > https://www.globalspec.com/ImageRepository/LearnMore/20142/9ee3
> > > > > 8d1a
> > > > > 85d37fa23f86a14d3a9776ff67b0ec0f3b.gif
> > > >
> > > > s/how much trace data/what the resolution is/
> > > >
> > > > > so like, if you had a counter incrementing with values after
> > > > > the
> > > > > increments
> > > > > as:  1,3,4,8,12,14,30 and say 5 is the threshold at which to
> > > > > emit a
> > > > > trace,
> > > > > then you would get 1,8,12,30.
> > > > >
> > > > > So I guess what is need is a way to reduce the quantiy of trace
> > > > > data this
> > > > > way. For this usecase, the user mostly cares about spikes in
> > > > > the
> > > > > counter
> > > > > changing that accurate values of the different points.
> > > >
> > > > s/that accurate/than accurate/
> > > >
> > > > I think Tim, Suren, Dan and Michal are all saying the same thing
> > > > as
> > > > well.
> > > >
> > >
> > > There's not a way to do this using existing triggers (histogram
> > > triggers have an onchange() that fires on any change, but that
> > > doesn't
> > > help here), and I wouldn't expect there to be - these sound like
> > > very
> > > specific cases that would never have support in the simple trigger
> > > 'language'.
> >
> > I don't see the filtering under discussion as some "very specific"
> > esoteric need. You need this general kind of mechanism any time you
> > want to monitor at low frequency a thing that changes at high
> > frequency. The general pattern isn't specific to RSS or even memory
> > in
> > general. One might imagine, say, wanting to trace large changes in
> > TCP
> > window sizes. Any time something in the kernel has a "level" and that
> > level changes at high frequency and we want to learn about big swings
> > in that level, the mechanism we're talking about becomes useful. I
> > don't think it should be out of bounds for the histogram mechanism,
> > which is *almost* there right now. We already have the ability to
> > accumulate values derived from ftrace events into tables keyed on
> > various fields in these events and things like onmax().
> >
>
> OK, so with the histograms we already have onchange(), which triggers
> on any change.
>
> Would it be sufficient to just add a 'threshold' param to that i.e.
> onchange(x) means trigger whenever the difference between the new value
> and the previous value is >= x?

By previous value, do you mean previously-reported value or the
previously-set value? If the former, that's good, but we may be able
to do better (see below). If the latter, I don't think that's quite
right, because then we could miss an arbitrarily-large change in
"level" so long as it occurred in sufficiently small steps.

Basically, what I have in mind is this:

1) attach a trigger a tracepoint that contains some
absolutely-specified "level" (say, task private RSS),

2) in the trigger, find the absolute value of the difference between
the new "level" (some field in that tracepoint) and the last "level"
we have for the combination of that value and configurable
partitioning criteria (e.g., pid, uid, cpu, NUMA node) yielding an
absdelta,

3) accumulate absdelta values in some table partitioned on the same
fields as in #2, and

4) after updating accumulated absdelta, evaluate a filter expression,
and if the filter expression evaluates to true, emit a tracepoint
saying "the new value of $LEVEL for $PARTITION is $VALUE" and reset
the accumulated absdelta to zero.

I think this mechanism would give us what we wanted in a general and
powerful way, and we can dial the granularity up or down however want.
The reason I want to accumulate absdelta values instead of just firing
when the previously-reported value differs sufficiently from the
last-set value is so we can tell whether a counter is fluctuating a
lot without its value actually changing. The trigger expression could
then allow any combination of conditions, e.g., "fire a tracepoint
when the accumulated change is greater than 2MB _OR_ a single change
is greater than 1MB _OR_ when we've gone 10 changes of this level
value without reporting the level's new value".

Let me know if this description doesn't make sense.
Daniel Colascione Sept. 5, 2019, 10:51 p.m. UTC | #32
On Thu, Sep 5, 2019 at 3:12 PM Daniel Colascione <dancol@google.com> wrote:
> Basically, what I have in mind is this:

Actually --- I wonder whether there's already enough power in the
trigger mechanism to do this without any code changes to ftrace
histograms themselves. I'm trying to think of the minimum additional
kernel facility that we'd need to implement the scheme I described
above, and it might be that we don't need to do anything at all except
add the actual level tracepoints.
Joel Fernandes Sept. 6, 2019, 12:59 a.m. UTC | #33
On Thu, Sep 05, 2019 at 10:50:27AM -0700, Daniel Colascione wrote:
> On Thu, Sep 5, 2019 at 10:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > On Thu, 5 Sep 2019 09:03:01 -0700
> > Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > [Add Steven]
> > > >
> > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > > >
> > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > [...]
> > > > > > > but also for reducing
> > > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.
> > > > > >
> > > > > > This is not really anything special for this tracepoint though.
> > > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > > not see a point why each of them should really invent its own way to
> > > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > > directly.
> > > > >
> > > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > > you still have to call into trace events. Why call into it at all, if you can
> > > > > filter in advance and have a sane filtering default?
> > > > >
> > > > > The bigger improvement with the threshold is the number of trace records are
> > > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > > 2.6K.
> > > >
> > > > Steven, would it be feasible to add a generic tracepoint throttling?
> > >
> > > I might misunderstand this but is the issue here actually throttling
> > > of the sheer number of trace records or tracing large enough changes
> > > to RSS that user might care about? Small changes happen all the time
> > > but we are likely not interested in those. Surely we could postprocess
> > > the traces to extract changes large enough to be interesting but why
> > > capture uninteresting information in the first place? IOW the
> > > throttling here should be based not on the time between traces but on
> > > the amount of change of the traced signal. Maybe a generic facility
> > > like that would be a good idea?
> >
> > You mean like add a trigger (or filter) that only traces if a field has
> > changed since the last time the trace was hit? Hmm, I think we could
> > possibly do that. Perhaps even now with histogram triggers?
> 
> I was thinking along the same lines. The histogram subsystem seems
> like a very good fit here. Histogram triggers already let users talk
> about specific fields of trace events, aggregate them in configurable
> ways, and (importantly, IMHO) create synthetic new trace events that
> the kernel emits under configurable conditions.

Hmm, I think this tracing feature will be a good idea. But in order not to
gate this patch, can we agree on keeping a temporary threshold for this
patch? Once such idea is implemented in trace subsystem, then we can remove
the temporary filter.

As Tim said, we don't want our traces flooded and this is a very useful
tracepoint as proven in our internal usage at Android. The threshold filter
is just few lines of code.

thanks,

 - Joel
Daniel Colascione Sept. 6, 2019, 1:15 a.m. UTC | #34
On Thu, Sep 5, 2019 at 5:59 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> On Thu, Sep 05, 2019 at 10:50:27AM -0700, Daniel Colascione wrote:
> > On Thu, Sep 5, 2019 at 10:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > On Thu, 5 Sep 2019 09:03:01 -0700
> > > Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > > On Thu, Sep 5, 2019 at 7:43 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > [Add Steven]
> > > > >
> > > > > On Wed 04-09-19 12:28:08, Joel Fernandes wrote:
> > > > > > On Wed, Sep 4, 2019 at 11:38 AM Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > >
> > > > > > > On Wed 04-09-19 11:32:58, Joel Fernandes wrote:
> > > > > [...]
> > > > > > > > but also for reducing
> > > > > > > > tracing noise. Flooding the traces makes it less useful for long traces and
> > > > > > > > post-processing of traces. IOW, the overhead reduction is a bonus.
> > > > > > >
> > > > > > > This is not really anything special for this tracepoint though.
> > > > > > > Basically any tracepoint in a hot path is in the same situation and I do
> > > > > > > not see a point why each of them should really invent its own way to
> > > > > > > throttle. Maybe there is some way to do that in the tracing subsystem
> > > > > > > directly.
> > > > > >
> > > > > > I am not sure if there is a way to do this easily. Add to that, the fact that
> > > > > > you still have to call into trace events. Why call into it at all, if you can
> > > > > > filter in advance and have a sane filtering default?
> > > > > >
> > > > > > The bigger improvement with the threshold is the number of trace records are
> > > > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > > > 2.6K.
> > > > >
> > > > > Steven, would it be feasible to add a generic tracepoint throttling?
> > > >
> > > > I might misunderstand this but is the issue here actually throttling
> > > > of the sheer number of trace records or tracing large enough changes
> > > > to RSS that user might care about? Small changes happen all the time
> > > > but we are likely not interested in those. Surely we could postprocess
> > > > the traces to extract changes large enough to be interesting but why
> > > > capture uninteresting information in the first place? IOW the
> > > > throttling here should be based not on the time between traces but on
> > > > the amount of change of the traced signal. Maybe a generic facility
> > > > like that would be a good idea?
> > >
> > > You mean like add a trigger (or filter) that only traces if a field has
> > > changed since the last time the trace was hit? Hmm, I think we could
> > > possibly do that. Perhaps even now with histogram triggers?
> >
> > I was thinking along the same lines. The histogram subsystem seems
> > like a very good fit here. Histogram triggers already let users talk
> > about specific fields of trace events, aggregate them in configurable
> > ways, and (importantly, IMHO) create synthetic new trace events that
> > the kernel emits under configurable conditions.
>
> Hmm, I think this tracing feature will be a good idea. But in order not to
> gate this patch, can we agree on keeping a temporary threshold for this
> patch? Once such idea is implemented in trace subsystem, then we can remove
> the temporary filter.
>
> As Tim said, we don't want our traces flooded and this is a very useful
> tracepoint as proven in our internal usage at Android. The threshold filter
> is just few lines of code.

I'm not sure the threshold filtering code you've added does the right
thing: we don't keep state, so if a counter constantly flips between
one "side" of the TRACE_MM_COUNTER_THRESHOLD and the other, we'll emit
ftrace events at high frequency. More generally, this filtering
couples the rate of counter logging to the *value* of the counter ---
that is, we log ftrace events at different times depending on how much
memory we happen to have used --- and that's not ideal from a
predictability POV.

All things being equal, I'd prefer that we get things upstream as fast
as possible. But in this case, I'd rather wait for a general-purpose
filtering facility (whether that facility is based on histogram, eBPF,
or something else) rather than hardcode one particular fixed filtering
strategy (which might be suboptimal) for one particular kind of event.
Is there some special urgency here?

How about we instead add non-filtered tracepoints for the mm counters?
These tracepoints will still be free when turned off.

Having added the basic tracepoints, we can discuss separately how to
do the rate limiting. Maybe instead of providing direct support for
the algorithm that I described above, we can just use a BPF program as
a yes/no predicate for whether to log to ftrace. That'd get us to the
same place as this patch, but more flexibly, right?
Joel Fernandes Sept. 6, 2019, 3:01 a.m. UTC | #35
On Thu, Sep 05, 2019 at 06:15:43PM -0700, Daniel Colascione wrote:
[snip]
> > > > > > > The bigger improvement with the threshold is the number of trace records are
> > > > > > > almost halved by using a threshold. The number of records went from 4.6K to
> > > > > > > 2.6K.
> > > > > >
> > > > > > Steven, would it be feasible to add a generic tracepoint throttling?
> > > > >
> > > > > I might misunderstand this but is the issue here actually throttling
> > > > > of the sheer number of trace records or tracing large enough changes
> > > > > to RSS that user might care about? Small changes happen all the time
> > > > > but we are likely not interested in those. Surely we could postprocess
> > > > > the traces to extract changes large enough to be interesting but why
> > > > > capture uninteresting information in the first place? IOW the
> > > > > throttling here should be based not on the time between traces but on
> > > > > the amount of change of the traced signal. Maybe a generic facility
> > > > > like that would be a good idea?
> > > >
> > > > You mean like add a trigger (or filter) that only traces if a field has
> > > > changed since the last time the trace was hit? Hmm, I think we could
> > > > possibly do that. Perhaps even now with histogram triggers?
> > >
> > > I was thinking along the same lines. The histogram subsystem seems
> > > like a very good fit here. Histogram triggers already let users talk
> > > about specific fields of trace events, aggregate them in configurable
> > > ways, and (importantly, IMHO) create synthetic new trace events that
> > > the kernel emits under configurable conditions.
> >
> > Hmm, I think this tracing feature will be a good idea. But in order not to
> > gate this patch, can we agree on keeping a temporary threshold for this
> > patch? Once such idea is implemented in trace subsystem, then we can remove
> > the temporary filter.
> >
> > As Tim said, we don't want our traces flooded and this is a very useful
> > tracepoint as proven in our internal usage at Android. The threshold filter
> > is just few lines of code.
> 
> I'm not sure the threshold filtering code you've added does the right
> thing: we don't keep state, so if a counter constantly flips between
> one "side" of the TRACE_MM_COUNTER_THRESHOLD and the other, we'll emit
> ftrace events at high frequency. More generally, this filtering
> couples the rate of counter logging to the *value* of the counter ---
> that is, we log ftrace events at different times depending on how much
> memory we happen to have used --- and that's not ideal from a
> predictability POV.
> 
> All things being equal, I'd prefer that we get things upstream as fast
> as possible. But in this case, I'd rather wait for a general-purpose
> filtering facility (whether that facility is based on histogram, eBPF,
> or something else) rather than hardcode one particular fixed filtering
> strategy (which might be suboptimal) for one particular kind of event.
> Is there some special urgency here?
> 
> How about we instead add non-filtered tracepoints for the mm counters?
> These tracepoints will still be free when turned off.
> 
> Having added the basic tracepoints, we can discuss separately how to
> do the rate limiting. Maybe instead of providing direct support for
> the algorithm that I described above, we can just use a BPF program as
> a yes/no predicate for whether to log to ftrace. That'd get us to the
> same place as this patch, but more flexibly, right?

Chatted with Daniel offline, we agreed on removing the threshold -- which
Michal also wants to be that way.

So I'll be resubmitting this patch with the threshold removed; and we'll work
on seeing to use filtering through other generic ways like BPF.

thanks all!

 - Joel
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..823aaf759bdb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1671,19 +1671,27 @@  static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 	return (unsigned long)val;
 }
 
+void mm_trace_rss_stat(int member, long count, long value);
+
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-	atomic_long_add(value, &mm->rss_stat.count[member]);
+	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count, value);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-	atomic_long_inc(&mm->rss_stat.count[member]);
+	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count, 1);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-	atomic_long_dec(&mm->rss_stat.count[member]);
+	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count, -1);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index eb57e3037deb..8b88e04fafbf 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -315,6 +315,27 @@  TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->change_ownership)
 );
 
+TRACE_EVENT(rss_stat,
+
+	TP_PROTO(int member,
+		long count),
+
+	TP_ARGS(member, count),
+
+	TP_STRUCT__entry(
+		__field(int, member)
+		__field(long, size)
+	),
+
+	TP_fast_assign(
+		__entry->member = member;
+		__entry->size = (count << PAGE_SHIFT);
+	),
+
+	TP_printk("member=%d size=%ldB",
+		__entry->member,
+		__entry->size)
+	);
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..9d81322c24a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@ 
 #include <linux/oom.h>
 #include <linux/numa.h>
 
+#include <trace/events/kmem.h>
+
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
@@ -140,6 +142,24 @@  static int __init init_zero_pfn(void)
 }
 core_initcall(init_zero_pfn);
 
+/*
+ * This threshold is the boundary in the value space, that the counter has to
+ * advance before we trace it. Should be a power of 2. It is to reduce unwanted
+ * trace overhead. The counter is in units of number of pages.
+ */
+#define TRACE_MM_COUNTER_THRESHOLD 128
+
+void mm_trace_rss_stat(int member, long count, long value)
+{
+	long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
+
+	if (!trace_rss_stat_enabled())
+		return;
+
+	/* Threshold roll-over, trace it */
+	if ((count & thresh_mask) != ((count - value) & thresh_mask))
+		trace_rss_stat(member, count);
+}
 
 #if defined(SPLIT_RSS_COUNTING)