diff mbox series

memcg: add tracing for memcg stat updates

Message ID 20241010003550.3695245-1-shakeel.butt@linux.dev (mailing list archive)
State New
Headers show
Series memcg: add tracing for memcg stat updates | expand

Commit Message

Shakeel Butt Oct. 10, 2024, 12:35 a.m. UTC
The memcg stats are maintained in rstat infrastructure which provides
very fast updates side and reasonable read side. However memcg added
plethora of stats and made the read side, which is cgroup rstat flush,
very slow. To solve that, threshold was added in the memcg stats read
side i.e. no need to flush the stats if updates are within the
threshold.

This threshold based improvement worked for sometime but more stats were
added to memcg and also the read codepath was getting triggered in the
performance sensitive paths which made threshold based ratelimiting
ineffective. We need more visibility into the hot and cold stats i.e.
stats with a lot of updates. Let's add trace to get that visibility.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/trace/events/memcg.h | 59 ++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c              | 13 ++++++--
 2 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/memcg.h

Comments

Yosry Ahmed Oct. 10, 2024, 12:46 a.m. UTC | #1
On Wed, Oct 9, 2024 at 5:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> The memcg stats are maintained in rstat infrastructure which provides
> very fast updates side and reasonable read side. However memcg added
> plethora of stats and made the read side, which is cgroup rstat flush,
> very slow. To solve that, threshold was added in the memcg stats read
> side i.e. no need to flush the stats if updates are within the
> threshold.
>
> This threshold based improvement worked for sometime but more stats were
> added to memcg and also the read codepath was getting triggered in the
> performance sensitive paths which made threshold based ratelimiting
> ineffective. We need more visibility into the hot and cold stats i.e.
> stats with a lot of updates. Let's add trace to get that visibility.

Good idea, thanks for doing this.

>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>  include/trace/events/memcg.h | 59 ++++++++++++++++++++++++++++++++++++
>  mm/memcontrol.c              | 13 ++++++--
>  2 files changed, 70 insertions(+), 2 deletions(-)
>  create mode 100644 include/trace/events/memcg.h
>
> diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> new file mode 100644
> index 000000000000..913db9aba580
> --- /dev/null
> +++ b/include/trace/events/memcg.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM memcg
> +
> +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_MEMCG_H
> +
> +#include <linux/memcontrol.h>
> +#include <linux/tracepoint.h>
> +
> +
> +DECLARE_EVENT_CLASS(memcg_rstat,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val),
> +
> +       TP_STRUCT__entry(
> +               __field(u64, id)
> +               __field(int, item)
> +               __field(int, val)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->id = cgroup_id(memcg->css.cgroup);
> +               __entry->item = item;
> +               __entry->val = val;
> +       ),
> +
> +       TP_printk("memcg_id=%llu item=%d val=%d",
> +                 __entry->id, __entry->item, __entry->val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, mod_memcg_state,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, count_memcg_events,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +
> +#endif /* _TRACE_MEMCG_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c098fd7f5c5e..17af08367c68 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -71,6 +71,10 @@
>
>  #include <linux/uaccess.h>
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/memcg.h>
> +#undef CREATE_TRACE_POINTS
> +
>  #include <trace/events/vmscan.h>
>
>  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
>                 return;
>
>         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> +       val = memcg_state_val_in_pages(idx, val);
> +       memcg_rstat_updated(memcg, val);
> +       trace_mod_memcg_state(memcg, idx, val);

Is it too unreasonable to include the stat name?

The index has to be correlated with the kernel config and perhaps even
version. It's not a big deal, but if performance is not a concern when
tracing is enabled anyway, maybe we can lookup the name here (or in
TP_fast_assign()).

>  }
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item. */
> @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>         /* Update lruvec */
>         __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
> -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> +       val = memcg_state_val_in_pages(idx, val);
> +       memcg_rstat_updated(memcg, val);
> +       trace_mod_memcg_lruvec_state(memcg, idx, val);
>         memcg_stats_unlock();
>  }
>
> @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
>         memcg_stats_lock();
>         __this_cpu_add(memcg->vmstats_percpu->events[i], count);
>         memcg_rstat_updated(memcg, count);
> +       trace_count_memcg_events(memcg, idx, count);
>         memcg_stats_unlock();
>  }
>
> --
> 2.43.5
>
Roman Gushchin Oct. 10, 2024, 12:46 a.m. UTC | #2
On Wed, Oct 09, 2024 at 05:35:50PM -0700, Shakeel Butt wrote:
> The memcg stats are maintained in rstat infrastructure which provides
> very fast updates side and reasonable read side. However memcg added
> plethora of stats and made the read side, which is cgroup rstat flush,
> very slow. To solve that, threshold was added in the memcg stats read
> side i.e. no need to flush the stats if updates are within the
> threshold.
> 
> This threshold based improvement worked for sometime but more stats were
> added to memcg and also the read codepath was getting triggered in the
> performance sensitive paths which made threshold based ratelimiting
> ineffective. We need more visibility into the hot and cold stats i.e.
> stats with a lot of updates. Let's add trace to get that visibility.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

Thanks!
Steven Rostedt Oct. 10, 2024, 1:08 a.m. UTC | #3
On Wed, 9 Oct 2024 17:46:22 -0700
Yosry Ahmed <yosryahmed@google.com> wrote:

> > +++ b/mm/memcontrol.c
> > @@ -71,6 +71,10 @@
> >
> >  #include <linux/uaccess.h>
> >
> > +#define CREATE_TRACE_POINTS
> > +#include <trace/events/memcg.h>
> > +#undef CREATE_TRACE_POINTS
> > +
> >  #include <trace/events/vmscan.h>
> >
> >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> >                 return;
> >
> >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > +       val = memcg_state_val_in_pages(idx, val);
> > +       memcg_rstat_updated(memcg, val);
> > +       trace_mod_memcg_state(memcg, idx, val);  
> 
> Is it too unreasonable to include the stat name?
> 
> The index has to be correlated with the kernel config and perhaps even
> version. It's not a big deal, but if performance is not a concern when
> tracing is enabled anyway, maybe we can lookup the name here (or in
> TP_fast_assign()).

What name? Is it looked up from idx? If so, you can do it on the reading of
the trace event where performance is not an issue. See the __print_symbolic()
and friends in samples/trace_events/trace-events-sample.h

-- Steve
Yosry Ahmed Oct. 10, 2024, 1:24 a.m. UTC | #4
On Wed, Oct 9, 2024 at 6:08 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 9 Oct 2024 17:46:22 -0700
> Yosry Ahmed <yosryahmed@google.com> wrote:
>
> > > +++ b/mm/memcontrol.c
> > > @@ -71,6 +71,10 @@
> > >
> > >  #include <linux/uaccess.h>
> > >
> > > +#define CREATE_TRACE_POINTS
> > > +#include <trace/events/memcg.h>
> > > +#undef CREATE_TRACE_POINTS
> > > +
> > >  #include <trace/events/vmscan.h>
> > >
> > >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > >                 return;
> > >
> > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > +       val = memcg_state_val_in_pages(idx, val);
> > > +       memcg_rstat_updated(memcg, val);
> > > +       trace_mod_memcg_state(memcg, idx, val);
> >
> > Is it too unreasonable to include the stat name?
> >
> > The index has to be correlated with the kernel config and perhaps even
> > version. It's not a big deal, but if performance is not a concern when
> > tracing is enabled anyway, maybe we can lookup the name here (or in
> > TP_fast_assign()).
>
> What name? Is it looked up from idx? If so, you can do it on the reading of
> the trace event where performance is not an issue. See the __print_symbolic()
> and friends in samples/trace_events/trace-events-sample.h

Yeah they can be found using idx. Thanks for referring us to
__print_symbolic(), I suppose for this to work we need to construct an
array of {idx, name}. I think we can replace the existing memory_stats
and memcg1_stats/memcg1_stat_names arrays with something that we can
reuse for tracing, so we wouldn't need to consume extra space.

Shakeel, what do you think?
Shakeel Butt Oct. 10, 2024, 5:26 p.m. UTC | #5
On Wed, Oct 09, 2024 at 06:24:55PM GMT, Yosry Ahmed wrote:
> On Wed, Oct 9, 2024 at 6:08 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Wed, 9 Oct 2024 17:46:22 -0700
> > Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > > +++ b/mm/memcontrol.c
> > > > @@ -71,6 +71,10 @@
> > > >
> > > >  #include <linux/uaccess.h>
> > > >
> > > > +#define CREATE_TRACE_POINTS
> > > > +#include <trace/events/memcg.h>
> > > > +#undef CREATE_TRACE_POINTS
> > > > +
> > > >  #include <trace/events/vmscan.h>
> > > >
> > > >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > > >                 return;
> > > >
> > > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > > +       val = memcg_state_val_in_pages(idx, val);
> > > > +       memcg_rstat_updated(memcg, val);
> > > > +       trace_mod_memcg_state(memcg, idx, val);
> > >
> > > Is it too unreasonable to include the stat name?
> > >
> > > The index has to be correlated with the kernel config and perhaps even
> > > version. It's not a big deal, but if performance is not a concern when
> > > tracing is enabled anyway, maybe we can lookup the name here (or in
> > > TP_fast_assign()).
> >
> > What name? Is it looked up from idx? If so, you can do it on the reading of

Does reading side mean the one reading /sys/kernel/tracing/trace will do
the translation from enums to string?

> > the trace event where performance is not an issue. See the __print_symbolic()
> > and friends in samples/trace_events/trace-events-sample.h
> 
> Yeah they can be found using idx. Thanks for referring us to
> __print_symbolic(), I suppose for this to work we need to construct an
> array of {idx, name}. I think we can replace the existing memory_stats
> and memcg1_stats/memcg1_stat_names arrays with something that we can
> reuse for tracing, so we wouldn't need to consume extra space.
> 
> Shakeel, what do you think?

Cc Daniel & Martin

I was planning to use bpftrace which can use dwarf/btf to convert the
raw int to its enum string. Martin provided the following command to
extract the translation from the kernel.

$ bpftool btf dump file /sys/kernel/btf/vmlinux | grep -A10 node_stat_item
[2264] ENUM 'node_stat_item' encoding=UNSIGNED size=4 vlen=46
        'NR_LRU_BASE' val=0
        'NR_INACTIVE_ANON' val=0
        'NR_ACTIVE_ANON' val=1
        'NR_INACTIVE_FILE' val=2
        'NR_ACTIVE_FILE' val=3
        'NR_UNEVICTABLE' val=4
        'NR_SLAB_RECLAIMABLE_B' val=5
        'NR_SLAB_UNRECLAIMABLE_B' val=6
        'NR_ISOLATED_ANON' val=7
        'NR_ISOLATED_FILE' val=8
...

My point is userspace tools can use existing infra to extract this
information.

However I am not against adding __print_symbolic() (but without any
duplication), so users reading /sys/kernel/tracing/trace directly can
see more useful information as well. Please post a follow up patch after
this one.

thanks for the review,
Shakeel
Shakeel Butt Oct. 10, 2024, 5:27 p.m. UTC | #6
On Thu, Oct 10, 2024 at 12:46:29AM GMT, Roman Gushchin wrote:
> On Wed, Oct 09, 2024 at 05:35:50PM -0700, Shakeel Butt wrote:
> > The memcg stats are maintained in rstat infrastructure which provides
> > very fast updates side and reasonable read side. However memcg added
> > plethora of stats and made the read side, which is cgroup rstat flush,
> > very slow. To solve that, threshold was added in the memcg stats read
> > side i.e. no need to flush the stats if updates are within the
> > threshold.
> > 
> > This threshold based improvement worked for sometime but more stats were
> > added to memcg and also the read codepath was getting triggered in the
> > performance sensitive paths which made threshold based ratelimiting
> > ineffective. We need more visibility into the hot and cold stats i.e.
> > stats with a lot of updates. Let's add trace to get that visibility.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

Thanks for the review.
Yosry Ahmed Oct. 15, 2024, 12:15 a.m. UTC | #7
On Thu, Oct 10, 2024 at 10:26 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, Oct 09, 2024 at 06:24:55PM GMT, Yosry Ahmed wrote:
> > On Wed, Oct 9, 2024 at 6:08 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Wed, 9 Oct 2024 17:46:22 -0700
> > > Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -71,6 +71,10 @@
> > > > >
> > > > >  #include <linux/uaccess.h>
> > > > >
> > > > > +#define CREATE_TRACE_POINTS
> > > > > +#include <trace/events/memcg.h>
> > > > > +#undef CREATE_TRACE_POINTS
> > > > > +
> > > > >  #include <trace/events/vmscan.h>
> > > > >
> > > > >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > > > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > > > >                 return;
> > > > >
> > > > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > > > +       val = memcg_state_val_in_pages(idx, val);
> > > > > +       memcg_rstat_updated(memcg, val);
> > > > > +       trace_mod_memcg_state(memcg, idx, val);
> > > >
> > > > Is it too unreasonable to include the stat name?
> > > >
> > > > The index has to be correlated with the kernel config and perhaps even
> > > > version. It's not a big deal, but if performance is not a concern when
> > > > tracing is enabled anyway, maybe we can lookup the name here (or in
> > > > TP_fast_assign()).
> > >
> > > What name? Is it looked up from idx? If so, you can do it on the reading of
>
> Does reading side mean the one reading /sys/kernel/tracing/trace will do
> the translation from enums to string?
>
> > > the trace event where performance is not an issue. See the __print_symbolic()
> > > and friends in samples/trace_events/trace-events-sample.h
> >
> > Yeah they can be found using idx. Thanks for referring us to
> > __print_symbolic(), I suppose for this to work we need to construct an
> > array of {idx, name}. I think we can replace the existing memory_stats
> > and memcg1_stats/memcg1_stat_names arrays with something that we can
> > reuse for tracing, so we wouldn't need to consume extra space.
> >
> > Shakeel, what do you think?
>
> Cc Daniel & Martin
>
> I was planning to use bpftrace which can use dwarf/btf to convert the
> raw int to its enum string. Martin provided the following command to
> extract the translation from the kernel.
>
> $ bpftool btf dump file /sys/kernel/btf/vmlinux | grep -A10 node_stat_item
> [2264] ENUM 'node_stat_item' encoding=UNSIGNED size=4 vlen=46
>         'NR_LRU_BASE' val=0
>         'NR_INACTIVE_ANON' val=0
>         'NR_ACTIVE_ANON' val=1
>         'NR_INACTIVE_FILE' val=2
>         'NR_ACTIVE_FILE' val=3
>         'NR_UNEVICTABLE' val=4
>         'NR_SLAB_RECLAIMABLE_B' val=5
>         'NR_SLAB_UNRECLAIMABLE_B' val=6
>         'NR_ISOLATED_ANON' val=7
>         'NR_ISOLATED_FILE' val=8
> ...
>
> My point is userspace tools can use existing infra to extract this
> information.
>
> However I am not against adding __print_symbolic() (but without any
> duplication), so users reading /sys/kernel/tracing/trace directly can
> see more useful information as well. Please post a follow up patch after
> this one.

I briefly looked into this and I think it would be annoying to have
this, unfortunately. Even if we rework the existing arrays with memcg
stat names to be in a format conforming to tracing, we would need to
move them out to a separate header to avoid a circular dependency.

Additionally, for __count_memcg_events() things will be more
complicated because the names are not in an array in memcontrol.c, but
we use vm_event_name() and the relevant names are part of a larger
array, vmstat_text, which we would need to rework similarly.

I think this would be easier to implement if we can somehow provide a
callback that returns the name based on the index, rather than an
array. But even then, we would need to specify a different callback
for each event, so it won't be as simple as specifying the callback in
the event class.

All in all, unless we realize there is something that is fundamentally
more difficult to do in userspace, I think it's not worth adding this
unfortunately :/
Daniel Xu Oct. 15, 2024, 12:23 a.m. UTC | #8
Hi Yosry,

On Mon, Oct 14, 2024 at 05:15:39PM GMT, Yosry Ahmed wrote:
> On Thu, Oct 10, 2024 at 10:26 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Wed, Oct 09, 2024 at 06:24:55PM GMT, Yosry Ahmed wrote:
> > > On Wed, Oct 9, 2024 at 6:08 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > On Wed, 9 Oct 2024 17:46:22 -0700
> > > > Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > > > +++ b/mm/memcontrol.c
> > > > > > @@ -71,6 +71,10 @@
> > > > > >
> > > > > >  #include <linux/uaccess.h>
> > > > > >
> > > > > > +#define CREATE_TRACE_POINTS
> > > > > > +#include <trace/events/memcg.h>
> > > > > > +#undef CREATE_TRACE_POINTS
> > > > > > +
> > > > > >  #include <trace/events/vmscan.h>
> > > > > >
> > > > > >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > > > > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > > > > >                 return;
> > > > > >
> > > > > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > > > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > > > > +       val = memcg_state_val_in_pages(idx, val);
> > > > > > +       memcg_rstat_updated(memcg, val);
> > > > > > +       trace_mod_memcg_state(memcg, idx, val);
> > > > >
> > > > > Is it too unreasonable to include the stat name?
> > > > >
> > > > > The index has to be correlated with the kernel config and perhaps even
> > > > > version. It's not a big deal, but if performance is not a concern when
> > > > > tracing is enabled anyway, maybe we can lookup the name here (or in
> > > > > TP_fast_assign()).
> > > >
> > > > What name? Is it looked up from idx? If so, you can do it on the reading of
> >
> > Does reading side mean the one reading /sys/kernel/tracing/trace will do
> > the translation from enums to string?
> >
> > > > the trace event where performance is not an issue. See the __print_symbolic()
> > > > and friends in samples/trace_events/trace-events-sample.h
> > >
> > > Yeah they can be found using idx. Thanks for referring us to
> > > __print_symbolic(), I suppose for this to work we need to construct an
> > > array of {idx, name}. I think we can replace the existing memory_stats
> > > and memcg1_stats/memcg1_stat_names arrays with something that we can
> > > reuse for tracing, so we wouldn't need to consume extra space.
> > >
> > > Shakeel, what do you think?
> >
> > Cc Daniel & Martin
> >
> > I was planning to use bpftrace which can use dwarf/btf to convert the
> > raw int to its enum string. Martin provided the following command to
> > extract the translation from the kernel.
> >
> > $ bpftool btf dump file /sys/kernel/btf/vmlinux | grep -A10 node_stat_item
> > [2264] ENUM 'node_stat_item' encoding=UNSIGNED size=4 vlen=46
> >         'NR_LRU_BASE' val=0
> >         'NR_INACTIVE_ANON' val=0
> >         'NR_ACTIVE_ANON' val=1
> >         'NR_INACTIVE_FILE' val=2
> >         'NR_ACTIVE_FILE' val=3
> >         'NR_UNEVICTABLE' val=4
> >         'NR_SLAB_RECLAIMABLE_B' val=5
> >         'NR_SLAB_UNRECLAIMABLE_B' val=6
> >         'NR_ISOLATED_ANON' val=7
> >         'NR_ISOLATED_FILE' val=8
> > ...
> >
> > My point is userspace tools can use existing infra to extract this
> > information.
> >
> > However I am not against adding __print_symbolic() (but without any
> > duplication), so users reading /sys/kernel/tracing/trace directly can
> > see more useful information as well. Please post a follow up patch after
> > this one.
> 
> I briefly looked into this and I think it would be annoying to have
> this, unfortunately. Even if we rework the existing arrays with memcg
> stat names to be in a format conforming to tracing, we would need to
> move them out to a separate header to avoid a circular dependency.
> 
> Additionally, for __count_memcg_events() things will be more
> complicated because the names are not in an array in memcontrol.c, but
> we use vm_event_name() and the relevant names are part of a larger
> array, vmstat_text, which we would need to rework similarly.
> 
> I think this would be easier to implement if we can somehow provide a
> callback that returns the name based on the index, rather than an
> array. But even then, we would need to specify a different callback
> for each event, so it won't be as simple as specifying the callback in
> the event class.
> 
> All in all, unless we realize there is something that is fundamentally
> more difficult to do in userspace, I think it's not worth adding this
> unfortunately :/

Turned out to be quite straightforward to do in userspace:
https://github.com/bpftrace/bpftrace/pull/3515 .

A nice property is the resolution occurs out of line and saves the
kernel some cycles in the fast path.

Thanks,
Daniel
Yosry Ahmed Oct. 15, 2024, 8:02 a.m. UTC | #9
> > > > > > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > > > > > >                 return;
> > > > > > >
> > > > > > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > > > > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > > > > > +       val = memcg_state_val_in_pages(idx, val);
> > > > > > > +       memcg_rstat_updated(memcg, val);
> > > > > > > +       trace_mod_memcg_state(memcg, idx, val);
> > > > > >
> > > > > > Is it too unreasonable to include the stat name?
> > > > > >
> > > > > > The index has to be correlated with the kernel config and perhaps even
> > > > > > version. It's not a big deal, but if performance is not a concern when
> > > > > > tracing is enabled anyway, maybe we can lookup the name here (or in
> > > > > > TP_fast_assign()).
> > > > >
> > > > > What name? Is it looked up from idx? If so, you can do it on the reading of
> > >
> > > Does reading side mean the one reading /sys/kernel/tracing/trace will do
> > > the translation from enums to string?
> > >
> > > > > the trace event where performance is not an issue. See the __print_symbolic()
> > > > > and friends in samples/trace_events/trace-events-sample.h
> > > >
> > > > Yeah they can be found using idx. Thanks for referring us to
> > > > __print_symbolic(), I suppose for this to work we need to construct an
> > > > array of {idx, name}. I think we can replace the existing memory_stats
> > > > and memcg1_stats/memcg1_stat_names arrays with something that we can
> > > > reuse for tracing, so we wouldn't need to consume extra space.
> > > >
> > > > Shakeel, what do you think?
> > >
> > > Cc Daniel & Martin
> > >
> > > I was planning to use bpftrace which can use dwarf/btf to convert the
> > > raw int to its enum string. Martin provided the following command to
> > > extract the translation from the kernel.
> > >
> > > $ bpftool btf dump file /sys/kernel/btf/vmlinux | grep -A10 node_stat_item
> > > [2264] ENUM 'node_stat_item' encoding=UNSIGNED size=4 vlen=46
> > >         'NR_LRU_BASE' val=0
> > >         'NR_INACTIVE_ANON' val=0
> > >         'NR_ACTIVE_ANON' val=1
> > >         'NR_INACTIVE_FILE' val=2
> > >         'NR_ACTIVE_FILE' val=3
> > >         'NR_UNEVICTABLE' val=4
> > >         'NR_SLAB_RECLAIMABLE_B' val=5
> > >         'NR_SLAB_UNRECLAIMABLE_B' val=6
> > >         'NR_ISOLATED_ANON' val=7
> > >         'NR_ISOLATED_FILE' val=8
> > > ...
> > >
> > > My point is userspace tools can use existing infra to extract this
> > > information.
> > >
> > > However I am not against adding __print_symbolic() (but without any
> > > duplication), so users reading /sys/kernel/tracing/trace directly can
> > > see more useful information as well. Please post a follow up patch after
> > > this one.
> >
> > I briefly looked into this and I think it would be annoying to have
> > this, unfortunately. Even if we rework the existing arrays with memcg
> > stat names to be in a format conforming to tracing, we would need to
> > move them out to a separate header to avoid a circular dependency.
> >
> > Additionally, for __count_memcg_events() things will be more
> > complicated because the names are not in an array in memcontrol.c, but
> > we use vm_event_name() and the relevant names are part of a larger
> > array, vmstat_text, which we would need to rework similarly.
> >
> > I think this would be easier to implement if we can somehow provide a
> > callback that returns the name based on the index, rather than an
> > array. But even then, we would need to specify a different callback
> > for each event, so it won't be as simple as specifying the callback in
> > the event class.
> >
> > All in all, unless we realize there is something that is fundamentally
> > more difficult to do in userspace, I think it's not worth adding this
> > unfortunately :/
>
> Turned out to be quite straightforward to do in userspace:
> https://github.com/bpftrace/bpftrace/pull/3515 .
>
> A nice property is the resolution occurs out of line and saves the
> kernel some cycles in the fast path.

This is really neat, thanks for doing this! Native support in bpftrace
is definitely better than manually correlating the values, especially
with large enums with 10s of values like the ones we are talking about
here.
Yosry Ahmed Oct. 15, 2024, 8:07 a.m. UTC | #10
On Wed, Oct 9, 2024 at 5:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> The memcg stats are maintained in rstat infrastructure which provides
> very fast updates side and reasonable read side. However memcg added
> plethora of stats and made the read side, which is cgroup rstat flush,
> very slow. To solve that, threshold was added in the memcg stats read
> side i.e. no need to flush the stats if updates are within the
> threshold.
>
> This threshold based improvement worked for sometime but more stats were
> added to memcg and also the read codepath was getting triggered in the
> performance sensitive paths which made threshold based ratelimiting
> ineffective. We need more visibility into the hot and cold stats i.e.
> stats with a lot of updates. Let's add trace to get that visibility.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

One question below, otherwise:

Reviewed-by: Yosry Ahmed <yosryahmed@google.com>

> ---
>  include/trace/events/memcg.h | 59 ++++++++++++++++++++++++++++++++++++
>  mm/memcontrol.c              | 13 ++++++--
>  2 files changed, 70 insertions(+), 2 deletions(-)
>  create mode 100644 include/trace/events/memcg.h
>
> diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> new file mode 100644
> index 000000000000..913db9aba580
> --- /dev/null
> +++ b/include/trace/events/memcg.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM memcg
> +
> +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_MEMCG_H
> +
> +#include <linux/memcontrol.h>
> +#include <linux/tracepoint.h>
> +
> +
> +DECLARE_EVENT_CLASS(memcg_rstat,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val),
> +
> +       TP_STRUCT__entry(
> +               __field(u64, id)
> +               __field(int, item)
> +               __field(int, val)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->id = cgroup_id(memcg->css.cgroup);
> +               __entry->item = item;
> +               __entry->val = val;
> +       ),
> +
> +       TP_printk("memcg_id=%llu item=%d val=%d",
> +                 __entry->id, __entry->item, __entry->val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, mod_memcg_state,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +DEFINE_EVENT(memcg_rstat, count_memcg_events,
> +
> +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +
> +       TP_ARGS(memcg, item, val)
> +);
> +
> +
> +#endif /* _TRACE_MEMCG_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c098fd7f5c5e..17af08367c68 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -71,6 +71,10 @@
>
>  #include <linux/uaccess.h>
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/memcg.h>
> +#undef CREATE_TRACE_POINTS
> +
>  #include <trace/events/vmscan.h>
>
>  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
>                 return;
>
>         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> +       val = memcg_state_val_in_pages(idx, val);
> +       memcg_rstat_updated(memcg, val);
> +       trace_mod_memcg_state(memcg, idx, val);
>  }
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item. */
> @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>         /* Update lruvec */
>         __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
> -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> +       val = memcg_state_val_in_pages(idx, val);
> +       memcg_rstat_updated(memcg, val);
> +       trace_mod_memcg_lruvec_state(memcg, idx, val);
>         memcg_stats_unlock();
>  }
>
> @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
>         memcg_stats_lock();
>         __this_cpu_add(memcg->vmstats_percpu->events[i], count);
>         memcg_rstat_updated(memcg, count);
> +       trace_count_memcg_events(memcg, idx, count);

count here is an unsigned long, and we are casting it to int, right?

Would it be slightly better if the tracepoint uses a long instead of
int? It's still not ideal but probably better than int.

>         memcg_stats_unlock();
>  }
>
> --
> 2.43.5
>
Shakeel Butt Oct. 15, 2024, 6:39 p.m. UTC | #11
On Tue, Oct 15, 2024 at 01:07:30AM GMT, Yosry Ahmed wrote:
> On Wed, Oct 9, 2024 at 5:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > The memcg stats are maintained in rstat infrastructure which provides
> > very fast updates side and reasonable read side. However memcg added
> > plethora of stats and made the read side, which is cgroup rstat flush,
> > very slow. To solve that, threshold was added in the memcg stats read
> > side i.e. no need to flush the stats if updates are within the
> > threshold.
> >
> > This threshold based improvement worked for sometime but more stats were
> > added to memcg and also the read codepath was getting triggered in the
> > performance sensitive paths which made threshold based ratelimiting
> > ineffective. We need more visibility into the hot and cold stats i.e.
> > stats with a lot of updates. Let's add trace to get that visibility.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> One question below, otherwise:
> 
> Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
> 
> > ---
> >  include/trace/events/memcg.h | 59 ++++++++++++++++++++++++++++++++++++
> >  mm/memcontrol.c              | 13 ++++++--
> >  2 files changed, 70 insertions(+), 2 deletions(-)
> >  create mode 100644 include/trace/events/memcg.h
> >
> > diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> > new file mode 100644
> > index 000000000000..913db9aba580
> > --- /dev/null
> > +++ b/include/trace/events/memcg.h
> > @@ -0,0 +1,59 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM memcg
> > +
> > +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
> > +#define _TRACE_MEMCG_H
> > +
> > +#include <linux/memcontrol.h>
> > +#include <linux/tracepoint.h>
> > +
> > +
> > +DECLARE_EVENT_CLASS(memcg_rstat,
> > +
> > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > +
> > +       TP_ARGS(memcg, item, val),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(u64, id)
> > +               __field(int, item)
> > +               __field(int, val)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->id = cgroup_id(memcg->css.cgroup);
> > +               __entry->item = item;
> > +               __entry->val = val;
> > +       ),
> > +
> > +       TP_printk("memcg_id=%llu item=%d val=%d",
> > +                 __entry->id, __entry->item, __entry->val)
> > +);
> > +
> > +DEFINE_EVENT(memcg_rstat, mod_memcg_state,
> > +
> > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > +
> > +       TP_ARGS(memcg, item, val)
> > +);
> > +
> > +DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
> > +
> > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > +
> > +       TP_ARGS(memcg, item, val)
> > +);
> > +
> > +DEFINE_EVENT(memcg_rstat, count_memcg_events,
> > +
> > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > +
> > +       TP_ARGS(memcg, item, val)
> > +);
> > +
> > +
> > +#endif /* _TRACE_MEMCG_H */
> > +
> > +/* This part must be outside protection */
> > +#include <trace/define_trace.h>
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index c098fd7f5c5e..17af08367c68 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -71,6 +71,10 @@
> >
> >  #include <linux/uaccess.h>
> >
> > +#define CREATE_TRACE_POINTS
> > +#include <trace/events/memcg.h>
> > +#undef CREATE_TRACE_POINTS
> > +
> >  #include <trace/events/vmscan.h>
> >
> >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> >                 return;
> >
> >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > +       val = memcg_state_val_in_pages(idx, val);
> > +       memcg_rstat_updated(memcg, val);
> > +       trace_mod_memcg_state(memcg, idx, val);
> >  }
> >
> >  /* idx can be of type enum memcg_stat_item or node_stat_item. */
> > @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> >         /* Update lruvec */
> >         __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
> >
> > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > +       val = memcg_state_val_in_pages(idx, val);
> > +       memcg_rstat_updated(memcg, val);
> > +       trace_mod_memcg_lruvec_state(memcg, idx, val);
> >         memcg_stats_unlock();
> >  }
> >
> > @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
> >         memcg_stats_lock();
> >         __this_cpu_add(memcg->vmstats_percpu->events[i], count);
> >         memcg_rstat_updated(memcg, count);
> > +       trace_count_memcg_events(memcg, idx, count);
> 
> count here is an unsigned long, and we are casting it to int, right?
> 
> Would it be slightly better if the tracepoint uses a long instead of
> int? It's still not ideal but probably better than int.
> 

Do you mean something line the following? If this looks good to you then
we can ask Andrew to squash this in the patch.


diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
index 913db9aba580..37812900acce 100644
--- a/include/trace/events/memcg.h
+++ b/include/trace/events/memcg.h
@@ -11,14 +11,14 @@
 
 DECLARE_EVENT_CLASS(memcg_rstat,
 
-	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+	TP_PROTO(struct mem_cgroup *memcg, int item, long val),
 
 	TP_ARGS(memcg, item, val),
 
 	TP_STRUCT__entry(
 		__field(u64, id)
 		__field(int, item)
-		__field(int, val)
+		__field(long, val)
 	),
 
 	TP_fast_assign(
@@ -33,21 +33,21 @@ DECLARE_EVENT_CLASS(memcg_rstat,
 
 DEFINE_EVENT(memcg_rstat, mod_memcg_state,
 
-	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+	TP_PROTO(struct mem_cgroup *memcg, int item, long val),
 
 	TP_ARGS(memcg, item, val)
 );
 
 DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
 
-	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+	TP_PROTO(struct mem_cgroup *memcg, int item, long val),
 
 	TP_ARGS(memcg, item, val)
 );
 
 DEFINE_EVENT(memcg_rstat, count_memcg_events,
 
-	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+	TP_PROTO(struct mem_cgroup *memcg, int item, long val),
 
 	TP_ARGS(memcg, item, val)
 );
Yosry Ahmed Oct. 15, 2024, 6:47 p.m. UTC | #12
On Tue, Oct 15, 2024 at 11:39 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Oct 15, 2024 at 01:07:30AM GMT, Yosry Ahmed wrote:
> > On Wed, Oct 9, 2024 at 5:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > The memcg stats are maintained in rstat infrastructure which provides
> > > very fast updates side and reasonable read side. However memcg added
> > > plethora of stats and made the read side, which is cgroup rstat flush,
> > > very slow. To solve that, threshold was added in the memcg stats read
> > > side i.e. no need to flush the stats if updates are within the
> > > threshold.
> > >
> > > This threshold based improvement worked for sometime but more stats were
> > > added to memcg and also the read codepath was getting triggered in the
> > > performance sensitive paths which made threshold based ratelimiting
> > > ineffective. We need more visibility into the hot and cold stats i.e.
> > > stats with a lot of updates. Let's add trace to get that visibility.
> > >
> > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> >
> > One question below, otherwise:
> >
> > Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
> >
> > > ---
> > >  include/trace/events/memcg.h | 59 ++++++++++++++++++++++++++++++++++++
> > >  mm/memcontrol.c              | 13 ++++++--
> > >  2 files changed, 70 insertions(+), 2 deletions(-)
> > >  create mode 100644 include/trace/events/memcg.h
> > >
> > > diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> > > new file mode 100644
> > > index 000000000000..913db9aba580
> > > --- /dev/null
> > > +++ b/include/trace/events/memcg.h
> > > @@ -0,0 +1,59 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +#undef TRACE_SYSTEM
> > > +#define TRACE_SYSTEM memcg
> > > +
> > > +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
> > > +#define _TRACE_MEMCG_H
> > > +
> > > +#include <linux/memcontrol.h>
> > > +#include <linux/tracepoint.h>
> > > +
> > > +
> > > +DECLARE_EVENT_CLASS(memcg_rstat,
> > > +
> > > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > > +
> > > +       TP_ARGS(memcg, item, val),
> > > +
> > > +       TP_STRUCT__entry(
> > > +               __field(u64, id)
> > > +               __field(int, item)
> > > +               __field(int, val)
> > > +       ),
> > > +
> > > +       TP_fast_assign(
> > > +               __entry->id = cgroup_id(memcg->css.cgroup);
> > > +               __entry->item = item;
> > > +               __entry->val = val;
> > > +       ),
> > > +
> > > +       TP_printk("memcg_id=%llu item=%d val=%d",
> > > +                 __entry->id, __entry->item, __entry->val)
> > > +);
> > > +
> > > +DEFINE_EVENT(memcg_rstat, mod_memcg_state,
> > > +
> > > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > > +
> > > +       TP_ARGS(memcg, item, val)
> > > +);
> > > +
> > > +DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
> > > +
> > > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > > +
> > > +       TP_ARGS(memcg, item, val)
> > > +);
> > > +
> > > +DEFINE_EVENT(memcg_rstat, count_memcg_events,
> > > +
> > > +       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> > > +
> > > +       TP_ARGS(memcg, item, val)
> > > +);
> > > +
> > > +
> > > +#endif /* _TRACE_MEMCG_H */
> > > +
> > > +/* This part must be outside protection */
> > > +#include <trace/define_trace.h>
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index c098fd7f5c5e..17af08367c68 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -71,6 +71,10 @@
> > >
> > >  #include <linux/uaccess.h>
> > >
> > > +#define CREATE_TRACE_POINTS
> > > +#include <trace/events/memcg.h>
> > > +#undef CREATE_TRACE_POINTS
> > > +
> > >  #include <trace/events/vmscan.h>
> > >
> > >  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
> > > @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
> > >                 return;
> > >
> > >         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > +       val = memcg_state_val_in_pages(idx, val);
> > > +       memcg_rstat_updated(memcg, val);
> > > +       trace_mod_memcg_state(memcg, idx, val);
> > >  }
> > >
> > >  /* idx can be of type enum memcg_stat_item or node_stat_item. */
> > > @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> > >         /* Update lruvec */
> > >         __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
> > >
> > > -       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> > > +       val = memcg_state_val_in_pages(idx, val);
> > > +       memcg_rstat_updated(memcg, val);
> > > +       trace_mod_memcg_lruvec_state(memcg, idx, val);
> > >         memcg_stats_unlock();
> > >  }
> > >
> > > @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
> > >         memcg_stats_lock();
> > >         __this_cpu_add(memcg->vmstats_percpu->events[i], count);
> > >         memcg_rstat_updated(memcg, count);
> > > +       trace_count_memcg_events(memcg, idx, count);
> >
> > count here is an unsigned long, and we are casting it to int, right?
> >
> > Would it be slightly better if the tracepoint uses a long instead of
> > int? It's still not ideal but probably better than int.
> >
>
> Do you mean something line the following? If this looks good to you then
> we can ask Andrew to squash this in the patch.

Yes, unless you have a better way to also accommodate the unsigned
long value in __count_memcg_events().

>
>
> diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> index 913db9aba580..37812900acce 100644
> --- a/include/trace/events/memcg.h
> +++ b/include/trace/events/memcg.h
> @@ -11,14 +11,14 @@
>
>  DECLARE_EVENT_CLASS(memcg_rstat,
>
> -       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +       TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
>         TP_ARGS(memcg, item, val),
>
>         TP_STRUCT__entry(
>                 __field(u64, id)
>                 __field(int, item)
> -               __field(int, val)
> +               __field(long, val)
>         ),
>
>         TP_fast_assign(
> @@ -33,21 +33,21 @@ DECLARE_EVENT_CLASS(memcg_rstat,
>
>  DEFINE_EVENT(memcg_rstat, mod_memcg_state,
>
> -       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +       TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
>         TP_ARGS(memcg, item, val)
>  );
>
>  DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
>
> -       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +       TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
>         TP_ARGS(memcg, item, val)
>  );
>
>  DEFINE_EVENT(memcg_rstat, count_memcg_events,
>
> -       TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> +       TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
>         TP_ARGS(memcg, item, val)
>  );
>
diff mbox series

Patch

diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
new file mode 100644
index 000000000000..913db9aba580
--- /dev/null
+++ b/include/trace/events/memcg.h
@@ -0,0 +1,59 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM memcg
+
+#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MEMCG_H
+
+#include <linux/memcontrol.h>
+#include <linux/tracepoint.h>
+
+
+DECLARE_EVENT_CLASS(memcg_rstat,
+
+	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+	TP_ARGS(memcg, item, val),
+
+	TP_STRUCT__entry(
+		__field(u64, id)
+		__field(int, item)
+		__field(int, val)
+	),
+
+	TP_fast_assign(
+		__entry->id = cgroup_id(memcg->css.cgroup);
+		__entry->item = item;
+		__entry->val = val;
+	),
+
+	TP_printk("memcg_id=%llu item=%d val=%d",
+		  __entry->id, __entry->item, __entry->val)
+);
+
+DEFINE_EVENT(memcg_rstat, mod_memcg_state,
+
+	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+	TP_ARGS(memcg, item, val)
+);
+
+DEFINE_EVENT(memcg_rstat, mod_memcg_lruvec_state,
+
+	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+	TP_ARGS(memcg, item, val)
+);
+
+DEFINE_EVENT(memcg_rstat, count_memcg_events,
+
+	TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+	TP_ARGS(memcg, item, val)
+);
+
+
+#endif /* _TRACE_MEMCG_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c098fd7f5c5e..17af08367c68 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -71,6 +71,10 @@ 
 
 #include <linux/uaccess.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/memcg.h>
+#undef CREATE_TRACE_POINTS
+
 #include <trace/events/vmscan.h>
 
 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
@@ -682,7 +686,9 @@  void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
 		return;
 
 	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
-	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+	val = memcg_state_val_in_pages(idx, val);
+	memcg_rstat_updated(memcg, val);
+	trace_mod_memcg_state(memcg, idx, val);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -741,7 +747,9 @@  static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 	/* Update lruvec */
 	__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
 
-	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+	val = memcg_state_val_in_pages(idx, val);
+	memcg_rstat_updated(memcg, val);
+	trace_mod_memcg_lruvec_state(memcg, idx, val);
 	memcg_stats_unlock();
 }
 
@@ -832,6 +840,7 @@  void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	memcg_stats_lock();
 	__this_cpu_add(memcg->vmstats_percpu->events[i], count);
 	memcg_rstat_updated(memcg, count);
+	trace_count_memcg_events(memcg, idx, count);
 	memcg_stats_unlock();
 }