diff mbox series

[v3,2/2] mm: memcg: introduce new event to trace shrink_memcg

Message ID 20231123193937.11628-3-ddrokosov@salutedevices.com (mailing list archive)
State Not Applicable
Headers show
Series mm: memcg: improve vmscan tracepoints | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Dmitry Rokosov Nov. 23, 2023, 7:39 p.m. UTC
The shrink_memcg flow plays a crucial role in memcg reclamation.
Currently, it is not possible to trace this point from non-direct
reclaim paths. However, direct reclaim has its own tracepoint, so there
is no issue there. In certain cases, when debugging memcg pressure,
developers may need to identify all potential requests for memcg
reclamation including kswapd(). The patchset introduces the tracepoints
mm_vmscan_memcg_shrink_{begin|end}() to address this problem.

Example of output in the kswapd context (non-direct reclaim):
    kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
    kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
    kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
    kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
    kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
    kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16

Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
---
 include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
 mm/vmscan.c                   |  7 +++++++
 2 files changed, 29 insertions(+)

Comments

Shakeel Butt Nov. 25, 2023, 6:36 a.m. UTC | #1
On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
> ---
>  include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
>  mm/vmscan.c                   |  7 +++++++
>  2 files changed, 29 insertions(+)
> 
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index e9093fa1c924..a4686afe571d 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -180,6 +180,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
>  	TP_ARGS(order, gfp_flags, memcg)
>  );
>  
> +DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
> +
> +	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> +
> +	TP_ARGS(order, gfp_flags, memcg)
> +);
> +
> +#else
> +
> +#define trace_mm_vmscan_memcg_shrink_begin(...)
> +
>  #endif /* CONFIG_MEMCG */
>  
>  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
> @@ -243,6 +254,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
>  	TP_ARGS(nr_reclaimed, memcg)
>  );
>  
> +DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
> +
> +	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
> +
> +	TP_ARGS(nr_reclaimed, memcg)
> +);
> +
> +#else
> +
> +#define trace_mm_vmscan_memcg_shrink_end(...)
> +
>  #endif /* CONFIG_MEMCG */
>  
>  TRACE_EVENT(mm_shrink_slab_start,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 45780952f4b5..f7e3ddc5a7ad 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6461,6 +6461,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  		 */
>  		cond_resched();
>  
> +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> +						   sc->gfp_mask,
> +						   memcg);
> +

If you place the start of the trace here, you may have only the begin
trace for memcgs whose usage are below their min or low limits. Is that
fine? Otherwise you can put it just before shrink_lruvec() call.

>  		mem_cgroup_calculate_protection(target_memcg, memcg);
>  
>  		if (mem_cgroup_below_min(target_memcg, memcg)) {
> @@ -6491,6 +6495,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
>  			    sc->priority);
>  
> +		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
> +						 memcg);
> +
>  		/* Record the group's reclaim efficiency */
>  		if (!sc->proactive)
>  			vmpressure(sc->gfp_mask, memcg, false,
> -- 
> 2.36.0
>
Dmitry Rokosov Nov. 25, 2023, 8:01 a.m. UTC | #2
On Sat, Nov 25, 2023 at 06:36:16AM +0000, Shakeel Butt wrote:
> On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > Currently, it is not possible to trace this point from non-direct
> > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > is no issue there. In certain cases, when debugging memcg pressure,
> > developers may need to identify all potential requests for memcg
> > reclamation including kswapd(). The patchset introduces the tracepoints
> > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > 
> > Example of output in the kswapd context (non-direct reclaim):
> >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > 
> > Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
> > ---
> >  include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
> >  mm/vmscan.c                   |  7 +++++++
> >  2 files changed, 29 insertions(+)
> > 
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index e9093fa1c924..a4686afe571d 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -180,6 +180,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
> >  	TP_ARGS(order, gfp_flags, memcg)
> >  );
> >  
> > +DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
> > +
> > +	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> > +
> > +	TP_ARGS(order, gfp_flags, memcg)
> > +);
> > +
> > +#else
> > +
> > +#define trace_mm_vmscan_memcg_shrink_begin(...)
> > +
> >  #endif /* CONFIG_MEMCG */
> >  
> >  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
> > @@ -243,6 +254,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
> >  	TP_ARGS(nr_reclaimed, memcg)
> >  );
> >  
> > +DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
> > +
> > +	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
> > +
> > +	TP_ARGS(nr_reclaimed, memcg)
> > +);
> > +
> > +#else
> > +
> > +#define trace_mm_vmscan_memcg_shrink_end(...)
> > +
> >  #endif /* CONFIG_MEMCG */
> >  
> >  TRACE_EVENT(mm_shrink_slab_start,
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 45780952f4b5..f7e3ddc5a7ad 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -6461,6 +6461,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> >  		 */
> >  		cond_resched();
> >  
> > +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> > +						   sc->gfp_mask,
> > +						   memcg);
> > +
> 
> If you place the start of the trace here, you may have only the begin
> trace for memcgs whose usage are below their min or low limits. Is that
> fine? Otherwise you can put it just before shrink_lruvec() call.
> 

From my point of view, it's fine. For situations like the one you
described, when we only see the begin() tracepoint raised without the
end(), we understand that reclaim requests are being made but cannot be
satisfied due to certain conditions within memcg (such as limits).

There may be some spam tracepoints in the trace pipe, which is a disadvantage
of this approach.

How important do you think it is to understand such situations? Or do
you suggest moving the begin() tracepoint after the memcg limits checks
and don't care about it?

> >  		mem_cgroup_calculate_protection(target_memcg, memcg);
> >  
> >  		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > @@ -6491,6 +6495,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> >  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> >  			    sc->priority);
> >  
> > +		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
> > +						 memcg);
> > +
> >  		/* Record the group's reclaim efficiency */
> >  		if (!sc->proactive)
> >  			vmpressure(sc->gfp_mask, memcg, false,
> > -- 
> > 2.36.0
> >
Shakeel Butt Nov. 25, 2023, 5:38 p.m. UTC | #3
On Sat, Nov 25, 2023 at 11:01:37AM +0300, Dmitry Rokosov wrote:
[...]
> > > +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> > > +						   sc->gfp_mask,
> > > +						   memcg);
> > > +
> > 
> > If you place the start of the trace here, you may have only the begin
> > trace for memcgs whose usage are below their min or low limits. Is that
> > fine? Otherwise you can put it just before shrink_lruvec() call.
> > 
> 
> From my point of view, it's fine. For situations like the one you
> described, when we only see the begin() tracepoint raised without the
> end(), we understand that reclaim requests are being made but cannot be
> satisfied due to certain conditions within memcg (such as limits).
> 
> There may be some spam tracepoints in the trace pipe, which is a disadvantage
> of this approach.
> 
> How important do you think it is to understand such situations? Or do
> you suggest moving the begin() tracepoint after the memcg limits checks
> and don't care about it?
> 

I was mainly wondering if that is intentional. It seems like you as
first user of this trace has a need to know that a reclaim for a given
memcg was triggered but due to min/low limits no reclaim was done. This
is a totally reasonable use-case.
Shakeel Butt Nov. 25, 2023, 5:47 p.m. UTC | #4
On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>

Acked-by: Shakeel Butt <shakeelb@google.com>
Michal Hocko Nov. 27, 2023, 9:33 a.m. UTC | #5
On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16

In the previous version I have asked why do we need this specific
tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
which already give you a very good insight. That includes the number of
reclaimed pages but also more. I do see that we do not include memcg id
of the reclaimed LRU, but that shouldn't be a big problem to add, no?
Dmitry Rokosov Nov. 27, 2023, 11:36 a.m. UTC | #6
On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > Currently, it is not possible to trace this point from non-direct
> > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > is no issue there. In certain cases, when debugging memcg pressure,
> > developers may need to identify all potential requests for memcg
> > reclamation including kswapd(). The patchset introduces the tracepoints
> > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > 
> > Example of output in the kswapd context (non-direct reclaim):
> >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> In the previous version I have asked why do we need this specific
> tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> which already give you a very good insight. That includes the number of
> reclaimed pages but also more. I do see that we do not include memcg id
> of the reclaimed LRU, but that shouldn't be a big problem to add, no?

From my point of view, memcg reclaim includes two points: LRU shrink and
slab shrink, as mentioned in the vmscan.c file.


static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
...
		reclaimed = sc->nr_reclaimed;
		scanned = sc->nr_scanned;

		shrink_lruvec(lruvec, sc);

		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
			    sc->priority);
...

So, both of these operations are important for understanding whether
memcg reclaiming was successful or not, as well as its effectiveness. I
believe it would be beneficial to summarize them, which is why I have
created new tracepoints.
Michal Hocko Nov. 27, 2023, 12:50 p.m. UTC | #7
On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > Currently, it is not possible to trace this point from non-direct
> > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > is no issue there. In certain cases, when debugging memcg pressure,
> > > developers may need to identify all potential requests for memcg
> > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > 
> > > Example of output in the kswapd context (non-direct reclaim):
> > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > 
> > In the previous version I have asked why do we need this specific
> > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > which already give you a very good insight. That includes the number of
> > reclaimed pages but also more. I do see that we do not include memcg id
> > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> 
> >From my point of view, memcg reclaim includes two points: LRU shrink and
> slab shrink, as mentioned in the vmscan.c file.
> 
> 
> static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> ...
> 		reclaimed = sc->nr_reclaimed;
> 		scanned = sc->nr_scanned;
> 
> 		shrink_lruvec(lruvec, sc);
> 
> 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> 			    sc->priority);
> ...
> 
> So, both of these operations are important for understanding whether
> memcg reclaiming was successful or not, as well as its effectiveness. I
> believe it would be beneficial to summarize them, which is why I have
> created new tracepoints.

This sounds like nice to have rather than must. Put it differently. If
you make existing reclaim trace points memcg aware (print memcg id) then
what prevents you from making analysis you need?
Dmitry Rokosov Nov. 27, 2023, 4:16 p.m. UTC | #8
On Mon, Nov 27, 2023 at 01:50:22PM +0100, Michal Hocko wrote:
> On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> > On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > > Currently, it is not possible to trace this point from non-direct
> > > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > > is no issue there. In certain cases, when debugging memcg pressure,
> > > > developers may need to identify all potential requests for memcg
> > > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > > 
> > > > Example of output in the kswapd context (non-direct reclaim):
> > > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > > 
> > > In the previous version I have asked why do we need this specific
> > > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > > which already give you a very good insight. That includes the number of
> > > reclaimed pages but also more. I do see that we do not include memcg id
> > > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> > 
> > >From my point of view, memcg reclaim includes two points: LRU shrink and
> > slab shrink, as mentioned in the vmscan.c file.
> > 
> > 
> > static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > ...
> > 		reclaimed = sc->nr_reclaimed;
> > 		scanned = sc->nr_scanned;
> > 
> > 		shrink_lruvec(lruvec, sc);
> > 
> > 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > 			    sc->priority);
> > ...
> > 
> > So, both of these operations are important for understanding whether
> > memcg reclaiming was successful or not, as well as its effectiveness. I
> > believe it would be beneficial to summarize them, which is why I have
> > created new tracepoints.
> 
> This sounds like nice to have rather than must. Put it differently. If
> you make existing reclaim trace points memcg aware (print memcg id) then
> what prevents you from making analysis you need?

You are right, nothing prevents me from making this analysis... but...

This approach does have some disadvantages:
1) It requires more changes to vmscan. At the very least, the memcg
object should be forwarded to all subfunctions for LRU and SLAB
shrinkers.

2) With this approach, we will not have the ability to trace a situation
where the kernel is requesting reclaim for a specific memcg, but due to
limits issues, we are unable to run it.

3) LRU and SLAB shrinkers are too common places to handle memcg-related
tasks. Additionally, memcg can be disabled in the kernel configuration.
Michal Hocko Nov. 28, 2023, 9:32 a.m. UTC | #9
On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> On Mon, Nov 27, 2023 at 01:50:22PM +0100, Michal Hocko wrote:
> > On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> > > On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > > > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > > > Currently, it is not possible to trace this point from non-direct
> > > > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > > > is no issue there. In certain cases, when debugging memcg pressure,
> > > > > developers may need to identify all potential requests for memcg
> > > > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > > > 
> > > > > Example of output in the kswapd context (non-direct reclaim):
> > > > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > > > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > > > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > > > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > > > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > > > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > > > 
> > > > In the previous version I have asked why do we need this specific
> > > > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > > > which already give you a very good insight. That includes the number of
> > > > reclaimed pages but also more. I do see that we do not include memcg id
> > > > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> > > 
> > > >From my point of view, memcg reclaim includes two points: LRU shrink and
> > > slab shrink, as mentioned in the vmscan.c file.
> > > 
> > > 
> > > static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > > ...
> > > 		reclaimed = sc->nr_reclaimed;
> > > 		scanned = sc->nr_scanned;
> > > 
> > > 		shrink_lruvec(lruvec, sc);
> > > 
> > > 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > > 			    sc->priority);
> > > ...
> > > 
> > > So, both of these operations are important for understanding whether
> > > memcg reclaiming was successful or not, as well as its effectiveness. I
> > > believe it would be beneficial to summarize them, which is why I have
> > > created new tracepoints.
> > 
> > This sounds like nice to have rather than must. Put it differently. If
> > you make existing reclaim trace points memcg aware (print memcg id) then
> > what prevents you from making analysis you need?
> 
> You are right, nothing prevents me from making this analysis... but...
> 
> This approach does have some disadvantages:
> 1) It requires more changes to vmscan. At the very least, the memcg
> object should be forwarded to all subfunctions for LRU and SLAB
> shrinkers.

We should have lruvec or memcg available. lruvec_memcg() could be used
to get memcg from the lruvec. It might be more places to add the id but
arguably this would improve them to identify where the memory has been
scanned/reclaimed from.
 
> 2) With this approach, we will not have the ability to trace a situation
> where the kernel is requesting reclaim for a specific memcg, but due to
> limits issues, we are unable to run it.

I do not follow. Could you be more specific please?

> 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> tasks. Additionally, memcg can be disabled in the kernel configuration.

Right. This could be all hidden in the tracing code. You simply do not
print memcg id when the controller is disabled. Or just simply print 0.
I do not really see any major problems with that.

I would really prefer to focus on that direction rather than adding
another begin/end tracepoint which overalaps with existing begin/end
traces and provides much more limited information because I would bet we
will have somebody complaining that mere nr_reclaimed is not sufficient.
Dmitry Rokosov Nov. 29, 2023, 3:20 p.m. UTC | #10
On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > On Mon, Nov 27, 2023 at 01:50:22PM +0100, Michal Hocko wrote:
> > > On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> > > > On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > > > > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > > > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > > > > Currently, it is not possible to trace this point from non-direct
> > > > > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > > > > is no issue there. In certain cases, when debugging memcg pressure,
> > > > > > developers may need to identify all potential requests for memcg
> > > > > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > > > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > > > > 
> > > > > > Example of output in the kswapd context (non-direct reclaim):
> > > > > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > > > > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > > > > 
> > > > > In the previous version I have asked why do we need this specific
> > > > > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > > > > which already give you a very good insight. That includes the number of
> > > > > reclaimed pages but also more. I do see that we do not include memcg id
> > > > > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> > > > 
> > > > >From my point of view, memcg reclaim includes two points: LRU shrink and
> > > > slab shrink, as mentioned in the vmscan.c file.
> > > > 
> > > > 
> > > > static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > > > ...
> > > > 		reclaimed = sc->nr_reclaimed;
> > > > 		scanned = sc->nr_scanned;
> > > > 
> > > > 		shrink_lruvec(lruvec, sc);
> > > > 
> > > > 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > > > 			    sc->priority);
> > > > ...
> > > > 
> > > > So, both of these operations are important for understanding whether
> > > > memcg reclaiming was successful or not, as well as its effectiveness. I
> > > > believe it would be beneficial to summarize them, which is why I have
> > > > created new tracepoints.
> > > 
> > > This sounds like nice to have rather than must. Put it differently. If
> > > you make existing reclaim trace points memcg aware (print memcg id) then
> > > what prevents you from making analysis you need?
> > 
> > You are right, nothing prevents me from making this analysis... but...
> > 
> > This approach does have some disadvantages:
> > 1) It requires more changes to vmscan. At the very least, the memcg
> > object should be forwarded to all subfunctions for LRU and SLAB
> > shrinkers.
> 
> We should have lruvec or memcg available. lruvec_memcg() could be used
> to get memcg from the lruvec. It might be more places to add the id but
> arguably this would improve them to identify where the memory has been
> scanned/reclaimed from.
>  

Oh, thank you, didn't see this conversion function before...

> > 2) With this approach, we will not have the ability to trace a situation
> > where the kernel is requesting reclaim for a specific memcg, but due to
> > limits issues, we are unable to run it.
> 
> I do not follow. Could you be more specific please?
> 

I'm referring to a situation where kswapd() or another kernel mm code
requests some reclaim pages from memcg, but memcg rejects it due to
limits checkers. This occurs in the shrink_node_memcgs() function.

===
		mem_cgroup_calculate_protection(target_memcg, memcg);

		if (mem_cgroup_below_min(target_memcg, memcg)) {
			/*
			 * Hard protection.
			 * If there is no reclaimable memory, OOM.
			 */
			continue;
		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
			/*
			 * Soft protection.
			 * Respect the protection only as long as
			 * there is an unprotected supply
			 * of reclaimable memory from other cgroups.
			 */
			if (!sc->memcg_low_reclaim) {
				sc->memcg_low_skipped = 1;
				continue;
			}
			memcg_memory_event(memcg, MEMCG_LOW);
		}
===

With separate shrink begin()/end() tracepoints we can detect such
problem.


> > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > tasks. Additionally, memcg can be disabled in the kernel configuration.
> 
> Right. This could be all hidden in the tracing code. You simply do not
> print memcg id when the controller is disabled. Or just simply print 0.
> I do not really see any major problems with that.
> 
> I would really prefer to focus on that direction rather than adding
> another begin/end tracepoint which overalaps with existing begin/end
> traces and provides much more limited information because I would bet we
> will have somebody complaining that mere nr_reclaimed is not sufficient.

Okay, I will try to prepare a new patch version with memcg printing from
lruvec and slab tracepoints.

Then Andrew should drop the previous patchsets, I suppose. Please advise
on the correct workflow steps here.
Dmitry Rokosov Nov. 29, 2023, 3:26 p.m. UTC | #11
On Wed, Nov 29, 2023 at 06:20:57PM +0300, Dmitry Rokosov wrote:
> On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > > On Mon, Nov 27, 2023 at 01:50:22PM +0100, Michal Hocko wrote:
> > > > On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> > > > > On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > > > > > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > > > > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > > > > > Currently, it is not possible to trace this point from non-direct
> > > > > > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > > > > > is no issue there. In certain cases, when debugging memcg pressure,
> > > > > > > developers may need to identify all potential requests for memcg
> > > > > > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > > > > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > > > > > 
> > > > > > > Example of output in the kswapd context (non-direct reclaim):
> > > > > > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > > > > > 
> > > > > > In the previous version I have asked why do we need this specific
> > > > > > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > > > > > which already give you a very good insight. That includes the number of
> > > > > > reclaimed pages but also more. I do see that we do not include memcg id
> > > > > > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> > > > > 
> > > > > >From my point of view, memcg reclaim includes two points: LRU shrink and
> > > > > slab shrink, as mentioned in the vmscan.c file.
> > > > > 
> > > > > 
> > > > > static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > > > > ...
> > > > > 		reclaimed = sc->nr_reclaimed;
> > > > > 		scanned = sc->nr_scanned;
> > > > > 
> > > > > 		shrink_lruvec(lruvec, sc);
> > > > > 
> > > > > 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > > > > 			    sc->priority);
> > > > > ...
> > > > > 
> > > > > So, both of these operations are important for understanding whether
> > > > > memcg reclaiming was successful or not, as well as its effectiveness. I
> > > > > believe it would be beneficial to summarize them, which is why I have
> > > > > created new tracepoints.
> > > > 
> > > > This sounds like nice to have rather than must. Put it differently. If
> > > > you make existing reclaim trace points memcg aware (print memcg id) then
> > > > what prevents you from making analysis you need?
> > > 
> > > You are right, nothing prevents me from making this analysis... but...
> > > 
> > > This approach does have some disadvantages:
> > > 1) It requires more changes to vmscan. At the very least, the memcg
> > > object should be forwarded to all subfunctions for LRU and SLAB
> > > shrinkers.
> > 
> > We should have lruvec or memcg available. lruvec_memcg() could be used
> > to get memcg from the lruvec. It might be more places to add the id but
> > arguably this would improve them to identify where the memory has been
> > scanned/reclaimed from.
> >  
> 
> Oh, thank you, didn't see this conversion function before...
> 
> > > 2) With this approach, we will not have the ability to trace a situation
> > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > limits issues, we are unable to run it.
> > 
> > I do not follow. Could you be more specific please?
> > 
> 
> I'm referring to a situation where kswapd() or another kernel mm code
> requests some reclaim pages from memcg, but memcg rejects it due to
> limits checkers. This occurs in the shrink_node_memcgs() function.
> 
> ===
> 		mem_cgroup_calculate_protection(target_memcg, memcg);
> 
> 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> 			/*
> 			 * Hard protection.
> 			 * If there is no reclaimable memory, OOM.
> 			 */
> 			continue;
> 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> 			/*
> 			 * Soft protection.
> 			 * Respect the protection only as long as
> 			 * there is an unprotected supply
> 			 * of reclaimable memory from other cgroups.
> 			 */
> 			if (!sc->memcg_low_reclaim) {
> 				sc->memcg_low_skipped = 1;
> 				continue;
> 			}
> 			memcg_memory_event(memcg, MEMCG_LOW);
> 		}
> ===
> 
> With separate shrink begin()/end() tracepoints we can detect such
> problem.
> 
> 
> > > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > > tasks. Additionally, memcg can be disabled in the kernel configuration.
> > 
> > Right. This could be all hidden in the tracing code. You simply do not
> > print memcg id when the controller is disabled. Or just simply print 0.
> > I do not really see any major problems with that.
> > 
> > I would really prefer to focus on that direction rather than adding
> > another begin/end tracepoint which overalaps with existing begin/end
> > traces and provides much more limited information because I would bet we
> > will have somebody complaining that mere nr_reclaimed is not sufficient.
> 
> Okay, I will try to prepare a new patch version with memcg printing from
> lruvec and slab tracepoints.
> 
> Then Andrew should drop the previous patchsets, I suppose. Please advise
> on the correct workflow steps here.

Actually, it has already been merged into linux-next... I just checked.
Maybe it would be better to prepare lruvec and slab memcg printing as a
separate patch series?

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=0e7f0c52a76cb22c8633f21bff6e48fabff6016e
Michal Hocko Nov. 29, 2023, 4:06 p.m. UTC | #12
On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
[...]
> > > 2) With this approach, we will not have the ability to trace a situation
> > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > limits issues, we are unable to run it.
> > 
> > I do not follow. Could you be more specific please?
> > 
> 
> I'm referring to a situation where kswapd() or another kernel mm code
> requests some reclaim pages from memcg, but memcg rejects it due to
> limits checkers. This occurs in the shrink_node_memcgs() function.

Ohh, you mean reclaim protection

> ===
> 		mem_cgroup_calculate_protection(target_memcg, memcg);
> 
> 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> 			/*
> 			 * Hard protection.
> 			 * If there is no reclaimable memory, OOM.
> 			 */
> 			continue;
> 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> 			/*
> 			 * Soft protection.
> 			 * Respect the protection only as long as
> 			 * there is an unprotected supply
> 			 * of reclaimable memory from other cgroups.
> 			 */
> 			if (!sc->memcg_low_reclaim) {
> 				sc->memcg_low_skipped = 1;
> 				continue;
> 			}
> 			memcg_memory_event(memcg, MEMCG_LOW);
> 		}
> ===
> 
> With separate shrink begin()/end() tracepoints we can detect such
> problem.

How? You are only reporting the number of reclaimed pages and no
reclaimed pages could be not just because of low/min limits but
generally because of other reasons. You would need to report also the
number of scanned/isolated pages.
 
> > > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > > tasks. Additionally, memcg can be disabled in the kernel configuration.
> > 
> > Right. This could be all hidden in the tracing code. You simply do not
> > print memcg id when the controller is disabled. Or just simply print 0.
> > I do not really see any major problems with that.
> > 
> > I would really prefer to focus on that direction rather than adding
> > another begin/end tracepoint which overalaps with existing begin/end
> > traces and provides much more limited information because I would bet we
> > will have somebody complaining that mere nr_reclaimed is not sufficient.
> 
> Okay, I will try to prepare a new patch version with memcg printing from
> lruvec and slab tracepoints.
> 
> Then Andrew should drop the previous patchsets, I suppose. Please advise
> on the correct workflow steps here.

Andrew usually just drops the patch from his tree and it will disappaer
from the linux-next as well.
Dmitry Rokosov Nov. 29, 2023, 4:57 p.m. UTC | #13
On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> [...]
> > > > 2) With this approach, we will not have the ability to trace a situation
> > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > limits issues, we are unable to run it.
> > > 
> > > I do not follow. Could you be more specific please?
> > > 
> > 
> > I'm referring to a situation where kswapd() or another kernel mm code
> > requests some reclaim pages from memcg, but memcg rejects it due to
> > limits checkers. This occurs in the shrink_node_memcgs() function.
> 
> Ohh, you mean reclaim protection
> 
> > ===
> > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > 
> > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > 			/*
> > 			 * Hard protection.
> > 			 * If there is no reclaimable memory, OOM.
> > 			 */
> > 			continue;
> > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > 			/*
> > 			 * Soft protection.
> > 			 * Respect the protection only as long as
> > 			 * there is an unprotected supply
> > 			 * of reclaimable memory from other cgroups.
> > 			 */
> > 			if (!sc->memcg_low_reclaim) {
> > 				sc->memcg_low_skipped = 1;
> > 				continue;
> > 			}
> > 			memcg_memory_event(memcg, MEMCG_LOW);
> > 		}
> > ===
> > 
> > With separate shrink begin()/end() tracepoints we can detect such
> > problem.
> 
> How? You are only reporting the number of reclaimed pages and no
> reclaimed pages could be not just because of low/min limits but
> generally because of other reasons. You would need to report also the
> number of scanned/isolated pages.
>  

From my perspective, if memory control group (memcg) protection
restrictions occur, we can identify them by the absence of the end()
pair of begin(). Other reasons will have both tracepoints raised.

> > > > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > > > tasks. Additionally, memcg can be disabled in the kernel configuration.
> > > 
> > > Right. This could be all hidden in the tracing code. You simply do not
> > > print memcg id when the controller is disabled. Or just simply print 0.
> > > I do not really see any major problems with that.
> > > 
> > > I would really prefer to focus on that direction rather than adding
> > > another begin/end tracepoint which overalaps with existing begin/end
> > > traces and provides much more limited information because I would bet we
> > > will have somebody complaining that mere nr_reclaimed is not sufficient.
> > 
> > Okay, I will try to prepare a new patch version with memcg printing from
> > lruvec and slab tracepoints.
> > 
> > Then Andrew should drop the previous patchsets, I suppose. Please advise
> > on the correct workflow steps here.
> 
> Andrew usually just drops the patch from his tree and it will disappaer
> from the linux-next as well.

Okay, I understand, thank you!

Andrew, could you please take a look? I am planning to prepare a new
patch version based on Michal's suggestion, so previous one should be
dropped.
Michal Hocko Nov. 29, 2023, 5:10 p.m. UTC | #14
On Wed 29-11-23 19:57:52, Dmitry Rokosov wrote:
> On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> > On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > [...]
> > > > > 2) With this approach, we will not have the ability to trace a situation
> > > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > > limits issues, we are unable to run it.
> > > > 
> > > > I do not follow. Could you be more specific please?
> > > > 
> > > 
> > > I'm referring to a situation where kswapd() or another kernel mm code
> > > requests some reclaim pages from memcg, but memcg rejects it due to
> > > limits checkers. This occurs in the shrink_node_memcgs() function.
> > 
> > Ohh, you mean reclaim protection
> > 
> > > ===
> > > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > > 
> > > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > > 			/*
> > > 			 * Hard protection.
> > > 			 * If there is no reclaimable memory, OOM.
> > > 			 */
> > > 			continue;
> > > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > > 			/*
> > > 			 * Soft protection.
> > > 			 * Respect the protection only as long as
> > > 			 * there is an unprotected supply
> > > 			 * of reclaimable memory from other cgroups.
> > > 			 */
> > > 			if (!sc->memcg_low_reclaim) {
> > > 				sc->memcg_low_skipped = 1;
> > > 				continue;
> > > 			}
> > > 			memcg_memory_event(memcg, MEMCG_LOW);
> > > 		}
> > > ===
> > > 
> > > With separate shrink begin()/end() tracepoints we can detect such
> > > problem.
> > 
> > How? You are only reporting the number of reclaimed pages and no
> > reclaimed pages could be not just because of low/min limits but
> > generally because of other reasons. You would need to report also the
> > number of scanned/isolated pages.
> >  
> 
> From my perspective, if memory control group (memcg) protection
> restrictions occur, we can identify them by the absence of the end()
> pair of begin(). Other reasons will have both tracepoints raised.

That is not really great way to detect that TBH. Trace events could be
lost and then you simply do not know what has happened.
Andrew Morton Nov. 29, 2023, 5:33 p.m. UTC | #15
On Wed, 29 Nov 2023 18:20:57 +0300 Dmitry Rokosov <ddrokosov@salutedevices.com> wrote:

> Okay, I will try to prepare a new patch version with memcg printing from
> lruvec and slab tracepoints.
> 
> Then Andrew should drop the previous patchsets, I suppose. Please advise
> on the correct workflow steps here.

This series is present in mm.git's mm-unstable branch.  Note
"unstable".  So dropping the v3 series and merging v4 is totally not a
problem.  It's why this branch exists - it's daily rebasing, in high
flux.

When a patchset is considered stabilized and ready, I'll move it into
the mm-stable branch, which is (supposed to be) the non-rebasing tree
for next merge window.

If you have small fixes then I prefer little fixup patches against what
is presently in mm-unstable.

If you send replacement patches then no problem, I'll check to see
whether I should turn them into little fixup deltas.

I prefer little fixups so that people can see what has changed, so I
can see which review/test issues were addressed and so that people
don't feel a need to re-review the whole patchset.

If generation of little fixups is impractical, I'll drop the old series
entirely and I'll merge the new one.

Each case is a judgement call, please send whatever you think makes
most sense given the above.
Steven Rostedt Nov. 29, 2023, 5:34 p.m. UTC | #16
On Wed, 29 Nov 2023 18:10:33 +0100
Michal Hocko <mhocko@suse.com> wrote:

> > > How? You are only reporting the number of reclaimed pages and no
> > > reclaimed pages could be not just because of low/min limits but
> > > generally because of other reasons. You would need to report also the
> > > number of scanned/isolated pages.
> > >    
> > 
> > From my perspective, if memory control group (memcg) protection
> > restrictions occur, we can identify them by the absence of the end()
> > pair of begin(). Other reasons will have both tracepoints raised.  
> 
> That is not really great way to detect that TBH. Trace events could be
> lost and then you simply do not know what has happened.

Note, you can detect dropped events. If there's a dropped event, you can
ignore the "missing end" from a beginning. You could also make synthetic
events that pair an end event with a beginning event (which uses the last
begin event found). Synthetic event creation is not affected by dropped
events.

There's a lot you can to get information with the prospect of dropped
events. I would not use that as rationale for not using events.

-- Steve
Dmitry Rokosov Nov. 29, 2023, 5:35 p.m. UTC | #17
On Wed, Nov 29, 2023 at 06:10:33PM +0100, Michal Hocko wrote:
> On Wed 29-11-23 19:57:52, Dmitry Rokosov wrote:
> > On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> > > On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > > > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > > [...]
> > > > > > 2) With this approach, we will not have the ability to trace a situation
> > > > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > > > limits issues, we are unable to run it.
> > > > > 
> > > > > I do not follow. Could you be more specific please?
> > > > > 
> > > > 
> > > > I'm referring to a situation where kswapd() or another kernel mm code
> > > > requests some reclaim pages from memcg, but memcg rejects it due to
> > > > limits checkers. This occurs in the shrink_node_memcgs() function.
> > > 
> > > Ohh, you mean reclaim protection
> > > 
> > > > ===
> > > > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > > > 
> > > > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > > > 			/*
> > > > 			 * Hard protection.
> > > > 			 * If there is no reclaimable memory, OOM.
> > > > 			 */
> > > > 			continue;
> > > > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > > > 			/*
> > > > 			 * Soft protection.
> > > > 			 * Respect the protection only as long as
> > > > 			 * there is an unprotected supply
> > > > 			 * of reclaimable memory from other cgroups.
> > > > 			 */
> > > > 			if (!sc->memcg_low_reclaim) {
> > > > 				sc->memcg_low_skipped = 1;
> > > > 				continue;
> > > > 			}
> > > > 			memcg_memory_event(memcg, MEMCG_LOW);
> > > > 		}
> > > > ===
> > > > 
> > > > With separate shrink begin()/end() tracepoints we can detect such
> > > > problem.
> > > 
> > > How? You are only reporting the number of reclaimed pages and no
> > > reclaimed pages could be not just because of low/min limits but
> > > generally because of other reasons. You would need to report also the
> > > number of scanned/isolated pages.
> > >  
> > 
> > From my perspective, if memory control group (memcg) protection
> > restrictions occur, we can identify them by the absence of the end()
> > pair of begin(). Other reasons will have both tracepoints raised.
> 
> That is not really great way to detect that TBH. Trace events could be
> lost and then you simply do not know what has happened.

I see, thank you very much for the detailed review! I will prepare a new
patchset with memcg names in the lruvec and slab paths, will back soon.
Dmitry Rokosov Nov. 29, 2023, 5:49 p.m. UTC | #18
On Wed, Nov 29, 2023 at 09:33:41AM -0800, Andrew Morton wrote:
> On Wed, 29 Nov 2023 18:20:57 +0300 Dmitry Rokosov <ddrokosov@salutedevices.com> wrote:
> 
> > Okay, I will try to prepare a new patch version with memcg printing from
> > lruvec and slab tracepoints.
> > 
> > Then Andrew should drop the previous patchsets, I suppose. Please advise
> > on the correct workflow steps here.
> 
> This series is present in mm.git's mm-unstable branch.  Note
> "unstable".  So dropping the v3 series and merging v4 is totally not a
> problem.  It's why this branch exists - it's daily rebasing, in high
> flux.
> 
> When a patchset is considered stabilized and ready, I'll move it into
> the mm-stable branch, which is (supposed to be) the non-rebasing tree
> for next merge window.
> 
> If you have small fixes then I prefer little fixup patches against what
> is presently in mm-unstable.
> 
> If you send replacement patches then no problem, I'll check to see
> whether I should turn them into little fixup deltas.
> 
> I prefer little fixups so that people can see what has changed, so I
> can see which review/test issues were addressed and so that people
> don't feel a need to re-review the whole patchset.
> 
> If generation of little fixups is impractical, I'll drop the old series
> entirely and I'll merge the new one.
> 
> Each case is a judgement call, please send whatever you think makes
> most sense given the above.

Thank you for the detailed explanation! It is now completely clear to
me! I will be sending the new patch series soon.
diff mbox series

Patch

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index e9093fa1c924..a4686afe571d 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -180,6 +180,17 @@  DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
 	TP_ARGS(order, gfp_flags, memcg)
 );
 
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
+
+	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
+
+	TP_ARGS(order, gfp_flags, memcg)
+);
+
+#else
+
+#define trace_mm_vmscan_memcg_shrink_begin(...)
+
 #endif /* CONFIG_MEMCG */
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -243,6 +254,17 @@  DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
 	TP_ARGS(nr_reclaimed, memcg)
 );
 
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
+
+	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
+
+	TP_ARGS(nr_reclaimed, memcg)
+);
+
+#else
+
+#define trace_mm_vmscan_memcg_shrink_end(...)
+
 #endif /* CONFIG_MEMCG */
 
 TRACE_EVENT(mm_shrink_slab_start,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 45780952f4b5..f7e3ddc5a7ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6461,6 +6461,10 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		 */
 		cond_resched();
 
+		trace_mm_vmscan_memcg_shrink_begin(sc->order,
+						   sc->gfp_mask,
+						   memcg);
+
 		mem_cgroup_calculate_protection(target_memcg, memcg);
 
 		if (mem_cgroup_below_min(target_memcg, memcg)) {
@@ -6491,6 +6495,9 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
 			    sc->priority);
 
+		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
+						 memcg);
+
 		/* Record the group's reclaim efficiency */
 		if (!sc->proactive)
 			vmpressure(sc->gfp_mask, memcg, false,