diff mbox series

mm: vmscan: add tracepoints for node reclaim

Message ID 1551341664-13912-1-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series mm: vmscan: add tracepoints for node reclaim | expand

Commit Message

Yafang Shao Feb. 28, 2019, 8:14 a.m. UTC
In the page alloc fast path, it may do node reclaim, which may cause
latency spike.
We should add tracepoint for this event, and also mesure the latency
it causes.

So bellow two tracepoints are introduced,
	mm_vmscan_node_reclaim_begin
	mm_vmscan_node_reclaim_end

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                   | 13 +++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

Comments

Souptick Joarder Feb. 28, 2019, 8:59 a.m. UTC | #1
On Thu, Feb 28, 2019 at 1:44 PM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> In the page alloc fast path, it may do node reclaim, which may cause
> latency spike.
> We should add tracepoint for this event, and also mesure the latency
> it causes.

Minor typo : mesure ->measure.

>
> So bellow two tracepoints are introduced,
>         mm_vmscan_node_reclaim_begin
>         mm_vmscan_node_reclaim_end
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                   | 13 +++++++++++-
>  2 files changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index a1cb913..9310d5b 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -465,6 +465,54 @@
>                 __entry->ratio,
>                 show_reclaim_flags(__entry->reclaim_flags))
>  );
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> +
> +       TP_PROTO(int nid, int order, int may_writepage,
> +               gfp_t gfp_flags, int zid),
> +
> +       TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> +
> +       TP_STRUCT__entry(
> +               __field(int, nid)
> +               __field(int, order)
> +               __field(int, may_writepage)
> +               __field(gfp_t, gfp_flags)
> +               __field(int, zid)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->nid = nid;
> +               __entry->order = order;
> +               __entry->may_writepage = may_writepage;
> +               __entry->gfp_flags = gfp_flags;
> +               __entry->zid = zid;
> +       ),
> +
> +       TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> +               __entry->nid,
> +               __entry->zid,
> +               __entry->order,
> +               __entry->may_writepage,
> +               show_gfp_flags(__entry->gfp_flags))
> +);
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> +
> +       TP_PROTO(int result),
> +
> +       TP_ARGS(result),
> +
> +       TP_STRUCT__entry(
> +               __field(int, result)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->result = result;
> +       ),
> +
> +       TP_printk("result=%d", __entry->result)
> +);
>  #endif /* _TRACE_VMSCAN_H */
>
>  /* This part must be outside protection */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ac4806f..01a0401 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>                 .may_swap = 1,
>                 .reclaim_idx = gfp_zone(gfp_mask),
>         };
> +       int result;

If it goes to v2, then
s/result/ret ?

> +
> +       trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> +                                       sc.may_writepage,
> +                                       sc.gfp_mask,
> +                                       sc.reclaim_idx);
>
>         cond_resched();
>         fs_reclaim_acquire(sc.gfp_mask);
> @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>         current->flags &= ~PF_SWAPWRITE;
>         memalloc_noreclaim_restore(noreclaim_flag);
>         fs_reclaim_release(sc.gfp_mask);
> -       return sc.nr_reclaimed >= nr_pages;
> +
> +       result = sc.nr_reclaimed >= nr_pages;
> +
> +       trace_mm_vmscan_node_reclaim_end(result);
> +
> +       return result;
>  }
>
>  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> --
> 1.8.3.1
>
Yafang Shao Feb. 28, 2019, 9:35 a.m. UTC | #2
On Thu, Feb 28, 2019 at 4:59 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
>
> On Thu, Feb 28, 2019 at 1:44 PM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > In the page alloc fast path, it may do node reclaim, which may cause
> > latency spike.
> > We should add tracepoint for this event, and also mesure the latency
> > it causes.
>
> Minor typo : mesure ->measure.
>

Thanks for your correction.

> >
> > So bellow two tracepoints are introduced,
> >         mm_vmscan_node_reclaim_begin
> >         mm_vmscan_node_reclaim_end
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
> >  mm/vmscan.c                   | 13 +++++++++++-
> >  2 files changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index a1cb913..9310d5b 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -465,6 +465,54 @@
> >                 __entry->ratio,
> >                 show_reclaim_flags(__entry->reclaim_flags))
> >  );
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> > +
> > +       TP_PROTO(int nid, int order, int may_writepage,
> > +               gfp_t gfp_flags, int zid),
> > +
> > +       TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(int, nid)
> > +               __field(int, order)
> > +               __field(int, may_writepage)
> > +               __field(gfp_t, gfp_flags)
> > +               __field(int, zid)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->nid = nid;
> > +               __entry->order = order;
> > +               __entry->may_writepage = may_writepage;
> > +               __entry->gfp_flags = gfp_flags;
> > +               __entry->zid = zid;
> > +       ),
> > +
> > +       TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> > +               __entry->nid,
> > +               __entry->zid,
> > +               __entry->order,
> > +               __entry->may_writepage,
> > +               show_gfp_flags(__entry->gfp_flags))
> > +);
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> > +
> > +       TP_PROTO(int result),
> > +
> > +       TP_ARGS(result),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(int, result)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->result = result;
> > +       ),
> > +
> > +       TP_printk("result=%d", __entry->result)
> > +);
> >  #endif /* _TRACE_VMSCAN_H */
> >
> >  /* This part must be outside protection */
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index ac4806f..01a0401 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >                 .may_swap = 1,
> >                 .reclaim_idx = gfp_zone(gfp_mask),
> >         };
> > +       int result;
>
> If it goes to v2, then
> s/result/ret ?
>

Sure. Will change it.

> > +
> > +       trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> > +                                       sc.may_writepage,
> > +                                       sc.gfp_mask,
> > +                                       sc.reclaim_idx);
> >
> >         cond_resched();
> >         fs_reclaim_acquire(sc.gfp_mask);
> > @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >         current->flags &= ~PF_SWAPWRITE;
> >         memalloc_noreclaim_restore(noreclaim_flag);
> >         fs_reclaim_release(sc.gfp_mask);
> > -       return sc.nr_reclaimed >= nr_pages;
> > +
> > +       result = sc.nr_reclaimed >= nr_pages;
> > +
> > +       trace_mm_vmscan_node_reclaim_end(result);
> > +
> > +       return result;
> >  }
> >
> >  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> > --
> > 1.8.3.1
> >

Thanks
Yafang
Michal Hocko Feb. 28, 2019, 10:17 a.m. UTC | #3
On Thu 28-02-19 16:14:24, Yafang Shao wrote:
> In the page alloc fast path, it may do node reclaim, which may cause
> latency spike.
> We should add tracepoint for this event, and also mesure the latency
> it causes.
> 
> So bellow two tracepoints are introduced,
> 	mm_vmscan_node_reclaim_begin
> 	mm_vmscan_node_reclaim_end

This makes some sense to me. Regular direct reclaim already does have
similar tracepoints. Is there any reason you haven't used
mm_vmscan_direct_reclaim_{begin,end}_template as all other direct reclaim
paths?

> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                   | 13 +++++++++++-
>  2 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index a1cb913..9310d5b 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -465,6 +465,54 @@
>  		__entry->ratio,
>  		show_reclaim_flags(__entry->reclaim_flags))
>  );
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> +
> +	TP_PROTO(int nid, int order, int may_writepage,
> +		gfp_t gfp_flags, int zid),
> +
> +	TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> +
> +	TP_STRUCT__entry(
> +		__field(int, nid)
> +		__field(int, order)
> +		__field(int, may_writepage)
> +		__field(gfp_t, gfp_flags)
> +		__field(int, zid)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->nid = nid;
> +		__entry->order = order;
> +		__entry->may_writepage = may_writepage;
> +		__entry->gfp_flags = gfp_flags;
> +		__entry->zid = zid;
> +	),
> +
> +	TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> +		__entry->nid,
> +		__entry->zid,
> +		__entry->order,
> +		__entry->may_writepage,
> +		show_gfp_flags(__entry->gfp_flags))
> +);
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> +
> +	TP_PROTO(int result),
> +
> +	TP_ARGS(result),
> +
> +	TP_STRUCT__entry(
> +		__field(int, result)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->result = result;
> +	),
> +
> +	TP_printk("result=%d", __entry->result)
> +);
>  #endif /* _TRACE_VMSCAN_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ac4806f..01a0401 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  		.may_swap = 1,
>  		.reclaim_idx = gfp_zone(gfp_mask),
>  	};
> +	int result;
> +
> +	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> +					sc.may_writepage,
> +					sc.gfp_mask,
> +					sc.reclaim_idx);
>  
>  	cond_resched();
>  	fs_reclaim_acquire(sc.gfp_mask);
> @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  	current->flags &= ~PF_SWAPWRITE;
>  	memalloc_noreclaim_restore(noreclaim_flag);
>  	fs_reclaim_release(sc.gfp_mask);
> -	return sc.nr_reclaimed >= nr_pages;
> +
> +	result = sc.nr_reclaimed >= nr_pages;
> +
> +	trace_mm_vmscan_node_reclaim_end(result);
> +
> +	return result;
>  }
>  
>  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> -- 
> 1.8.3.1
Yafang Shao Feb. 28, 2019, 10:20 a.m. UTC | #4
On Thu, Feb 28, 2019 at 6:17 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Thu 28-02-19 16:14:24, Yafang Shao wrote:
> > In the page alloc fast path, it may do node reclaim, which may cause
> > latency spike.
> > We should add tracepoint for this event, and also mesure the latency
> > it causes.
> >
> > So bellow two tracepoints are introduced,
> >       mm_vmscan_node_reclaim_begin
> >       mm_vmscan_node_reclaim_end
>
> This makes some sense to me. Regular direct reclaim already does have
> similar tracepoints. Is there any reason you haven't used
> mm_vmscan_direct_reclaim_{begin,end}_template as all other direct reclaim
> paths?
>

Because I also want to know the node id, which is not show in
mm_vmscan_direct_reclaim_{begin,end}_template.

Or should we modify mm_vmscan_direct_reclaim_{begin,end}_template to
show the node id as well ?

Thanks
Yafang

> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
> >  mm/vmscan.c                   | 13 +++++++++++-
> >  2 files changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index a1cb913..9310d5b 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -465,6 +465,54 @@
> >               __entry->ratio,
> >               show_reclaim_flags(__entry->reclaim_flags))
> >  );
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> > +
> > +     TP_PROTO(int nid, int order, int may_writepage,
> > +             gfp_t gfp_flags, int zid),
> > +
> > +     TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(int, nid)
> > +             __field(int, order)
> > +             __field(int, may_writepage)
> > +             __field(gfp_t, gfp_flags)
> > +             __field(int, zid)
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->nid = nid;
> > +             __entry->order = order;
> > +             __entry->may_writepage = may_writepage;
> > +             __entry->gfp_flags = gfp_flags;
> > +             __entry->zid = zid;
> > +     ),
> > +
> > +     TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> > +             __entry->nid,
> > +             __entry->zid,
> > +             __entry->order,
> > +             __entry->may_writepage,
> > +             show_gfp_flags(__entry->gfp_flags))
> > +);
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> > +
> > +     TP_PROTO(int result),
> > +
> > +     TP_ARGS(result),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(int, result)
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->result = result;
> > +     ),
> > +
> > +     TP_printk("result=%d", __entry->result)
> > +);
> >  #endif /* _TRACE_VMSCAN_H */
> >
> >  /* This part must be outside protection */
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index ac4806f..01a0401 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >               .may_swap = 1,
> >               .reclaim_idx = gfp_zone(gfp_mask),
> >       };
> > +     int result;
> > +
> > +     trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> > +                                     sc.may_writepage,
> > +                                     sc.gfp_mask,
> > +                                     sc.reclaim_idx);
> >
> >       cond_resched();
> >       fs_reclaim_acquire(sc.gfp_mask);
> > @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >       current->flags &= ~PF_SWAPWRITE;
> >       memalloc_noreclaim_restore(noreclaim_flag);
> >       fs_reclaim_release(sc.gfp_mask);
> > -     return sc.nr_reclaimed >= nr_pages;
> > +
> > +     result = sc.nr_reclaimed >= nr_pages;
> > +
> > +     trace_mm_vmscan_node_reclaim_end(result);
> > +
> > +     return result;
> >  }
> >
> >  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> > --
> > 1.8.3.1
>
> --
> Michal Hocko
> SUSE Labs
Vlastimil Babka Feb. 28, 2019, 10:21 a.m. UTC | #5
On 2/28/19 9:14 AM, Yafang Shao wrote:
> In the page alloc fast path, it may do node reclaim, which may cause
> latency spike.
> We should add tracepoint for this event, and also mesure the latency
> it causes.
> 
> So bellow two tracepoints are introduced,
> 	mm_vmscan_node_reclaim_begin
> 	mm_vmscan_node_reclaim_end
> 
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                   | 13 +++++++++++-
>  2 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index a1cb913..9310d5b 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -465,6 +465,54 @@
>  		__entry->ratio,
>  		show_reclaim_flags(__entry->reclaim_flags))
>  );
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> +
> +	TP_PROTO(int nid, int order, int may_writepage,
> +		gfp_t gfp_flags, int zid),
> +
> +	TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> +
> +	TP_STRUCT__entry(
> +		__field(int, nid)
> +		__field(int, order)
> +		__field(int, may_writepage)

For node reclaim may_writepage is statically set in node_reclaim_mode,
so I'm not sure it's worth including it.

> +		__field(gfp_t, gfp_flags)
> +		__field(int, zid)

zid seems wasteful and misleading as it's simply derived by
gfp_zone(gfp_mask), so I would drop it.

> +	),
> +
> +	TP_fast_assign(
> +		__entry->nid = nid;
> +		__entry->order = order;
> +		__entry->may_writepage = may_writepage;
> +		__entry->gfp_flags = gfp_flags;
> +		__entry->zid = zid;
> +	),
> +
> +	TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> +		__entry->nid,
> +		__entry->zid,
> +		__entry->order,
> +		__entry->may_writepage,
> +		show_gfp_flags(__entry->gfp_flags))
> +);
> +
> +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> +
> +	TP_PROTO(int result),
> +
> +	TP_ARGS(result),
> +
> +	TP_STRUCT__entry(
> +		__field(int, result)

Reporting sc.nr_reclaimed sounds more useful and in line with other
reclaim tracepoints. Result (sc.nr_reclaimed >= nr_pages) can then be
derived by postprocessing as the beginning tracepoint contains 'order'
thus we know nr_pages?

> +	),
> +
> +	TP_fast_assign(
> +		__entry->result = result;
> +	),
> +
> +	TP_printk("result=%d", __entry->result)
> +);
>  #endif /* _TRACE_VMSCAN_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ac4806f..01a0401 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  		.may_swap = 1,
>  		.reclaim_idx = gfp_zone(gfp_mask),
>  	};
> +	int result;
> +
> +	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> +					sc.may_writepage,
> +					sc.gfp_mask,
> +					sc.reclaim_idx);
>  
>  	cond_resched();
>  	fs_reclaim_acquire(sc.gfp_mask);
> @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  	current->flags &= ~PF_SWAPWRITE;
>  	memalloc_noreclaim_restore(noreclaim_flag);
>  	fs_reclaim_release(sc.gfp_mask);
> -	return sc.nr_reclaimed >= nr_pages;
> +
> +	result = sc.nr_reclaimed >= nr_pages;
> +
> +	trace_mm_vmscan_node_reclaim_end(result);
> +
> +	return result;
>  }
>  
>  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
>
Michal Hocko Feb. 28, 2019, 10:28 a.m. UTC | #6
On Thu 28-02-19 18:20:16, Yafang Shao wrote:
> On Thu, Feb 28, 2019 at 6:17 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Thu 28-02-19 16:14:24, Yafang Shao wrote:
> > > In the page alloc fast path, it may do node reclaim, which may cause
> > > latency spike.
> > > We should add tracepoint for this event, and also mesure the latency
> > > it causes.
> > >
> > > So bellow two tracepoints are introduced,
> > >       mm_vmscan_node_reclaim_begin
> > >       mm_vmscan_node_reclaim_end
> >
> > This makes some sense to me. Regular direct reclaim already does have
> > similar tracepoints. Is there any reason you haven't used
> > mm_vmscan_direct_reclaim_{begin,end}_template as all other direct reclaim
> > paths?
> >
> 
> Because I also want to know the node id, which is not show in
> mm_vmscan_direct_reclaim_{begin,end}_template.
> 
> Or should we modify mm_vmscan_direct_reclaim_{begin,end}_template to
> show the node id as well ?

OK, I see. I thought it was there but it would make much less sense than
for the node reclaim for sure. A separate tracepoint makes more sense
then.
Yafang Shao Feb. 28, 2019, 10:34 a.m. UTC | #7
On Thu, Feb 28, 2019 at 6:21 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/28/19 9:14 AM, Yafang Shao wrote:
> > In the page alloc fast path, it may do node reclaim, which may cause
> > latency spike.
> > We should add tracepoint for this event, and also mesure the latency
> > it causes.
> >
> > So bellow two tracepoints are introduced,
> >       mm_vmscan_node_reclaim_begin
> >       mm_vmscan_node_reclaim_end
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
> >  mm/vmscan.c                   | 13 +++++++++++-
> >  2 files changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index a1cb913..9310d5b 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -465,6 +465,54 @@
> >               __entry->ratio,
> >               show_reclaim_flags(__entry->reclaim_flags))
> >  );
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> > +
> > +     TP_PROTO(int nid, int order, int may_writepage,
> > +             gfp_t gfp_flags, int zid),
> > +
> > +     TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(int, nid)
> > +             __field(int, order)
> > +             __field(int, may_writepage)
>
> For node reclaim may_writepage is statically set in node_reclaim_mode,
> so I'm not sure it's worth including it.
>
> > +             __field(gfp_t, gfp_flags)
> > +             __field(int, zid)
>
> zid seems wasteful and misleading as it's simply derived by
> gfp_zone(gfp_mask), so I would drop it.
>

I agree with you that may_writepage and zid is wasteful, but I found
they are in other tracepoints in this file,
so I place them in this tracepoint as well.

Seems we'd better drop them from other tracepoints as well ?

> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->nid = nid;
> > +             __entry->order = order;
> > +             __entry->may_writepage = may_writepage;
> > +             __entry->gfp_flags = gfp_flags;
> > +             __entry->zid = zid;
> > +     ),
> > +
> > +     TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
> > +             __entry->nid,
> > +             __entry->zid,
> > +             __entry->order,
> > +             __entry->may_writepage,
> > +             show_gfp_flags(__entry->gfp_flags))
> > +);
> > +
> > +TRACE_EVENT(mm_vmscan_node_reclaim_end,
> > +
> > +     TP_PROTO(int result),
> > +
> > +     TP_ARGS(result),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(int, result)
>
> Reporting sc.nr_reclaimed sounds more useful and in line with other
> reclaim tracepoints. Result (sc.nr_reclaimed >= nr_pages) can then be
> derived by postprocessing as the beginning tracepoint contains 'order'
> thus we know nr_pages?
>

Seems reasonable.
Will change it.

> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->result = result;
> > +     ),
> > +
> > +     TP_printk("result=%d", __entry->result)
> > +);
> >  #endif /* _TRACE_VMSCAN_H */
> >
> >  /* This part must be outside protection */
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index ac4806f..01a0401 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >               .may_swap = 1,
> >               .reclaim_idx = gfp_zone(gfp_mask),
> >       };
> > +     int result;
> > +
> > +     trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
> > +                                     sc.may_writepage,
> > +                                     sc.gfp_mask,
> > +                                     sc.reclaim_idx);
> >
> >       cond_resched();
> >       fs_reclaim_acquire(sc.gfp_mask);
> > @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> >       current->flags &= ~PF_SWAPWRITE;
> >       memalloc_noreclaim_restore(noreclaim_flag);
> >       fs_reclaim_release(sc.gfp_mask);
> > -     return sc.nr_reclaimed >= nr_pages;
> > +
> > +     result = sc.nr_reclaimed >= nr_pages;
> > +
> > +     trace_mm_vmscan_node_reclaim_end(result);
> > +
> > +     return result;
> >  }
> >
> >  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> >
>

Thanks
Yafang
Vlastimil Babka Feb. 28, 2019, 10:44 a.m. UTC | #8
On 2/28/19 11:34 AM, Yafang Shao wrote:
> On Thu, Feb 28, 2019 at 6:21 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 2/28/19 9:14 AM, Yafang Shao wrote:
>>> In the page alloc fast path, it may do node reclaim, which may cause
>>> latency spike.
>>> We should add tracepoint for this event, and also mesure the latency
>>> it causes.
>>>
>>> So bellow two tracepoints are introduced,
>>>       mm_vmscan_node_reclaim_begin
>>>       mm_vmscan_node_reclaim_end
>>>
>>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>>> ---
>>>  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
>>>  mm/vmscan.c                   | 13 +++++++++++-
>>>  2 files changed, 60 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
>>> index a1cb913..9310d5b 100644
>>> --- a/include/trace/events/vmscan.h
>>> +++ b/include/trace/events/vmscan.h
>>> @@ -465,6 +465,54 @@
>>>               __entry->ratio,
>>>               show_reclaim_flags(__entry->reclaim_flags))
>>>  );
>>> +
>>> +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
>>> +
>>> +     TP_PROTO(int nid, int order, int may_writepage,
>>> +             gfp_t gfp_flags, int zid),
>>> +
>>> +     TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
>>> +
>>> +     TP_STRUCT__entry(
>>> +             __field(int, nid)
>>> +             __field(int, order)
>>> +             __field(int, may_writepage)
>>
>> For node reclaim may_writepage is statically set in node_reclaim_mode,
>> so I'm not sure it's worth including it.
>>
>>> +             __field(gfp_t, gfp_flags)
>>> +             __field(int, zid)
>>
>> zid seems wasteful and misleading as it's simply derived by
>> gfp_zone(gfp_mask), so I would drop it.
>>
> 
> I agree with you that may_writepage and zid is wasteful, but I found
> they are in other tracepoints in this file,
> so I place them in this tracepoint as well.

I see zid only in kswapd waking tracepoints? That's different kind of
event.

> Seems we'd better drop them from other tracepoints as well ?

Hmm seems may_writepage in other tracepoints depends on laptop_mode
which is also a static setting. do_try_to_free_pages() can override it
due to priority, but that doesn't affect the tracepoints. If they are to
be dropped, it would be a separate patch though.
Yafang Shao Feb. 28, 2019, 10:48 a.m. UTC | #9
On Thu, Feb 28, 2019 at 6:44 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/28/19 11:34 AM, Yafang Shao wrote:
> > On Thu, Feb 28, 2019 at 6:21 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> On 2/28/19 9:14 AM, Yafang Shao wrote:
> >>> In the page alloc fast path, it may do node reclaim, which may cause
> >>> latency spike.
> >>> We should add tracepoint for this event, and also mesure the latency
> >>> it causes.
> >>>
> >>> So bellow two tracepoints are introduced,
> >>>       mm_vmscan_node_reclaim_begin
> >>>       mm_vmscan_node_reclaim_end
> >>>
> >>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> >>> ---
> >>>  include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++
> >>>  mm/vmscan.c                   | 13 +++++++++++-
> >>>  2 files changed, 60 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> >>> index a1cb913..9310d5b 100644
> >>> --- a/include/trace/events/vmscan.h
> >>> +++ b/include/trace/events/vmscan.h
> >>> @@ -465,6 +465,54 @@
> >>>               __entry->ratio,
> >>>               show_reclaim_flags(__entry->reclaim_flags))
> >>>  );
> >>> +
> >>> +TRACE_EVENT(mm_vmscan_node_reclaim_begin,
> >>> +
> >>> +     TP_PROTO(int nid, int order, int may_writepage,
> >>> +             gfp_t gfp_flags, int zid),
> >>> +
> >>> +     TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
> >>> +
> >>> +     TP_STRUCT__entry(
> >>> +             __field(int, nid)
> >>> +             __field(int, order)
> >>> +             __field(int, may_writepage)
> >>
> >> For node reclaim may_writepage is statically set in node_reclaim_mode,
> >> so I'm not sure it's worth including it.
> >>
> >>> +             __field(gfp_t, gfp_flags)
> >>> +             __field(int, zid)
> >>
> >> zid seems wasteful and misleading as it's simply derived by
> >> gfp_zone(gfp_mask), so I would drop it.
> >>
> >
> > I agree with you that may_writepage and zid is wasteful, but I found
> > they are in other tracepoints in this file,
> > so I place them in this tracepoint as well.
>
> I see zid only in kswapd waking tracepoints? That's different kind of
> event.
>

Pls. see mm_vmscan_wakeup_kswapd and  classzone_idx in
mm_vmscan_direct_reclaim_begin_template.

mm_vmscan_direct_reclaim_begin_template:
    "order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d"

mm_vmscan_wakeup_kswapd:
    "nid=%d zid=%d order=%d gfp_flags=%s"








> > Seems we'd better drop them from other tracepoints as well ?
>
> Hmm seems may_writepage in other tracepoints depends on laptop_mode
> which is also a static setting. do_try_to_free_pages() can override it
> due to priority, but that doesn't affect the tracepoints. If they are to
> be dropped, it would be a separate patch though.

OK.

Thanks
Yafang
diff mbox series

Patch

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a1cb913..9310d5b 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -465,6 +465,54 @@ 
 		__entry->ratio,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
+
+TRACE_EVENT(mm_vmscan_node_reclaim_begin,
+
+	TP_PROTO(int nid, int order, int may_writepage,
+		gfp_t gfp_flags, int zid),
+
+	TP_ARGS(nid, order, may_writepage, gfp_flags, zid),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, order)
+		__field(int, may_writepage)
+		__field(gfp_t, gfp_flags)
+		__field(int, zid)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->order = order;
+		__entry->may_writepage = may_writepage;
+		__entry->gfp_flags = gfp_flags;
+		__entry->zid = zid;
+	),
+
+	TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s",
+		__entry->nid,
+		__entry->zid,
+		__entry->order,
+		__entry->may_writepage,
+		show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(mm_vmscan_node_reclaim_end,
+
+	TP_PROTO(int result),
+
+	TP_ARGS(result),
+
+	TP_STRUCT__entry(
+		__field(int, result)
+	),
+
+	TP_fast_assign(
+		__entry->result = result;
+	),
+
+	TP_printk("result=%d", __entry->result)
+);
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ac4806f..01a0401 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4240,6 +4240,12 @@  static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_swap = 1,
 		.reclaim_idx = gfp_zone(gfp_mask),
 	};
+	int result;
+
+	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+					sc.may_writepage,
+					sc.gfp_mask,
+					sc.reclaim_idx);
 
 	cond_resched();
 	fs_reclaim_acquire(sc.gfp_mask);
@@ -4267,7 +4273,12 @@  static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
 	fs_reclaim_release(sc.gfp_mask);
-	return sc.nr_reclaimed >= nr_pages;
+
+	result = sc.nr_reclaimed >= nr_pages;
+
+	trace_mm_vmscan_node_reclaim_end(result);
+
+	return result;
 }
 
 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)