diff mbox series

[1/6] writeback: collect stats of all wb of bdi in bdi_debug_stats_show

Message ID 20240320110222.6564-2-shikemeng@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series Improve visibility of writeback | expand

Commit Message

Kemeng Shi March 20, 2024, 11:02 a.m. UTC
/sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
of whole bdi, but only writeback information of bdi in root cgroup is
collected. So writeback information in non-root cgroup are missing now.
To be more specific, considering following case:

/* create writeback cgroup */
cd /sys/fs/cgroup
echo "+memory +io" > cgroup.subtree_control
mkdir group1
cd group1
echo $$ > cgroup.procs
/* do writeback in cgroup */
fio -name test -filename=/dev/vdb ...
/* get writeback info of bdi */
cat /sys/kernel/debug/bdi/xxx/stats
The cat result unexpectedly implies that there is no writeback on target
bdi.

Fix this by collecting stats of all wb in bdi instead of only wb in
root cgroup.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
---
 mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 23 deletions(-)

Comments

Brian Foster March 20, 2024, 1:21 p.m. UTC | #1
On Wed, Mar 20, 2024 at 07:02:17PM +0800, Kemeng Shi wrote:
> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
> of whole bdi, but only writeback information of bdi in root cgroup is
> collected. So writeback information in non-root cgroup are missing now.
> To be more specific, considering following case:
> 
> /* create writeback cgroup */
> cd /sys/fs/cgroup
> echo "+memory +io" > cgroup.subtree_control
> mkdir group1
> cd group1
> echo $$ > cgroup.procs
> /* do writeback in cgroup */
> fio -name test -filename=/dev/vdb ...
> /* get writeback info of bdi */
> cat /sys/kernel/debug/bdi/xxx/stats
> The cat result unexpectedly implies that there is no writeback on target
> bdi.
> 
> Fix this by collecting stats of all wb in bdi instead of only wb in
> root cgroup.
> 
> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> ---
>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 70 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 5f2be8c8df11..788702b6c5dd 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
...
> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>  }
>  
...
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> +			      struct wb_stats *stats)
> +{
> +	struct bdi_writeback *wb;
> +
> +	/* protect wb from release */
> +	mutex_lock(&bdi->cgwb_release_mutex);
> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
> +		collect_wb_stats(stats, wb);
> +	mutex_unlock(&bdi->cgwb_release_mutex);
> +}
> +#else
> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> +			      struct wb_stats *stats)
> +{
> +	collect_wb_stats(stats, &bdi->wb);
> +}
> +#endif
> +

I'm not familiar enough with the cgwb code to say for sure (and I'd
probably wait for more high level feedback before worrying too much
about this), but do we need the ifdef here just to iterate ->wb_list?
From looking at the code, it appears bdi->wb ends up on the list while
the bdi is registered for both cases, so that distinction seems
unnecessary. WRT to wb release protection, I wonder if this could use a
combination of rcu_read_lock()/list_for_each_safe() and wb_tryget() on
each wb before collecting its stats..? See how bdi_split_work_to_wbs()
works, for example.

Also I see a patch conflict/compile error on patch 2 due to
__wb_calc_thresh() only taking one parameter in my tree. What's the
baseline commit for this series?

Brian

> +static int bdi_debug_stats_show(struct seq_file *m, void *v)
> +{
> +	struct backing_dev_info *bdi = m->private;
> +	unsigned long background_thresh;
> +	unsigned long dirty_thresh;
> +	struct wb_stats stats;
> +	unsigned long tot_bw;
> +
>  	global_dirty_limits(&background_thresh, &dirty_thresh);
> -	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
> +
> +	memset(&stats, 0, sizeof(stats));
> +	stats.dirty_thresh = dirty_thresh;
> +	bdi_collect_stats(bdi, &stats);
> +
> +	tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
>  
>  	seq_printf(m,
>  		   "BdiWriteback:       %10lu kB\n"
> @@ -87,18 +134,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>  		   "b_dirty_time:       %10lu\n"
>  		   "bdi_list:           %10u\n"
>  		   "state:              %10lx\n",
> -		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
> -		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
> -		   K(wb_thresh),
> +		   K(stats.nr_writeback),
> +		   K(stats.nr_reclaimable),
> +		   K(stats.wb_thresh),
>  		   K(dirty_thresh),
>  		   K(background_thresh),
> -		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
> -		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
> -		   (unsigned long) K(wb->write_bandwidth),
> -		   nr_dirty,
> -		   nr_io,
> -		   nr_more_io,
> -		   nr_dirty_time,
> +		   K(stats.nr_dirtied),
> +		   K(stats.nr_written),
> +		   K(tot_bw),
> +		   stats.nr_dirty,
> +		   stats.nr_io,
> +		   stats.nr_more_io,
> +		   stats.nr_dirty_time,
>  		   !list_empty(&bdi->bdi_list), bdi->wb.state);
>  
>  	return 0;
> -- 
> 2.30.0
> 
>
Kemeng Shi March 21, 2024, 3:44 a.m. UTC | #2
on 3/20/2024 9:21 PM, Brian Foster wrote:
> On Wed, Mar 20, 2024 at 07:02:17PM +0800, Kemeng Shi wrote:
>> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
>> of whole bdi, but only writeback information of bdi in root cgroup is
>> collected. So writeback information in non-root cgroup are missing now.
>> To be more specific, considering following case:
>>
>> /* create writeback cgroup */
>> cd /sys/fs/cgroup
>> echo "+memory +io" > cgroup.subtree_control
>> mkdir group1
>> cd group1
>> echo $$ > cgroup.procs
>> /* do writeback in cgroup */
>> fio -name test -filename=/dev/vdb ...
>> /* get writeback info of bdi */
>> cat /sys/kernel/debug/bdi/xxx/stats
>> The cat result unexpectedly implies that there is no writeback on target
>> bdi.
>>
>> Fix this by collecting stats of all wb in bdi instead of only wb in
>> root cgroup.
>>
>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>> ---
>>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>>  1 file changed, 70 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>> index 5f2be8c8df11..788702b6c5dd 100644
>> --- a/mm/backing-dev.c
>> +++ b/mm/backing-dev.c
> ...
>> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>>  }
>>  
> ...
>> +#ifdef CONFIG_CGROUP_WRITEBACK
>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>> +			      struct wb_stats *stats)
>> +{
>> +	struct bdi_writeback *wb;
>> +
>> +	/* protect wb from release */
>> +	mutex_lock(&bdi->cgwb_release_mutex);
>> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
>> +		collect_wb_stats(stats, wb);
>> +	mutex_unlock(&bdi->cgwb_release_mutex);
>> +}
>> +#else
>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>> +			      struct wb_stats *stats)
>> +{
>> +	collect_wb_stats(stats, &bdi->wb);
>> +}
>> +#endif
>> +
> 
> I'm not familiar enough with the cgwb code to say for sure (and I'd
> probably wait for more high level feedback before worrying too much
> about this), but do we need the ifdef here just to iterate ->wb_list?
>>From looking at the code, it appears bdi->wb ends up on the list while
> the bdi is registered for both cases, so that distinction seems
> unnecessary. WRT to wb release protection, I wonder if this could use a
Currently, we have ifdef trying to remove unnecessary cost when
CONFIG_CGROUP_WRITEBACK is not enabled, see defination of cgwb_bdi_register
and cgwb_remove_from_bdi_list for example. So I try to define bdi_collect_stats
in similar way.
> combination of rcu_read_lock()/list_for_each_safe() and wb_tryget() on
> each wb before collecting its stats..? See how bdi_split_work_to_wbs()
> works, for example.
The combination of rcu_read_lock()/list_for_each_safe() and wb_tryget()
should work fine.
With ifdef, bdi_collect_stats takes no extra cost when CONFIG_CGROUP_WRITEBACK
is not enabled and is consistent with existing code style, so I still prefer
this way. Yes, The extra cost is not a big deal as it only exists in debug mode,
so it's acceptable to use the suggested combination in next version if you are
still strongly aganst this.

> 
> Also I see a patch conflict/compile error on patch 2 due to
> __wb_calc_thresh() only taking one parameter in my tree. What's the
> baseline commit for this series?
> 
Sorry for missing this, this seris is based on another patchset [1] which is still
under review.
Look forward to your reply!

Thansk
Kemeng

[1] https://lore.kernel.org/lkml/20240123183332.876854-1-shikemeng@huaweicloud.com/T/#mc6455784a63d0f8aa1a2f5aff325abcdf9336b76

> Brian
> 
>> +static int bdi_debug_stats_show(struct seq_file *m, void *v)
>> +{
>> +	struct backing_dev_info *bdi = m->private;
>> +	unsigned long background_thresh;
>> +	unsigned long dirty_thresh;
>> +	struct wb_stats stats;
>> +	unsigned long tot_bw;
>> +
>>  	global_dirty_limits(&background_thresh, &dirty_thresh);
>> -	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
>> +
>> +	memset(&stats, 0, sizeof(stats));
>> +	stats.dirty_thresh = dirty_thresh;
>> +	bdi_collect_stats(bdi, &stats);
>> +
>> +	tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
>>  
>>  	seq_printf(m,
>>  		   "BdiWriteback:       %10lu kB\n"
>> @@ -87,18 +134,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>>  		   "b_dirty_time:       %10lu\n"
>>  		   "bdi_list:           %10u\n"
>>  		   "state:              %10lx\n",
>> -		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
>> -		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
>> -		   K(wb_thresh),
>> +		   K(stats.nr_writeback),
>> +		   K(stats.nr_reclaimable),
>> +		   K(stats.wb_thresh),
>>  		   K(dirty_thresh),
>>  		   K(background_thresh),
>> -		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
>> -		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
>> -		   (unsigned long) K(wb->write_bandwidth),
>> -		   nr_dirty,
>> -		   nr_io,
>> -		   nr_more_io,
>> -		   nr_dirty_time,
>> +		   K(stats.nr_dirtied),
>> +		   K(stats.nr_written),
>> +		   K(tot_bw),
>> +		   stats.nr_dirty,
>> +		   stats.nr_io,
>> +		   stats.nr_more_io,
>> +		   stats.nr_dirty_time,
>>  		   !list_empty(&bdi->bdi_list), bdi->wb.state);
>>  
>>  	return 0;
>> -- 
>> 2.30.0
>>
>>
> 
>
Brian Foster March 21, 2024, 12:10 p.m. UTC | #3
On Thu, Mar 21, 2024 at 11:44:40AM +0800, Kemeng Shi wrote:
> 
> 
> on 3/20/2024 9:21 PM, Brian Foster wrote:
> > On Wed, Mar 20, 2024 at 07:02:17PM +0800, Kemeng Shi wrote:
> >> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
> >> of whole bdi, but only writeback information of bdi in root cgroup is
> >> collected. So writeback information in non-root cgroup are missing now.
> >> To be more specific, considering following case:
> >>
> >> /* create writeback cgroup */
> >> cd /sys/fs/cgroup
> >> echo "+memory +io" > cgroup.subtree_control
> >> mkdir group1
> >> cd group1
> >> echo $$ > cgroup.procs
> >> /* do writeback in cgroup */
> >> fio -name test -filename=/dev/vdb ...
> >> /* get writeback info of bdi */
> >> cat /sys/kernel/debug/bdi/xxx/stats
> >> The cat result unexpectedly implies that there is no writeback on target
> >> bdi.
> >>
> >> Fix this by collecting stats of all wb in bdi instead of only wb in
> >> root cgroup.
> >>
> >> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> >> ---
> >>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
> >>  1 file changed, 70 insertions(+), 23 deletions(-)
> >>
> >> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> >> index 5f2be8c8df11..788702b6c5dd 100644
> >> --- a/mm/backing-dev.c
> >> +++ b/mm/backing-dev.c
> > ...
> >> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
> >>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
> >>  }
> >>  
> > ...
> >> +#ifdef CONFIG_CGROUP_WRITEBACK
> >> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> >> +			      struct wb_stats *stats)
> >> +{
> >> +	struct bdi_writeback *wb;
> >> +
> >> +	/* protect wb from release */
> >> +	mutex_lock(&bdi->cgwb_release_mutex);
> >> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
> >> +		collect_wb_stats(stats, wb);
> >> +	mutex_unlock(&bdi->cgwb_release_mutex);
> >> +}
> >> +#else
> >> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> >> +			      struct wb_stats *stats)
> >> +{
> >> +	collect_wb_stats(stats, &bdi->wb);
> >> +}
> >> +#endif
> >> +
> > 
> > I'm not familiar enough with the cgwb code to say for sure (and I'd
> > probably wait for more high level feedback before worrying too much
> > about this), but do we need the ifdef here just to iterate ->wb_list?
> >>From looking at the code, it appears bdi->wb ends up on the list while
> > the bdi is registered for both cases, so that distinction seems
> > unnecessary. WRT to wb release protection, I wonder if this could use a
> Currently, we have ifdef trying to remove unnecessary cost when
> CONFIG_CGROUP_WRITEBACK is not enabled, see defination of cgwb_bdi_register
> and cgwb_remove_from_bdi_list for example. So I try to define bdi_collect_stats
> in similar way.
> > combination of rcu_read_lock()/list_for_each_safe() and wb_tryget() on
> > each wb before collecting its stats..? See how bdi_split_work_to_wbs()
> > works, for example.
> The combination of rcu_read_lock()/list_for_each_safe() and wb_tryget()
> should work fine.
> With ifdef, bdi_collect_stats takes no extra cost when CONFIG_CGROUP_WRITEBACK
> is not enabled and is consistent with existing code style, so I still prefer
> this way. Yes, The extra cost is not a big deal as it only exists in debug mode,
> so it's acceptable to use the suggested combination in next version if you are
> still strongly aganst this.
> 

Ok. I also previously missed that there are two implementations of
bdi_split_work_to_wbs() based on CGROUP_WRITEBACK. It seems reasonable
enough to me to follow that precedent for the !CGROUP_WRITEBACK case.

It still seems to make more sense to me to walk the list similar to how
bdi_split_work_to_wbs() does for the CGROUP_WRITEBACK enabled case. Do
you agree?

Brian

> > 
> > Also I see a patch conflict/compile error on patch 2 due to
> > __wb_calc_thresh() only taking one parameter in my tree. What's the
> > baseline commit for this series?
> > 
> Sorry for missing this, this seris is based on another patchset [1] which is still
> under review.
> Look forward to your reply!
> 
> Thansk
> Kemeng
> 
> [1] https://lore.kernel.org/lkml/20240123183332.876854-1-shikemeng@huaweicloud.com/T/#mc6455784a63d0f8aa1a2f5aff325abcdf9336b76
> 
> > Brian
> > 
> >> +static int bdi_debug_stats_show(struct seq_file *m, void *v)
> >> +{
> >> +	struct backing_dev_info *bdi = m->private;
> >> +	unsigned long background_thresh;
> >> +	unsigned long dirty_thresh;
> >> +	struct wb_stats stats;
> >> +	unsigned long tot_bw;
> >> +
> >>  	global_dirty_limits(&background_thresh, &dirty_thresh);
> >> -	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
> >> +
> >> +	memset(&stats, 0, sizeof(stats));
> >> +	stats.dirty_thresh = dirty_thresh;
> >> +	bdi_collect_stats(bdi, &stats);
> >> +
> >> +	tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
> >>  
> >>  	seq_printf(m,
> >>  		   "BdiWriteback:       %10lu kB\n"
> >> @@ -87,18 +134,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
> >>  		   "b_dirty_time:       %10lu\n"
> >>  		   "bdi_list:           %10u\n"
> >>  		   "state:              %10lx\n",
> >> -		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
> >> -		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
> >> -		   K(wb_thresh),
> >> +		   K(stats.nr_writeback),
> >> +		   K(stats.nr_reclaimable),
> >> +		   K(stats.wb_thresh),
> >>  		   K(dirty_thresh),
> >>  		   K(background_thresh),
> >> -		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
> >> -		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
> >> -		   (unsigned long) K(wb->write_bandwidth),
> >> -		   nr_dirty,
> >> -		   nr_io,
> >> -		   nr_more_io,
> >> -		   nr_dirty_time,
> >> +		   K(stats.nr_dirtied),
> >> +		   K(stats.nr_written),
> >> +		   K(tot_bw),
> >> +		   stats.nr_dirty,
> >> +		   stats.nr_io,
> >> +		   stats.nr_more_io,
> >> +		   stats.nr_dirty_time,
> >>  		   !list_empty(&bdi->bdi_list), bdi->wb.state);
> >>  
> >>  	return 0;
> >> -- 
> >> 2.30.0
> >>
> >>
> > 
> > 
>
Jan Kara March 21, 2024, 6:06 p.m. UTC | #4
On Wed 20-03-24 19:02:17, Kemeng Shi wrote:
> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
> of whole bdi, but only writeback information of bdi in root cgroup is
> collected. So writeback information in non-root cgroup are missing now.
> To be more specific, considering following case:
> 
> /* create writeback cgroup */
> cd /sys/fs/cgroup
> echo "+memory +io" > cgroup.subtree_control
> mkdir group1
> cd group1
> echo $$ > cgroup.procs
> /* do writeback in cgroup */
> fio -name test -filename=/dev/vdb ...
> /* get writeback info of bdi */
> cat /sys/kernel/debug/bdi/xxx/stats
> The cat result unexpectedly implies that there is no writeback on target
> bdi.
> 
> Fix this by collecting stats of all wb in bdi instead of only wb in
> root cgroup.
> 
> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>

Looks mostly good, one comment below:

> ---
>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 70 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 5f2be8c8df11..788702b6c5dd 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq;
>  #include <linux/debugfs.h>
>  #include <linux/seq_file.h>
>  
> +struct wb_stats {
> +	unsigned long nr_dirty;
> +	unsigned long nr_io;
> +	unsigned long nr_more_io;
> +	unsigned long nr_dirty_time;
> +	unsigned long nr_writeback;
> +	unsigned long nr_reclaimable;
> +	unsigned long nr_dirtied;
> +	unsigned long nr_written;
> +	unsigned long dirty_thresh;
> +	unsigned long wb_thresh;
> +};
> +
>  static struct dentry *bdi_debug_root;
>  
>  static void bdi_debug_init(void)
> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>  }
>  
> -static int bdi_debug_stats_show(struct seq_file *m, void *v)
> +static void collect_wb_stats(struct wb_stats *stats,
> +			     struct bdi_writeback *wb)
>  {
> -	struct backing_dev_info *bdi = m->private;
> -	struct bdi_writeback *wb = &bdi->wb;
> -	unsigned long background_thresh;
> -	unsigned long dirty_thresh;
> -	unsigned long wb_thresh;
> -	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
>  	struct inode *inode;
>  
> -	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
>  	spin_lock(&wb->list_lock);
>  	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
> -		nr_dirty++;
> +		stats->nr_dirty++;
>  	list_for_each_entry(inode, &wb->b_io, i_io_list)
> -		nr_io++;
> +		stats->nr_io++;
>  	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
> -		nr_more_io++;
> +		stats->nr_more_io++;
>  	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
>  		if (inode->i_state & I_DIRTY_TIME)
> -			nr_dirty_time++;
> +			stats->nr_dirty_time++;
>  	spin_unlock(&wb->list_lock);
>  
> +	stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
> +	stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
> +	stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
> +	stats->nr_written += wb_stat(wb, WB_WRITTEN);
> +	stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
> +}
> +
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> +			      struct wb_stats *stats)
> +{
> +	struct bdi_writeback *wb;
> +
> +	/* protect wb from release */
> +	mutex_lock(&bdi->cgwb_release_mutex);
> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
> +		collect_wb_stats(stats, wb);
> +	mutex_unlock(&bdi->cgwb_release_mutex);
> +}

So AFAICT this function can race against
  bdi_unregister() -> wb_shutdown(&bdi->wb)

because that doesn't take the cgwb_release_mutex. So we either need the RCU
protection as Brian suggested or cgwb_lock or something. But given
collect_wb_stats() can take a significant amount of time (traversing all
the lists etc.) I think we'll need something more clever.

								Honza
Kemeng Shi March 22, 2024, 7:32 a.m. UTC | #5
on 3/21/2024 8:10 PM, Brian Foster wrote:
> On Thu, Mar 21, 2024 at 11:44:40AM +0800, Kemeng Shi wrote:
>>
>>
>> on 3/20/2024 9:21 PM, Brian Foster wrote:
>>> On Wed, Mar 20, 2024 at 07:02:17PM +0800, Kemeng Shi wrote:
>>>> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
>>>> of whole bdi, but only writeback information of bdi in root cgroup is
>>>> collected. So writeback information in non-root cgroup are missing now.
>>>> To be more specific, considering following case:
>>>>
>>>> /* create writeback cgroup */
>>>> cd /sys/fs/cgroup
>>>> echo "+memory +io" > cgroup.subtree_control
>>>> mkdir group1
>>>> cd group1
>>>> echo $$ > cgroup.procs
>>>> /* do writeback in cgroup */
>>>> fio -name test -filename=/dev/vdb ...
>>>> /* get writeback info of bdi */
>>>> cat /sys/kernel/debug/bdi/xxx/stats
>>>> The cat result unexpectedly implies that there is no writeback on target
>>>> bdi.
>>>>
>>>> Fix this by collecting stats of all wb in bdi instead of only wb in
>>>> root cgroup.
>>>>
>>>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>>>> ---
>>>>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>>>>  1 file changed, 70 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>>>> index 5f2be8c8df11..788702b6c5dd 100644
>>>> --- a/mm/backing-dev.c
>>>> +++ b/mm/backing-dev.c
>>> ...
>>>> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>>>>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>>>>  }
>>>>  
>>> ...
>>>> +#ifdef CONFIG_CGROUP_WRITEBACK
>>>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>> +			      struct wb_stats *stats)
>>>> +{
>>>> +	struct bdi_writeback *wb;
>>>> +
>>>> +	/* protect wb from release */
>>>> +	mutex_lock(&bdi->cgwb_release_mutex);
>>>> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
>>>> +		collect_wb_stats(stats, wb);
>>>> +	mutex_unlock(&bdi->cgwb_release_mutex);
>>>> +}
>>>> +#else
>>>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>> +			      struct wb_stats *stats)
>>>> +{
>>>> +	collect_wb_stats(stats, &bdi->wb);
>>>> +}
>>>> +#endif
>>>> +
>>>
>>> I'm not familiar enough with the cgwb code to say for sure (and I'd
>>> probably wait for more high level feedback before worrying too much
>>> about this), but do we need the ifdef here just to iterate ->wb_list?
>>> >From looking at the code, it appears bdi->wb ends up on the list while
>>> the bdi is registered for both cases, so that distinction seems
>>> unnecessary. WRT to wb release protection, I wonder if this could use a
>> Currently, we have ifdef trying to remove unnecessary cost when
>> CONFIG_CGROUP_WRITEBACK is not enabled, see defination of cgwb_bdi_register
>> and cgwb_remove_from_bdi_list for example. So I try to define bdi_collect_stats
>> in similar way.
>>> combination of rcu_read_lock()/list_for_each_safe() and wb_tryget() on
>>> each wb before collecting its stats..? See how bdi_split_work_to_wbs()
>>> works, for example.
>> The combination of rcu_read_lock()/list_for_each_safe() and wb_tryget()
>> should work fine.
>> With ifdef, bdi_collect_stats takes no extra cost when CONFIG_CGROUP_WRITEBACK
>> is not enabled and is consistent with existing code style, so I still prefer
>> this way. Yes, The extra cost is not a big deal as it only exists in debug mode,
>> so it's acceptable to use the suggested combination in next version if you are
>> still strongly aganst this.
>>
> 
> Ok. I also previously missed that there are two implementations of
> bdi_split_work_to_wbs() based on CGROUP_WRITEBACK. It seems reasonable
> enough to me to follow that precedent for the !CGROUP_WRITEBACK case.
> 
> It still seems to make more sense to me to walk the list similar to how
> bdi_split_work_to_wbs() does for the CGROUP_WRITEBACK enabled case. Do
> you agree?
Sure, I agree with this and will do it in next version. Thansk!

Kemeng
> 
> Brian
> 
>>>
>>> Also I see a patch conflict/compile error on patch 2 due to
>>> __wb_calc_thresh() only taking one parameter in my tree. What's the
>>> baseline commit for this series?
>>>
>> Sorry for missing this, this seris is based on another patchset [1] which is still
>> under review.
>> Look forward to your reply!
>>
>> Thansk
>> Kemeng
>>
>> [1] https://lore.kernel.org/lkml/20240123183332.876854-1-shikemeng@huaweicloud.com/T/#mc6455784a63d0f8aa1a2f5aff325abcdf9336b76
>>
>>> Brian
>>>
>>>> +static int bdi_debug_stats_show(struct seq_file *m, void *v)
>>>> +{
>>>> +	struct backing_dev_info *bdi = m->private;
>>>> +	unsigned long background_thresh;
>>>> +	unsigned long dirty_thresh;
>>>> +	struct wb_stats stats;
>>>> +	unsigned long tot_bw;
>>>> +
>>>>  	global_dirty_limits(&background_thresh, &dirty_thresh);
>>>> -	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
>>>> +
>>>> +	memset(&stats, 0, sizeof(stats));
>>>> +	stats.dirty_thresh = dirty_thresh;
>>>> +	bdi_collect_stats(bdi, &stats);
>>>> +
>>>> +	tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
>>>>  
>>>>  	seq_printf(m,
>>>>  		   "BdiWriteback:       %10lu kB\n"
>>>> @@ -87,18 +134,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>>>>  		   "b_dirty_time:       %10lu\n"
>>>>  		   "bdi_list:           %10u\n"
>>>>  		   "state:              %10lx\n",
>>>> -		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
>>>> -		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
>>>> -		   K(wb_thresh),
>>>> +		   K(stats.nr_writeback),
>>>> +		   K(stats.nr_reclaimable),
>>>> +		   K(stats.wb_thresh),
>>>>  		   K(dirty_thresh),
>>>>  		   K(background_thresh),
>>>> -		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
>>>> -		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
>>>> -		   (unsigned long) K(wb->write_bandwidth),
>>>> -		   nr_dirty,
>>>> -		   nr_io,
>>>> -		   nr_more_io,
>>>> -		   nr_dirty_time,
>>>> +		   K(stats.nr_dirtied),
>>>> +		   K(stats.nr_written),
>>>> +		   K(tot_bw),
>>>> +		   stats.nr_dirty,
>>>> +		   stats.nr_io,
>>>> +		   stats.nr_more_io,
>>>> +		   stats.nr_dirty_time,
>>>>  		   !list_empty(&bdi->bdi_list), bdi->wb.state);
>>>>  
>>>>  	return 0;
>>>> -- 
>>>> 2.30.0
>>>>
>>>>
>>>
>>>
>>
> 
>
Kemeng Shi March 22, 2024, 7:51 a.m. UTC | #6
on 3/22/2024 2:06 AM, Jan Kara wrote:
> On Wed 20-03-24 19:02:17, Kemeng Shi wrote:
>> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
>> of whole bdi, but only writeback information of bdi in root cgroup is
>> collected. So writeback information in non-root cgroup are missing now.
>> To be more specific, considering following case:
>>
>> /* create writeback cgroup */
>> cd /sys/fs/cgroup
>> echo "+memory +io" > cgroup.subtree_control
>> mkdir group1
>> cd group1
>> echo $$ > cgroup.procs
>> /* do writeback in cgroup */
>> fio -name test -filename=/dev/vdb ...
>> /* get writeback info of bdi */
>> cat /sys/kernel/debug/bdi/xxx/stats
>> The cat result unexpectedly implies that there is no writeback on target
>> bdi.
>>
>> Fix this by collecting stats of all wb in bdi instead of only wb in
>> root cgroup.
>>
>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> 
> Looks mostly good, one comment below:
> 
>> ---
>>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>>  1 file changed, 70 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>> index 5f2be8c8df11..788702b6c5dd 100644
>> --- a/mm/backing-dev.c
>> +++ b/mm/backing-dev.c
>> @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq;
>>  #include <linux/debugfs.h>
>>  #include <linux/seq_file.h>
>>  
>> +struct wb_stats {
>> +	unsigned long nr_dirty;
>> +	unsigned long nr_io;
>> +	unsigned long nr_more_io;
>> +	unsigned long nr_dirty_time;
>> +	unsigned long nr_writeback;
>> +	unsigned long nr_reclaimable;
>> +	unsigned long nr_dirtied;
>> +	unsigned long nr_written;
>> +	unsigned long dirty_thresh;
>> +	unsigned long wb_thresh;
>> +};
>> +
>>  static struct dentry *bdi_debug_root;
>>  
>>  static void bdi_debug_init(void)
>> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>>  }
>>  
>> -static int bdi_debug_stats_show(struct seq_file *m, void *v)
>> +static void collect_wb_stats(struct wb_stats *stats,
>> +			     struct bdi_writeback *wb)
>>  {
>> -	struct backing_dev_info *bdi = m->private;
>> -	struct bdi_writeback *wb = &bdi->wb;
>> -	unsigned long background_thresh;
>> -	unsigned long dirty_thresh;
>> -	unsigned long wb_thresh;
>> -	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
>>  	struct inode *inode;
>>  
>> -	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
>>  	spin_lock(&wb->list_lock);
>>  	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
>> -		nr_dirty++;
>> +		stats->nr_dirty++;
>>  	list_for_each_entry(inode, &wb->b_io, i_io_list)
>> -		nr_io++;
>> +		stats->nr_io++;
>>  	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
>> -		nr_more_io++;
>> +		stats->nr_more_io++;
>>  	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
>>  		if (inode->i_state & I_DIRTY_TIME)
>> -			nr_dirty_time++;
>> +			stats->nr_dirty_time++;
>>  	spin_unlock(&wb->list_lock);
>>  
>> +	stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
>> +	stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
>> +	stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
>> +	stats->nr_written += wb_stat(wb, WB_WRITTEN);
>> +	stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
>> +}
>> +
>> +#ifdef CONFIG_CGROUP_WRITEBACK
>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>> +			      struct wb_stats *stats)
>> +{
>> +	struct bdi_writeback *wb;
>> +
>> +	/* protect wb from release */
>> +	mutex_lock(&bdi->cgwb_release_mutex);
>> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
>> +		collect_wb_stats(stats, wb);
>> +	mutex_unlock(&bdi->cgwb_release_mutex);
>> +}
> 
> So AFAICT this function can race against
>   bdi_unregister() -> wb_shutdown(&bdi->wb)
> 
> because that doesn't take the cgwb_release_mutex. So we either need the RCU
> protection as Brian suggested or cgwb_lock or something. But given
> collect_wb_stats() can take a significant amount of time (traversing all
> the lists etc.) I think we'll need something more clever.
Sorry for missing this. I only pay attention to wb in cgroup as there is no
much change to how we use bdi->wb.
It seems that there was always a race between orginal bdi_debug_stats_show and
release of bdi as following
cat /.../stats
bdi_debug_stats_show
			bdi_unregister
			bdi_put
			  release_bdi
			    kfree(bdi)
  use after free

I will fix this in next version. Thanks!

> 
> 								Honza
>
Brian Foster March 22, 2024, 11:58 a.m. UTC | #7
On Fri, Mar 22, 2024 at 03:51:27PM +0800, Kemeng Shi wrote:
> 
> 
> on 3/22/2024 2:06 AM, Jan Kara wrote:
> > On Wed 20-03-24 19:02:17, Kemeng Shi wrote:
> >> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
> >> of whole bdi, but only writeback information of bdi in root cgroup is
> >> collected. So writeback information in non-root cgroup are missing now.
> >> To be more specific, considering following case:
> >>
> >> /* create writeback cgroup */
> >> cd /sys/fs/cgroup
> >> echo "+memory +io" > cgroup.subtree_control
> >> mkdir group1
> >> cd group1
> >> echo $$ > cgroup.procs
> >> /* do writeback in cgroup */
> >> fio -name test -filename=/dev/vdb ...
> >> /* get writeback info of bdi */
> >> cat /sys/kernel/debug/bdi/xxx/stats
> >> The cat result unexpectedly implies that there is no writeback on target
> >> bdi.
> >>
> >> Fix this by collecting stats of all wb in bdi instead of only wb in
> >> root cgroup.
> >>
> >> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> > 
> > Looks mostly good, one comment below:
> > 
> >> ---
> >>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
> >>  1 file changed, 70 insertions(+), 23 deletions(-)
> >>
> >> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> >> index 5f2be8c8df11..788702b6c5dd 100644
> >> --- a/mm/backing-dev.c
> >> +++ b/mm/backing-dev.c
> >> @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq;
> >>  #include <linux/debugfs.h>
> >>  #include <linux/seq_file.h>
> >>  
> >> +struct wb_stats {
> >> +	unsigned long nr_dirty;
> >> +	unsigned long nr_io;
> >> +	unsigned long nr_more_io;
> >> +	unsigned long nr_dirty_time;
> >> +	unsigned long nr_writeback;
> >> +	unsigned long nr_reclaimable;
> >> +	unsigned long nr_dirtied;
> >> +	unsigned long nr_written;
> >> +	unsigned long dirty_thresh;
> >> +	unsigned long wb_thresh;
> >> +};
> >> +
> >>  static struct dentry *bdi_debug_root;
> >>  
> >>  static void bdi_debug_init(void)
> >> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
> >>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
> >>  }
> >>  
> >> -static int bdi_debug_stats_show(struct seq_file *m, void *v)
> >> +static void collect_wb_stats(struct wb_stats *stats,
> >> +			     struct bdi_writeback *wb)
> >>  {
> >> -	struct backing_dev_info *bdi = m->private;
> >> -	struct bdi_writeback *wb = &bdi->wb;
> >> -	unsigned long background_thresh;
> >> -	unsigned long dirty_thresh;
> >> -	unsigned long wb_thresh;
> >> -	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
> >>  	struct inode *inode;
> >>  
> >> -	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
> >>  	spin_lock(&wb->list_lock);
> >>  	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
> >> -		nr_dirty++;
> >> +		stats->nr_dirty++;
> >>  	list_for_each_entry(inode, &wb->b_io, i_io_list)
> >> -		nr_io++;
> >> +		stats->nr_io++;
> >>  	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
> >> -		nr_more_io++;
> >> +		stats->nr_more_io++;
> >>  	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
> >>  		if (inode->i_state & I_DIRTY_TIME)
> >> -			nr_dirty_time++;
> >> +			stats->nr_dirty_time++;
> >>  	spin_unlock(&wb->list_lock);
> >>  
> >> +	stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
> >> +	stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
> >> +	stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
> >> +	stats->nr_written += wb_stat(wb, WB_WRITTEN);
> >> +	stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
> >> +}
> >> +
> >> +#ifdef CONFIG_CGROUP_WRITEBACK
> >> +static void bdi_collect_stats(struct backing_dev_info *bdi,
> >> +			      struct wb_stats *stats)
> >> +{
> >> +	struct bdi_writeback *wb;
> >> +
> >> +	/* protect wb from release */
> >> +	mutex_lock(&bdi->cgwb_release_mutex);
> >> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
> >> +		collect_wb_stats(stats, wb);
> >> +	mutex_unlock(&bdi->cgwb_release_mutex);
> >> +}
> > 
> > So AFAICT this function can race against
> >   bdi_unregister() -> wb_shutdown(&bdi->wb)
> > 
> > because that doesn't take the cgwb_release_mutex. So we either need the RCU
> > protection as Brian suggested or cgwb_lock or something. But given
> > collect_wb_stats() can take a significant amount of time (traversing all
> > the lists etc.) I think we'll need something more clever.
> Sorry for missing this. I only pay attention to wb in cgroup as there is no
> much change to how we use bdi->wb.
> It seems that there was always a race between orginal bdi_debug_stats_show and
> release of bdi as following
> cat /.../stats
> bdi_debug_stats_show
> 			bdi_unregister
> 			bdi_put
> 			  release_bdi
> 			    kfree(bdi)
>   use after free
> 
> I will fix this in next version. Thanks!
> 

BTW, I thought this was kind of the point of the tryget in the writeback
path. I.e., the higher level loop runs under rcu_read_lock(), but in the
case it needs to cycle the rcu lock it acquires a reference to the wb,
and then can use that wb to continue the loop once the rcu lock is
reacquired. IIUC, this works because the rcu list removal keeps the list
->next pointer sane and then the ref keeps the wb memory from being
freed. A tryget of any wb's that have been shutdown fails because the
percpu ref counter has been killed.

So I _think_ this means that for the stat collection use case, you could
protect the overall walk with rcu as Jan alludes above, but then maybe
use a combination of need_resched()/cond_resched_rcu() and wb_tryget()
to introduce resched points and avoid holding lock(s) for too long.

Personally, I wonder if since this is mainly debug code we could just
get away with doing the simple thing of trying for a ref on each wb
unconditionally rather than only in a need_resched() case, but maybe
there are reasons not to do that... hm?

Brian

> > 
> > 								Honza
> > 
>
Kemeng Shi March 26, 2024, 1:16 p.m. UTC | #8
on 3/22/2024 7:58 PM, Brian Foster wrote:
> On Fri, Mar 22, 2024 at 03:51:27PM +0800, Kemeng Shi wrote:
>>
>>
>> on 3/22/2024 2:06 AM, Jan Kara wrote:
>>> On Wed 20-03-24 19:02:17, Kemeng Shi wrote:
>>>> /sys/kernel/debug/bdi/xxx/stats is supposed to show writeback information
>>>> of whole bdi, but only writeback information of bdi in root cgroup is
>>>> collected. So writeback information in non-root cgroup are missing now.
>>>> To be more specific, considering following case:
>>>>
>>>> /* create writeback cgroup */
>>>> cd /sys/fs/cgroup
>>>> echo "+memory +io" > cgroup.subtree_control
>>>> mkdir group1
>>>> cd group1
>>>> echo $$ > cgroup.procs
>>>> /* do writeback in cgroup */
>>>> fio -name test -filename=/dev/vdb ...
>>>> /* get writeback info of bdi */
>>>> cat /sys/kernel/debug/bdi/xxx/stats
>>>> The cat result unexpectedly implies that there is no writeback on target
>>>> bdi.
>>>>
>>>> Fix this by collecting stats of all wb in bdi instead of only wb in
>>>> root cgroup.
>>>>
>>>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>>>
>>> Looks mostly good, one comment below:
>>>
>>>> ---
>>>>  mm/backing-dev.c | 93 ++++++++++++++++++++++++++++++++++++------------
>>>>  1 file changed, 70 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>>>> index 5f2be8c8df11..788702b6c5dd 100644
>>>> --- a/mm/backing-dev.c
>>>> +++ b/mm/backing-dev.c
>>>> @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq;
>>>>  #include <linux/debugfs.h>
>>>>  #include <linux/seq_file.h>
>>>>  
>>>> +struct wb_stats {
>>>> +	unsigned long nr_dirty;
>>>> +	unsigned long nr_io;
>>>> +	unsigned long nr_more_io;
>>>> +	unsigned long nr_dirty_time;
>>>> +	unsigned long nr_writeback;
>>>> +	unsigned long nr_reclaimable;
>>>> +	unsigned long nr_dirtied;
>>>> +	unsigned long nr_written;
>>>> +	unsigned long dirty_thresh;
>>>> +	unsigned long wb_thresh;
>>>> +};
>>>> +
>>>>  static struct dentry *bdi_debug_root;
>>>>  
>>>>  static void bdi_debug_init(void)
>>>> @@ -46,31 +59,65 @@ static void bdi_debug_init(void)
>>>>  	bdi_debug_root = debugfs_create_dir("bdi", NULL);
>>>>  }
>>>>  
>>>> -static int bdi_debug_stats_show(struct seq_file *m, void *v)
>>>> +static void collect_wb_stats(struct wb_stats *stats,
>>>> +			     struct bdi_writeback *wb)
>>>>  {
>>>> -	struct backing_dev_info *bdi = m->private;
>>>> -	struct bdi_writeback *wb = &bdi->wb;
>>>> -	unsigned long background_thresh;
>>>> -	unsigned long dirty_thresh;
>>>> -	unsigned long wb_thresh;
>>>> -	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
>>>>  	struct inode *inode;
>>>>  
>>>> -	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
>>>>  	spin_lock(&wb->list_lock);
>>>>  	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
>>>> -		nr_dirty++;
>>>> +		stats->nr_dirty++;
>>>>  	list_for_each_entry(inode, &wb->b_io, i_io_list)
>>>> -		nr_io++;
>>>> +		stats->nr_io++;
>>>>  	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
>>>> -		nr_more_io++;
>>>> +		stats->nr_more_io++;
>>>>  	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
>>>>  		if (inode->i_state & I_DIRTY_TIME)
>>>> -			nr_dirty_time++;
>>>> +			stats->nr_dirty_time++;
>>>>  	spin_unlock(&wb->list_lock);
>>>>  
>>>> +	stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
>>>> +	stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
>>>> +	stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
>>>> +	stats->nr_written += wb_stat(wb, WB_WRITTEN);
>>>> +	stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
>>>> +}
>>>> +
>>>> +#ifdef CONFIG_CGROUP_WRITEBACK
>>>> +static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>> +			      struct wb_stats *stats)
>>>> +{
>>>> +	struct bdi_writeback *wb;
>>>> +
>>>> +	/* protect wb from release */
>>>> +	mutex_lock(&bdi->cgwb_release_mutex);
>>>> +	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
>>>> +		collect_wb_stats(stats, wb);
>>>> +	mutex_unlock(&bdi->cgwb_release_mutex);
>>>> +}
>>>
>>> So AFAICT this function can race against
>>>   bdi_unregister() -> wb_shutdown(&bdi->wb)
>>>
>>> because that doesn't take the cgwb_release_mutex. So we either need the RCU
>>> protection as Brian suggested or cgwb_lock or something. But given
>>> collect_wb_stats() can take a significant amount of time (traversing all
>>> the lists etc.) I think we'll need something more clever.
>> Sorry for missing this. I only pay attention to wb in cgroup as there is no
>> much change to how we use bdi->wb.
>> It seems that there was always a race between orginal bdi_debug_stats_show and
>> release of bdi as following
>> cat /.../stats
>> bdi_debug_stats_show
>> 			bdi_unregister
>> 			bdi_put
>> 			  release_bdi
>> 			    kfree(bdi)
>>   use after free
>>
>> I will fix this in next version. Thanks!
>>
> 
Hi Brian
> BTW, I thought this was kind of the point of the tryget in the writeback
> path. I.e., the higher level loop runs under rcu_read_lock(), but in the
> case it needs to cycle the rcu lock it acquires a reference to the wb,
> and then can use that wb to continue the loop once the rcu lock is
> reacquired. IIUC, this works because the rcu list removal keeps the list
> ->next pointer sane and then the ref keeps the wb memory from being
> freed. A tryget of any wb's that have been shutdown fails because the
> percpu ref counter has been killedFor bdi->wb, tryget seems not helpful to protect race as wb_tryget does
nothing for bdi->wb. For wb in cgroup, this works fine.
> 
> So I _think_ this means that for the stat collection use case, you could
> protect the overall walk with rcu as Jan alludes above, but then maybe
> use a combination of need_resched()/cond_resched_rcu() and wb_tryget()
> to introduce resched points and avoid holding lock(s) for too long.
Sure, I will protect race with rcu in next version!
> 
> Personally, I wonder if since this is mainly debug code we could just
> get away with doing the simple thing of trying for a ref on each wb
> unconditionally rather than only in a need_resched() case, but maybe
> there are reasons not to do that... hm?
Agreed, I also prefer simple debug code with no need_resched. Will do
it unless someone is against this.

Thansk a lot for the helpful information!
Kemeng
> 
> Brian
> 
>>>
>>> 								Honza
>>>
>>
>
diff mbox series

Patch

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5f2be8c8df11..788702b6c5dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,6 +39,19 @@  struct workqueue_struct *bdi_wq;
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+struct wb_stats {
+	unsigned long nr_dirty;
+	unsigned long nr_io;
+	unsigned long nr_more_io;
+	unsigned long nr_dirty_time;
+	unsigned long nr_writeback;
+	unsigned long nr_reclaimable;
+	unsigned long nr_dirtied;
+	unsigned long nr_written;
+	unsigned long dirty_thresh;
+	unsigned long wb_thresh;
+};
+
 static struct dentry *bdi_debug_root;
 
 static void bdi_debug_init(void)
@@ -46,31 +59,65 @@  static void bdi_debug_init(void)
 	bdi_debug_root = debugfs_create_dir("bdi", NULL);
 }
 
-static int bdi_debug_stats_show(struct seq_file *m, void *v)
+static void collect_wb_stats(struct wb_stats *stats,
+			     struct bdi_writeback *wb)
 {
-	struct backing_dev_info *bdi = m->private;
-	struct bdi_writeback *wb = &bdi->wb;
-	unsigned long background_thresh;
-	unsigned long dirty_thresh;
-	unsigned long wb_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
-		nr_dirty++;
+		stats->nr_dirty++;
 	list_for_each_entry(inode, &wb->b_io, i_io_list)
-		nr_io++;
+		stats->nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
-		nr_more_io++;
+		stats->nr_more_io++;
 	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
 		if (inode->i_state & I_DIRTY_TIME)
-			nr_dirty_time++;
+			stats->nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
 
+	stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
+	stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
+	stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
+	stats->nr_written += wb_stat(wb, WB_WRITTEN);
+	stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+static void bdi_collect_stats(struct backing_dev_info *bdi,
+			      struct wb_stats *stats)
+{
+	struct bdi_writeback *wb;
+
+	/* protect wb from release */
+	mutex_lock(&bdi->cgwb_release_mutex);
+	list_for_each_entry(wb, &bdi->wb_list, bdi_node)
+		collect_wb_stats(stats, wb);
+	mutex_unlock(&bdi->cgwb_release_mutex);
+}
+#else
+static void bdi_collect_stats(struct backing_dev_info *bdi,
+			      struct wb_stats *stats)
+{
+	collect_wb_stats(stats, &bdi->wb);
+}
+#endif
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+	struct backing_dev_info *bdi = m->private;
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	struct wb_stats stats;
+	unsigned long tot_bw;
+
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
+
+	memset(&stats, 0, sizeof(stats));
+	stats.dirty_thresh = dirty_thresh;
+	bdi_collect_stats(bdi, &stats);
+
+	tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
 
 	seq_printf(m,
 		   "BdiWriteback:       %10lu kB\n"
@@ -87,18 +134,18 @@  static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
-		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
-		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
-		   K(wb_thresh),
+		   K(stats.nr_writeback),
+		   K(stats.nr_reclaimable),
+		   K(stats.wb_thresh),
 		   K(dirty_thresh),
 		   K(background_thresh),
-		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
-		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
-		   (unsigned long) K(wb->write_bandwidth),
-		   nr_dirty,
-		   nr_io,
-		   nr_more_io,
-		   nr_dirty_time,
+		   K(stats.nr_dirtied),
+		   K(stats.nr_written),
+		   K(tot_bw),
+		   stats.nr_dirty,
+		   stats.nr_io,
+		   stats.nr_more_io,
+		   stats.nr_dirty_time,
 		   !list_empty(&bdi->bdi_list), bdi->wb.state);
 
 	return 0;