diff mbox series

[v2,3/6] writeback: support retrieving per group debug writeback stats of bdi

Message ID 20240327155751.3536-4-shikemeng@huaweicloud.com (mailing list archive)
State New
Headers show
Series Improve visibility of writeback | expand

Commit Message

Kemeng Shi March 27, 2024, 3:57 p.m. UTC
Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
of bdi.

Following domain hierarchy is tested:
                global domain (320G)
                /                 \
        cgroup domain1(10G)     cgroup domain2(10G)
                |                 |
bdi            wb1               wb2

/* per wb writeback info of bdi is collected */
cat /sys/kernel/debug/bdi/252:16/wb_stats
WbCgIno:                    1
WbWriteback:                0 kB
WbReclaimable:              0 kB
WbDirtyThresh:              0 kB
WbDirtied:                  0 kB
WbWritten:                  0 kB
WbWriteBandwidth:      102400 kBps
b_dirty:                    0
b_io:                       0
b_more_io:                  0
b_dirty_time:               0
state:                      1
WbCgIno:                 4094
WbWriteback:            54432 kB
WbReclaimable:         766080 kB
WbDirtyThresh:        3094760 kB
WbDirtied:            1656480 kB
WbWritten:             837088 kB
WbWriteBandwidth:      132772 kBps
b_dirty:                    1
b_io:                       1
b_more_io:                  0
b_dirty_time:               0
state:                      7
WbCgIno:                 4135
WbWriteback:            15232 kB
WbReclaimable:         786688 kB
WbDirtyThresh:        2909984 kB
WbDirtied:            1482656 kB
WbWritten:             681408 kB
WbWriteBandwidth:      124848 kBps
b_dirty:                    0
b_io:                       1
b_more_io:                  0
b_dirty_time:               0
state:                      7

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
---
 include/linux/writeback.h |  1 +
 mm/backing-dev.c          | 88 +++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c       | 19 +++++++++
 3 files changed, 108 insertions(+)

Comments

Brian Foster March 29, 2024, 1:10 p.m. UTC | #1
On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
> of bdi.
> 

Hi Kemeng,

Just a few random thoughts/comments..

> Following domain hierarchy is tested:
>                 global domain (320G)
>                 /                 \
>         cgroup domain1(10G)     cgroup domain2(10G)
>                 |                 |
> bdi            wb1               wb2
> 
> /* per wb writeback info of bdi is collected */
> cat /sys/kernel/debug/bdi/252:16/wb_stats
> WbCgIno:                    1
> WbWriteback:                0 kB
> WbReclaimable:              0 kB
> WbDirtyThresh:              0 kB
> WbDirtied:                  0 kB
> WbWritten:                  0 kB
> WbWriteBandwidth:      102400 kBps
> b_dirty:                    0
> b_io:                       0
> b_more_io:                  0
> b_dirty_time:               0
> state:                      1

Maybe some whitespace or something between entries would improve
readability?

> WbCgIno:                 4094
> WbWriteback:            54432 kB
> WbReclaimable:         766080 kB
> WbDirtyThresh:        3094760 kB
> WbDirtied:            1656480 kB
> WbWritten:             837088 kB
> WbWriteBandwidth:      132772 kBps
> b_dirty:                    1
> b_io:                       1
> b_more_io:                  0
> b_dirty_time:               0
> state:                      7
> WbCgIno:                 4135
> WbWriteback:            15232 kB
> WbReclaimable:         786688 kB
> WbDirtyThresh:        2909984 kB
> WbDirtied:            1482656 kB
> WbWritten:             681408 kB
> WbWriteBandwidth:      124848 kBps
> b_dirty:                    0
> b_io:                       1
> b_more_io:                  0
> b_dirty_time:               0
> state:                      7
> 
> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> ---
>  include/linux/writeback.h |  1 +
>  mm/backing-dev.c          | 88 +++++++++++++++++++++++++++++++++++++++
>  mm/page-writeback.c       | 19 +++++++++
>  3 files changed, 108 insertions(+)
> 
...
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 8daf950e6855..e3953db7d88d 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -103,6 +103,91 @@ static void collect_wb_stats(struct wb_stats *stats,
>  }
>  
>  #ifdef CONFIG_CGROUP_WRITEBACK
...
> +static int cgwb_debug_stats_show(struct seq_file *m, void *v)
> +{
> +	struct backing_dev_info *bdi;
> +	unsigned long background_thresh;
> +	unsigned long dirty_thresh;
> +	struct bdi_writeback *wb;
> +	struct wb_stats stats;
> +
> +	rcu_read_lock();
> +	bdi = lookup_bdi(m);
> +	if (!bdi) {
> +		rcu_read_unlock();
> +		return -EEXIST;
> +	}
> +
> +	global_dirty_limits(&background_thresh, &dirty_thresh);
> +
> +	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
> +		memset(&stats, 0, sizeof(stats));
> +		stats.dirty_thresh = dirty_thresh;

If you did something like the following here, wouldn't that also zero
the rest of the structure?

		struct wb_stats stats = { .dirty_thresh = dirty_thresh };

> +		collect_wb_stats(&stats, wb);
> +

Also, similar question as before on whether you'd want to check
WB_registered or something here..

> +		if (mem_cgroup_wb_domain(wb) == NULL) {
> +			wb_stats_show(m, wb, &stats);
> +			continue;
> +		}

Can you explain what this logic is about? Is the cgwb_calc_thresh()
thing not needed in this case? A comment might help for those less
familiar with the implementation details.

BTW, I'm also wondering if something like the following is correct
and/or roughly equivalent:
	
	list_for_each_*(wb, ...) {
		struct wb_stats stats = ...;

		if (!wb_tryget(wb))
			continue;

		collect_wb_stats(&stats, wb);

		/*
		 * Extra wb_thresh magic. Drop rcu lock because ... . We
		 * can do so here because we have a ref.
		 */
		if (mem_cgroup_wb_domain(wb)) {
			rcu_read_unlock();
			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
			rcu_read_lock();
		}

		wb_stats_show(m, wb, &stats)
		wb_put(wb);
	}

> +
> +		/*
> +		 * cgwb_release will destroy wb->memcg_completions which
> +		 * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
> +		 * memcg_completions destruction from cgwb_release.
> +		 */
> +		if (!wb_tryget(wb))
> +			continue;
> +
> +		rcu_read_unlock();
> +		/* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
> +		stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> +		wb_stats_show(m, wb, &stats);
> +		rcu_read_lock();
> +		wb_put(wb);
> +	}
> +	rcu_read_unlock();
> +
> +	return 0;
> +}
> +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
> +
> +static void cgwb_debug_register(struct backing_dev_info *bdi)
> +{
> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
> +			    &cgwb_debug_stats_fops);
> +}
> +
>  static void bdi_collect_stats(struct backing_dev_info *bdi,
>  			      struct wb_stats *stats)
>  {
> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
>  {
>  	collect_wb_stats(stats, &bdi->wb);
>  }
> +
> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }

Could we just create the wb_stats file regardless of whether cgwb is
enabled? Obviously theres only one wb in the !CGWB case and it's
somewhat duplicative with the bdi stats file, but that seems harmless if
the same code can be reused..? Maybe there's also a small argument for
dropping the state info from the bdi stats file and moving it to
wb_stats.

Brian

>  #endif
>  
>  static int bdi_debug_stats_show(struct seq_file *m, void *v)
> @@ -182,6 +269,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
>  
>  	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
>  			    &bdi_debug_stats_fops);
> +	cgwb_debug_register(bdi);
>  }
>  
>  static void bdi_debug_unregister(struct backing_dev_info *bdi)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0e20467367fe..3724c7525316 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
>  	return __wb_calc_thresh(&gdtc, thresh);
>  }
>  
> +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
> +{
> +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> +	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
> +	unsigned long filepages, headroom, writeback;
> +
> +	gdtc.avail = global_dirtyable_memory();
> +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> +		     global_node_page_state(NR_WRITEBACK);
> +
> +	mem_cgroup_wb_stats(wb, &filepages, &headroom,
> +			    &mdtc.dirty, &writeback);
> +	mdtc.dirty += writeback;
> +	mdtc_calc_avail(&mdtc, filepages, headroom);
> +	domain_dirty_limits(&mdtc);
> +
> +	return __wb_calc_thresh(&mdtc, mdtc.thresh);
> +}
> +
>  /*
>   *                           setpoint - dirty 3
>   *        f(dirty) := 1.0 + (----------------)
> -- 
> 2.30.0
>
Kemeng Shi April 3, 2024, 8:49 a.m. UTC | #2
on 3/29/2024 9:10 PM, Brian Foster wrote:
> On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
>> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
>> of bdi.
>>
> 
> Hi Kemeng,
Hello Brian,
> 
> Just a few random thoughts/comments..
> 
>> Following domain hierarchy is tested:
>>                 global domain (320G)
>>                 /                 \
>>         cgroup domain1(10G)     cgroup domain2(10G)
>>                 |                 |
>> bdi            wb1               wb2
>>
>> /* per wb writeback info of bdi is collected */
>> cat /sys/kernel/debug/bdi/252:16/wb_stats
>> WbCgIno:                    1
>> WbWriteback:                0 kB
>> WbReclaimable:              0 kB
>> WbDirtyThresh:              0 kB
>> WbDirtied:                  0 kB
>> WbWritten:                  0 kB
>> WbWriteBandwidth:      102400 kBps
>> b_dirty:                    0
>> b_io:                       0
>> b_more_io:                  0
>> b_dirty_time:               0
>> state:                      1
> 
> Maybe some whitespace or something between entries would improve
> readability?
Sure, I will add a whitespace in next version.
> 
>> WbCgIno:                 4094
>> WbWriteback:            54432 kB
>> WbReclaimable:         766080 kB
>> WbDirtyThresh:        3094760 kB
>> WbDirtied:            1656480 kB
>> WbWritten:             837088 kB
>> WbWriteBandwidth:      132772 kBps
>> b_dirty:                    1
>> b_io:                       1
>> b_more_io:                  0
>> b_dirty_time:               0
>> state:                      7
>> WbCgIno:                 4135
>> WbWriteback:            15232 kB
>> WbReclaimable:         786688 kB
>> WbDirtyThresh:        2909984 kB
>> WbDirtied:            1482656 kB
>> WbWritten:             681408 kB
>> WbWriteBandwidth:      124848 kBps
>> b_dirty:                    0
>> b_io:                       1
>> b_more_io:                  0
>> b_dirty_time:               0
>> state:                      7
>>
>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>> ---
>>  include/linux/writeback.h |  1 +
>>  mm/backing-dev.c          | 88 +++++++++++++++++++++++++++++++++++++++
>>  mm/page-writeback.c       | 19 +++++++++
>>  3 files changed, 108 insertions(+)
>>
> ...
>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>> index 8daf950e6855..e3953db7d88d 100644
>> --- a/mm/backing-dev.c
>> +++ b/mm/backing-dev.c
>> @@ -103,6 +103,91 @@ static void collect_wb_stats(struct wb_stats *stats,
>>  }
>>  
>>  #ifdef CONFIG_CGROUP_WRITEBACK
> ...
>> +static int cgwb_debug_stats_show(struct seq_file *m, void *v)
>> +{
>> +	struct backing_dev_info *bdi;
>> +	unsigned long background_thresh;
>> +	unsigned long dirty_thresh;
>> +	struct bdi_writeback *wb;
>> +	struct wb_stats stats;
>> +
>> +	rcu_read_lock();
>> +	bdi = lookup_bdi(m);
>> +	if (!bdi) {
>> +		rcu_read_unlock();
>> +		return -EEXIST;
>> +	}
>> +
>> +	global_dirty_limits(&background_thresh, &dirty_thresh);
>> +
>> +	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
>> +		memset(&stats, 0, sizeof(stats));
>> +		stats.dirty_thresh = dirty_thresh;
> 
> If you did something like the following here, wouldn't that also zero
> the rest of the structure?
> 
> 		struct wb_stats stats = { .dirty_thresh = dirty_thresh };
> 
Suer, will do it in next version.
>> +		collect_wb_stats(&stats, wb);
>> +
> 
> Also, similar question as before on whether you'd want to check
> WB_registered or something here..
Still prefer to keep full debug info and user could filter out on
demand.
> 
>> +		if (mem_cgroup_wb_domain(wb) == NULL) {
>> +			wb_stats_show(m, wb, &stats);
>> +			continue;
>> +		}
> 
> Can you explain what this logic is about? Is the cgwb_calc_thresh()
> thing not needed in this case? A comment might help for those less
> familiar with the implementation details.
If mem_cgroup_wb_domain(wb) is NULL, then it's bdi->wb, otherwise,
it's wb in cgroup. For bdi->wb, there is no need to do wb_tryget
and cgwb_calc_thresh. Will add some comment in next version.
> 
> BTW, I'm also wondering if something like the following is correct
> and/or roughly equivalent:
> 	
> 	list_for_each_*(wb, ...) {
> 		struct wb_stats stats = ...;
> 
> 		if (!wb_tryget(wb))
> 			continue;
> 
> 		collect_wb_stats(&stats, wb);
> 
> 		/*
> 		 * Extra wb_thresh magic. Drop rcu lock because ... . We
> 		 * can do so here because we have a ref.
> 		 */
> 		if (mem_cgroup_wb_domain(wb)) {
> 			rcu_read_unlock();
> 			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> 			rcu_read_lock();
> 		}
> 
> 		wb_stats_show(m, wb, &stats)
> 		wb_put(wb);
> 	}
It's correct as wb_tryget to bdi->wb has no harm. I have considered
to do it in this way, I change my mind to do it in new way for
two reason:
1. Put code handling wb in cgroup more tight which could be easier
to maintain.
2. Rmove extra wb_tryget/wb_put for wb in bdi.
Would this make sense to you?
> 
>> +
>> +		/*
>> +		 * cgwb_release will destroy wb->memcg_completions which
>> +		 * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
>> +		 * memcg_completions destruction from cgwb_release.
>> +		 */
>> +		if (!wb_tryget(wb))
>> +			continue;
>> +
>> +		rcu_read_unlock();
>> +		/* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
>> +		stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
>> +		wb_stats_show(m, wb, &stats);
>> +		rcu_read_lock();
>> +		wb_put(wb);
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	return 0;
>> +}
>> +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
>> +
>> +static void cgwb_debug_register(struct backing_dev_info *bdi)
>> +{
>> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
>> +			    &cgwb_debug_stats_fops);
>> +}
>> +
>>  static void bdi_collect_stats(struct backing_dev_info *bdi,
>>  			      struct wb_stats *stats)
>>  {
>> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
>>  {
>>  	collect_wb_stats(stats, &bdi->wb);
>>  }
>> +
>> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
> 
> Could we just create the wb_stats file regardless of whether cgwb is
> enabled? Obviously theres only one wb in the !CGWB case and it's
> somewhat duplicative with the bdi stats file, but that seems harmless if
> the same code can be reused..? Maybe there's also a small argument for
> dropping the state info from the bdi stats file and moving it to
> wb_stats.In backing-dev.c, there are a lot "#ifdef CGWB .. #else .. #endif" to
avoid unneed extra cost when CGWB is not enabled.
I think it's better to avoid extra cost from wb_stats when CGWB is not
enabled. For now, we only save cpu cost to create and destroy wb_stats
and save memory cost to record debugfs file, we could save more in
future when wb_stats records more debug info.
Move state info from bdi stats to wb_stats make senses to me. The only
concern would be compatibility problem. I will add a new patch to this
to make this more noticeable and easier to revert.
Thanks a lot for review!

Kemeng
> 
> Brian
> 
>>  #endif
>>  
>>  static int bdi_debug_stats_show(struct seq_file *m, void *v)
>> @@ -182,6 +269,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
>>  
>>  	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
>>  			    &bdi_debug_stats_fops);
>> +	cgwb_debug_register(bdi);
>>  }
>>  
>>  static void bdi_debug_unregister(struct backing_dev_info *bdi)
>> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
>> index 0e20467367fe..3724c7525316 100644
>> --- a/mm/page-writeback.c
>> +++ b/mm/page-writeback.c
>> @@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
>>  	return __wb_calc_thresh(&gdtc, thresh);
>>  }
>>  
>> +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
>> +{
>> +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>> +	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
>> +	unsigned long filepages, headroom, writeback;
>> +
>> +	gdtc.avail = global_dirtyable_memory();
>> +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
>> +		     global_node_page_state(NR_WRITEBACK);
>> +
>> +	mem_cgroup_wb_stats(wb, &filepages, &headroom,
>> +			    &mdtc.dirty, &writeback);
>> +	mdtc.dirty += writeback;
>> +	mdtc_calc_avail(&mdtc, filepages, headroom);
>> +	domain_dirty_limits(&mdtc);
>> +
>> +	return __wb_calc_thresh(&mdtc, mdtc.thresh);
>> +}
>> +
>>  /*
>>   *                           setpoint - dirty 3
>>   *        f(dirty) := 1.0 + (----------------)
>> -- 
>> 2.30.0
>>
> 
>
Brian Foster April 3, 2024, 3:04 p.m. UTC | #3
On Wed, Apr 03, 2024 at 04:49:42PM +0800, Kemeng Shi wrote:
> 
> 
> on 3/29/2024 9:10 PM, Brian Foster wrote:
> > On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
> >> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
> >> of bdi.
> >>
> > 
> > Hi Kemeng,
> Hello Brian,
> > 
> > Just a few random thoughts/comments..
> > 
> >> Following domain hierarchy is tested:
> >>                 global domain (320G)
> >>                 /                 \
> >>         cgroup domain1(10G)     cgroup domain2(10G)
> >>                 |                 |
> >> bdi            wb1               wb2
> >>
> >> /* per wb writeback info of bdi is collected */
> >> cat /sys/kernel/debug/bdi/252:16/wb_stats
> >> WbCgIno:                    1
> >> WbWriteback:                0 kB
> >> WbReclaimable:              0 kB
> >> WbDirtyThresh:              0 kB
> >> WbDirtied:                  0 kB
> >> WbWritten:                  0 kB
> >> WbWriteBandwidth:      102400 kBps
> >> b_dirty:                    0
> >> b_io:                       0
> >> b_more_io:                  0
> >> b_dirty_time:               0
> >> state:                      1
> > 
> > Maybe some whitespace or something between entries would improve
> > readability?
> Sure, I will add a whitespace in next version.
> > 
> >> WbCgIno:                 4094
> >> WbWriteback:            54432 kB
> >> WbReclaimable:         766080 kB
> >> WbDirtyThresh:        3094760 kB
> >> WbDirtied:            1656480 kB
> >> WbWritten:             837088 kB
> >> WbWriteBandwidth:      132772 kBps
> >> b_dirty:                    1
> >> b_io:                       1
> >> b_more_io:                  0
> >> b_dirty_time:               0
> >> state:                      7
> >> WbCgIno:                 4135
> >> WbWriteback:            15232 kB
> >> WbReclaimable:         786688 kB
> >> WbDirtyThresh:        2909984 kB
> >> WbDirtied:            1482656 kB
> >> WbWritten:             681408 kB
> >> WbWriteBandwidth:      124848 kBps
> >> b_dirty:                    0
> >> b_io:                       1
> >> b_more_io:                  0
> >> b_dirty_time:               0
> >> state:                      7
> >>
> >> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> >> ---
> >>  include/linux/writeback.h |  1 +
> >>  mm/backing-dev.c          | 88 +++++++++++++++++++++++++++++++++++++++
> >>  mm/page-writeback.c       | 19 +++++++++
> >>  3 files changed, 108 insertions(+)
> >>
> > ...
> >> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> >> index 8daf950e6855..e3953db7d88d 100644
> >> --- a/mm/backing-dev.c
> >> +++ b/mm/backing-dev.c
> >> @@ -103,6 +103,91 @@ static void collect_wb_stats(struct wb_stats *stats,
> >>  }
> >>  
> >>  #ifdef CONFIG_CGROUP_WRITEBACK
> > ...
> >> +static int cgwb_debug_stats_show(struct seq_file *m, void *v)
> >> +{
> >> +	struct backing_dev_info *bdi;
> >> +	unsigned long background_thresh;
> >> +	unsigned long dirty_thresh;
> >> +	struct bdi_writeback *wb;
> >> +	struct wb_stats stats;
> >> +
> >> +	rcu_read_lock();
> >> +	bdi = lookup_bdi(m);
> >> +	if (!bdi) {
> >> +		rcu_read_unlock();
> >> +		return -EEXIST;
> >> +	}
> >> +
> >> +	global_dirty_limits(&background_thresh, &dirty_thresh);
> >> +
> >> +	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
> >> +		memset(&stats, 0, sizeof(stats));
> >> +		stats.dirty_thresh = dirty_thresh;
> > 
> > If you did something like the following here, wouldn't that also zero
> > the rest of the structure?
> > 
> > 		struct wb_stats stats = { .dirty_thresh = dirty_thresh };
> > 
> Suer, will do it in next version.
> >> +		collect_wb_stats(&stats, wb);
> >> +
> > 
> > Also, similar question as before on whether you'd want to check
> > WB_registered or something here..
> Still prefer to keep full debug info and user could filter out on
> demand.

Ok. I was more wondering if that was needed for correctness. If not,
then that seems fair enough to me.

> > 
> >> +		if (mem_cgroup_wb_domain(wb) == NULL) {
> >> +			wb_stats_show(m, wb, &stats);
> >> +			continue;
> >> +		}
> > 
> > Can you explain what this logic is about? Is the cgwb_calc_thresh()
> > thing not needed in this case? A comment might help for those less
> > familiar with the implementation details.
> If mem_cgroup_wb_domain(wb) is NULL, then it's bdi->wb, otherwise,
> it's wb in cgroup. For bdi->wb, there is no need to do wb_tryget
> and cgwb_calc_thresh. Will add some comment in next version.
> > 
> > BTW, I'm also wondering if something like the following is correct
> > and/or roughly equivalent:
> > 	
> > 	list_for_each_*(wb, ...) {
> > 		struct wb_stats stats = ...;
> > 
> > 		if (!wb_tryget(wb))
> > 			continue;
> > 
> > 		collect_wb_stats(&stats, wb);
> > 
> > 		/*
> > 		 * Extra wb_thresh magic. Drop rcu lock because ... . We
> > 		 * can do so here because we have a ref.
> > 		 */
> > 		if (mem_cgroup_wb_domain(wb)) {
> > 			rcu_read_unlock();
> > 			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> > 			rcu_read_lock();
> > 		}
> > 
> > 		wb_stats_show(m, wb, &stats)
> > 		wb_put(wb);
> > 	}
> It's correct as wb_tryget to bdi->wb has no harm. I have considered
> to do it in this way, I change my mind to do it in new way for
> two reason:
> 1. Put code handling wb in cgroup more tight which could be easier
> to maintain.
> 2. Rmove extra wb_tryget/wb_put for wb in bdi.
> Would this make sense to you?

Ok, well assuming it is correct the above logic is a bit more simple and
readable to me. I think you'd just need to fill in the comment around
the wb_thresh thing rather than i.e. having to explain we don't need to
ref bdi->wb even though it doesn't seem to matter.

I kind of feel the same on the wb_stats file thing below just because it
seems more consistent and available if wb_stats eventually grows more
wb-specific data.

That said, this is subjective and not hugely important so I don't insist
on either point. Maybe wait a bit and see if Jan or Tejun or somebody
has any thoughts..? If nobody else expresses explicit preference then
I'm good with it either way.

Brian

> > 
> >> +
> >> +		/*
> >> +		 * cgwb_release will destroy wb->memcg_completions which
> >> +		 * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
> >> +		 * memcg_completions destruction from cgwb_release.
> >> +		 */
> >> +		if (!wb_tryget(wb))
> >> +			continue;
> >> +
> >> +		rcu_read_unlock();
> >> +		/* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
> >> +		stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> >> +		wb_stats_show(m, wb, &stats);
> >> +		rcu_read_lock();
> >> +		wb_put(wb);
> >> +	}
> >> +	rcu_read_unlock();
> >> +
> >> +	return 0;
> >> +}
> >> +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
> >> +
> >> +static void cgwb_debug_register(struct backing_dev_info *bdi)
> >> +{
> >> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
> >> +			    &cgwb_debug_stats_fops);
> >> +}
> >> +
> >>  static void bdi_collect_stats(struct backing_dev_info *bdi,
> >>  			      struct wb_stats *stats)
> >>  {
> >> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
> >>  {
> >>  	collect_wb_stats(stats, &bdi->wb);
> >>  }
> >> +
> >> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
> > 
> > Could we just create the wb_stats file regardless of whether cgwb is
> > enabled? Obviously theres only one wb in the !CGWB case and it's
> > somewhat duplicative with the bdi stats file, but that seems harmless if
> > the same code can be reused..? Maybe there's also a small argument for
> > dropping the state info from the bdi stats file and moving it to
> > wb_stats.In backing-dev.c, there are a lot "#ifdef CGWB .. #else .. #endif" to
> avoid unneed extra cost when CGWB is not enabled.
> I think it's better to avoid extra cost from wb_stats when CGWB is not
> enabled. For now, we only save cpu cost to create and destroy wb_stats
> and save memory cost to record debugfs file, we could save more in
> future when wb_stats records more debug info.
> Move state info from bdi stats to wb_stats make senses to me. The only
> concern would be compatibility problem. I will add a new patch to this
> to make this more noticeable and easier to revert.
> Thanks a lot for review!
> 
> Kemeng
> > 
> > Brian
> > 
> >>  #endif
> >>  
> >>  static int bdi_debug_stats_show(struct seq_file *m, void *v)
> >> @@ -182,6 +269,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
> >>  
> >>  	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
> >>  			    &bdi_debug_stats_fops);
> >> +	cgwb_debug_register(bdi);
> >>  }
> >>  
> >>  static void bdi_debug_unregister(struct backing_dev_info *bdi)
> >> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> >> index 0e20467367fe..3724c7525316 100644
> >> --- a/mm/page-writeback.c
> >> +++ b/mm/page-writeback.c
> >> @@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
> >>  	return __wb_calc_thresh(&gdtc, thresh);
> >>  }
> >>  
> >> +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
> >> +{
> >> +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> >> +	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
> >> +	unsigned long filepages, headroom, writeback;
> >> +
> >> +	gdtc.avail = global_dirtyable_memory();
> >> +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> >> +		     global_node_page_state(NR_WRITEBACK);
> >> +
> >> +	mem_cgroup_wb_stats(wb, &filepages, &headroom,
> >> +			    &mdtc.dirty, &writeback);
> >> +	mdtc.dirty += writeback;
> >> +	mdtc_calc_avail(&mdtc, filepages, headroom);
> >> +	domain_dirty_limits(&mdtc);
> >> +
> >> +	return __wb_calc_thresh(&mdtc, mdtc.thresh);
> >> +}
> >> +
> >>  /*
> >>   *                           setpoint - dirty 3
> >>   *        f(dirty) := 1.0 + (----------------)
> >> -- 
> >> 2.30.0
> >>
> > 
> > 
>
Jan Kara April 4, 2024, 9:07 a.m. UTC | #4
On Wed 03-04-24 11:04:58, Brian Foster wrote:
> On Wed, Apr 03, 2024 at 04:49:42PM +0800, Kemeng Shi wrote:
> > on 3/29/2024 9:10 PM, Brian Foster wrote:
> > > On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
> > >> +		collect_wb_stats(&stats, wb);
> > >> +
> > > 
> > > Also, similar question as before on whether you'd want to check
> > > WB_registered or something here..
> > Still prefer to keep full debug info and user could filter out on
> > demand.
> 
> Ok. I was more wondering if that was needed for correctness. If not,
> then that seems fair enough to me.
> 
> > >> +		if (mem_cgroup_wb_domain(wb) == NULL) {
> > >> +			wb_stats_show(m, wb, &stats);
> > >> +			continue;
> > >> +		}
> > > 
> > > Can you explain what this logic is about? Is the cgwb_calc_thresh()
> > > thing not needed in this case? A comment might help for those less
> > > familiar with the implementation details.
> > If mem_cgroup_wb_domain(wb) is NULL, then it's bdi->wb, otherwise,
> > it's wb in cgroup. For bdi->wb, there is no need to do wb_tryget
> > and cgwb_calc_thresh. Will add some comment in next version.
> > > 
> > > BTW, I'm also wondering if something like the following is correct
> > > and/or roughly equivalent:
> > > 	
> > > 	list_for_each_*(wb, ...) {
> > > 		struct wb_stats stats = ...;
> > > 
> > > 		if (!wb_tryget(wb))
> > > 			continue;
> > > 
> > > 		collect_wb_stats(&stats, wb);
> > > 
> > > 		/*
> > > 		 * Extra wb_thresh magic. Drop rcu lock because ... . We
> > > 		 * can do so here because we have a ref.
> > > 		 */
> > > 		if (mem_cgroup_wb_domain(wb)) {
> > > 			rcu_read_unlock();
> > > 			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> > > 			rcu_read_lock();
> > > 		}
> > > 
> > > 		wb_stats_show(m, wb, &stats)
> > > 		wb_put(wb);
> > > 	}
> > It's correct as wb_tryget to bdi->wb has no harm. I have considered
> > to do it in this way, I change my mind to do it in new way for
> > two reason:
> > 1. Put code handling wb in cgroup more tight which could be easier
> > to maintain.
> > 2. Rmove extra wb_tryget/wb_put for wb in bdi.
> > Would this make sense to you?
> 
> Ok, well assuming it is correct the above logic is a bit more simple and
> readable to me. I think you'd just need to fill in the comment around
> the wb_thresh thing rather than i.e. having to explain we don't need to
> ref bdi->wb even though it doesn't seem to matter.
> 
> I kind of feel the same on the wb_stats file thing below just because it
> seems more consistent and available if wb_stats eventually grows more
> wb-specific data.
> 
> That said, this is subjective and not hugely important so I don't insist
> on either point. Maybe wait a bit and see if Jan or Tejun or somebody
> has any thoughts..? If nobody else expresses explicit preference then
> I'm good with it either way.

No strong opinion from me really.

> > >> +static void cgwb_debug_register(struct backing_dev_info *bdi)
> > >> +{
> > >> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
> > >> +			    &cgwb_debug_stats_fops);
> > >> +}
> > >> +
> > >>  static void bdi_collect_stats(struct backing_dev_info *bdi,
> > >>  			      struct wb_stats *stats)
> > >>  {
> > >> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
> > >>  {
> > >>  	collect_wb_stats(stats, &bdi->wb);
> > >>  }
> > >> +
> > >> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
> > > 
> > > Could we just create the wb_stats file regardless of whether cgwb is
> > > enabled? Obviously theres only one wb in the !CGWB case and it's
> > > somewhat duplicative with the bdi stats file, but that seems harmless if
> > > the same code can be reused..? Maybe there's also a small argument for
> > > dropping the state info from the bdi stats file and moving it to
> > > wb_stats.In backing-dev.c, there are a lot "#ifdef CGWB .. #else .. #endif" to
> > avoid unneed extra cost when CGWB is not enabled.
> > I think it's better to avoid extra cost from wb_stats when CGWB is not
> > enabled. For now, we only save cpu cost to create and destroy wb_stats
> > and save memory cost to record debugfs file, we could save more in
> > future when wb_stats records more debug info.

Well, there's the other side that you don't have to think whether the
kernel has CGWB enabled or not when asking a customer to gather the
writeback debug info - you can always ask for wb_stats. Also if you move
the wb->state to wb_stats only it will become inaccessible with CGWB
disabled. So I agree with Brian that it is better to provide wb_stats also
with CGWB disabled (and we can just implement wb_stats for !CGWB case with
the same function as bdi_stats).

That being said all production kernels I have seen do have CGWB enabled so
I don't care that much about this...

> > Move state info from bdi stats to wb_stats make senses to me. The only
> > concern would be compatibility problem. I will add a new patch to this
> > to make this more noticeable and easier to revert.

Yeah, I don't think we care much about debugfs compatibility but I think
removing state from bdi_stats is not worth the inconsistency between
wb_stats and bdi_stats in the !CGWB case.

								Honza
Kemeng Shi April 7, 2024, 2:48 a.m. UTC | #5
on 4/3/2024 11:04 PM, Brian Foster wrote:
> On Wed, Apr 03, 2024 at 04:49:42PM +0800, Kemeng Shi wrote:
>>
>>
>> on 3/29/2024 9:10 PM, Brian Foster wrote:
>>> On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
>>>> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
>>>> of bdi.
>>>>
>>>
>>> Hi Kemeng,
>> Hello Brian,
>>>
>>> Just a few random thoughts/comments..
>>>
>>>> Following domain hierarchy is tested:
>>>>                 global domain (320G)
>>>>                 /                 \
>>>>         cgroup domain1(10G)     cgroup domain2(10G)
>>>>                 |                 |
>>>> bdi            wb1               wb2
>>>>
>>>> /* per wb writeback info of bdi is collected */
>>>> cat /sys/kernel/debug/bdi/252:16/wb_stats
>>>> WbCgIno:                    1
>>>> WbWriteback:                0 kB
>>>> WbReclaimable:              0 kB
>>>> WbDirtyThresh:              0 kB
>>>> WbDirtied:                  0 kB
>>>> WbWritten:                  0 kB
>>>> WbWriteBandwidth:      102400 kBps
>>>> b_dirty:                    0
>>>> b_io:                       0
>>>> b_more_io:                  0
>>>> b_dirty_time:               0
>>>> state:                      1
>>>
>>> Maybe some whitespace or something between entries would improve
>>> readability?
>> Sure, I will add a whitespace in next version.
>>>
>>>> WbCgIno:                 4094
>>>> WbWriteback:            54432 kB
>>>> WbReclaimable:         766080 kB
>>>> WbDirtyThresh:        3094760 kB
>>>> WbDirtied:            1656480 kB
>>>> WbWritten:             837088 kB
>>>> WbWriteBandwidth:      132772 kBps
>>>> b_dirty:                    1
>>>> b_io:                       1
>>>> b_more_io:                  0
>>>> b_dirty_time:               0
>>>> state:                      7
>>>> WbCgIno:                 4135
>>>> WbWriteback:            15232 kB
>>>> WbReclaimable:         786688 kB
>>>> WbDirtyThresh:        2909984 kB
>>>> WbDirtied:            1482656 kB
>>>> WbWritten:             681408 kB
>>>> WbWriteBandwidth:      124848 kBps
>>>> b_dirty:                    0
>>>> b_io:                       1
>>>> b_more_io:                  0
>>>> b_dirty_time:               0
>>>> state:                      7
>>>>
>>>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>>>> ---
>>>>  include/linux/writeback.h |  1 +
>>>>  mm/backing-dev.c          | 88 +++++++++++++++++++++++++++++++++++++++
>>>>  mm/page-writeback.c       | 19 +++++++++
>>>>  3 files changed, 108 insertions(+)
>>>>
>>> ...
>>>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>>>> index 8daf950e6855..e3953db7d88d 100644
>>>> --- a/mm/backing-dev.c
>>>> +++ b/mm/backing-dev.c
>>>> @@ -103,6 +103,91 @@ static void collect_wb_stats(struct wb_stats *stats,
>>>>  }
>>>>  
>>>>  #ifdef CONFIG_CGROUP_WRITEBACK
>>> ...
>>>> +static int cgwb_debug_stats_show(struct seq_file *m, void *v)
>>>> +{
>>>> +	struct backing_dev_info *bdi;
>>>> +	unsigned long background_thresh;
>>>> +	unsigned long dirty_thresh;
>>>> +	struct bdi_writeback *wb;
>>>> +	struct wb_stats stats;
>>>> +
>>>> +	rcu_read_lock();
>>>> +	bdi = lookup_bdi(m);
>>>> +	if (!bdi) {
>>>> +		rcu_read_unlock();
>>>> +		return -EEXIST;
>>>> +	}
>>>> +
>>>> +	global_dirty_limits(&background_thresh, &dirty_thresh);
>>>> +
>>>> +	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
>>>> +		memset(&stats, 0, sizeof(stats));
>>>> +		stats.dirty_thresh = dirty_thresh;
>>>
>>> If you did something like the following here, wouldn't that also zero
>>> the rest of the structure?
>>>
>>> 		struct wb_stats stats = { .dirty_thresh = dirty_thresh };
>>>
>> Suer, will do it in next version.
>>>> +		collect_wb_stats(&stats, wb);
>>>> +
>>>
>>> Also, similar question as before on whether you'd want to check
>>> WB_registered or something here..
>> Still prefer to keep full debug info and user could filter out on
>> demand.
> 
> Ok. I was more wondering if that was needed for correctness. If not,
> then that seems fair enough to me.
For bdi->wb, it's unavailable after release_bdi. As bdi_debug_unregister
will block bdi_unregister, then release_bdi must not be reached yet and
it's safe to collect bdi->wb info.
For wb in cgroup, it's unavailable after cgwb_release_workfn, we could
prevent this with wb_tryget before collection.
So it's correct for per-wb stats but we add a extra wb_trget in
bdi stats in patch 2 and will do it in next version.
> 
>>>
>>>> +		if (mem_cgroup_wb_domain(wb) == NULL) {
>>>> +			wb_stats_show(m, wb, &stats);
>>>> +			continue;
>>>> +		}
>>>
>>> Can you explain what this logic is about? Is the cgwb_calc_thresh()
>>> thing not needed in this case? A comment might help for those less
>>> familiar with the implementation details.
>> If mem_cgroup_wb_domain(wb) is NULL, then it's bdi->wb, otherwise,
>> it's wb in cgroup. For bdi->wb, there is no need to do wb_tryget
>> and cgwb_calc_thresh. Will add some comment in next version.
>>>
>>> BTW, I'm also wondering if something like the following is correct
>>> and/or roughly equivalent:
>>> 	
>>> 	list_for_each_*(wb, ...) {
>>> 		struct wb_stats stats = ...;
>>>
>>> 		if (!wb_tryget(wb))
>>> 			continue;
>>>
>>> 		collect_wb_stats(&stats, wb);
>>>
>>> 		/*
>>> 		 * Extra wb_thresh magic. Drop rcu lock because ... . We
>>> 		 * can do so here because we have a ref.
>>> 		 */
>>> 		if (mem_cgroup_wb_domain(wb)) {
>>> 			rcu_read_unlock();
>>> 			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
>>> 			rcu_read_lock();
>>> 		}
>>>
>>> 		wb_stats_show(m, wb, &stats)
>>> 		wb_put(wb);
>>> 	}
>> It's correct as wb_tryget to bdi->wb has no harm. I have considered
>> to do it in this way, I change my mind to do it in new way for
>> two reason:
>> 1. Put code handling wb in cgroup more tight which could be easier
>> to maintain.
>> 2. Rmove extra wb_tryget/wb_put for wb in bdi.
>> Would this make sense to you?
> 
> Ok, well assuming it is correct the above logic is a bit more simple and
> readable to me. I think you'd just need to fill in the comment around
> the wb_thresh thing rather than i.e. having to explain we don't need to
> ref bdi->wb even though it doesn't seem to matter.
> 
> I kind of feel the same on the wb_stats file thing below just because it
> seems more consistent and available if wb_stats eventually grows more
> wb-specific data.
> 
> That said, this is subjective and not hugely important so I don't insist
> on either point. Maybe wait a bit and see if Jan or Tejun or somebody
> has any thoughts..? If nobody else expresses explicit preference then
> I'm good with it either way.
Sure, I will wait for someday and decide the way used in next version.

Thanks so much for all the advise.

Kemeng
> 
> Brian
> 
>>>
>>>> +
>>>> +		/*
>>>> +		 * cgwb_release will destroy wb->memcg_completions which
>>>> +		 * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
>>>> +		 * memcg_completions destruction from cgwb_release.
>>>> +		 */
>>>> +		if (!wb_tryget(wb))
>>>> +			continue;
>>>> +
>>>> +		rcu_read_unlock();
>>>> +		/* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
>>>> +		stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
>>>> +		wb_stats_show(m, wb, &stats);
>>>> +		rcu_read_lock();
>>>> +		wb_put(wb);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
>>>> +
>>>> +static void cgwb_debug_register(struct backing_dev_info *bdi)
>>>> +{
>>>> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
>>>> +			    &cgwb_debug_stats_fops);
>>>> +}
>>>> +
>>>>  static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>>  			      struct wb_stats *stats)
>>>>  {
>>>> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>>  {
>>>>  	collect_wb_stats(stats, &bdi->wb);
>>>>  }
>>>> +
>>>> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
>>>
>>> Could we just create the wb_stats file regardless of whether cgwb is
>>> enabled? Obviously theres only one wb in the !CGWB case and it's
>>> somewhat duplicative with the bdi stats file, but that seems harmless if
>>> the same code can be reused..? Maybe there's also a small argument for
>>> dropping the state info from the bdi stats file and moving it to
>>> wb_stats.In backing-dev.c, there are a lot "#ifdef CGWB .. #else .. #endif" to
>> avoid unneed extra cost when CGWB is not enabled.
>> I think it's better to avoid extra cost from wb_stats when CGWB is not
>> enabled. For now, we only save cpu cost to create and destroy wb_stats
>> and save memory cost to record debugfs file, we could save more in
>> future when wb_stats records more debug info.
>> Move state info from bdi stats to wb_stats make senses to me. The only
>> concern would be compatibility problem. I will add a new patch to this
>> to make this more noticeable and easier to revert.
>> Thanks a lot for review!
>>
>> Kemeng
>>>
>>> Brian
>>>
>>>>  #endif
>>>>  
>>>>  static int bdi_debug_stats_show(struct seq_file *m, void *v)
>>>> @@ -182,6 +269,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
>>>>  
>>>>  	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
>>>>  			    &bdi_debug_stats_fops);
>>>> +	cgwb_debug_register(bdi);
>>>>  }
>>>>  
>>>>  static void bdi_debug_unregister(struct backing_dev_info *bdi)
>>>> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
>>>> index 0e20467367fe..3724c7525316 100644
>>>> --- a/mm/page-writeback.c
>>>> +++ b/mm/page-writeback.c
>>>> @@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
>>>>  	return __wb_calc_thresh(&gdtc, thresh);
>>>>  }
>>>>  
>>>> +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
>>>> +{
>>>> +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>>>> +	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
>>>> +	unsigned long filepages, headroom, writeback;
>>>> +
>>>> +	gdtc.avail = global_dirtyable_memory();
>>>> +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
>>>> +		     global_node_page_state(NR_WRITEBACK);
>>>> +
>>>> +	mem_cgroup_wb_stats(wb, &filepages, &headroom,
>>>> +			    &mdtc.dirty, &writeback);
>>>> +	mdtc.dirty += writeback;
>>>> +	mdtc_calc_avail(&mdtc, filepages, headroom);
>>>> +	domain_dirty_limits(&mdtc);
>>>> +
>>>> +	return __wb_calc_thresh(&mdtc, mdtc.thresh);
>>>> +}
>>>> +
>>>>  /*
>>>>   *                           setpoint - dirty 3
>>>>   *        f(dirty) := 1.0 + (----------------)
>>>> -- 
>>>> 2.30.0
>>>>
>>>
>>>
>>
> 
>
Kemeng Shi April 7, 2024, 3:13 a.m. UTC | #6
on 4/4/2024 5:07 PM, Jan Kara wrote:
> On Wed 03-04-24 11:04:58, Brian Foster wrote:
>> On Wed, Apr 03, 2024 at 04:49:42PM +0800, Kemeng Shi wrote:
>>> on 3/29/2024 9:10 PM, Brian Foster wrote:
>>>> On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
>>>>> +		collect_wb_stats(&stats, wb);
>>>>> +
>>>>
>>>> Also, similar question as before on whether you'd want to check
>>>> WB_registered or something here..
>>> Still prefer to keep full debug info and user could filter out on
>>> demand.
>>
>> Ok. I was more wondering if that was needed for correctness. If not,
>> then that seems fair enough to me.
>>
>>>>> +		if (mem_cgroup_wb_domain(wb) == NULL) {
>>>>> +			wb_stats_show(m, wb, &stats);
>>>>> +			continue;
>>>>> +		}
>>>>
>>>> Can you explain what this logic is about? Is the cgwb_calc_thresh()
>>>> thing not needed in this case? A comment might help for those less
>>>> familiar with the implementation details.
>>> If mem_cgroup_wb_domain(wb) is NULL, then it's bdi->wb, otherwise,
>>> it's wb in cgroup. For bdi->wb, there is no need to do wb_tryget
>>> and cgwb_calc_thresh. Will add some comment in next version.
>>>>
>>>> BTW, I'm also wondering if something like the following is correct
>>>> and/or roughly equivalent:
>>>> 	
>>>> 	list_for_each_*(wb, ...) {
>>>> 		struct wb_stats stats = ...;
>>>>
>>>> 		if (!wb_tryget(wb))
>>>> 			continue;
>>>>
>>>> 		collect_wb_stats(&stats, wb);
>>>>
>>>> 		/*
>>>> 		 * Extra wb_thresh magic. Drop rcu lock because ... . We
>>>> 		 * can do so here because we have a ref.
>>>> 		 */
>>>> 		if (mem_cgroup_wb_domain(wb)) {
>>>> 			rcu_read_unlock();
>>>> 			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
>>>> 			rcu_read_lock();
>>>> 		}
>>>>
>>>> 		wb_stats_show(m, wb, &stats)
>>>> 		wb_put(wb);
>>>> 	}
>>> It's correct as wb_tryget to bdi->wb has no harm. I have considered
>>> to do it in this way, I change my mind to do it in new way for
>>> two reason:
>>> 1. Put code handling wb in cgroup more tight which could be easier
>>> to maintain.
>>> 2. Rmove extra wb_tryget/wb_put for wb in bdi.
>>> Would this make sense to you?
>>
>> Ok, well assuming it is correct the above logic is a bit more simple and
>> readable to me. I think you'd just need to fill in the comment around
>> the wb_thresh thing rather than i.e. having to explain we don't need to
>> ref bdi->wb even though it doesn't seem to matter.
>>
>> I kind of feel the same on the wb_stats file thing below just because it
>> seems more consistent and available if wb_stats eventually grows more
>> wb-specific data.
>>
>> That said, this is subjective and not hugely important so I don't insist
>> on either point. Maybe wait a bit and see if Jan or Tejun or somebody
>> has any thoughts..? If nobody else expresses explicit preference then
>> I'm good with it either way.
> 
> No strong opinion from me really.
> 
>>>>> +static void cgwb_debug_register(struct backing_dev_info *bdi)
>>>>> +{
>>>>> +	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
>>>>> +			    &cgwb_debug_stats_fops);
>>>>> +}
>>>>> +
>>>>>  static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>>>  			      struct wb_stats *stats)
>>>>>  {
>>>>> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
>>>>>  {
>>>>>  	collect_wb_stats(stats, &bdi->wb);
>>>>>  }
>>>>> +
>>>>> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
>>>>
>>>> Could we just create the wb_stats file regardless of whether cgwb is
>>>> enabled? Obviously theres only one wb in the !CGWB case and it's
>>>> somewhat duplicative with the bdi stats file, but that seems harmless if
>>>> the same code can be reused..? Maybe there's also a small argument for
>>>> dropping the state info from the bdi stats file and moving it to
>>>> wb_stats.In backing-dev.c, there are a lot "#ifdef CGWB .. #else .. #endif" to
>>> avoid unneed extra cost when CGWB is not enabled.
>>> I think it's better to avoid extra cost from wb_stats when CGWB is not
>>> enabled. For now, we only save cpu cost to create and destroy wb_stats
>>> and save memory cost to record debugfs file, we could save more in
>>> future when wb_stats records more debug info.
> 
> Well, there's the other side that you don't have to think whether the
> kernel has CGWB enabled or not when asking a customer to gather the
> writeback debug info - you can always ask for wb_stats. Also if you move
> the wb->state to wb_stats only it will become inaccessible with CGWB
> disabled. So I agree with Brian that it is better to provide wb_stats also
> with CGWB disabled (and we can just implement wb_stats for !CGWB case with
> the same function as bdi_stats).
> 
> That being said all production kernels I have seen do have CGWB enabled so
> I don't care that much about this...
It's acceptable to me if the extra cost is tolerable.
> 
>>> Move state info from bdi stats to wb_stats make senses to me. The only
>>> concern would be compatibility problem. I will add a new patch to this
>>> to make this more noticeable and easier to revert.
> 
> Yeah, I don't think we care much about debugfs compatibility but I think
> removing state from bdi_stats is not worth the inconsistency between
> wb_stats and bdi_stats in the !CGWB case.
OK, I will simply keep wb_stats even CGWB is not enabled while keep state
in both bdi_stats and wb_stats if Braian doesn't against in recent dasy.

Kemeng
> 
> 								Honza
>
diff mbox series

Patch

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9845cb62e40b..112d806ddbe4 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -355,6 +355,7 @@  int dirtytime_interval_handler(struct ctl_table *table, int write,
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);
 
 void wb_update_bandwidth(struct bdi_writeback *wb);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8daf950e6855..e3953db7d88d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -103,6 +103,91 @@  static void collect_wb_stats(struct wb_stats *stats,
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
+static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
+			  struct wb_stats *stats)
+{
+
+	seq_printf(m,
+		   "WbCgIno:           %10lu\n"
+		   "WbWriteback:       %10lu kB\n"
+		   "WbReclaimable:     %10lu kB\n"
+		   "WbDirtyThresh:     %10lu kB\n"
+		   "WbDirtied:         %10lu kB\n"
+		   "WbWritten:         %10lu kB\n"
+		   "WbWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:           %10lu\n"
+		   "b_io:              %10lu\n"
+		   "b_more_io:         %10lu\n"
+		   "b_dirty_time:      %10lu\n"
+		   "state:             %10lx\n",
+		   cgroup_ino(wb->memcg_css->cgroup),
+		   K(stats->nr_writeback),
+		   K(stats->nr_reclaimable),
+		   K(stats->wb_thresh),
+		   K(stats->nr_dirtied),
+		   K(stats->nr_written),
+		   K(wb->avg_write_bandwidth),
+		   stats->nr_dirty,
+		   stats->nr_io,
+		   stats->nr_more_io,
+		   stats->nr_dirty_time,
+		   wb->state);
+}
+
+static int cgwb_debug_stats_show(struct seq_file *m, void *v)
+{
+	struct backing_dev_info *bdi;
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	struct bdi_writeback *wb;
+	struct wb_stats stats;
+
+	rcu_read_lock();
+	bdi = lookup_bdi(m);
+	if (!bdi) {
+		rcu_read_unlock();
+		return -EEXIST;
+	}
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+
+	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
+		memset(&stats, 0, sizeof(stats));
+		stats.dirty_thresh = dirty_thresh;
+		collect_wb_stats(&stats, wb);
+
+		if (mem_cgroup_wb_domain(wb) == NULL) {
+			wb_stats_show(m, wb, &stats);
+			continue;
+		}
+
+		/*
+		 * cgwb_release will destroy wb->memcg_completions which
+		 * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
+		 * memcg_completions destruction from cgwb_release.
+		 */
+		if (!wb_tryget(wb))
+			continue;
+
+		rcu_read_unlock();
+		/* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
+		stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
+		wb_stats_show(m, wb, &stats);
+		rcu_read_lock();
+		wb_put(wb);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
+
+static void cgwb_debug_register(struct backing_dev_info *bdi)
+{
+	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
+			    &cgwb_debug_stats_fops);
+}
+
 static void bdi_collect_stats(struct backing_dev_info *bdi,
 			      struct wb_stats *stats)
 {
@@ -117,6 +202,8 @@  static void bdi_collect_stats(struct backing_dev_info *bdi,
 {
 	collect_wb_stats(stats, &bdi->wb);
 }
+
+static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
 #endif
 
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
@@ -182,6 +269,7 @@  static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 
 	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
 			    &bdi_debug_stats_fops);
+	cgwb_debug_register(bdi);
 }
 
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e20467367fe..3724c7525316 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -893,6 +893,25 @@  unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
 	return __wb_calc_thresh(&gdtc, thresh);
 }
 
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
+	unsigned long filepages, headroom, writeback;
+
+	gdtc.avail = global_dirtyable_memory();
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+		     global_node_page_state(NR_WRITEBACK);
+
+	mem_cgroup_wb_stats(wb, &filepages, &headroom,
+			    &mdtc.dirty, &writeback);
+	mdtc.dirty += writeback;
+	mdtc_calc_avail(&mdtc, filepages, headroom);
+	domain_dirty_limits(&mdtc);
+
+	return __wb_calc_thresh(&mdtc, mdtc.thresh);
+}
+
 /*
  *                           setpoint - dirty 3
  *        f(dirty) := 1.0 + (----------------)