diff mbox series

[RFC] writeback: add elastic bdi in cgwb bdp

Message ID 20191012132740.12968-1-hdanton@sina.com (mailing list archive)
State New, archived
Headers show
Series [RFC] writeback: add elastic bdi in cgwb bdp | expand

Commit Message

Hillf Danton Oct. 12, 2019, 1:27 p.m. UTC
The behaviors of the elastic bdi (ebdi) observed in the current cgwb
bandwidth measurement include

1, like spinning disks on market ebdi can do ~128MB/s IOs in consective
minutes in few scenarios, or higher like SSD, or lower like USB key.

2, with ebdi a bdi_writeback, wb-A, is able to do 80MB/s writeouts in the
current time window of 200ms, while it was 16M/s in the previous one.

3, it will be either 100MB/s in the next time window if wb-B joins wb-A
writing pages out or 18MB/s if wb-C also decides to chime in.

With the help of bandwidth gauged above, what is left in balancing dirty
pages, bdp, is try to make wb-A's laundry speed catch up dirty speed in
every 200ms interval without knowing what wb-B is doing.

No heuristic is added in this work because ebdi does bdp without it.

Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--

Comments

Jan Kara Oct. 15, 2019, 10:22 a.m. UTC | #1
Hello,

On Sat 12-10-19 21:27:40, Hillf Danton wrote:
> The behaviors of the elastic bdi (ebdi) observed in the current cgwb
> bandwidth measurement include
> 
> 1, like spinning disks on market ebdi can do ~128MB/s IOs in consective
> minutes in few scenarios, or higher like SSD, or lower like USB key.
> 
> 2, with ebdi a bdi_writeback, wb-A, is able to do 80MB/s writeouts in the
> current time window of 200ms, while it was 16M/s in the previous one.
> 
> 3, it will be either 100MB/s in the next time window if wb-B joins wb-A
> writing pages out or 18MB/s if wb-C also decides to chime in.
> 
> With the help of bandwidth gauged above, what is left in balancing dirty
> pages, bdp, is try to make wb-A's laundry speed catch up dirty speed in
> every 200ms interval without knowing what wb-B is doing.
> 
> No heuristic is added in this work because ebdi does bdp without it.

Thanks for the patch but honestly, I have hard time understanding what is
the purpose of this patch from the changelog. Some kind of writeback
throttling? And why is this needed? Also some highlevel description of what
your solution is would be good...

								Honza
 
> Cc: Roman Gushchin <guro@fb.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Signed-off-by: Hillf Danton <hdanton@sina.com>
> ---
> 
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -157,6 +157,9 @@ struct bdi_writeback {
>  	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
>  	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
>  
> +#ifdef CONFIG_CGWB_BDP_WITH_EBDI
> +	struct wait_queue_head bdp_waitq;
> +#endif
>  	union {
>  		struct work_struct release_work;
>  		struct rcu_head rcu;
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -324,6 +324,10 @@ static int wb_init(struct bdi_writeback
>  			goto out_destroy_stat;
>  	}
>  
> +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> +		init_waitqueue_head(&wb->bdp_waitq);
> +
>  	return 0;
>  
>  out_destroy_stat:
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1551,6 +1551,45 @@ static inline void wb_dirty_limits(struc
>  	}
>  }
>  
> +#if defined(CONFIG_CGROUP_WRITEBACK) && defined(CONFIG_CGWB_BDP_WITH_EBDI)
> +static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
> +{
> +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> +
> +	if (fatal_signal_pending(current))
> +		return false;
> +
> +	gdtc.avail = global_dirtyable_memory();
> +
> +	domain_dirty_limits(&gdtc);
> +
> +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> +			global_node_page_state(NR_UNSTABLE_NFS) +
> +			global_node_page_state(NR_WRITEBACK);
> +
> +	if (gdtc.dirty < gdtc.bg_thresh)
> +		return false;
> +
> +	if (!writeback_in_progress(wb))
> +		wb_start_background_writeback(wb);
> +
> +	/*
> +	 * throttle if laundry speed remarkably falls behind dirty speed
> +	 * in the current time window of 200ms
> +	 */
> +	return gdtc.dirty > gdtc.thresh &&
> +		wb_stat(wb, WB_DIRTIED) >
> +		wb_stat(wb, WB_WRITTEN) +
> +		wb_stat_error();
> +}
> +
> +static inline void cgwb_bdp(struct bdi_writeback *wb)
> +{
> +	wait_event_interruptible_timeout(wb->bdp_waitq,
> +			!cgwb_bdp_should_throttle(wb), HZ);
> +}
> +#endif
> +
>  /*
>   * balance_dirty_pages() must be called by processes which are generating dirty
>   * data.  It looks at the number of dirty pages in the machine and will force
> @@ -1910,7 +1949,11 @@ void balance_dirty_pages_ratelimited(str
>  	preempt_enable();
>  
>  	if (unlikely(current->nr_dirtied >= ratelimit))
> -		balance_dirty_pages(wb, current->nr_dirtied);
> +		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> +		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> +			cgwb_bdp(wb);
> +		else
> +			balance_dirty_pages(wb, current->nr_dirtied);
>  
>  	wb_put(wb);
>  }
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -632,6 +632,11 @@ void wbc_detach_inode(struct writeback_c
>  	if (!wb)
>  		return;
>  
> +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> +		if (waitqueue_active(&wb->bdp_waitq))
> +			wake_up_all(&wb->bdp_waitq);
> +
>  	history = inode->i_wb_frn_history;
>  	avg_time = inode->i_wb_frn_avg_time;
>  
> @@ -811,6 +816,9 @@ static long wb_split_bdi_pages(struct bd
>  	if (nr_pages == LONG_MAX)
>  		return LONG_MAX;
>  
> +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> +		return nr_pages;
>  	/*
>  	 * This may be called on clean wb's and proportional distribution
>  	 * may not make sense, just use the original @nr_pages in those
> @@ -1599,6 +1607,10 @@ static long writeback_chunk_size(struct
>  	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
>  		pages = LONG_MAX;
>  	else {
> +		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> +		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> +			return work->nr_pages;
> +
>  		pages = min(wb->avg_write_bandwidth / 2,
>  			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
>  		pages = min(pages, work->nr_pages);
> --
>
Hillf Danton Oct. 15, 2019, 2:03 p.m. UTC | #2
Hey Jan

On Tue, 15 Oct 2019 12:22:10 +0200 Jan Kara wrote:
> Hello,
> 
> On Sat 12-10-19 21:27:40, Hillf Danton wrote:
> > 
> > The behaviors of the elastic bdi (ebdi) observed in the current cgwb
> > bandwidth measurement include
> > 
> > 1, like spinning disks on market ebdi can do ~128MB/s IOs in consective
> > minutes in few scenarios, or higher like SSD, or lower like USB key.
> > 
> > 2, with ebdi a bdi_writeback, wb-A, is able to do 80MB/s writeouts in the
> > current time window of 200ms, while it was 16M/s in the previous one.
> > 
> > 3, it will be either 100MB/s in the next time window if wb-B joins wb-A
> > writing pages out or 18MB/s if wb-C also decides to chime in.
> > 
> > With the help of bandwidth gauged above, what is left in balancing dirty
> > pages, bdp, is try to make wb-A's laundry speed catch up dirty speed in
> > every 200ms interval without knowing what wb-B is doing.
> > 
> > No heuristic is added in this work because ebdi does bdp without it.
> 
> Thanks for the patch but honestly, I have hard time understanding what is
> the purpose of this patch from the changelog.

Fault on my side. I will try to make it as clear as I could.

Under the cover of "behaviors of elastic bdi" I list the difficulties in
the current writeback bandwidth measurings, particularly in the case with
CONFIG_CGROUP_WRITEBACK enabled, with the phrase ebdi used to abstract
the attribute of hardwares, like spinning disk, SSD and USB storage,
that their physical bandwidth is a constant.

The difference between that constant and the bandwidth currently
measured comes from, I think, the IO pattern dispatched to hardware in
the time interval of 200ms. How much sense does it make to guide wb-A's
IO in the next 200ms without idea about what other wbs are doing? What
should be modeled and built on top of the measured bw value?
Hard to say.

What will bdp OTOH look like on top of ebdi without the hard work of
measuring bw?
A name came up before I am tapping this message, though not available
when I sent the RFC, and it essentially is that ebdi paves a brick for
applying the walk-dog method to bdp: let wb-A's laundry speed walk its
dirty speed the same way as pet owners in Paris, Prague and other cities
go walking their dogs every day with a leash worth two dimes on average.

Is a $200 electronic walkmeter needed to have a good time of walking dog
in London?
Nope, I think because it makes ant-eyelash-size sense to gauge the walker's
speed first with that gadget prone to glitch and then teach the dog to
walk that speed, and to do more based on it.

The only reason I have to do walk-dog in bdp is that laundry speed
remarkably falls behind dirty speed in every case of bdp with no exception.
And a leash is supposed to do the job in a manner that it should naturally
be, even though laundry speed changes in every 200ms interval and is
usually hard to predict before hand under real workloads, with two things
below met in every 200ms:
1, dirty pages in the system clamped near the threshold that is configurable
in userspace,
2, dirty speed of every wb glued as close to the laundry speed as possible,
in long run.

Should walk-dog be in place, then we can do cleanups of bw measurement and
things dependent of it a step after another.

Thanks
Hillf

> Some kind of writeback throttling?
> And why is this needed?
> Also some highlevel description of what
> your solution is would be good...
> 
> 								Honza
>  
> > Cc: Roman Gushchin <guro@fb.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Shakeel Butt <shakeelb@google.com>
> > Cc: Minchan Kim <minchan@kernel.org>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Signed-off-by: Hillf Danton <hdanton@sina.com>
> > ---
> > 
> > --- a/include/linux/backing-dev-defs.h
> > +++ b/include/linux/backing-dev-defs.h
> > @@ -157,6 +157,9 @@ struct bdi_writeback {
> >  	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
> >  	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
> >  
> > +#ifdef CONFIG_CGWB_BDP_WITH_EBDI
> > +	struct wait_queue_head bdp_waitq;
> > +#endif
> >  	union {
> >  		struct work_struct release_work;
> >  		struct rcu_head rcu;
> > --- a/mm/backing-dev.c
> > +++ b/mm/backing-dev.c
> > @@ -324,6 +324,10 @@ static int wb_init(struct bdi_writeback
> >  			goto out_destroy_stat;
> >  	}
> >  
> > +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> > +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> > +		init_waitqueue_head(&wb->bdp_waitq);
> > +
> >  	return 0;
> >  
> >  out_destroy_stat:
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -1551,6 +1551,45 @@ static inline void wb_dirty_limits(struc
> >  	}
> >  }
> >  
> > +#if defined(CONFIG_CGROUP_WRITEBACK) && defined(CONFIG_CGWB_BDP_WITH_EBDI)
> > +static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
> > +{
> > +	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> > +
> > +	if (fatal_signal_pending(current))
> > +		return false;
> > +
> > +	gdtc.avail = global_dirtyable_memory();
> > +
> > +	domain_dirty_limits(&gdtc);
> > +
> > +	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> > +			global_node_page_state(NR_UNSTABLE_NFS) +
> > +			global_node_page_state(NR_WRITEBACK);
> > +
> > +	if (gdtc.dirty < gdtc.bg_thresh)
> > +		return false;
> > +
> > +	if (!writeback_in_progress(wb))
> > +		wb_start_background_writeback(wb);
> > +
> > +	/*
> > +	 * throttle if laundry speed remarkably falls behind dirty speed
> > +	 * in the current time window of 200ms
> > +	 */
> > +	return gdtc.dirty > gdtc.thresh &&
> > +		wb_stat(wb, WB_DIRTIED) >
> > +		wb_stat(wb, WB_WRITTEN) +
> > +		wb_stat_error();
> > +}
> > +
> > +static inline void cgwb_bdp(struct bdi_writeback *wb)
> > +{
> > +	wait_event_interruptible_timeout(wb->bdp_waitq,
> > +			!cgwb_bdp_should_throttle(wb), HZ);
> > +}
> > +#endif
> > +
> >  /*
> >   * balance_dirty_pages() must be called by processes which are generating dirty
> >   * data.  It looks at the number of dirty pages in the machine and will force
> > @@ -1910,7 +1949,11 @@ void balance_dirty_pages_ratelimited(str
> >  	preempt_enable();
> >  
> >  	if (unlikely(current->nr_dirtied >= ratelimit))
> > -		balance_dirty_pages(wb, current->nr_dirtied);
> > +		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> > +		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> > +			cgwb_bdp(wb);
> > +		else
> > +			balance_dirty_pages(wb, current->nr_dirtied);
> >  
> >  	wb_put(wb);
> >  }
> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -632,6 +632,11 @@ void wbc_detach_inode(struct writeback_c
> >  	if (!wb)
> >  		return;
> >  
> > +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> > +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> > +		if (waitqueue_active(&wb->bdp_waitq))
> > +			wake_up_all(&wb->bdp_waitq);
> > +
> >  	history = inode->i_wb_frn_history;
> >  	avg_time = inode->i_wb_frn_avg_time;
> >  
> > @@ -811,6 +816,9 @@ static long wb_split_bdi_pages(struct bd
> >  	if (nr_pages == LONG_MAX)
> >  		return LONG_MAX;
> >  
> > +	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> > +	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> > +		return nr_pages;
> >  	/*
> >  	 * This may be called on clean wb's and proportional distribution
> >  	 * may not make sense, just use the original @nr_pages in those
> > @@ -1599,6 +1607,10 @@ static long writeback_chunk_size(struct
> >  	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
> >  		pages = LONG_MAX;
> >  	else {
> > +		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
> > +		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> > +			return work->nr_pages;
> > +
> >  		pages = min(wb->avg_write_bandwidth / 2,
> >  			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
> >  		pages = min(pages, work->nr_pages);
> > --
> > 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Tejun Heo Oct. 15, 2019, 2:37 p.m. UTC | #3
Hello, Hillf.

Do you have a test case which can demonstrate the problem you're
seeing in the existing code?

Thanks.
Hillf Danton Oct. 16, 2019, 2:26 a.m. UTC | #4
Hello Tejun

On Tue, 15 Oct 2019 07:37:31 -0700 Tejun Heo wrote:
> Hello, Hillf.
> 
> Do you have a test case which can demonstrate the problem you're
> seeing in the existing code?

I dont have such a test case because I see no problem in the current
bw measurings except for the difficulties. For example, wb-A's bw is
measured to be 66MB/s if wb-B joins it dispatching IO, or 96MB/s if
wb-C also joins them, in assumption it is a simple case.

It may be too difficult to be feasible, I am afraid, to get wb-A's bw
under the workloads in data centers without wb-non-A's churnings.

Thanks
Hillf
diff mbox series

Patch

--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -157,6 +157,9 @@  struct bdi_writeback {
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
 
+#ifdef CONFIG_CGWB_BDP_WITH_EBDI
+	struct wait_queue_head bdp_waitq;
+#endif
 	union {
 		struct work_struct release_work;
 		struct rcu_head rcu;
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -324,6 +324,10 @@  static int wb_init(struct bdi_writeback
 			goto out_destroy_stat;
 	}
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		init_waitqueue_head(&wb->bdp_waitq);
+
 	return 0;
 
 out_destroy_stat:
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1551,6 +1551,45 @@  static inline void wb_dirty_limits(struc
 	}
 }
 
+#if defined(CONFIG_CGROUP_WRITEBACK) && defined(CONFIG_CGWB_BDP_WITH_EBDI)
+static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+	if (fatal_signal_pending(current))
+		return false;
+
+	gdtc.avail = global_dirtyable_memory();
+
+	domain_dirty_limits(&gdtc);
+
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+			global_node_page_state(NR_UNSTABLE_NFS) +
+			global_node_page_state(NR_WRITEBACK);
+
+	if (gdtc.dirty < gdtc.bg_thresh)
+		return false;
+
+	if (!writeback_in_progress(wb))
+		wb_start_background_writeback(wb);
+
+	/*
+	 * throttle if laundry speed remarkably falls behind dirty speed
+	 * in the current time window of 200ms
+	 */
+	return gdtc.dirty > gdtc.thresh &&
+		wb_stat(wb, WB_DIRTIED) >
+		wb_stat(wb, WB_WRITTEN) +
+		wb_stat_error();
+}
+
+static inline void cgwb_bdp(struct bdi_writeback *wb)
+{
+	wait_event_interruptible_timeout(wb->bdp_waitq,
+			!cgwb_bdp_should_throttle(wb), HZ);
+}
+#endif
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1910,7 +1949,11 @@  void balance_dirty_pages_ratelimited(str
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(wb, current->nr_dirtied);
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+			cgwb_bdp(wb);
+		else
+			balance_dirty_pages(wb, current->nr_dirtied);
 
 	wb_put(wb);
 }
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -632,6 +632,11 @@  void wbc_detach_inode(struct writeback_c
 	if (!wb)
 		return;
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		if (waitqueue_active(&wb->bdp_waitq))
+			wake_up_all(&wb->bdp_waitq);
+
 	history = inode->i_wb_frn_history;
 	avg_time = inode->i_wb_frn_avg_time;
 
@@ -811,6 +816,9 @@  static long wb_split_bdi_pages(struct bd
 	if (nr_pages == LONG_MAX)
 		return LONG_MAX;
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		return nr_pages;
 	/*
 	 * This may be called on clean wb's and proportional distribution
 	 * may not make sense, just use the original @nr_pages in those
@@ -1599,6 +1607,10 @@  static long writeback_chunk_size(struct
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
 	else {
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+			return work->nr_pages;
+
 		pages = min(wb->avg_write_bandwidth / 2,
 			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);