diff mbox series

[RFC,v2,12/16] mm: factor out _balance_dirty_pages() from balance_dirty_pages()

Message ID 20220516164718.2419891-13-shr@fb.com (mailing list archive)
State New
Headers show
Series io-uring/xfs: support async buffered writes | expand

Commit Message

Stefan Roesch May 16, 2022, 4:47 p.m. UTC
This factors out the for loop in balance_dirty_pages() into a new
function called _balance_dirty_pages(). By factoring out this function
the async write code can determine if it has to wait to throttle writes
or not. The function _balance_dirty_pages() returns the sleep time.
If the sleep time is greater 0, then the async write code needs to throttle.

To maintain the context for consecutive calls of _balance_dirty_pages()
and maintain the current behavior a new data structure called bdp_ctx
has been introduced.

Signed-off-by: Stefan Roesch <shr@fb.com>
---
 mm/page-writeback.c | 452 +++++++++++++++++++++++---------------------
 1 file changed, 239 insertions(+), 213 deletions(-)

Comments

Jan Kara May 18, 2022, 11:07 a.m. UTC | #1
On Mon 16-05-22 09:47:14, Stefan Roesch wrote:
> This factors out the for loop in balance_dirty_pages() into a new
> function called _balance_dirty_pages(). By factoring out this function
> the async write code can determine if it has to wait to throttle writes
> or not. The function _balance_dirty_pages() returns the sleep time.
> If the sleep time is greater 0, then the async write code needs to throttle.
> 
> To maintain the context for consecutive calls of _balance_dirty_pages()
> and maintain the current behavior a new data structure called bdp_ctx
> has been introduced.
> 
> Signed-off-by: Stefan Roesch <shr@fb.com>

...

> ---
>  mm/page-writeback.c | 452 +++++++++++++++++++++++---------------------
>  1 file changed, 239 insertions(+), 213 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 7e2da284e427..cbb74c0666c6 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -140,6 +140,29 @@ struct dirty_throttle_control {
>  	unsigned long		pos_ratio;
>  };
>  
> +/* context for consecutive calls to _balance_dirty_pages() */
> +struct bdp_ctx {
> +	long			pause;
> +	unsigned long		now;
> +	unsigned long		start_time;
> +	unsigned long		task_ratelimit;
> +	unsigned long		dirty_ratelimit;
> +	unsigned long		nr_reclaimable;
> +	int			nr_dirtied_pause;
> +	bool			dirty_exceeded;
> +
> +	struct dirty_throttle_control gdtc_stor;
> +	struct dirty_throttle_control mdtc_stor;
> +	struct dirty_throttle_control *sdtc;
> +} bdp_ctx;

Looking at how much context you propagate into _balance_dirty_pages() I
don't think this suggestion was as great as I've hoped. I'm sorry for that.
We could actually significantly reduce the amount of context passed in/out
but some things would be difficult to get rid of and some interactions of
code in _balance_dirty_pages() and the caller are actually pretty subtle.

I think something like attached three patches should make things NOWAIT
support in balance_dirty_pages() reasonably readable.

								Honza

> +
> +/* initialize _balance_dirty_pages() context */
> +#define BDP_CTX_INIT(ctx, wb)				\
> +	.gdtc_stor = { GDTC_INIT(wb) },			\
> +	.mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) },	\
> +	.start_time = jiffies,				\
> +	.dirty_exceeded = false
> +
>  /*
>   * Length of period for aging writeout fractions of bdis. This is an
>   * arbitrarily chosen number. The longer the period, the slower fractions will
> @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
>  	}
>  }
>  
> -/*
> - * balance_dirty_pages() must be called by processes which are generating dirty
> - * data.  It looks at the number of dirty pages in the machine and will force
> - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
> - * If we're over `background_thresh' then the writeback threads are woken to
> - * perform some writeout.
> - */
> -static void balance_dirty_pages(struct bdi_writeback *wb,
> -				unsigned long pages_dirtied)
> +static inline int _balance_dirty_pages(struct bdi_writeback *wb,
> +					unsigned long pages_dirtied, struct bdp_ctx *ctx)
>  {
> -	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
> -	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
> -	struct dirty_throttle_control * const gdtc = &gdtc_stor;
> -	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
> -						     &mdtc_stor : NULL;
> -	struct dirty_throttle_control *sdtc;
> -	unsigned long nr_reclaimable;	/* = file_dirty */
> +	struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor;
> +	struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ?
> +						     &ctx->mdtc_stor : NULL;
>  	long period;
> -	long pause;
>  	long max_pause;
>  	long min_pause;
> -	int nr_dirtied_pause;
> -	bool dirty_exceeded = false;
> -	unsigned long task_ratelimit;
> -	unsigned long dirty_ratelimit;
>  	struct backing_dev_info *bdi = wb->bdi;
>  	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
> -	unsigned long start_time = jiffies;
>  
> -	for (;;) {
> -		unsigned long now = jiffies;
> -		unsigned long dirty, thresh, bg_thresh;
> -		unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
> -		unsigned long m_thresh = 0;
> -		unsigned long m_bg_thresh = 0;
> -
> -		nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
> -		gdtc->avail = global_dirtyable_memory();
> -		gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
> +	unsigned long dirty, thresh, bg_thresh;
> +	unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
> +	unsigned long m_thresh = 0;
> +	unsigned long m_bg_thresh = 0;
>  
> -		domain_dirty_limits(gdtc);
> +	ctx->now = jiffies;
> +	ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
> +	gdtc->avail = global_dirtyable_memory();
> +	gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK);
>  
> -		if (unlikely(strictlimit)) {
> -			wb_dirty_limits(gdtc);
> +	domain_dirty_limits(gdtc);
>  
> -			dirty = gdtc->wb_dirty;
> -			thresh = gdtc->wb_thresh;
> -			bg_thresh = gdtc->wb_bg_thresh;
> -		} else {
> -			dirty = gdtc->dirty;
> -			thresh = gdtc->thresh;
> -			bg_thresh = gdtc->bg_thresh;
> -		}
> +	if (unlikely(strictlimit)) {
> +		wb_dirty_limits(gdtc);
>  
> -		if (mdtc) {
> -			unsigned long filepages, headroom, writeback;
> +		dirty = gdtc->wb_dirty;
> +		thresh = gdtc->wb_thresh;
> +		bg_thresh = gdtc->wb_bg_thresh;
> +	} else {
> +		dirty = gdtc->dirty;
> +		thresh = gdtc->thresh;
> +		bg_thresh = gdtc->bg_thresh;
> +	}
>  
> -			/*
> -			 * If @wb belongs to !root memcg, repeat the same
> -			 * basic calculations for the memcg domain.
> -			 */
> -			mem_cgroup_wb_stats(wb, &filepages, &headroom,
> -					    &mdtc->dirty, &writeback);
> -			mdtc->dirty += writeback;
> -			mdtc_calc_avail(mdtc, filepages, headroom);
> -
> -			domain_dirty_limits(mdtc);
> -
> -			if (unlikely(strictlimit)) {
> -				wb_dirty_limits(mdtc);
> -				m_dirty = mdtc->wb_dirty;
> -				m_thresh = mdtc->wb_thresh;
> -				m_bg_thresh = mdtc->wb_bg_thresh;
> -			} else {
> -				m_dirty = mdtc->dirty;
> -				m_thresh = mdtc->thresh;
> -				m_bg_thresh = mdtc->bg_thresh;
> -			}
> -		}
> +	if (mdtc) {
> +		unsigned long filepages, headroom, writeback;
>  
>  		/*
> -		 * Throttle it only when the background writeback cannot
> -		 * catch-up. This avoids (excessively) small writeouts
> -		 * when the wb limits are ramping up in case of !strictlimit.
> -		 *
> -		 * In strictlimit case make decision based on the wb counters
> -		 * and limits. Small writeouts when the wb limits are ramping
> -		 * up are the price we consciously pay for strictlimit-ing.
> -		 *
> -		 * If memcg domain is in effect, @dirty should be under
> -		 * both global and memcg freerun ceilings.
> +		 * If @wb belongs to !root memcg, repeat the same
> +		 * basic calculations for the memcg domain.
>  		 */
> -		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
> -		    (!mdtc ||
> -		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
> -			unsigned long intv;
> -			unsigned long m_intv;
> +		mem_cgroup_wb_stats(wb, &filepages, &headroom,
> +				    &mdtc->dirty, &writeback);
> +		mdtc->dirty += writeback;
> +		mdtc_calc_avail(mdtc, filepages, headroom);
>  
> -free_running:
> -			intv = dirty_poll_interval(dirty, thresh);
> -			m_intv = ULONG_MAX;
> +		domain_dirty_limits(mdtc);
>  
> -			current->dirty_paused_when = now;
> -			current->nr_dirtied = 0;
> -			if (mdtc)
> -				m_intv = dirty_poll_interval(m_dirty, m_thresh);
> -			current->nr_dirtied_pause = min(intv, m_intv);
> -			break;
> +		if (unlikely(strictlimit)) {
> +			wb_dirty_limits(mdtc);
> +			m_dirty = mdtc->wb_dirty;
> +			m_thresh = mdtc->wb_thresh;
> +			m_bg_thresh = mdtc->wb_bg_thresh;
> +		} else {
> +			m_dirty = mdtc->dirty;
> +			m_thresh = mdtc->thresh;
> +			m_bg_thresh = mdtc->bg_thresh;
>  		}
> +	}
>  
> -		if (unlikely(!writeback_in_progress(wb)))
> -			wb_start_background_writeback(wb);
> +	/*
> +	 * Throttle it only when the background writeback cannot
> +	 * catch-up. This avoids (excessively) small writeouts
> +	 * when the wb limits are ramping up in case of !strictlimit.
> +	 *
> +	 * In strictlimit case make decision based on the wb counters
> +	 * and limits. Small writeouts when the wb limits are ramping
> +	 * up are the price we consciously pay for strictlimit-ing.
> +	 *
> +	 * If memcg domain is in effect, @dirty should be under
> +	 * both global and memcg freerun ceilings.
> +	 */
> +	if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
> +	    (!mdtc ||
> +	     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
> +		unsigned long intv;
> +		unsigned long m_intv;
>  
> -		mem_cgroup_flush_foreign(wb);
> +free_running:
> +		intv = dirty_poll_interval(dirty, thresh);
> +		m_intv = ULONG_MAX;
> +
> +		current->dirty_paused_when = ctx->now;
> +		current->nr_dirtied = 0;
> +		if (mdtc)
> +			m_intv = dirty_poll_interval(m_dirty, m_thresh);
> +		current->nr_dirtied_pause = min(intv, m_intv);
> +		return 0;
> +	}
> +
> +	if (unlikely(!writeback_in_progress(wb)))
> +		wb_start_background_writeback(wb);
>  
> +	mem_cgroup_flush_foreign(wb);
> +
> +	/*
> +	 * Calculate global domain's pos_ratio and select the
> +	 * global dtc by default.
> +	 */
> +	if (!strictlimit) {
> +		wb_dirty_limits(gdtc);
> +
> +		if ((current->flags & PF_LOCAL_THROTTLE) &&
> +		    gdtc->wb_dirty <
> +		    dirty_freerun_ceiling(gdtc->wb_thresh,
> +					  gdtc->wb_bg_thresh))
> +			/*
> +			 * LOCAL_THROTTLE tasks must not be throttled
> +			 * when below the per-wb freerun ceiling.
> +			 */
> +			goto free_running;
> +	}
> +
> +	ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
> +		((gdtc->dirty > gdtc->thresh) || strictlimit);
> +
> +	wb_position_ratio(gdtc);
> +	ctx->sdtc = gdtc;
> +
> +	if (mdtc) {
>  		/*
> -		 * Calculate global domain's pos_ratio and select the
> -		 * global dtc by default.
> +		 * If memcg domain is in effect, calculate its
> +		 * pos_ratio.  @wb should satisfy constraints from
> +		 * both global and memcg domains.  Choose the one
> +		 * w/ lower pos_ratio.
>  		 */
>  		if (!strictlimit) {
> -			wb_dirty_limits(gdtc);
> +			wb_dirty_limits(mdtc);
>  
>  			if ((current->flags & PF_LOCAL_THROTTLE) &&
> -			    gdtc->wb_dirty <
> -			    dirty_freerun_ceiling(gdtc->wb_thresh,
> -						  gdtc->wb_bg_thresh))
> +			    mdtc->wb_dirty <
> +			    dirty_freerun_ceiling(mdtc->wb_thresh,
> +						  mdtc->wb_bg_thresh))
>  				/*
> -				 * LOCAL_THROTTLE tasks must not be throttled
> -				 * when below the per-wb freerun ceiling.
> +				 * LOCAL_THROTTLE tasks must not be
> +				 * throttled when below the per-wb
> +				 * freerun ceiling.
>  				 */
>  				goto free_running;
>  		}
> +		ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
> +			((mdtc->dirty > mdtc->thresh) || strictlimit);
>  
> -		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
> -			((gdtc->dirty > gdtc->thresh) || strictlimit);
> +		wb_position_ratio(mdtc);
> +		if (mdtc->pos_ratio < gdtc->pos_ratio)
> +			ctx->sdtc = mdtc;
> +	}
>  
> -		wb_position_ratio(gdtc);
> -		sdtc = gdtc;
> +	if (ctx->dirty_exceeded && !wb->dirty_exceeded)
> +		wb->dirty_exceeded = 1;
>  
> -		if (mdtc) {
> -			/*
> -			 * If memcg domain is in effect, calculate its
> -			 * pos_ratio.  @wb should satisfy constraints from
> -			 * both global and memcg domains.  Choose the one
> -			 * w/ lower pos_ratio.
> -			 */
> -			if (!strictlimit) {
> -				wb_dirty_limits(mdtc);
> -
> -				if ((current->flags & PF_LOCAL_THROTTLE) &&
> -				    mdtc->wb_dirty <
> -				    dirty_freerun_ceiling(mdtc->wb_thresh,
> -							  mdtc->wb_bg_thresh))
> -					/*
> -					 * LOCAL_THROTTLE tasks must not be
> -					 * throttled when below the per-wb
> -					 * freerun ceiling.
> -					 */
> -					goto free_running;
> -			}
> -			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
> -				((mdtc->dirty > mdtc->thresh) || strictlimit);
> +	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
> +				   BANDWIDTH_INTERVAL))
> +		__wb_update_bandwidth(gdtc, mdtc, true);
> +
> +	/* throttle according to the chosen dtc */
> +	ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
> +	ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >>
> +						RATELIMIT_CALC_SHIFT;
> +	max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty);
> +	min_pause = wb_min_pause(wb, max_pause,
> +				 ctx->task_ratelimit, ctx->dirty_ratelimit,
> +				 &ctx->nr_dirtied_pause);
> +
> +	if (unlikely(ctx->task_ratelimit == 0)) {
> +		period = max_pause;
> +		ctx->pause = max_pause;
> +		goto pause;
> +	}
> +	period = HZ * pages_dirtied / ctx->task_ratelimit;
> +	ctx->pause = period;
> +	if (current->dirty_paused_when)
> +		ctx->pause -= ctx->now - current->dirty_paused_when;
> +	/*
> +	 * For less than 1s think time (ext3/4 may block the dirtier
> +	 * for up to 800ms from time to time on 1-HDD; so does xfs,
> +	 * however at much less frequency), try to compensate it in
> +	 * future periods by updating the virtual time; otherwise just
> +	 * do a reset, as it may be a light dirtier.
> +	 */
> +	if (ctx->pause < min_pause) {
> +		trace_balance_dirty_pages(wb,
> +					  ctx->sdtc->thresh,
> +					  ctx->sdtc->bg_thresh,
> +					  ctx->sdtc->dirty,
> +					  ctx->sdtc->wb_thresh,
> +					  ctx->sdtc->wb_dirty,
> +					  ctx->dirty_ratelimit,
> +					  ctx->task_ratelimit,
> +					  pages_dirtied,
> +					  period,
> +					  min(ctx->pause, 0L),
> +					  ctx->start_time);
> +		if (ctx->pause < -HZ) {
> +			current->dirty_paused_when = ctx->now;
> +			current->nr_dirtied = 0;
> +		} else if (period) {
> +			current->dirty_paused_when += period;
> +			current->nr_dirtied = 0;
> +		} else if (current->nr_dirtied_pause <= pages_dirtied)
> +			current->nr_dirtied_pause += pages_dirtied;
> +		return 0;
> +	}
> +	if (unlikely(ctx->pause > max_pause)) {
> +		/* for occasional dropped task_ratelimit */
> +		ctx->now += min(ctx->pause - max_pause, max_pause);
> +		ctx->pause = max_pause;
> +	}
>  
> -			wb_position_ratio(mdtc);
> -			if (mdtc->pos_ratio < gdtc->pos_ratio)
> -				sdtc = mdtc;
> -		}
> +pause:
> +	trace_balance_dirty_pages(wb,
> +				  ctx->sdtc->thresh,
> +				  ctx->sdtc->bg_thresh,
> +				  ctx->sdtc->dirty,
> +				  ctx->sdtc->wb_thresh,
> +				  ctx->sdtc->wb_dirty,
> +				  ctx->dirty_ratelimit,
> +				  ctx->task_ratelimit,
> +				  pages_dirtied,
> +				  period,
> +				  ctx->pause,
> +				  ctx->start_time);
> +
> +	return ctx->pause;
> +}
>  
> -		if (dirty_exceeded && !wb->dirty_exceeded)
> -			wb->dirty_exceeded = 1;
> -
> -		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
> -					   BANDWIDTH_INTERVAL))
> -			__wb_update_bandwidth(gdtc, mdtc, true);
> -
> -		/* throttle according to the chosen dtc */
> -		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
> -		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
> -							RATELIMIT_CALC_SHIFT;
> -		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
> -		min_pause = wb_min_pause(wb, max_pause,
> -					 task_ratelimit, dirty_ratelimit,
> -					 &nr_dirtied_pause);
> -
> -		if (unlikely(task_ratelimit == 0)) {
> -			period = max_pause;
> -			pause = max_pause;
> -			goto pause;
> -		}
> -		period = HZ * pages_dirtied / task_ratelimit;
> -		pause = period;
> -		if (current->dirty_paused_when)
> -			pause -= now - current->dirty_paused_when;
> -		/*
> -		 * For less than 1s think time (ext3/4 may block the dirtier
> -		 * for up to 800ms from time to time on 1-HDD; so does xfs,
> -		 * however at much less frequency), try to compensate it in
> -		 * future periods by updating the virtual time; otherwise just
> -		 * do a reset, as it may be a light dirtier.
> -		 */
> -		if (pause < min_pause) {
> -			trace_balance_dirty_pages(wb,
> -						  sdtc->thresh,
> -						  sdtc->bg_thresh,
> -						  sdtc->dirty,
> -						  sdtc->wb_thresh,
> -						  sdtc->wb_dirty,
> -						  dirty_ratelimit,
> -						  task_ratelimit,
> -						  pages_dirtied,
> -						  period,
> -						  min(pause, 0L),
> -						  start_time);
> -			if (pause < -HZ) {
> -				current->dirty_paused_when = now;
> -				current->nr_dirtied = 0;
> -			} else if (period) {
> -				current->dirty_paused_when += period;
> -				current->nr_dirtied = 0;
> -			} else if (current->nr_dirtied_pause <= pages_dirtied)
> -				current->nr_dirtied_pause += pages_dirtied;
> +/*
> + * balance_dirty_pages() must be called by processes which are generating dirty
> + * data.  It looks at the number of dirty pages in the machine and will force
> + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
> + * If we're over `background_thresh' then the writeback threads are woken to
> + * perform some writeout.
> + */
> +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied)
> +{
> +	int ret;
> +	struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
> +
> +	for (;;) {
> +		ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
> +		if (!ret)
>  			break;
> -		}
> -		if (unlikely(pause > max_pause)) {
> -			/* for occasional dropped task_ratelimit */
> -			now += min(pause - max_pause, max_pause);
> -			pause = max_pause;
> -		}
>  
> -pause:
> -		trace_balance_dirty_pages(wb,
> -					  sdtc->thresh,
> -					  sdtc->bg_thresh,
> -					  sdtc->dirty,
> -					  sdtc->wb_thresh,
> -					  sdtc->wb_dirty,
> -					  dirty_ratelimit,
> -					  task_ratelimit,
> -					  pages_dirtied,
> -					  period,
> -					  pause,
> -					  start_time);
>  		__set_current_state(TASK_KILLABLE);
> -		wb->dirty_sleep = now;
> -		io_schedule_timeout(pause);
> +		wb->dirty_sleep = ctx.now;
> +		io_schedule_timeout(ctx.pause);
>  
> -		current->dirty_paused_when = now + pause;
> +		current->dirty_paused_when = ctx.now + ctx.pause;
>  		current->nr_dirtied = 0;
> -		current->nr_dirtied_pause = nr_dirtied_pause;
> +		current->nr_dirtied_pause = ctx.nr_dirtied_pause;
>  
>  		/*
>  		 * This is typically equal to (dirty < thresh) and can also
>  		 * keep "1000+ dd on a slow USB stick" under control.
>  		 */
> -		if (task_ratelimit)
> +		if (ctx.task_ratelimit)
>  			break;
>  
>  		/*
> @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
>  		 * more page. However wb_dirty has accounting errors.  So use
>  		 * the larger and more IO friendly wb_stat_error.
>  		 */
> -		if (sdtc->wb_dirty <= wb_stat_error())
> +		if (ctx.sdtc->wb_dirty <= wb_stat_error())
>  			break;
>  
>  		if (fatal_signal_pending(current))
>  			break;
>  	}
>  
> -	if (!dirty_exceeded && wb->dirty_exceeded)
> +	if (!ctx.dirty_exceeded && wb->dirty_exceeded)
>  		wb->dirty_exceeded = 0;
>  
>  	if (writeback_in_progress(wb))
> @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
>  	if (laptop_mode)
>  		return;
>  
> -	if (nr_reclaimable > gdtc->bg_thresh)
> +	if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh)
>  		wb_start_background_writeback(wb);
>  }
>  
> -- 
> 2.30.2
>
Stefan Roesch May 18, 2022, 11:31 p.m. UTC | #2
On 5/18/22 4:07 AM, Jan Kara wrote:
> On Mon 16-05-22 09:47:14, Stefan Roesch wrote:
>> This factors out the for loop in balance_dirty_pages() into a new
>> function called _balance_dirty_pages(). By factoring out this function
>> the async write code can determine if it has to wait to throttle writes
>> or not. The function _balance_dirty_pages() returns the sleep time.
>> If the sleep time is greater 0, then the async write code needs to throttle.
>>
>> To maintain the context for consecutive calls of _balance_dirty_pages()
>> and maintain the current behavior a new data structure called bdp_ctx
>> has been introduced.
>>
>> Signed-off-by: Stefan Roesch <shr@fb.com>
> 
> ...
> 
>> ---
>>  mm/page-writeback.c | 452 +++++++++++++++++++++++---------------------
>>  1 file changed, 239 insertions(+), 213 deletions(-)
>>
>> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
>> index 7e2da284e427..cbb74c0666c6 100644
>> --- a/mm/page-writeback.c
>> +++ b/mm/page-writeback.c
>> @@ -140,6 +140,29 @@ struct dirty_throttle_control {
>>  	unsigned long		pos_ratio;
>>  };
>>  
>> +/* context for consecutive calls to _balance_dirty_pages() */
>> +struct bdp_ctx {
>> +	long			pause;
>> +	unsigned long		now;
>> +	unsigned long		start_time;
>> +	unsigned long		task_ratelimit;
>> +	unsigned long		dirty_ratelimit;
>> +	unsigned long		nr_reclaimable;
>> +	int			nr_dirtied_pause;
>> +	bool			dirty_exceeded;
>> +
>> +	struct dirty_throttle_control gdtc_stor;
>> +	struct dirty_throttle_control mdtc_stor;
>> +	struct dirty_throttle_control *sdtc;
>> +} bdp_ctx;
> 
> Looking at how much context you propagate into _balance_dirty_pages() I
> don't think this suggestion was as great as I've hoped. I'm sorry for that.
> We could actually significantly reduce the amount of context passed in/out
> but some things would be difficult to get rid of and some interactions of
> code in _balance_dirty_pages() and the caller are actually pretty subtle.
> 
> I think something like attached three patches should make things NOWAIT
> support in balance_dirty_pages() reasonably readable.
> 

Thanks a lot Jan for the three patches. I integrated them in the patch series
and changed the callers accordingly.

> 								Honza
> 
>> +
>> +/* initialize _balance_dirty_pages() context */
>> +#define BDP_CTX_INIT(ctx, wb)				\
>> +	.gdtc_stor = { GDTC_INIT(wb) },			\
>> +	.mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) },	\
>> +	.start_time = jiffies,				\
>> +	.dirty_exceeded = false
>> +
>>  /*
>>   * Length of period for aging writeout fractions of bdis. This is an
>>   * arbitrarily chosen number. The longer the period, the slower fractions will
>> @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
>>  	}
>>  }
>>  
>> -/*
>> - * balance_dirty_pages() must be called by processes which are generating dirty
>> - * data.  It looks at the number of dirty pages in the machine and will force
>> - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
>> - * If we're over `background_thresh' then the writeback threads are woken to
>> - * perform some writeout.
>> - */
>> -static void balance_dirty_pages(struct bdi_writeback *wb,
>> -				unsigned long pages_dirtied)
>> +static inline int _balance_dirty_pages(struct bdi_writeback *wb,
>> +					unsigned long pages_dirtied, struct bdp_ctx *ctx)
>>  {
>> -	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
>> -	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
>> -	struct dirty_throttle_control * const gdtc = &gdtc_stor;
>> -	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
>> -						     &mdtc_stor : NULL;
>> -	struct dirty_throttle_control *sdtc;
>> -	unsigned long nr_reclaimable;	/* = file_dirty */
>> +	struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor;
>> +	struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ?
>> +						     &ctx->mdtc_stor : NULL;
>>  	long period;
>> -	long pause;
>>  	long max_pause;
>>  	long min_pause;
>> -	int nr_dirtied_pause;
>> -	bool dirty_exceeded = false;
>> -	unsigned long task_ratelimit;
>> -	unsigned long dirty_ratelimit;
>>  	struct backing_dev_info *bdi = wb->bdi;
>>  	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
>> -	unsigned long start_time = jiffies;
>>  
>> -	for (;;) {
>> -		unsigned long now = jiffies;
>> -		unsigned long dirty, thresh, bg_thresh;
>> -		unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
>> -		unsigned long m_thresh = 0;
>> -		unsigned long m_bg_thresh = 0;
>> -
>> -		nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
>> -		gdtc->avail = global_dirtyable_memory();
>> -		gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
>> +	unsigned long dirty, thresh, bg_thresh;
>> +	unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
>> +	unsigned long m_thresh = 0;
>> +	unsigned long m_bg_thresh = 0;
>>  
>> -		domain_dirty_limits(gdtc);
>> +	ctx->now = jiffies;
>> +	ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
>> +	gdtc->avail = global_dirtyable_memory();
>> +	gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK);
>>  
>> -		if (unlikely(strictlimit)) {
>> -			wb_dirty_limits(gdtc);
>> +	domain_dirty_limits(gdtc);
>>  
>> -			dirty = gdtc->wb_dirty;
>> -			thresh = gdtc->wb_thresh;
>> -			bg_thresh = gdtc->wb_bg_thresh;
>> -		} else {
>> -			dirty = gdtc->dirty;
>> -			thresh = gdtc->thresh;
>> -			bg_thresh = gdtc->bg_thresh;
>> -		}
>> +	if (unlikely(strictlimit)) {
>> +		wb_dirty_limits(gdtc);
>>  
>> -		if (mdtc) {
>> -			unsigned long filepages, headroom, writeback;
>> +		dirty = gdtc->wb_dirty;
>> +		thresh = gdtc->wb_thresh;
>> +		bg_thresh = gdtc->wb_bg_thresh;
>> +	} else {
>> +		dirty = gdtc->dirty;
>> +		thresh = gdtc->thresh;
>> +		bg_thresh = gdtc->bg_thresh;
>> +	}
>>  
>> -			/*
>> -			 * If @wb belongs to !root memcg, repeat the same
>> -			 * basic calculations for the memcg domain.
>> -			 */
>> -			mem_cgroup_wb_stats(wb, &filepages, &headroom,
>> -					    &mdtc->dirty, &writeback);
>> -			mdtc->dirty += writeback;
>> -			mdtc_calc_avail(mdtc, filepages, headroom);
>> -
>> -			domain_dirty_limits(mdtc);
>> -
>> -			if (unlikely(strictlimit)) {
>> -				wb_dirty_limits(mdtc);
>> -				m_dirty = mdtc->wb_dirty;
>> -				m_thresh = mdtc->wb_thresh;
>> -				m_bg_thresh = mdtc->wb_bg_thresh;
>> -			} else {
>> -				m_dirty = mdtc->dirty;
>> -				m_thresh = mdtc->thresh;
>> -				m_bg_thresh = mdtc->bg_thresh;
>> -			}
>> -		}
>> +	if (mdtc) {
>> +		unsigned long filepages, headroom, writeback;
>>  
>>  		/*
>> -		 * Throttle it only when the background writeback cannot
>> -		 * catch-up. This avoids (excessively) small writeouts
>> -		 * when the wb limits are ramping up in case of !strictlimit.
>> -		 *
>> -		 * In strictlimit case make decision based on the wb counters
>> -		 * and limits. Small writeouts when the wb limits are ramping
>> -		 * up are the price we consciously pay for strictlimit-ing.
>> -		 *
>> -		 * If memcg domain is in effect, @dirty should be under
>> -		 * both global and memcg freerun ceilings.
>> +		 * If @wb belongs to !root memcg, repeat the same
>> +		 * basic calculations for the memcg domain.
>>  		 */
>> -		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
>> -		    (!mdtc ||
>> -		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
>> -			unsigned long intv;
>> -			unsigned long m_intv;
>> +		mem_cgroup_wb_stats(wb, &filepages, &headroom,
>> +				    &mdtc->dirty, &writeback);
>> +		mdtc->dirty += writeback;
>> +		mdtc_calc_avail(mdtc, filepages, headroom);
>>  
>> -free_running:
>> -			intv = dirty_poll_interval(dirty, thresh);
>> -			m_intv = ULONG_MAX;
>> +		domain_dirty_limits(mdtc);
>>  
>> -			current->dirty_paused_when = now;
>> -			current->nr_dirtied = 0;
>> -			if (mdtc)
>> -				m_intv = dirty_poll_interval(m_dirty, m_thresh);
>> -			current->nr_dirtied_pause = min(intv, m_intv);
>> -			break;
>> +		if (unlikely(strictlimit)) {
>> +			wb_dirty_limits(mdtc);
>> +			m_dirty = mdtc->wb_dirty;
>> +			m_thresh = mdtc->wb_thresh;
>> +			m_bg_thresh = mdtc->wb_bg_thresh;
>> +		} else {
>> +			m_dirty = mdtc->dirty;
>> +			m_thresh = mdtc->thresh;
>> +			m_bg_thresh = mdtc->bg_thresh;
>>  		}
>> +	}
>>  
>> -		if (unlikely(!writeback_in_progress(wb)))
>> -			wb_start_background_writeback(wb);
>> +	/*
>> +	 * Throttle it only when the background writeback cannot
>> +	 * catch-up. This avoids (excessively) small writeouts
>> +	 * when the wb limits are ramping up in case of !strictlimit.
>> +	 *
>> +	 * In strictlimit case make decision based on the wb counters
>> +	 * and limits. Small writeouts when the wb limits are ramping
>> +	 * up are the price we consciously pay for strictlimit-ing.
>> +	 *
>> +	 * If memcg domain is in effect, @dirty should be under
>> +	 * both global and memcg freerun ceilings.
>> +	 */
>> +	if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
>> +	    (!mdtc ||
>> +	     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
>> +		unsigned long intv;
>> +		unsigned long m_intv;
>>  
>> -		mem_cgroup_flush_foreign(wb);
>> +free_running:
>> +		intv = dirty_poll_interval(dirty, thresh);
>> +		m_intv = ULONG_MAX;
>> +
>> +		current->dirty_paused_when = ctx->now;
>> +		current->nr_dirtied = 0;
>> +		if (mdtc)
>> +			m_intv = dirty_poll_interval(m_dirty, m_thresh);
>> +		current->nr_dirtied_pause = min(intv, m_intv);
>> +		return 0;
>> +	}
>> +
>> +	if (unlikely(!writeback_in_progress(wb)))
>> +		wb_start_background_writeback(wb);
>>  
>> +	mem_cgroup_flush_foreign(wb);
>> +
>> +	/*
>> +	 * Calculate global domain's pos_ratio and select the
>> +	 * global dtc by default.
>> +	 */
>> +	if (!strictlimit) {
>> +		wb_dirty_limits(gdtc);
>> +
>> +		if ((current->flags & PF_LOCAL_THROTTLE) &&
>> +		    gdtc->wb_dirty <
>> +		    dirty_freerun_ceiling(gdtc->wb_thresh,
>> +					  gdtc->wb_bg_thresh))
>> +			/*
>> +			 * LOCAL_THROTTLE tasks must not be throttled
>> +			 * when below the per-wb freerun ceiling.
>> +			 */
>> +			goto free_running;
>> +	}
>> +
>> +	ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
>> +		((gdtc->dirty > gdtc->thresh) || strictlimit);
>> +
>> +	wb_position_ratio(gdtc);
>> +	ctx->sdtc = gdtc;
>> +
>> +	if (mdtc) {
>>  		/*
>> -		 * Calculate global domain's pos_ratio and select the
>> -		 * global dtc by default.
>> +		 * If memcg domain is in effect, calculate its
>> +		 * pos_ratio.  @wb should satisfy constraints from
>> +		 * both global and memcg domains.  Choose the one
>> +		 * w/ lower pos_ratio.
>>  		 */
>>  		if (!strictlimit) {
>> -			wb_dirty_limits(gdtc);
>> +			wb_dirty_limits(mdtc);
>>  
>>  			if ((current->flags & PF_LOCAL_THROTTLE) &&
>> -			    gdtc->wb_dirty <
>> -			    dirty_freerun_ceiling(gdtc->wb_thresh,
>> -						  gdtc->wb_bg_thresh))
>> +			    mdtc->wb_dirty <
>> +			    dirty_freerun_ceiling(mdtc->wb_thresh,
>> +						  mdtc->wb_bg_thresh))
>>  				/*
>> -				 * LOCAL_THROTTLE tasks must not be throttled
>> -				 * when below the per-wb freerun ceiling.
>> +				 * LOCAL_THROTTLE tasks must not be
>> +				 * throttled when below the per-wb
>> +				 * freerun ceiling.
>>  				 */
>>  				goto free_running;
>>  		}
>> +		ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
>> +			((mdtc->dirty > mdtc->thresh) || strictlimit);
>>  
>> -		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
>> -			((gdtc->dirty > gdtc->thresh) || strictlimit);
>> +		wb_position_ratio(mdtc);
>> +		if (mdtc->pos_ratio < gdtc->pos_ratio)
>> +			ctx->sdtc = mdtc;
>> +	}
>>  
>> -		wb_position_ratio(gdtc);
>> -		sdtc = gdtc;
>> +	if (ctx->dirty_exceeded && !wb->dirty_exceeded)
>> +		wb->dirty_exceeded = 1;
>>  
>> -		if (mdtc) {
>> -			/*
>> -			 * If memcg domain is in effect, calculate its
>> -			 * pos_ratio.  @wb should satisfy constraints from
>> -			 * both global and memcg domains.  Choose the one
>> -			 * w/ lower pos_ratio.
>> -			 */
>> -			if (!strictlimit) {
>> -				wb_dirty_limits(mdtc);
>> -
>> -				if ((current->flags & PF_LOCAL_THROTTLE) &&
>> -				    mdtc->wb_dirty <
>> -				    dirty_freerun_ceiling(mdtc->wb_thresh,
>> -							  mdtc->wb_bg_thresh))
>> -					/*
>> -					 * LOCAL_THROTTLE tasks must not be
>> -					 * throttled when below the per-wb
>> -					 * freerun ceiling.
>> -					 */
>> -					goto free_running;
>> -			}
>> -			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
>> -				((mdtc->dirty > mdtc->thresh) || strictlimit);
>> +	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
>> +				   BANDWIDTH_INTERVAL))
>> +		__wb_update_bandwidth(gdtc, mdtc, true);
>> +
>> +	/* throttle according to the chosen dtc */
>> +	ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
>> +	ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >>
>> +						RATELIMIT_CALC_SHIFT;
>> +	max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty);
>> +	min_pause = wb_min_pause(wb, max_pause,
>> +				 ctx->task_ratelimit, ctx->dirty_ratelimit,
>> +				 &ctx->nr_dirtied_pause);
>> +
>> +	if (unlikely(ctx->task_ratelimit == 0)) {
>> +		period = max_pause;
>> +		ctx->pause = max_pause;
>> +		goto pause;
>> +	}
>> +	period = HZ * pages_dirtied / ctx->task_ratelimit;
>> +	ctx->pause = period;
>> +	if (current->dirty_paused_when)
>> +		ctx->pause -= ctx->now - current->dirty_paused_when;
>> +	/*
>> +	 * For less than 1s think time (ext3/4 may block the dirtier
>> +	 * for up to 800ms from time to time on 1-HDD; so does xfs,
>> +	 * however at much less frequency), try to compensate it in
>> +	 * future periods by updating the virtual time; otherwise just
>> +	 * do a reset, as it may be a light dirtier.
>> +	 */
>> +	if (ctx->pause < min_pause) {
>> +		trace_balance_dirty_pages(wb,
>> +					  ctx->sdtc->thresh,
>> +					  ctx->sdtc->bg_thresh,
>> +					  ctx->sdtc->dirty,
>> +					  ctx->sdtc->wb_thresh,
>> +					  ctx->sdtc->wb_dirty,
>> +					  ctx->dirty_ratelimit,
>> +					  ctx->task_ratelimit,
>> +					  pages_dirtied,
>> +					  period,
>> +					  min(ctx->pause, 0L),
>> +					  ctx->start_time);
>> +		if (ctx->pause < -HZ) {
>> +			current->dirty_paused_when = ctx->now;
>> +			current->nr_dirtied = 0;
>> +		} else if (period) {
>> +			current->dirty_paused_when += period;
>> +			current->nr_dirtied = 0;
>> +		} else if (current->nr_dirtied_pause <= pages_dirtied)
>> +			current->nr_dirtied_pause += pages_dirtied;
>> +		return 0;
>> +	}
>> +	if (unlikely(ctx->pause > max_pause)) {
>> +		/* for occasional dropped task_ratelimit */
>> +		ctx->now += min(ctx->pause - max_pause, max_pause);
>> +		ctx->pause = max_pause;
>> +	}
>>  
>> -			wb_position_ratio(mdtc);
>> -			if (mdtc->pos_ratio < gdtc->pos_ratio)
>> -				sdtc = mdtc;
>> -		}
>> +pause:
>> +	trace_balance_dirty_pages(wb,
>> +				  ctx->sdtc->thresh,
>> +				  ctx->sdtc->bg_thresh,
>> +				  ctx->sdtc->dirty,
>> +				  ctx->sdtc->wb_thresh,
>> +				  ctx->sdtc->wb_dirty,
>> +				  ctx->dirty_ratelimit,
>> +				  ctx->task_ratelimit,
>> +				  pages_dirtied,
>> +				  period,
>> +				  ctx->pause,
>> +				  ctx->start_time);
>> +
>> +	return ctx->pause;
>> +}
>>  
>> -		if (dirty_exceeded && !wb->dirty_exceeded)
>> -			wb->dirty_exceeded = 1;
>> -
>> -		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
>> -					   BANDWIDTH_INTERVAL))
>> -			__wb_update_bandwidth(gdtc, mdtc, true);
>> -
>> -		/* throttle according to the chosen dtc */
>> -		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
>> -		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
>> -							RATELIMIT_CALC_SHIFT;
>> -		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
>> -		min_pause = wb_min_pause(wb, max_pause,
>> -					 task_ratelimit, dirty_ratelimit,
>> -					 &nr_dirtied_pause);
>> -
>> -		if (unlikely(task_ratelimit == 0)) {
>> -			period = max_pause;
>> -			pause = max_pause;
>> -			goto pause;
>> -		}
>> -		period = HZ * pages_dirtied / task_ratelimit;
>> -		pause = period;
>> -		if (current->dirty_paused_when)
>> -			pause -= now - current->dirty_paused_when;
>> -		/*
>> -		 * For less than 1s think time (ext3/4 may block the dirtier
>> -		 * for up to 800ms from time to time on 1-HDD; so does xfs,
>> -		 * however at much less frequency), try to compensate it in
>> -		 * future periods by updating the virtual time; otherwise just
>> -		 * do a reset, as it may be a light dirtier.
>> -		 */
>> -		if (pause < min_pause) {
>> -			trace_balance_dirty_pages(wb,
>> -						  sdtc->thresh,
>> -						  sdtc->bg_thresh,
>> -						  sdtc->dirty,
>> -						  sdtc->wb_thresh,
>> -						  sdtc->wb_dirty,
>> -						  dirty_ratelimit,
>> -						  task_ratelimit,
>> -						  pages_dirtied,
>> -						  period,
>> -						  min(pause, 0L),
>> -						  start_time);
>> -			if (pause < -HZ) {
>> -				current->dirty_paused_when = now;
>> -				current->nr_dirtied = 0;
>> -			} else if (period) {
>> -				current->dirty_paused_when += period;
>> -				current->nr_dirtied = 0;
>> -			} else if (current->nr_dirtied_pause <= pages_dirtied)
>> -				current->nr_dirtied_pause += pages_dirtied;
>> +/*
>> + * balance_dirty_pages() must be called by processes which are generating dirty
>> + * data.  It looks at the number of dirty pages in the machine and will force
>> + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
>> + * If we're over `background_thresh' then the writeback threads are woken to
>> + * perform some writeout.
>> + */
>> +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied)
>> +{
>> +	int ret;
>> +	struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
>> +
>> +	for (;;) {
>> +		ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
>> +		if (!ret)
>>  			break;
>> -		}
>> -		if (unlikely(pause > max_pause)) {
>> -			/* for occasional dropped task_ratelimit */
>> -			now += min(pause - max_pause, max_pause);
>> -			pause = max_pause;
>> -		}
>>  
>> -pause:
>> -		trace_balance_dirty_pages(wb,
>> -					  sdtc->thresh,
>> -					  sdtc->bg_thresh,
>> -					  sdtc->dirty,
>> -					  sdtc->wb_thresh,
>> -					  sdtc->wb_dirty,
>> -					  dirty_ratelimit,
>> -					  task_ratelimit,
>> -					  pages_dirtied,
>> -					  period,
>> -					  pause,
>> -					  start_time);
>>  		__set_current_state(TASK_KILLABLE);
>> -		wb->dirty_sleep = now;
>> -		io_schedule_timeout(pause);
>> +		wb->dirty_sleep = ctx.now;
>> +		io_schedule_timeout(ctx.pause);
>>  
>> -		current->dirty_paused_when = now + pause;
>> +		current->dirty_paused_when = ctx.now + ctx.pause;
>>  		current->nr_dirtied = 0;
>> -		current->nr_dirtied_pause = nr_dirtied_pause;
>> +		current->nr_dirtied_pause = ctx.nr_dirtied_pause;
>>  
>>  		/*
>>  		 * This is typically equal to (dirty < thresh) and can also
>>  		 * keep "1000+ dd on a slow USB stick" under control.
>>  		 */
>> -		if (task_ratelimit)
>> +		if (ctx.task_ratelimit)
>>  			break;
>>  
>>  		/*
>> @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
>>  		 * more page. However wb_dirty has accounting errors.  So use
>>  		 * the larger and more IO friendly wb_stat_error.
>>  		 */
>> -		if (sdtc->wb_dirty <= wb_stat_error())
>> +		if (ctx.sdtc->wb_dirty <= wb_stat_error())
>>  			break;
>>  
>>  		if (fatal_signal_pending(current))
>>  			break;
>>  	}
>>  
>> -	if (!dirty_exceeded && wb->dirty_exceeded)
>> +	if (!ctx.dirty_exceeded && wb->dirty_exceeded)
>>  		wb->dirty_exceeded = 0;
>>  
>>  	if (writeback_in_progress(wb))
>> @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
>>  	if (laptop_mode)
>>  		return;
>>  
>> -	if (nr_reclaimable > gdtc->bg_thresh)
>> +	if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh)
>>  		wb_start_background_writeback(wb);
>>  }
>>  
>> -- 
>> 2.30.2
>>
diff mbox series

Patch

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7e2da284e427..cbb74c0666c6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -140,6 +140,29 @@  struct dirty_throttle_control {
 	unsigned long		pos_ratio;
 };
 
+/* context for consecutive calls to _balance_dirty_pages() */
+struct bdp_ctx {
+	long			pause;
+	unsigned long		now;
+	unsigned long		start_time;
+	unsigned long		task_ratelimit;
+	unsigned long		dirty_ratelimit;
+	unsigned long		nr_reclaimable;
+	int			nr_dirtied_pause;
+	bool			dirty_exceeded;
+
+	struct dirty_throttle_control gdtc_stor;
+	struct dirty_throttle_control mdtc_stor;
+	struct dirty_throttle_control *sdtc;
+} bdp_ctx;
+
+/* initialize _balance_dirty_pages() context */
+#define BDP_CTX_INIT(ctx, wb)				\
+	.gdtc_stor = { GDTC_INIT(wb) },			\
+	.mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) },	\
+	.start_time = jiffies,				\
+	.dirty_exceeded = false
+
 /*
  * Length of period for aging writeout fractions of bdis. This is an
  * arbitrarily chosen number. The longer the period, the slower fractions will
@@ -1538,261 +1561,264 @@  static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
 	}
 }
 
-/*
- * balance_dirty_pages() must be called by processes which are generating dirty
- * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
- * If we're over `background_thresh' then the writeback threads are woken to
- * perform some writeout.
- */
-static void balance_dirty_pages(struct bdi_writeback *wb,
-				unsigned long pages_dirtied)
+static inline int _balance_dirty_pages(struct bdi_writeback *wb,
+					unsigned long pages_dirtied, struct bdp_ctx *ctx)
 {
-	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
-	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
-	struct dirty_throttle_control * const gdtc = &gdtc_stor;
-	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
-						     &mdtc_stor : NULL;
-	struct dirty_throttle_control *sdtc;
-	unsigned long nr_reclaimable;	/* = file_dirty */
+	struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ?
+						     &ctx->mdtc_stor : NULL;
 	long period;
-	long pause;
 	long max_pause;
 	long min_pause;
-	int nr_dirtied_pause;
-	bool dirty_exceeded = false;
-	unsigned long task_ratelimit;
-	unsigned long dirty_ratelimit;
 	struct backing_dev_info *bdi = wb->bdi;
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
-	unsigned long start_time = jiffies;
 
-	for (;;) {
-		unsigned long now = jiffies;
-		unsigned long dirty, thresh, bg_thresh;
-		unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
-		unsigned long m_thresh = 0;
-		unsigned long m_bg_thresh = 0;
-
-		nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
-		gdtc->avail = global_dirtyable_memory();
-		gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+	unsigned long dirty, thresh, bg_thresh;
+	unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
+	unsigned long m_thresh = 0;
+	unsigned long m_bg_thresh = 0;
 
-		domain_dirty_limits(gdtc);
+	ctx->now = jiffies;
+	ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+	gdtc->avail = global_dirtyable_memory();
+	gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK);
 
-		if (unlikely(strictlimit)) {
-			wb_dirty_limits(gdtc);
+	domain_dirty_limits(gdtc);
 
-			dirty = gdtc->wb_dirty;
-			thresh = gdtc->wb_thresh;
-			bg_thresh = gdtc->wb_bg_thresh;
-		} else {
-			dirty = gdtc->dirty;
-			thresh = gdtc->thresh;
-			bg_thresh = gdtc->bg_thresh;
-		}
+	if (unlikely(strictlimit)) {
+		wb_dirty_limits(gdtc);
 
-		if (mdtc) {
-			unsigned long filepages, headroom, writeback;
+		dirty = gdtc->wb_dirty;
+		thresh = gdtc->wb_thresh;
+		bg_thresh = gdtc->wb_bg_thresh;
+	} else {
+		dirty = gdtc->dirty;
+		thresh = gdtc->thresh;
+		bg_thresh = gdtc->bg_thresh;
+	}
 
-			/*
-			 * If @wb belongs to !root memcg, repeat the same
-			 * basic calculations for the memcg domain.
-			 */
-			mem_cgroup_wb_stats(wb, &filepages, &headroom,
-					    &mdtc->dirty, &writeback);
-			mdtc->dirty += writeback;
-			mdtc_calc_avail(mdtc, filepages, headroom);
-
-			domain_dirty_limits(mdtc);
-
-			if (unlikely(strictlimit)) {
-				wb_dirty_limits(mdtc);
-				m_dirty = mdtc->wb_dirty;
-				m_thresh = mdtc->wb_thresh;
-				m_bg_thresh = mdtc->wb_bg_thresh;
-			} else {
-				m_dirty = mdtc->dirty;
-				m_thresh = mdtc->thresh;
-				m_bg_thresh = mdtc->bg_thresh;
-			}
-		}
+	if (mdtc) {
+		unsigned long filepages, headroom, writeback;
 
 		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the wb limits are ramping up in case of !strictlimit.
-		 *
-		 * In strictlimit case make decision based on the wb counters
-		 * and limits. Small writeouts when the wb limits are ramping
-		 * up are the price we consciously pay for strictlimit-ing.
-		 *
-		 * If memcg domain is in effect, @dirty should be under
-		 * both global and memcg freerun ceilings.
+		 * If @wb belongs to !root memcg, repeat the same
+		 * basic calculations for the memcg domain.
 		 */
-		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
-		    (!mdtc ||
-		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
-			unsigned long intv;
-			unsigned long m_intv;
+		mem_cgroup_wb_stats(wb, &filepages, &headroom,
+				    &mdtc->dirty, &writeback);
+		mdtc->dirty += writeback;
+		mdtc_calc_avail(mdtc, filepages, headroom);
 
-free_running:
-			intv = dirty_poll_interval(dirty, thresh);
-			m_intv = ULONG_MAX;
+		domain_dirty_limits(mdtc);
 
-			current->dirty_paused_when = now;
-			current->nr_dirtied = 0;
-			if (mdtc)
-				m_intv = dirty_poll_interval(m_dirty, m_thresh);
-			current->nr_dirtied_pause = min(intv, m_intv);
-			break;
+		if (unlikely(strictlimit)) {
+			wb_dirty_limits(mdtc);
+			m_dirty = mdtc->wb_dirty;
+			m_thresh = mdtc->wb_thresh;
+			m_bg_thresh = mdtc->wb_bg_thresh;
+		} else {
+			m_dirty = mdtc->dirty;
+			m_thresh = mdtc->thresh;
+			m_bg_thresh = mdtc->bg_thresh;
 		}
+	}
 
-		if (unlikely(!writeback_in_progress(wb)))
-			wb_start_background_writeback(wb);
+	/*
+	 * Throttle it only when the background writeback cannot
+	 * catch-up. This avoids (excessively) small writeouts
+	 * when the wb limits are ramping up in case of !strictlimit.
+	 *
+	 * In strictlimit case make decision based on the wb counters
+	 * and limits. Small writeouts when the wb limits are ramping
+	 * up are the price we consciously pay for strictlimit-ing.
+	 *
+	 * If memcg domain is in effect, @dirty should be under
+	 * both global and memcg freerun ceilings.
+	 */
+	if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+	    (!mdtc ||
+	     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+		unsigned long intv;
+		unsigned long m_intv;
 
-		mem_cgroup_flush_foreign(wb);
+free_running:
+		intv = dirty_poll_interval(dirty, thresh);
+		m_intv = ULONG_MAX;
+
+		current->dirty_paused_when = ctx->now;
+		current->nr_dirtied = 0;
+		if (mdtc)
+			m_intv = dirty_poll_interval(m_dirty, m_thresh);
+		current->nr_dirtied_pause = min(intv, m_intv);
+		return 0;
+	}
+
+	if (unlikely(!writeback_in_progress(wb)))
+		wb_start_background_writeback(wb);
 
+	mem_cgroup_flush_foreign(wb);
+
+	/*
+	 * Calculate global domain's pos_ratio and select the
+	 * global dtc by default.
+	 */
+	if (!strictlimit) {
+		wb_dirty_limits(gdtc);
+
+		if ((current->flags & PF_LOCAL_THROTTLE) &&
+		    gdtc->wb_dirty <
+		    dirty_freerun_ceiling(gdtc->wb_thresh,
+					  gdtc->wb_bg_thresh))
+			/*
+			 * LOCAL_THROTTLE tasks must not be throttled
+			 * when below the per-wb freerun ceiling.
+			 */
+			goto free_running;
+	}
+
+	ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+		((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+	wb_position_ratio(gdtc);
+	ctx->sdtc = gdtc;
+
+	if (mdtc) {
 		/*
-		 * Calculate global domain's pos_ratio and select the
-		 * global dtc by default.
+		 * If memcg domain is in effect, calculate its
+		 * pos_ratio.  @wb should satisfy constraints from
+		 * both global and memcg domains.  Choose the one
+		 * w/ lower pos_ratio.
 		 */
 		if (!strictlimit) {
-			wb_dirty_limits(gdtc);
+			wb_dirty_limits(mdtc);
 
 			if ((current->flags & PF_LOCAL_THROTTLE) &&
-			    gdtc->wb_dirty <
-			    dirty_freerun_ceiling(gdtc->wb_thresh,
-						  gdtc->wb_bg_thresh))
+			    mdtc->wb_dirty <
+			    dirty_freerun_ceiling(mdtc->wb_thresh,
+						  mdtc->wb_bg_thresh))
 				/*
-				 * LOCAL_THROTTLE tasks must not be throttled
-				 * when below the per-wb freerun ceiling.
+				 * LOCAL_THROTTLE tasks must not be
+				 * throttled when below the per-wb
+				 * freerun ceiling.
 				 */
 				goto free_running;
 		}
+		ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+			((mdtc->dirty > mdtc->thresh) || strictlimit);
 
-		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
-			((gdtc->dirty > gdtc->thresh) || strictlimit);
+		wb_position_ratio(mdtc);
+		if (mdtc->pos_ratio < gdtc->pos_ratio)
+			ctx->sdtc = mdtc;
+	}
 
-		wb_position_ratio(gdtc);
-		sdtc = gdtc;
+	if (ctx->dirty_exceeded && !wb->dirty_exceeded)
+		wb->dirty_exceeded = 1;
 
-		if (mdtc) {
-			/*
-			 * If memcg domain is in effect, calculate its
-			 * pos_ratio.  @wb should satisfy constraints from
-			 * both global and memcg domains.  Choose the one
-			 * w/ lower pos_ratio.
-			 */
-			if (!strictlimit) {
-				wb_dirty_limits(mdtc);
-
-				if ((current->flags & PF_LOCAL_THROTTLE) &&
-				    mdtc->wb_dirty <
-				    dirty_freerun_ceiling(mdtc->wb_thresh,
-							  mdtc->wb_bg_thresh))
-					/*
-					 * LOCAL_THROTTLE tasks must not be
-					 * throttled when below the per-wb
-					 * freerun ceiling.
-					 */
-					goto free_running;
-			}
-			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
-				((mdtc->dirty > mdtc->thresh) || strictlimit);
+	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+				   BANDWIDTH_INTERVAL))
+		__wb_update_bandwidth(gdtc, mdtc, true);
+
+	/* throttle according to the chosen dtc */
+	ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
+	ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >>
+						RATELIMIT_CALC_SHIFT;
+	max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty);
+	min_pause = wb_min_pause(wb, max_pause,
+				 ctx->task_ratelimit, ctx->dirty_ratelimit,
+				 &ctx->nr_dirtied_pause);
+
+	if (unlikely(ctx->task_ratelimit == 0)) {
+		period = max_pause;
+		ctx->pause = max_pause;
+		goto pause;
+	}
+	period = HZ * pages_dirtied / ctx->task_ratelimit;
+	ctx->pause = period;
+	if (current->dirty_paused_when)
+		ctx->pause -= ctx->now - current->dirty_paused_when;
+	/*
+	 * For less than 1s think time (ext3/4 may block the dirtier
+	 * for up to 800ms from time to time on 1-HDD; so does xfs,
+	 * however at much less frequency), try to compensate it in
+	 * future periods by updating the virtual time; otherwise just
+	 * do a reset, as it may be a light dirtier.
+	 */
+	if (ctx->pause < min_pause) {
+		trace_balance_dirty_pages(wb,
+					  ctx->sdtc->thresh,
+					  ctx->sdtc->bg_thresh,
+					  ctx->sdtc->dirty,
+					  ctx->sdtc->wb_thresh,
+					  ctx->sdtc->wb_dirty,
+					  ctx->dirty_ratelimit,
+					  ctx->task_ratelimit,
+					  pages_dirtied,
+					  period,
+					  min(ctx->pause, 0L),
+					  ctx->start_time);
+		if (ctx->pause < -HZ) {
+			current->dirty_paused_when = ctx->now;
+			current->nr_dirtied = 0;
+		} else if (period) {
+			current->dirty_paused_when += period;
+			current->nr_dirtied = 0;
+		} else if (current->nr_dirtied_pause <= pages_dirtied)
+			current->nr_dirtied_pause += pages_dirtied;
+		return 0;
+	}
+	if (unlikely(ctx->pause > max_pause)) {
+		/* for occasional dropped task_ratelimit */
+		ctx->now += min(ctx->pause - max_pause, max_pause);
+		ctx->pause = max_pause;
+	}
 
-			wb_position_ratio(mdtc);
-			if (mdtc->pos_ratio < gdtc->pos_ratio)
-				sdtc = mdtc;
-		}
+pause:
+	trace_balance_dirty_pages(wb,
+				  ctx->sdtc->thresh,
+				  ctx->sdtc->bg_thresh,
+				  ctx->sdtc->dirty,
+				  ctx->sdtc->wb_thresh,
+				  ctx->sdtc->wb_dirty,
+				  ctx->dirty_ratelimit,
+				  ctx->task_ratelimit,
+				  pages_dirtied,
+				  period,
+				  ctx->pause,
+				  ctx->start_time);
+
+	return ctx->pause;
+}
 
-		if (dirty_exceeded && !wb->dirty_exceeded)
-			wb->dirty_exceeded = 1;
-
-		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
-					   BANDWIDTH_INTERVAL))
-			__wb_update_bandwidth(gdtc, mdtc, true);
-
-		/* throttle according to the chosen dtc */
-		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
-		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
-							RATELIMIT_CALC_SHIFT;
-		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
-		min_pause = wb_min_pause(wb, max_pause,
-					 task_ratelimit, dirty_ratelimit,
-					 &nr_dirtied_pause);
-
-		if (unlikely(task_ratelimit == 0)) {
-			period = max_pause;
-			pause = max_pause;
-			goto pause;
-		}
-		period = HZ * pages_dirtied / task_ratelimit;
-		pause = period;
-		if (current->dirty_paused_when)
-			pause -= now - current->dirty_paused_when;
-		/*
-		 * For less than 1s think time (ext3/4 may block the dirtier
-		 * for up to 800ms from time to time on 1-HDD; so does xfs,
-		 * however at much less frequency), try to compensate it in
-		 * future periods by updating the virtual time; otherwise just
-		 * do a reset, as it may be a light dirtier.
-		 */
-		if (pause < min_pause) {
-			trace_balance_dirty_pages(wb,
-						  sdtc->thresh,
-						  sdtc->bg_thresh,
-						  sdtc->dirty,
-						  sdtc->wb_thresh,
-						  sdtc->wb_dirty,
-						  dirty_ratelimit,
-						  task_ratelimit,
-						  pages_dirtied,
-						  period,
-						  min(pause, 0L),
-						  start_time);
-			if (pause < -HZ) {
-				current->dirty_paused_when = now;
-				current->nr_dirtied = 0;
-			} else if (period) {
-				current->dirty_paused_when += period;
-				current->nr_dirtied = 0;
-			} else if (current->nr_dirtied_pause <= pages_dirtied)
-				current->nr_dirtied_pause += pages_dirtied;
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied)
+{
+	int ret;
+	struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
+
+	for (;;) {
+		ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
+		if (!ret)
 			break;
-		}
-		if (unlikely(pause > max_pause)) {
-			/* for occasional dropped task_ratelimit */
-			now += min(pause - max_pause, max_pause);
-			pause = max_pause;
-		}
 
-pause:
-		trace_balance_dirty_pages(wb,
-					  sdtc->thresh,
-					  sdtc->bg_thresh,
-					  sdtc->dirty,
-					  sdtc->wb_thresh,
-					  sdtc->wb_dirty,
-					  dirty_ratelimit,
-					  task_ratelimit,
-					  pages_dirtied,
-					  period,
-					  pause,
-					  start_time);
 		__set_current_state(TASK_KILLABLE);
-		wb->dirty_sleep = now;
-		io_schedule_timeout(pause);
+		wb->dirty_sleep = ctx.now;
+		io_schedule_timeout(ctx.pause);
 
-		current->dirty_paused_when = now + pause;
+		current->dirty_paused_when = ctx.now + ctx.pause;
 		current->nr_dirtied = 0;
-		current->nr_dirtied_pause = nr_dirtied_pause;
+		current->nr_dirtied_pause = ctx.nr_dirtied_pause;
 
 		/*
 		 * This is typically equal to (dirty < thresh) and can also
 		 * keep "1000+ dd on a slow USB stick" under control.
 		 */
-		if (task_ratelimit)
+		if (ctx.task_ratelimit)
 			break;
 
 		/*
@@ -1805,14 +1831,14 @@  static void balance_dirty_pages(struct bdi_writeback *wb,
 		 * more page. However wb_dirty has accounting errors.  So use
 		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (sdtc->wb_dirty <= wb_stat_error())
+		if (ctx.sdtc->wb_dirty <= wb_stat_error())
 			break;
 
 		if (fatal_signal_pending(current))
 			break;
 	}
 
-	if (!dirty_exceeded && wb->dirty_exceeded)
+	if (!ctx.dirty_exceeded && wb->dirty_exceeded)
 		wb->dirty_exceeded = 0;
 
 	if (writeback_in_progress(wb))
@@ -1829,7 +1855,7 @@  static void balance_dirty_pages(struct bdi_writeback *wb,
 	if (laptop_mode)
 		return;
 
-	if (nr_reclaimable > gdtc->bg_thresh)
+	if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh)
 		wb_start_background_writeback(wb);
 }