diff mbox series

blk-wbt: Fix detection of dirty-throttled tasks

Message ID 20240123175826.21452-1-jack@suse.cz (mailing list archive)
State New, archived
Headers show
Series blk-wbt: Fix detection of dirty-throttled tasks | expand

Commit Message

Jan Kara Jan. 23, 2024, 5:58 p.m. UTC
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).

Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.

CC: stable@vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 block/blk-wbt.c                  | 4 ++--
 include/linux/backing-dev-defs.h | 7 +++++--
 mm/backing-dev.c                 | 2 +-
 mm/page-writeback.c              | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

Comments

Jan Kara Feb. 6, 2024, 9:04 a.m. UTC | #1
On Tue 23-01-24 18:58:26, Jan Kara wrote:
> The detection of dirty-throttled tasks in blk-wbt has been subtly broken
> since its beginning in 2016. Namely if we are doing cgroup writeback and
> the throttled task is not in the root cgroup, balance_dirty_pages() will
> set dirty_sleep for the non-root bdi_writeback structure. However
> blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
> structure. Thus detection of recently throttled tasks is not working in
> this case (we noticed this when we switched to cgroup v2 and suddently
> writeback was slow).
> 
> Since blk-wbt has no easy way to get to proper bdi_writeback and
> furthermore its intention has always been to work on the whole device
> rather than on individual cgroups, just move the dirty_sleep timestamp
> from bdi_writeback to backing_dev_info. That fixes the checking for
> recently throttled task and saves memory for everybody as a bonus.
> 
> CC: stable@vger.kernel.org
> Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
> Signed-off-by: Jan Kara <jack@suse.cz>

Ping Jens?

								Honza

> ---
>  block/blk-wbt.c                  | 4 ++--
>  include/linux/backing-dev-defs.h | 7 +++++--
>  mm/backing-dev.c                 | 2 +-
>  mm/page-writeback.c              | 2 +-
>  4 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-wbt.c b/block/blk-wbt.c
> index 5ba3cd574eac..0c0e270a8265 100644
> --- a/block/blk-wbt.c
> +++ b/block/blk-wbt.c
> @@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
>   */
>  static bool wb_recent_wait(struct rq_wb *rwb)
>  {
> -	struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
> +	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
>  
> -	return time_before(jiffies, wb->dirty_sleep + HZ);
> +	return time_before(jiffies, bdi->last_bdp_sleep + HZ);
>  }
>  
>  static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
> diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
> index ae12696ec492..ad17739a2e72 100644
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -141,8 +141,6 @@ struct bdi_writeback {
>  	struct delayed_work dwork;	/* work item used for writeback */
>  	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
>  
> -	unsigned long dirty_sleep;	/* last wait */
> -
>  	struct list_head bdi_node;	/* anchored at bdi->wb_list */
>  
>  #ifdef CONFIG_CGROUP_WRITEBACK
> @@ -179,6 +177,11 @@ struct backing_dev_info {
>  	 * any dirty wbs, which is depended upon by bdi_has_dirty().
>  	 */
>  	atomic_long_t tot_write_bandwidth;
> +	/*
> +	 * Jiffies when last process was dirty throttled on this bdi. Used by
> + 	 * blk-wbt.
> +	*/
> +	unsigned long last_bdp_sleep;
>  
>  	struct bdi_writeback wb;  /* the root writeback info for this bdi */
>  	struct list_head wb_list; /* list of all wbs */
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 1e3447bccdb1..e039d05304dd 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
>  	INIT_LIST_HEAD(&wb->work_list);
>  	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
>  	INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
> -	wb->dirty_sleep = jiffies;
>  
>  	err = fprop_local_init_percpu(&wb->completions, gfp);
>  	if (err)
> @@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
>  	INIT_LIST_HEAD(&bdi->bdi_list);
>  	INIT_LIST_HEAD(&bdi->wb_list);
>  	init_waitqueue_head(&bdi->wb_waitq);
> +	bdi->last_bdp_sleep = jiffies;
>  
>  	return cgwb_bdi_init(bdi);
>  }
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index cd4e4ae77c40..cc37fa7f3364 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
>  			break;
>  		}
>  		__set_current_state(TASK_KILLABLE);
> -		wb->dirty_sleep = now;
> +		bdi->last_bdp_sleep = jiffies;
>  		io_schedule_timeout(pause);
>  
>  		current->dirty_paused_when = now + pause;
> -- 
> 2.35.3
>
Jens Axboe Feb. 6, 2024, 4:44 p.m. UTC | #2
On Tue, 23 Jan 2024 18:58:26 +0100, Jan Kara wrote:
> The detection of dirty-throttled tasks in blk-wbt has been subtly broken
> since its beginning in 2016. Namely if we are doing cgroup writeback and
> the throttled task is not in the root cgroup, balance_dirty_pages() will
> set dirty_sleep for the non-root bdi_writeback structure. However
> blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
> structure. Thus detection of recently throttled tasks is not working in
> this case (we noticed this when we switched to cgroup v2 and suddently
> writeback was slow).
> 
> [...]

Applied, thanks!

[1/1] blk-wbt: Fix detection of dirty-throttled tasks
      commit: f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5

Best regards,
diff mbox series

Patch

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@  static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  */
 static bool wb_recent_wait(struct rq_wb *rwb)
 {
-	struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
 
-	return time_before(jiffies, wb->dirty_sleep + HZ);
+	return time_before(jiffies, bdi->last_bdp_sleep + HZ);
 }
 
 static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..ad17739a2e72 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@  struct bdi_writeback {
 	struct delayed_work dwork;	/* work item used for writeback */
 	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
 
-	unsigned long dirty_sleep;	/* last wait */
-
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@  struct backing_dev_info {
 	 * any dirty wbs, which is depended upon by bdi_has_dirty().
 	 */
 	atomic_long_t tot_write_bandwidth;
+	/*
+	 * Jiffies when last process was dirty throttled on this bdi. Used by
+ 	 * blk-wbt.
+	*/
+	unsigned long last_bdp_sleep;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
 	struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@  static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 	INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
-	wb->dirty_sleep = jiffies;
 
 	err = fprop_local_init_percpu(&wb->completions, gfp);
 	if (err)
@@ -921,6 +920,7 @@  int bdi_init(struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	init_waitqueue_head(&bdi->wb_waitq);
+	bdi->last_bdp_sleep = jiffies;
 
 	return cgwb_bdi_init(bdi);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@  static int balance_dirty_pages(struct bdi_writeback *wb,
 			break;
 		}
 		__set_current_state(TASK_KILLABLE);
-		wb->dirty_sleep = now;
+		bdi->last_bdp_sleep = jiffies;
 		io_schedule_timeout(pause);
 
 		current->dirty_paused_when = now + pause;