@@ -1635,6 +1635,75 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
}
}
+/**
+ * blkcg_wb_waiters_on_bdi - check for writeback waiters on a block device
+ * @bdi: block device to check
+ *
+ * Return true if any other blkcg is waiting for writeback on the target block
+ * device, false otherwise.
+ */
+bool blkcg_wb_waiters_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg *blkcg, *curr_blkcg;
+ bool ret = false;
+
+ if (unlikely(!bdi))
+ return false;
+
+ rcu_read_lock();
+ curr_blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+ list_for_each_entry_rcu(blkcg, &bdi->cgwb_waiters, cgwb_wait_node)
+ if (blkcg != curr_blkcg) {
+ ret = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * blkcg_start_wb_wait_on_bdi - add current blkcg to writeback waiters list
+ * @bdi: target block device
+ *
+ * Add current blkcg to the list of writeback waiters on target block device.
+ */
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg *blkcg;
+
+ rcu_read_lock();
+ blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+ if (blkcg) {
+ spin_lock(&bdi->cgwb_waiters_lock);
+ list_add_rcu(&blkcg->cgwb_wait_node, &bdi->cgwb_waiters);
+ spin_unlock(&bdi->cgwb_waiters_lock);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * blkcg_stop_wb_wait_on_bdi - remove current blkcg from writeback waiters list
+ * @bdi: target block device
+ *
+ * Remove current blkcg from the list of writeback waiters on target block
+ * device.
+ */
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg *blkcg;
+
+ rcu_read_lock();
+ blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+ if (blkcg) {
+ spin_lock(&bdi->cgwb_waiters_lock);
+ list_del_rcu(&blkcg->cgwb_wait_node);
+ spin_unlock(&bdi->cgwb_waiters_lock);
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+}
+
/*
* This is called when we want to actually walk up the hierarchy and check to
* see if we need to throttle, and then actually throttle if there is some
@@ -970,9 +970,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ struct throtl_data *td = tg->td;
+ struct request_queue *q = td->queue;
+ struct backing_dev_info *bdi = q->backing_dev_info;
/*
- * Currently whole state machine of group depends on first bio
+ * Currently whole state machine of group depends on first bio
* queued in the group bio list. So one should not be calling
* this function with a different bio if there are other bios
* queued.
@@ -981,8 +984,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if (blkcg_wb_waiters_on_bdi(bdi) ||
+ (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX)) {
if (wait)
*wait = 0;
return true;
@@ -2446,6 +2446,8 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ blkcg_start_wb_wait_on_bdi(bdi);
+
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
@@ -2453,6 +2455,8 @@ void sync_inodes_sb(struct super_block *sb)
bdi_up_write_wb_switch_rwsem(bdi);
wait_sb_inodes(sb);
+
+ blkcg_stop_wb_wait_on_bdi(bdi);
}
EXPORT_SYMBOL(sync_inodes_sb);
@@ -191,6 +191,8 @@ struct backing_dev_info {
struct rb_root cgwb_congested_tree; /* their congested states */
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
+ struct list_head cgwb_waiters; /* list of all waiters for writeback */
+ spinlock_t cgwb_waiters_lock; /* protect cgwb_waiters list */
#else
struct bdi_writeback_congested *wb_congested;
#endif
@@ -56,6 +56,7 @@ struct blkcg {
struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_wait_node;
struct list_head cgwb_list;
refcount_t cgwb_refcnt;
#endif
@@ -454,6 +455,10 @@ static inline void blkcg_cgwb_put(struct blkcg *blkcg)
blkcg_destroy_blkgs(blkcg);
}
+bool blkcg_wb_waiters_on_bdi(struct backing_dev_info *bdi);
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi);
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi);
+
#else
static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
@@ -464,6 +469,13 @@ static inline void blkcg_cgwb_put(struct blkcg *blkcg)
blkcg_destroy_blkgs(blkcg);
}
+static inline bool blkcg_wb_waiters_on_bdi(struct backing_dev_info *bdi)
+{
+ return false;
+}
+static inline void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+static inline void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+
#endif
/**
@@ -772,6 +784,7 @@ static inline void blkcg_bio_issue_init(struct bio *bio)
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio)
{
+ struct backing_dev_info *bdi = q->backing_dev_info;
struct blkcg_gq *blkg;
bool throtl = false;
@@ -785,6 +798,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
bio_devname(bio, b));
bio_associate_blkg(bio);
}
+ if (blkcg_wb_waiters_on_bdi(bdi))
+ bio_set_flag(bio, BIO_THROTTLED);
blkg = bio->bi_blkg;
@@ -686,10 +686,12 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
int ret;
+ INIT_LIST_HEAD(&bdi->cgwb_waiters);
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
+ spin_lock_init(&bdi->cgwb_waiters_lock);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
This is an attempt to mitigate the priority inversion problem of a high-priority blkcg issuing a sync() and being forced to wait the completion of all the writeback I/O generated by any other low-priority blkcg, causing massive latencies to processes that shouldn't be I/O-throttled at all. The idea is to save a list of blkcg's that are waiting for writeback: every time a sync() is executed the current blkcg is added to the list. Then, when I/O is throttled, if there's a blkcg waiting for writeback different than the current blkcg, no throttling is applied (we can probably refine this logic later, i.e., a better policy could be to adjust the throttling rate using the blkcg with the highest speed from the list of waiters - priority inheritance, kinda). This topic has been discussed here: https://lwn.net/ml/cgroups/20190118103127.325-1-righi.andrea@gmail.com/ But we didn't come up with any definitive solution. This patch is not a definitive solution either, but it's an attempt to continue addressing the issue and, hopefully, handle the priority inversion problem with sync() in a better way. Signed-off-by: Andrea Righi <righi.andrea@gmail.com> --- block/blk-cgroup.c | 69 ++++++++++++++++++++++++++++++++ block/blk-throttle.c | 10 +++-- fs/fs-writeback.c | 4 ++ include/linux/backing-dev-defs.h | 2 + include/linux/blk-cgroup.h | 15 +++++++ mm/backing-dev.c | 2 + 6 files changed, 99 insertions(+), 3 deletions(-)