diff mbox series

[block-5.17] block: don't merge across cgroup boundaries if iocost or iolatency is active

Message ID Yi71WZ3O9/YViHSb@slm.duckdns.org (mailing list archive)
State New, archived
Headers show
Series [block-5.17] block: don't merge across cgroup boundaries if iocost or iolatency is active | expand

Commit Message

Tejun Heo March 14, 2022, 7:57 a.m. UTC
blk-iocost and iolatency are cgroup aware rq-qos policies but, unlike
elevators, they didn't have a way to disable merges across different
cgroups. This obviously can lead to accounting and control errors but more
importantly to priority inversions - e.g. an IO which belongs to a higher
priority cgroup or IO class may end up getting throttled incorrectly because
it gets merged to an IO issued from a low priority cgroup.

Fix it by adding blk_cgroup_mergeable() which is called from merge paths to
test whether merges are acceptable from cgroup POV. When iocost or iolatency
is active, this rejects cross-cgroup and cross-issue_as_root merges.

While at it,

* Add WARN_ON_ONCE() on blkg mismatch in ioc_rqos_merge() so that we can
  easily notice similar failures in the future.

* Make sure iocost enable/disable transitions only happen when iocost is
  actually enabled / disabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: d70675121546 ("block: introduce blk-iolatency io controller")
Cc: stable@vger.kernel.org # v4.19+
Cc: Josef Bacik <josef@toxicpanda.com>
---
 block/blk-cgroup.c         |    2 +
 block/blk-iocost.c         |   20 ++++++++++++-------
 block/blk-iolatency.c      |    9 +++++---
 block/blk-merge.c          |   11 ++++++++++
 include/linux/blk-cgroup.h |   46 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h     |    4 ++-
 6 files changed, 81 insertions(+), 11 deletions(-)

Comments

Jens Axboe March 14, 2022, 8:12 p.m. UTC | #1
On 3/14/22 1:57 AM, Tejun Heo wrote:
> blk-iocost and iolatency are cgroup aware rq-qos policies but, unlike
> elevators, they didn't have a way to disable merges across different
> cgroups. This obviously can lead to accounting and control errors but more
> importantly to priority inversions - e.g. an IO which belongs to a higher
> priority cgroup or IO class may end up getting throttled incorrectly because
> it gets merged to an IO issued from a low priority cgroup.
> 
> Fix it by adding blk_cgroup_mergeable() which is called from merge paths to
> test whether merges are acceptable from cgroup POV. When iocost or iolatency
> is active, this rejects cross-cgroup and cross-issue_as_root merges.
> 
> While at it,
> 
> * Add WARN_ON_ONCE() on blkg mismatch in ioc_rqos_merge() so that we can
>   easily notice similar failures in the future.
> 
> * Make sure iocost enable/disable transitions only happen when iocost is
>   actually enabled / disabled.

Is there really no better way to do this than add a lot of expensive
checks to the fast path?

Even just inverting the checks so that

if (req->bio->bi_blkg != bio->bi_blkg)
	...

is checked first would seem a lot saner.

In any case, since this isn't a new regression, I'd feel a lot better
deferring it to 5.18.
Tejun Heo March 14, 2022, 8:41 p.m. UTC | #2
On Mon, Mar 14, 2022 at 02:12:34PM -0600, Jens Axboe wrote:
> > * Add WARN_ON_ONCE() on blkg mismatch in ioc_rqos_merge() so that we can
> >   easily notice similar failures in the future.
> > 
> > * Make sure iocost enable/disable transitions only happen when iocost is
> >   actually enabled / disabled.
> 
> Is there really no better way to do this than add a lot of expensive
> checks to the fast path?
> 
> Even just inverting the checks so that
> 
> if (req->bio->bi_blkg != bio->bi_blkg)
> 	...
> 
> is checked first would seem a lot saner.
> 
> In any case, since this isn't a new regression, I'd feel a lot better
> deferring it to 5.18.

As discussed, the only risk of removing the disable/enable switch and always
testing is ruining some merges for cases where blkcg is enabled but no
control is applied, which shouldn't be a big problem. I'll redo the patch.

Thanks.
diff mbox series

Patch

--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1176,6 +1176,8 @@  int blkcg_init_queue(struct request_queu
 	bool preloaded;
 	int ret;
 
+	atomic_set(&q->cgroup_no_cross_merges, 0);
+
 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 	if (!new_blkg)
 		return -ENOMEM;
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2703,6 +2703,8 @@  static void ioc_rqos_merge(struct rq_qos
 	if (!ioc->enabled || !iocg || !iocg->level)
 		return;
 
+	WARN_ON_ONCE(rq->bio->bi_blkg != bio->bi_blkg);
+
 	abs_cost = calc_vtime_cost(bio, iocg, true);
 	if (!abs_cost)
 		return;
@@ -3253,13 +3255,17 @@  static ssize_t ioc_qos_write(struct kern
 
 	spin_lock_irq(&ioc->lock);
 
-	if (enable) {
-		blk_stat_enable_accounting(ioc->rqos.q);
-		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
-		ioc->enabled = true;
-	} else {
-		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
-		ioc->enabled = false;
+	if (enable != ioc->enabled) {
+		if (enable) {
+			blk_stat_enable_accounting(ioc->rqos.q);
+			blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+			blk_cgroup_disable_cross_merges(ioc->rqos.q);
+			ioc->enabled = true;
+		} else {
+			blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+			blk_cgroup_enable_cross_merges(ioc->rqos.q);
+			ioc->enabled = false;
+		}
 	}
 
 	if (user) {
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -855,12 +855,15 @@  out:
 
 		blk_mq_freeze_queue(blkg->q);
 
-		if (enable == 1)
+		if (enable == 1) {
 			atomic_inc(&blkiolat->enabled);
-		else if (enable == -1)
+			blk_cgroup_disable_cross_merges(blkg->q);
+		} else if (enable == -1) {
 			atomic_dec(&blkiolat->enabled);
-		else
+			blk_cgroup_enable_cross_merges(blkg->q);
+		} else {
 			WARN_ON_ONCE(1);
+		}
 
 		blk_mq_unfreeze_queue(blkg->q);
 
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,7 @@ 
 #include <linux/blk-integrity.h>
 #include <linux/scatterlist.h>
 #include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -600,6 +601,9 @@  static inline unsigned int blk_rq_get_ma
 static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
 		unsigned int nr_phys_segs)
 {
+	if (!blk_cgroup_mergeable(req, bio))
+		goto no_merge;
+
 	if (blk_integrity_merge_bio(req->q, req, bio) == false)
 		goto no_merge;
 
@@ -696,6 +700,9 @@  static int ll_merge_requests_fn(struct r
 	if (total_phys_segments > blk_rq_get_max_segments(req))
 		return 0;
 
+	if (!blk_cgroup_mergeable(req, next->bio))
+		return 0;
+
 	if (blk_integrity_merge_rq(q, req, next) == false)
 		return 0;
 
@@ -904,6 +911,10 @@  bool blk_rq_merge_ok(struct request *rq,
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 
+	/* don't merge across cgroup boundaries */
+	if (!blk_cgroup_mergeable(rq, bio))
+		return false;
+
 	/* only merge integrity protected bio into ditto rq */
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -21,6 +21,7 @@ 
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/atomic.h>
 #include <linux/kthread.h>
 #include <linux/fs.h>
@@ -604,6 +605,48 @@  static inline void blkcg_clear_delay(str
 		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 }
 
+/**
+ * blk_cgroup_disable_cross_merges - Disable cross-cgroup merges
+ * @q: target request_queue
+ *
+ * Disallow merges between bios that belong to different cgroups. Disabling can
+ * be nested. Used by cgroup-aware rq-qos policies.
+ */
+static inline void blk_cgroup_disable_cross_merges(struct request_queue *q)
+{
+	atomic_inc(&q->cgroup_no_cross_merges);
+}
+
+/**
+ * blk_cgroup_enable_cross_merges - Enable cross-cgroup merges
+ * @q: target request_queue
+ *
+ * Reverses blk_cgroup_disable_cross_merges().
+ */
+static inline void blk_cgroup_enable_cross_merges(struct request_queue *q)
+{
+	WARN_ON_ONCE(atomic_dec_return(&q->cgroup_no_cross_merges) < 0);
+}
+
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * Can @bio be merged into @rq? If cross merges are disallowed, the two should
+ * belong to the same cgroup and their issue_as_root should match. The latter is
+ * necessary as we don't want to throttle e.g. a metadata update because it
+ * happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+	if (!atomic_read(&rq->q->cgroup_no_cross_merges))
+		return true;
+
+	return rq->bio->bi_blkg == bio->bi_blkg &&
+		bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
 void blk_cgroup_bio_start(struct bio *bio);
 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
@@ -659,6 +702,9 @@  static inline void blkg_put(struct blkcg
 static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline void blk_cgroup_disallow_cross_merges(struct request_queue *q) { }
+static inline void blk_cgroup_allow_cross_merges(struct request_queue *q) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
 
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -274,7 +274,9 @@  struct request_queue {
 	struct work_struct	timeout_work;
 
 	atomic_t		nr_active_requests_shared_tags;
-
+#ifdef CONFIG_BLK_CGROUP
+	atomic_t		cgroup_no_cross_merges;
+#endif
 	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;