diff mbox series

[v8,3/3] blk-cgroup: Optimize blkcg_rstat_flush()

Message ID 20221004151748.293388-4-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-cgroup: Optimize blkcg_rstat_flush() | expand

Commit Message

Waiman Long Oct. 4, 2022, 3:17 p.m. UTC
For a system with many CPUs and block devices, the time to do
blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It
can be especially problematic as interrupt is disabled during the flush.
It was reported that it might take seconds to complete in some extreme
cases leading to hard lockup messages.

As it is likely that not all the percpu blkg_iostat_set's has been
updated since the last flush, those stale blkg_iostat_set's don't need
to be flushed in this case. This patch optimizes blkcg_rstat_flush()
by keeping a lockless list of recently updated blkg_iostat_set's in a
newly added percpu blkcg->lhead pointer.

The blkg_iostat_set is added to a sentinel lockless list on the update
side in blk_cgroup_bio_start(). It is removed from the sentinel lockless
list when flushed in blkcg_rstat_flush(). Due to racing, it is possible
that blk_iostat_set's in the lockless list may have no new IO stats to
be flushed, but that is OK.

To protect against destruction of blkg, a percpu reference is gotten
when putting into the lockless list and put back when removed.

A blkg_iostat_set can determine if it is in a lockless list by checking
the content of its lnode.next pointer which will be non-NULL when in
a sentinel lockless list.

When booting up an instrumented test kernel with this patch on a
2-socket 96-thread system with cgroup v2, out of the 2051 calls to
cgroup_rstat_flush() after bootup, 1788 of the calls were exited
immediately because of empty lockless list. After an all-cpu kernel
build, the ratio became 6295424/6340513. That was more than 99%.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.c | 75 ++++++++++++++++++++++++++++++++++++++++++----
 block/blk-cgroup.h |  9 ++++++
 2 files changed, 78 insertions(+), 6 deletions(-)

Comments

Michal Koutný Oct. 4, 2022, 6:49 p.m. UTC | #1
Hello.

On Tue, Oct 04, 2022 at 11:17:48AM -0400, Waiman Long <longman@redhat.com> wrote:
> To protect against destruction of blkg, a percpu reference is gotten
> when putting into the lockless list and put back when removed.

Just to conclude my previous remark about the loop, let me try
explaining it more precisely:

blkcg->lhead via blkg_iostat_set holds reference to blkcg_gq 
   (taken in in blk_cgroup_bio_start)

blkcg_gq holds reference to its blkcg_gq->blkcg 
   (taken in blkg_create)

The cycle has two edges, the latter is broken in __blkg_release but
that's a release callback of the involved blkcg_gq->refcnt, so it won't
be called.

The first edges is broken in blkcg_rstat_flush and that's more promising.
The current code does the final flushes -- in css_release_work_fn.
The problem is that it's the release callback of blkcg->css, i.e. it's
also referenced on the cycle, therefore this final flush won't happen
before cycle is broken.

Fortunately, any other caller of cgroup_rstat_flush comes to the rescue
-- the blkcg_rstat_flush on the stuck blkcg would decompose lhead list
and the reference cycle is broken.

In summary, I think this adds the reference cycle but its survival time
is limited to the soonest cgroup_rstat_flush call, which should not
cause practical troubles.

HTH,
Michal
Waiman Long Oct. 4, 2022, 10:53 p.m. UTC | #2
On 10/4/22 14:49, Michal Koutný wrote:
> Hello.
>
> On Tue, Oct 04, 2022 at 11:17:48AM -0400, Waiman Long <longman@redhat.com> wrote:
>> To protect against destruction of blkg, a percpu reference is gotten
>> when putting into the lockless list and put back when removed.
> Just to conclude my previous remark about the loop, let me try
> explaining it more precisely:
>
> blkcg->lhead via blkg_iostat_set holds reference to blkcg_gq
>     (taken in in blk_cgroup_bio_start)
>
> blkcg_gq holds reference to its blkcg_gq->blkcg
>     (taken in blkg_create)
>
> The cycle has two edges, the latter is broken in __blkg_release but
> that's a release callback of the involved blkcg_gq->refcnt, so it won't
> be called.
>
> The first edges is broken in blkcg_rstat_flush and that's more promising.
> The current code does the final flushes -- in css_release_work_fn.
> The problem is that it's the release callback of blkcg->css, i.e. it's
> also referenced on the cycle, therefore this final flush won't happen
> before cycle is broken.
>
> Fortunately, any other caller of cgroup_rstat_flush comes to the rescue
> -- the blkcg_rstat_flush on the stuck blkcg would decompose lhead list
> and the reference cycle is broken.
>
> In summary, I think this adds the reference cycle but its survival time
> is limited to the soonest cgroup_rstat_flush call, which should not
> cause practical troubles.

Thanks for the explanation. I now get what you are referring to. Yes, 
this delayed blkcg removal problem is annoying. I think the following 
patch should eliminate this issue. What do you think?

Cheers,
Longman

----------------8<-------------[ cut here ]------------------

  block/blk-cgroup.c     | 15 ++++++++++++++-
  include/linux/cgroup.h |  1 +
  kernel/cgroup/rstat.c  | 20 ++++++++++++++++++++
  3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63569b05db0d..f896caef9947 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1122,10 +1122,12 @@ struct list_head *blkcg_get_cgwb_list(struct 
cgroup_subsys_state *css)
   */
  static void blkcg_destroy_blkgs(struct blkcg *blkcg)
  {
+    int cpu;
+
      might_sleep();

+    css_get(&blkcg->css);
      spin_lock_irq(&blkcg->lock);
-
      while (!hlist_empty(&blkcg->blkg_list)) {
          struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                          struct blkcg_gq, blkcg_node);
@@ -1148,6 +1150,17 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg)
      }

      spin_unlock_irq(&blkcg->lock);
+
+    /*
+     * Flush all the non-empty percpu lockless lists.
+     */
+    for_each_possible_cpu(cpu) {
+        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+
+        if (!llist_empty(lhead))
+            cgroup_rstat_css_flush(&blkcg->css, cpu);
+    }
+    css_put(&blkcg->css);
  }

  /**
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ac5d0515680e..33e226a34073 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -763,6 +763,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp);
  void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
  void cgroup_rstat_flush_hold(struct cgroup *cgrp);
  void cgroup_rstat_flush_release(void);
+void cgroup_rstat_css_flush(struct cgroup_subsys_state *css, int cpu);

  /*
   * Basic resource stats.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index feb59380c896..a4e18d627b54 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -251,6 +251,26 @@ void cgroup_rstat_flush_release(void)
      spin_unlock_irq(&cgroup_rstat_lock);
  }

+/**
+ * cgroup_rstat_css_flush - flush stats for the given css and cpu
+ * @css: target css to be flush
+ * @cpu: the cpu that holds the stats to be flush
+ *
+ * A lightweight rstat flush operation for a given css and cpu.
+ * Only the cpu_lock is being held for mutual exclusion, the 
cgroup_rstat_lock
+ * isn't used.
+ */
+void cgroup_rstat_css_flush(struct cgroup_subsys_state *css, int cpu)
+{
+    raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+
+    raw_spin_lock_irq(cpu_lock);
+    rcu_read_lock();
+    css->ss->css_rstat_flush(css, cpu);
+    rcu_read_unlock();
+    raw_spin_unlock_irq(cpu_lock);
+}
+
  int cgroup_rstat_init(struct cgroup *cgrp)
  {
      int cpu;
--
diff mbox series

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 946592249795..63569b05db0d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,37 @@  static struct workqueue_struct *blkcg_punt_bio_wq;
 
 #define BLKG_DESTROY_BATCH_SIZE  64
 
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+	int cpu;
+
+	blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+	if (!blkcg->lhead)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		init_sllist_head(per_cpu_ptr(blkcg->lhead, cpu));
+	return 0;
+}
+
 /**
  * blkcg_css - find the current css
  *
@@ -236,8 +267,10 @@  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	blkg->blkcg = blkcg;
 
 	u64_stats_init(&blkg->iostat.sync);
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+		per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+	}
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
@@ -864,7 +897,9 @@  static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
-	struct blkcg_gq *blkg;
+	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+	struct llist_node *lnode;
+	struct blkg_iostat_set *bisc, *next_bisc;
 
 	/* Root-level stats are sourced from system-wide IO stats */
 	if (!cgroup_parent(css->cgroup))
@@ -872,12 +907,21 @@  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+	lnode = sllist_del_all(lhead);
+	if (!lnode)
+		goto out;
+
+	/*
+	 * Iterate only the iostat_cpu's queued in the lockless list.
+	 */
+	llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+		struct blkcg_gq *blkg = bisc->blkg;
 		struct blkcg_gq *parent = blkg->parent;
-		struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
 		struct blkg_iostat cur;
 		unsigned int seq;
 
+		WRITE_ONCE(lnode->next, NULL);
+
 		/* fetch the current per-cpu values */
 		do {
 			seq = u64_stats_fetch_begin(&bisc->sync);
@@ -890,8 +934,10 @@  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 		if (parent && parent->parent)
 			blkcg_iostat_update(parent, &blkg->iostat.cur,
 					    &blkg->iostat.last);
+		percpu_ref_put(&blkg->refcnt);
 	}
 
+out:
 	rcu_read_unlock();
 }
 
@@ -1170,6 +1216,7 @@  static void blkcg_css_free(struct cgroup_subsys_state *css)
 
 	mutex_unlock(&blkcg_pol_mutex);
 
+	free_percpu(blkcg->lhead);
 	kfree(blkcg);
 }
 
@@ -1189,6 +1236,9 @@  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 			goto unlock;
 	}
 
+	if (init_blkcg_llists(blkcg))
+		goto free_blkcg;
+
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -1229,7 +1279,8 @@  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	for (i--; i >= 0; i--)
 		if (blkcg->cpd[i])
 			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+	free_percpu(blkcg->lhead);
+free_blkcg:
 	if (blkcg != &blkcg_root)
 		kfree(blkcg);
 unlock:
@@ -1990,6 +2041,7 @@  static int blk_cgroup_io_type(struct bio *bio)
 
 void blk_cgroup_bio_start(struct bio *bio)
 {
+	struct blkcg *blkcg = bio->bi_blkg->blkcg;
 	int rwd = blk_cgroup_io_type(bio), cpu;
 	struct blkg_iostat_set *bis;
 	unsigned long flags;
@@ -2008,9 +2060,20 @@  void blk_cgroup_bio_start(struct bio *bio)
 	}
 	bis->cur.ios[rwd]++;
 
+	/*
+	 * If the iostat_cpu isn't in a lockless list, put it into the
+	 * list to indicate that a stat update is pending.
+	 */
+	if (!READ_ONCE(bis->lnode.next)) {
+		struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+		llist_add(&bis->lnode, lhead);
+		percpu_ref_get(&bis->blkg->refcnt);
+	}
+
 	u64_stats_update_end_irqrestore(&bis->sync, flags);
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys))
-		cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+		cgroup_rstat_updated(blkcg->css.cgroup, cpu);
 	put_cpu();
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d2724d1dd7c9..0968b6c8ea12 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@ 
 #include <linux/cgroup.h>
 #include <linux/kthread.h>
 #include <linux/blk-mq.h>
+#include <linux/llist.h>
 
 struct blkcg_gq;
 struct blkg_policy_data;
@@ -43,6 +44,8 @@  struct blkg_iostat {
 
 struct blkg_iostat_set {
 	struct u64_stats_sync		sync;
+	struct llist_node		lnode;
+	struct blkcg_gq		       *blkg;
 	struct blkg_iostat		cur;
 	struct blkg_iostat		last;
 };
@@ -97,6 +100,12 @@  struct blkcg {
 	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
+
+	/*
+	 * List of updated percpu blkg_iostat_set's since the last flush.
+	 */
+	struct llist_head __percpu	*lhead;
+
 #ifdef CONFIG_BLK_CGROUP_FC_APPID
 	char                            fc_app_id[FC_APPID_LEN];
 #endif