@@ -59,6 +59,56 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
#define BLKG_DESTROY_BATCH_SIZE 64
+/*
+ * lnode.next of the last entry in a lockless list is NULL. To enable us to
+ * use lnode.next as a boolean flag to indicate its presence in a lockless
+ * list, we have to make it non-NULL for all. This is done by using a
+ * sentinel node at the end of the lockless list. All the percpu lhead's
+ * are initialized to point to that sentinel node as being empty.
+ */
+static struct llist_node llist_last;
+
+static bool blkcg_llist_empty(struct llist_head *lhead)
+{
+ return lhead->first == &llist_last;
+}
+
+static void init_blkcg_llists(struct blkcg *blkcg)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(blkcg->lhead, cpu)->first = &llist_last;
+}
+
+static struct llist_node *fetch_delete_blkcg_llist(struct llist_head *lhead)
+{
+ return xchg(&lhead->first, &llist_last);
+}
+
+static struct llist_node *fetch_delete_lnode_next(struct llist_node *lnode)
+{
+ struct llist_node *next = READ_ONCE(lnode->next);
+ struct blkcg_gq *blkg = llist_entry(lnode, struct blkg_iostat_set,
+ lnode)->blkg;
+
+ WRITE_ONCE(lnode->next, NULL);
+ percpu_ref_put(&blkg->refcnt);
+ return next;
+}
+
+/*
+ * The retrieved blkg_iostat_set is immediately marked as not in the
+ * lockless list by clearing its node->next pointer. It could be put
+ * back into the list by a parallel update before the iostat's are
+ * finally flushed including probably the new update.
+ */
+#define blkcg_llist_for_each_entry_safe(pos, node, nxt) \
+ for (; (node != &llist_last) && \
+ (pos = llist_entry(node, struct blkg_iostat_set, lnode), \
+ nxt = fetch_delete_lnode_next(node), true); \
+ node = nxt)
+
/**
* blkcg_css - find the current css
*
@@ -236,8 +286,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->blkcg = blkcg;
u64_stats_init(&blkg->iostat.sync);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+ per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+ }
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -852,17 +904,23 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+ struct llist_node *lnode, *lnext;
+ struct blkg_iostat_set *bisc;
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(css->cgroup))
return;
+ if (blkcg_llist_empty(lhead))
+ return;
+
rcu_read_lock();
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ lnode = fetch_delete_blkcg_llist(lhead);
+ blkcg_llist_for_each_entry_safe(bisc, lnode, lnext) {
+ struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur, delta;
unsigned long flags;
unsigned int seq;
@@ -1192,6 +1250,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
}
}
+ blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+ if (!blkcg->lhead)
+ goto free_blkcg;
+ init_blkcg_llists(blkcg);
+
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@@ -1233,7 +1296,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+ free_percpu(blkcg->lhead);
+free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
@@ -1997,6 +2061,7 @@ static int blk_cgroup_io_type(struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio)
{
+ struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
unsigned long flags;
@@ -2015,9 +2080,16 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
+ if (!READ_ONCE(bis->lnode.next)) {
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+
+ llist_add(&bis->lnode, lhead);
+ percpu_ref_get(&bis->blkg->refcnt);
+ }
+
u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
- cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
@@ -18,6 +18,7 @@
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
+#include <linux/llist.h>
struct blkcg_gq;
struct blkg_policy_data;
@@ -43,6 +44,8 @@ struct blkg_iostat {
struct blkg_iostat_set {
struct u64_stats_sync sync;
+ struct llist_node lnode;
+ struct blkcg_gq *blkg;
struct blkg_iostat cur;
struct blkg_iostat last;
};
@@ -97,6 +100,12 @@ struct blkcg {
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
+
+ /*
+ * List of updated percpu blkg_iostat_set's since the last flush.
+ */
+ struct llist_head __percpu *lhead;
+
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif