diff mbox series

[RFC,5/9,v2] cgroup: separate locking between base css and others

Message ID 20250103015020.78547-6-inwardvessel@gmail.com (mailing list archive)
State New
Headers show
Series cgroup: separate per-subsystem rstat trees | expand

Commit Message

JP Kobryn Jan. 3, 2025, 1:50 a.m. UTC
Separate locks can be used to eliminate contention between subsystems that make
use of rstat. The base stats also get their own lock. Where applicable, check
for the existence of a subsystem pointer to determine if the given
cgroup_subsys_state is the base css or not for deciding which lock to take.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
 include/linux/cgroup-defs.h |  2 +
 kernel/cgroup/rstat.c       | 92 +++++++++++++++++++++++++------------
 2 files changed, 65 insertions(+), 29 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1932f8ae7995..4d87519ff023 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -790,6 +790,8 @@  struct cgroup_subsys {
 	 * specifies the mask of subsystems that this one depends on.
 	 */
 	unsigned int depends_on;
+
+	spinlock_t rstat_lock;
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 4381eb9ac426..958bdccf0359 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,8 +9,9 @@ 
 
 #include <trace/events/cgroup.h>
 
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock[CGROUP_SUBSYS_COUNT]);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -86,7 +87,7 @@  void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
 __bpf_kfunc void cgroup_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 {
 	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	raw_spinlock_t *cpu_lock;
 	unsigned long flags;
 
 	/*
@@ -100,6 +101,11 @@  __bpf_kfunc void cgroup_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 	if (data_race(css_rstat_cpu(css, cpu)->updated_next))
 		return;
 
+	if (css->ss)
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock[css->ss->id], cpu);
+	else
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
+
 	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
 	/* put @cgrp and all ancestors on the corresponding updated lists */
@@ -207,11 +213,16 @@  static struct cgroup_subsys_state *cgroup_rstat_push_children(
 static struct cgroup_subsys_state *cgroup_rstat_updated_list(
 				struct cgroup_subsys_state *root, int cpu)
 {
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
 	struct cgroup_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
 	struct cgroup_subsys_state *head = NULL, *parent, *child;
+	raw_spinlock_t *cpu_lock;
 	unsigned long flags;
 
+	if (root->ss)
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock[root->ss->id], cpu);
+	else
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
+
 	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root->cgroup, false);
 
 	/* Return NULL if this subtree is not on-list */
@@ -285,37 +296,44 @@  __bpf_hook_end();
  * number processed last.
  */
 static inline void __cgroup_rstat_lock(struct cgroup_subsys_state *css,
-				int cpu_in_loop)
-	__acquires(&cgroup_rstat_lock)
+				spinlock_t *lock, int cpu_in_loop)
+	__acquires(lock)
 {
 	struct cgroup *cgrp = css->cgroup;
 	bool contended;
 
-	contended = !spin_trylock_irq(&cgroup_rstat_lock);
+	contended = !spin_trylock_irq(lock);
 	if (contended) {
 		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
-		spin_lock_irq(&cgroup_rstat_lock);
+		spin_lock_irq(lock);
 	}
 	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
 }
 
 static inline void __cgroup_rstat_unlock(struct cgroup_subsys_state *css,
-				int cpu_in_loop)
-	__releases(&cgroup_rstat_lock)
+				spinlock_t *lock, int cpu_in_loop)
+	__releases(lock)
 {
 	struct cgroup *cgrp = css->cgroup;
 
 	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
-	spin_unlock_irq(&cgroup_rstat_lock);
+	spin_unlock_irq(lock);
 }
 
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css)
-	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
+	__releases(&css->ss->rstat_lock) __acquires(&css->ss->rstat_lock)
 {
+	spinlock_t *lock;
 	int cpu;
 
-	lockdep_assert_held(&cgroup_rstat_lock);
+	if (!css->ss) {
+		pr_warn("cannot use generic flush on base subsystem\n");
+		return;
+	}
+
+	lock = &css->ss->rstat_lock;
+	lockdep_assert_held(lock);
 
 	for_each_possible_cpu(cpu) {
 		struct cgroup_subsys_state *pos = cgroup_rstat_updated_list(css, cpu);
@@ -334,11 +352,11 @@  static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css)
 		}
 
 		/* play nice and yield if necessary */
-		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-			__cgroup_rstat_unlock(css, cpu);
+		if (need_resched() || spin_needbreak(lock)) {
+			__cgroup_rstat_unlock(css, lock, cpu);
 			if (!cond_resched())
 				cpu_relax();
-			__cgroup_rstat_lock(css, cpu);
+			__cgroup_rstat_lock(css, lock, cpu);
 		}
 	}
 }
@@ -358,11 +376,22 @@  static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css)
  */
 __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
 {
+	spinlock_t *lock;
+
+	if (!css->ss) {
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cgroup_base_stat_flush(css->cgroup, cpu);
+		return;
+	}
+
 	might_sleep();
 
-	__cgroup_rstat_lock(css, -1);
+	lock = &css->ss->rstat_lock;
+	__cgroup_rstat_lock(css, lock, -1);
 	cgroup_rstat_flush_locked(css);
-	__cgroup_rstat_unlock(css, -1);
+	__cgroup_rstat_unlock(css, lock, -1);
 }
 
 /**
@@ -374,11 +403,11 @@  __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
  *
  * This function may block.
  */
-void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
-	__acquires(&cgroup_rstat_lock)
+static void cgroup_rstat_base_flush_hold(struct cgroup_subsys_state *css)
+	__acquires(&cgroup_rstat_base_lock)
 {
 	might_sleep();
-	__cgroup_rstat_lock(css, -1);
+	__cgroup_rstat_lock(css, &cgroup_rstat_base_lock, -1);
 	cgroup_rstat_flush_locked(css);
 }
 
@@ -386,10 +415,10 @@  void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  * @cgrp: cgroup used by tracepoint
  */
-void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
-	__releases(&cgroup_rstat_lock)
+static void cgroup_rstat_base_flush_release(struct cgroup_subsys_state *css)
+	__releases(&cgroup_rstat_base_lock)
 {
-	__cgroup_rstat_unlock(css, -1);
+	__cgroup_rstat_unlock(css, &cgroup_rstat_base_lock, -1);
 }
 
 int cgroup_rstat_init(struct cgroup_subsys_state *css)
@@ -435,10 +464,15 @@  void cgroup_rstat_exit(struct cgroup_subsys_state *css)
 
 void __init cgroup_rstat_boot(void)
 {
-	int cpu;
+	struct cgroup_subsys *ss;
+	int cpu, ssid;
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
 
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+		for_each_subsys(ss, ssid)
+			raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock[ssid], cpu));
+	}
 }
 
 /*
@@ -629,12 +663,12 @@  void cgroup_base_stat_cputime_show(struct seq_file *seq)
 	u64 usage, utime, stime, ntime;
 
 	if (cgroup_parent(cgrp)) {
-		cgroup_rstat_flush_hold(css);
+		cgroup_rstat_base_flush_hold(css);
 		usage = cgrp->bstat.cputime.sum_exec_runtime;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &utime, &stime);
 		ntime = cgrp->bstat.ntime;
-		cgroup_rstat_flush_release(css);
+		cgroup_rstat_base_flush_release(css);
 	} else {
 		/* cgrp->bstat of root is not actually used, reuse it */
 		root_cgroup_cputime(&cgrp->bstat);