@@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(void);
+void cgroup_rstat_flush_release(struct cgroup *cgrp);
/*
* Basic resource stats.
@@ -204,6 +204,54 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
TP_ARGS(cgrp, path, val)
);
+DECLARE_EVENT_CLASS(cgroup_rstat,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+
+ TP_ARGS(cgrp, cpu_in_loop, contended),
+
+ TP_STRUCT__entry(
+ __field( int, root )
+ __field( int, level )
+ __field( u64, id )
+ __field( int, cpu_in_loop )
+ __field( bool, contended )
+ ),
+
+ TP_fast_assign(
+ __entry->root = cgrp->root->hierarchy_id;
+ __entry->id = cgroup_id(cgrp);
+ __entry->level = cgrp->level;
+ __entry->cpu_in_loop = cpu_in_loop;
+ __entry->contended = contended;
+ ),
+
+ TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+ __entry->root, __entry->id, __entry->level,
+ __entry->cpu_in_loop, __entry->contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
#endif /* _TRACE_CGROUP_H */
/* This part must be outside protection */
@@ -7,6 +7,8 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <trace/events/cgroup.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -222,6 +224,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+ __acquires(&cgroup_rstat_lock)
+{
+ bool contended;
+
+ contended = !spin_trylock_irq(&cgroup_rstat_lock);
+ if (contended) {
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+ spin_lock_irq(&cgroup_rstat_lock);
+ }
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+ __releases(&cgroup_rstat_lock)
+{
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +279,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, cpu);
if (!cond_resched())
cpu_relax();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, cpu);
}
}
}
@@ -273,9 +304,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
/**
@@ -291,17 +322,17 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
__acquires(&cgroup_rstat_lock)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
}
/**
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
*/
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock)
{
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +564,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
#ifdef CONFIG_SCHED_CORE
forceidle_time = cgrp->bstat.forceidle_sum;
#endif
- cgroup_rstat_flush_release();
+ cgroup_rstat_flush_release(cgrp);
} else {
root_cgroup_cputime(&bstat);
usage = bstat.cputime.sum_exec_runtime;