diff mbox series

[v8,06/11] fs: add percpu counters for significant multigrain timestamp events

Message ID 20240914-mgtime-v8-6-5bd872330bed@kernel.org (mailing list archive)
State New, archived
Headers show
Series fs: multigrain timestamp redux | expand

Commit Message

Jeff Layton Sept. 14, 2024, 5:07 p.m. UTC
New percpu counters for counting various stats around mgtimes, and a new
debugfs file for displaying them when CONFIG_DEBUG_FS is enabled:

- number of attempted ctime updates
- number of successful i_ctime_nsec swaps
- number of fine-grained timestamp fetches
- number of coarse-grained floor swaps

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/inode.c                         | 76 ++++++++++++++++++++++++++++++++++++--
 include/linux/timekeeping.h        |  1 +
 kernel/time/timekeeping.c          |  3 +-
 kernel/time/timekeeping_debug.c    | 12 ++++++
 kernel/time/timekeeping_internal.h |  3 ++
 5 files changed, 90 insertions(+), 5 deletions(-)

Comments

Thomas Gleixner Sept. 16, 2024, 10:20 a.m. UTC | #1
On Sat, Sep 14 2024 at 13:07, Jeff Layton wrote:
>  fs/inode.c                         | 76 ++++++++++++++++++++++++++++++++++++--
>  include/linux/timekeeping.h        |  1 +
>  kernel/time/timekeeping.c          |  3 +-
>  kernel/time/timekeeping_debug.c    | 12 ++++++
>  kernel/time/timekeeping_internal.h |  3 ++

So the subject says 'fs:'. This is not how it works.

Provide the timekeeping changes in a separate patch and then add the fs
voodoo. Documentation is pretty clear about this, no?

> diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
> index b73e8850e58d..9a3792072762 100644
> --- a/kernel/time/timekeeping_debug.c
> +++ b/kernel/time/timekeeping_debug.c
> @@ -17,6 +17,9 @@
>  
>  #define NUM_BINS 32
>  
> +/* incremented every time mg_floor is updated */

Sentences start with a uppercase letter.

> +DEFINE_PER_CPU(long, mg_floor_swaps);

Why is this long? This is a counter which always counts up..

>  static unsigned int sleep_time_bin[NUM_BINS] = {0};
>  
>  static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
> @@ -53,3 +56,12 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
>  			   (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
>  }
>  
> +long get_mg_floor_swaps(void)

Can we please have a proper subsystem prefix and not this get_*()
notation. It's horrible to grep for. timekeeping_mg_get_...() makes it
clear where this function belongs to, no?

> +{
> +	int i;
> +	long sum = 0;

https://www.kernel.org/doc/html/latest/process/maintainer-tip.html#variable-declarations

Also please use 'cpu' instead of 'i'. Self explanatory variable names
have a value.

> +	for_each_possible_cpu(i)
> +		sum += per_cpu(mg_floor_swaps, i);

This needs annotation for kcsan as this is a racy access.

> +	return sum < 0 ? 0 : sum;
> +}
> diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
> index 4ca2787d1642..2b49332b45a5 100644
> --- a/kernel/time/timekeeping_internal.h
> +++ b/kernel/time/timekeeping_internal.h
> @@ -11,8 +11,11 @@
>   */
>  #ifdef CONFIG_DEBUG_FS
>  extern void tk_debug_account_sleep_time(const struct timespec64 *t);
> +DECLARE_PER_CPU(long, mg_floor_swaps);
> +#define mgtime_counter_inc(__var)	this_cpu_inc(__var)

Please use static inlines for this.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index d7da9d06921f..1f0487104c71 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,8 @@ 
 #include <linux/list_lru.h>
 #include <linux/iversion.h>
 #include <linux/rw_hint.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <trace/events/writeback.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/timestamp.h>
@@ -101,6 +103,70 @@  long get_nr_dirty_inodes(void)
 	return nr_dirty > 0 ? nr_dirty : 0;
 }
 
+#ifdef CONFIG_DEBUG_FS
+static DEFINE_PER_CPU(long, mg_ctime_updates);
+static DEFINE_PER_CPU(long, mg_fine_stamps);
+static DEFINE_PER_CPU(long, mg_ctime_swaps);
+
+static long get_mg_ctime_updates(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(mg_ctime_updates, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static long get_mg_fine_stamps(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(mg_fine_stamps, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static long get_mg_ctime_swaps(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(mg_ctime_swaps, i);
+	return sum < 0 ? 0 : sum;
+}
+
+#define mgtime_counter_inc(__var)	this_cpu_inc(__var)
+
+static int mgts_show(struct seq_file *s, void *p)
+{
+	long ctime_updates = get_mg_ctime_updates();
+	long ctime_swaps = get_mg_ctime_swaps();
+	long fine_stamps = get_mg_fine_stamps();
+	long floor_swaps = get_mg_floor_swaps();
+
+	seq_printf(s, "%ld %ld %ld %ld\n",
+		   ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mgts);
+
+static int __init mg_debugfs_init(void)
+{
+	debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
+	return 0;
+}
+late_initcall(mg_debugfs_init);
+
+#else /* ! CONFIG_DEBUG_FS */
+
+#define mgtime_counter_inc()	do { } while (0)
+
+#endif /* CONFIG_DEBUG_FS */
+
 /*
  * Handle nr_inode sysctl
  */
@@ -2655,10 +2721,9 @@  EXPORT_SYMBOL(timestamp_truncate);
  *
  * If it is multigrain, then we first see if the coarse-grained timestamp is
  * distinct from what we have. If so, then we'll just use that. If we have to
- * get a fine-grained timestamp, then do so, and try to swap it into the floor.
- * We accept the new floor value regardless of the outcome of the cmpxchg.
- * After that, we try to swap the new value into i_ctime_nsec. Again, we take
- * the resulting ctime, regardless of the outcome of the swap.
+ * get a fine-grained timestamp, then do so. After that, we try to swap the new
+ * value into i_ctime_nsec. We take the resulting ctime, regardless of the
+ * outcome of the swap.
  */
 struct timespec64 inode_set_ctime_current(struct inode *inode)
 {
@@ -2687,8 +2752,10 @@  struct timespec64 inode_set_ctime_current(struct inode *inode)
 		if (timespec64_compare(&now, &ctime) <= 0) {
 			ktime_get_real_ts64_mg(&now);
 			now = timestamp_truncate(now, inode);
+			mgtime_counter_inc(mg_fine_stamps);
 		}
 	}
+	mgtime_counter_inc(mg_ctime_updates);
 
 	/* No need to cmpxchg if it's exactly the same */
 	if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
@@ -2702,6 +2769,7 @@  struct timespec64 inode_set_ctime_current(struct inode *inode)
 		/* If swap occurred, then we're (mostly) done */
 		inode->i_ctime_sec = now.tv_sec;
 		trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
+		mgtime_counter_inc(mg_ctime_swaps);
 	} else {
 		/*
 		 * Was the change due to someone marking the old ctime QUERIED?
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7aa85246c183..b9c8c597a073 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -48,6 +48,7 @@  extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
 /* Multigrain timestamp interfaces */
 extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
 extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+extern long get_mg_floor_swaps(void);
 
 void getboottime64(struct timespec64 *ts);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16937242b904..94b0219955a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2440,7 +2440,7 @@  EXPORT_SYMBOL_GPL(ktime_get_coarse_real_ts64_mg);
  * regardless of the outcome of the swap. Note that this is a filesystem
  * specific interface and should be avoided outside of that context.
  */
-void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	ktime_t old = atomic64_read(&mg_floor);
@@ -2464,6 +2464,7 @@  void ktime_get_real_ts64_mg(struct timespec64 *ts, u64 cookie)
 	if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
 		ts->tv_nsec = 0;
 		timespec64_add_ns(ts, nsecs);
+		mgtime_counter_inc(mg_floor_swaps);
 	} else {
 		/*
 		 * Something has changed mg_floor since "old" was
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..9a3792072762 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@ 
 
 #define NUM_BINS 32
 
+/* incremented every time mg_floor is updated */
+DEFINE_PER_CPU(long, mg_floor_swaps);
+
 static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
 static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,12 @@  void tk_debug_account_sleep_time(const struct timespec64 *t)
 			   (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
 }
 
+long get_mg_floor_swaps(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(mg_floor_swaps, i);
+	return sum < 0 ? 0 : sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..2b49332b45a5 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -11,8 +11,11 @@ 
  */
 #ifdef CONFIG_DEBUG_FS
 extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+DECLARE_PER_CPU(long, mg_floor_swaps);
+#define mgtime_counter_inc(__var)	this_cpu_inc(__var)
 #else
 #define tk_debug_account_sleep_time(x)
+#define mgtime_counter_inc()	do { } while (0)
 #endif
 
 #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE